clear all
global opts  a f plain coll(none) nodep nomti c(b(star fmt(%9.3f)) se(abs par fmt(%9.3f))) star(* .10 ** .05 *** .01) noobs
global opts1 a f plain coll(none) nodep nomti c(b(star fmt(%9.1f)) se(abs par fmt(%9.1f))) star(* .10 ** .05 *** .01) noobs
global opts4 a f plain coll(none) nodep nomti c(b(star fmt(%9.4f)) se(abs par fmt(%9.4f))) star(* .10 ** .05 *** .01) noobs
set scheme s1mono
set more off, perm
global data 	"DATA DIRECTORY"
global paper 	"PAPER DIRECTORY"

	// Summary statistics
	program define t_means 
	{

	cd "$data"
	use retaking_splines.dta, clear
	
	g byte all = 1	
	g int threshold0 = 100*(int(mvw/100))
	g int threshold1 = 100*(int(mvw/100)+1)
	g byte lowscore = inrange(threshold0,700,1500)|inrange(threshold1,700,1500)
	drop threshold*
	g byte lowinc = inrange(income,5,45)
	g byte urm = inlist(race,1,3,4,5,6)	
	
	expand 2 if (months>=7), g(copy0)
	expand 2 if lowscore&(copy0==1), g(copy1)
	expand 2 if lowinc&(copy0==1)&(copy1==0), g(copy2)
	expand 2 if urm&(copy0==1)&(copy1==0)&(copy2==0), g(copy3)
	g byte col = 1 if !copy0&!copy1&!copy2&!copy3
	replace col = 2 if copy0
	replace col = 3 if copy1
	replace col = 4 if copy2
	replace col = 5 if copy3
	drop copy*
	
	g white = (race==7)
	g asian = (race==2)
	replace inc_base = inc_base/1000
	
	label var white "White"
	label var urm "URM"
	label var asian "Asian"
	label var lowinc "Low income"
	label var coll4 "Four year college"
	label var coll2 "Two year college"
	label var gradrate "Colleges graduation rate"
	label var inc_base "Colleges mean earnings (000s)"
	label var months "Months available to retake"
	
	mat t_meansa = (0,0,0,0,0,0)
	foreach y of varlist female white urm asian lowinc feewaiver {
		mat `y' = (0)
		mat rownames `y' = "`: variable label `y''"
		forval x=1/5 {
			qui sum `y' if col==`x'
			mat `y'`x' = (r(mean))
			mat `y' = (`y',`y'`x')
		}
		mat t_meansa = (t_meansa \ `y')
	}
	mat t_meansa = t_meansa[2...,2...]

	mat t_meansb = (0,0,0,0,0,0)
	foreach y of varlist mvw maxmvw retook takes months {
		mat `y' = (0)
		mat rownames `y' = "`: variable label `y''"
		forval x=1/5 {
			qui sum `y' if col==`x'
			mat `y'`x' = (r(mean))
			mat `y' = (`y',`y'`x')
		}
		mat t_meansb = (t_meansb \ `y')
	}
	mat t_meansb = t_meansb[2...,2...]
	
	mat t_meansc = (0,0,0,0,0,0)
	foreach y of varlist coll4 coll2 gradrate inc_base {
		mat `y' = (0)
		mat rownames `y' = "`: variable label `y''"
		forval x=1/5 {
			qui sum `y' if col==`x'
			mat `y'`x' = (r(mean))
			mat `y' = (`y',`y'`x')
		}
		mat t_meansc = (t_meansc \ `y')
	}
	mat t_meansc = t_meansc[2...,2...]
	
	mat t_meansN = (0,0,0,0,0,0)
	
	mat N = (0)
	mat rownames N = "N"
	forval x=1/5 {
		qui sum mvw if col==`x'
		mat N`x' = (r(N))
		mat N = (N,N`x')
	}
	mat t_meansN = (t_meansN \ N)
	mat t_meansN = t_meansN[2...,2...]	
	
	cd "$paper"
	file open  t	using t_means.tex, replace write
	file write t	"\begin{table}[htbp!] \centering" _n "\def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}" _n ///
					"\caption{Summary Statistics}" _n "\label{t_means}" _n ///
					"\begin{tabular*}{1\textwidth}{@{\extracolsep{\fill}}l*{5}{c}}" _n "\midrule" _n ///
					"&			&\multicolumn{4}{c}{Regression discontinuity sample}\\" ///
					"\cmidrule{3-6}" _n ///
					"&All		&All		&Low	&Low&\\" ///
					"&students	&students	&scoring&income&URM\\" ///
					"&(1)&(2)&(3)&(4)&(5)\\" _n ///
					"\midrule" _n
	file close t
	file open  t	using t_means.tex, append write
	file write t	"(A) Demographics\\" _n "\cmidrule{1-1}" _n
	file close t
	esttab m(t_meansa, f(2 2 2 2 2 2 2 2)) using t_means.tex, a f plain coll(none) nodep nomti
	file open  t	using t_means.tex, append write
	file write t	"\cmidrule{1-1}" _n "(B) SAT-taking\\" _n "\cmidrule{1-1}" _n
	file close t
	esttab m(t_meansb, f(2 2 2 2 2)) using t_means.tex, a f plain coll(none) nodep nomti
	file open  t	using t_means.tex, append write
	file write t	"\cmidrule{1-1}" _n "(C) College enrollment\\" _n "\cmidrule{1-1}" _n
	file close t
	esttab m(t_meansc, f(2 2 2 2)) using t_means.tex, a f plain coll(none) nodep nomti
	file open  t	using t_means.tex, append write
	file write t 	"\\" _n
	file close t
	esttab m(t_meansN, f(%11.0fc %11.0fc %11.0fc)) using t_means.tex, a f plain coll(none) nodep nomti
	file open  t 	using t_means.tex, append write
	file write t 	"\midrule" _n "\end{tabular*}" _n ///
					"\begin{tabular*}{1\textwidth}{p{6.3in}}" _n ///
					"\footnotesize Notes: Listed above are mean values of key variables, with standard deviations of select variables in parentheses. " ///
					"Column 1 consists of all SAT-takers from the high school classes of 2006-14 who had valid math, reading and writing scores. " ///
					"Columns 2-5 limit that sample to students who took their first SAT by November of senior year. " ///
					"Columns 3-5 respectively include students whose first score places them nearest to a threshold below 1600, those with family income below 50,000, and underrepresented minority students." ///
					"\end{tabular*}" _n "\end{table}" _n 
	file close t
	*estimates clear		
	}
	end	

	// Determinants of retaking
	program define t_retake
	{
	cd "$data"
	use retaking_splines.dta if (mvw!=.), clear
	
	g byte asian = race==2
	g byte urm = inlist(race,1,3,4,5,6)
	g byte other = inlist(race,0,8)
	g byte lowinc = inrange(income,5,45)
	g byte medinc = inrange(income,50,125)
	g byte missinc = (income==0)
	
	g byte momba = (motheduc>=7)
	g byte dadba = (fatheduc>=7)
	g byte missmom = (motheduc==0)
	g byte missdad = (fatheduc==0)
	replace mvw = mvw/100
	replace feewaiver = 0 if feewaiver==.
	
	label var urm "URM"
	label var asian "Asian"
	label var lowinc "Low income"
	label var medinc "Middle income"
	label var momba "Mother has BA"
	label var dadba "Father has BA"
	label var mvw "First score (100s)"
	label var months "Months to retake"
		
	reg   retook lowinc medinc missinc 						i.cohort, cluster(highschool)
	eststo est1
	estadd scalar R = e(r2)

	reg   retook urm asian other	 						i.cohort, cluster(highschool)
	eststo est2
	estadd scalar R = e(r2)

	reg   retook female	 							i.cohort, cluster(highschool)
	eststo est3
	estadd scalar R = e(r2)

	reg   retook lowinc medinc missinc urm asian other female			i.cohort, cluster(highschool)
	eststo est4
	estadd scalar R = e(r2)
	
	reg   retook lowinc medinc missinc urm asian other female mvw			i.cohort, cluster(highschool)
	eststo est5
	estadd scalar R = e(r2)
	
	reg   retook lowinc medinc missinc urm asian other female mvw feew 		i.cohort, cluster(highschool)
	eststo est6
	estadd scalar R = e(r2)
	
	reg   retook lowinc medinc missinc urm asian other female mvw feew months	i.cohort, cluster(highschool)
	eststo est7
	estadd scalar R = e(r2)	
	
	cd "$paper"
	file open  t	using t_retake.tex, replace write
	file write t	"\begin{table}[htbp!] \centering" _n "\def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}" _n ///
					"\caption{Predictors of SAT Retaking}" _n "\label{t_retake}" _n ///
					"\begin{tabular*}{1\textwidth}{@{\extracolsep{\fill}}l*{7}{c}}" _n "\midrule" _n ///
					"&(1)&(2)&(3)&(4)&(5)&(6)&(7)\\" _n ///
					"\midrule" _n
	file close t
	esttab est1 est2 est3 est4 est5 est6 est7  using t_retake.tex, l keep(lowinc medinc urm asian female mvw mvw2 feewaiver monthstoretake) s(R, l("" "R$^2$")f(2) lay(`""' @)) $opts
	file open  t 	using t_retake.tex, append write
	file write t 	"\midrule" _n "\end{tabular*}" _n ///
			"\begin{tabular*}{1\textwidth}{p{6.3in}}" _n ///
			"\footnotesize Notes: Heteroskedasticity robust standard errors clustered by high school are in parentheses (* p$<$.10 ** p$<$.05 *** p$<$.01). " ///
			"Each column regresses an indicator for retaking the SAT on the variables shown. " ///
			"Months to retake represents the number of months between a student's first SAT take and June of senior year. " ///
			"All regressions include cohort fixed effects, as well as indicators for missing income or race, so that high income and White students are the reference groups. " /// 
			"The sample consists of all SAT-takers from the high school classes of 2006-14 who had valid math, reading and writing scores. " ///
			"Each regression uses 13,656,612 observations." ///
			"\end{tabular*}" _n "\end{table}" _n 
	file close t
	estimates clear		
		
	}
	end

	// Raw first stage graph
	program define f_rawfs
	{
	cd "$data"
	use retook mvw months using retaking_splines.dta if (mvw!=.)&(months>=7), clear
	g byte count = 1
	collapse retook (sum) count, by(mvw)
	egen N = sum(count)
	replace count = count/N
	
	cd "$paper"
	twoway scatter retook mvw, ylabel(0(0.1)0.7) msize(tiny) mcolor(black) mfcolor(black) legend(off) m(O) connect(i) ///
						xlabel(600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400, labsize(vsmall)) ///
						xline(600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400, lw(vvthin) lstyle(dot)) ///
						xtitle(, size(medsmall) height(5)) ytitle("Retook SAT", size(medsmall) height(4)) ///
						|| scatter count mvw, m(none) connect(l) lp(dash) lw(thin) yaxis(2) ///
						ytitle("Density", size(medsmall) axis(2)) ylabel(0(0.05)0.05, axis(2))
	graph export f_rawfs.pdf, replace	
	}
	end

	// Stacked first stage graph
	program define f_fs
	{		
	cd "$data"
	use retook distance months using retaking_stacked.dta if (distance!=.)&(months>=7), clear
	collapse retook, by(distance)
	cd "$paper"
	scatter retook distance if inrange(distance,-50,40), ylabel(0.59 0.60, format(%4.2f)) mcolor(black) mfcolor(black) legend(off) ///
						xlabel(-50(10)40, labsize(small)) xline(0, lw(thin) lstyle(dot)) ///
						xtitle("Distance of first SAT score to nearest multiple of 100", size(medsmall) height(5)) ///
						ytitle("Retook SAT", size(medsmall) height(4)) 
	graph export f_fs.pdf, replace	
	}
	end

	// Density of first scores
	program define f_density
	{

	cd "$data"
	use mvw months using retaking_splines.dta if (mvw!=.)&(months>=7), clear
	replace mvw = mvw-5
	replace mvw = 2405 if mvw==2395
	hist mvw,	w(10) fcolor(white) lw(vvthin) ylab(minmax) ///
			xlabel(600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400, labsize(vsmall)) ///
			xline(600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400, lw(vthin) lstyle(dot)) ///
			xtitle("First SAT score", height(5) size(medsmall))
	cd "$paper"
	graph export f_density.pdf, replace
	}
	end
	
	// Covariate balance table
	program define t_cov
	{
	cd "$data"
	use retook below *distance cohort months threshold mvw income race *educ female using retaking_stacked.dta if (months>=7), clear
	global y lowinc urm anyba female
	
	g byte lowinc = inrange(income,5,45)
	g byte urm = inlist(race,1,3,4,5,6)
	g byte momba = (motheduc>=7)
	g byte dadba = (fatheduc>=7)
	g byte anyba = momba|dadba
	
	egen int dc = group(cohort months)
	forval x=700(100)2300 {
		g byte t`x'_distance		= (threshold==`x')*distance
		g byte t`x'_below_distance	= (threshold==`x')*below_distance
	}
	
	forval x=40(20)100 {
		preserve
		keep if inrange(distance,-1*`x',`x'-10)
		reghdfe $y t*00_* retook below, a(threshold dc) vce(cluster mvw) cache(save)
		foreach y of varlist $y {
			cap reghdfe `y' t*00_* (retook = below), a(threshold dc) vce(cluster mvw) cache(use)
			cap est sto `y'_`x'
		}
		restore
	}

	cd "$paper"	
	file open  t	using t_cov.tex, replace write
	file write t	"\begin{table}[htbp!] \centering" _n "\def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}" _n ///
					"\caption{Covariate Balance}" _n "\label{t_cov}" _n ///
					"\begin{tabular*}{1\textwidth}{@{\extracolsep{\fill}}l*{4}{c}}" _n "\midrule" _n ///
					"&Low	&	&Mom or dad	&\\" _n ///
					"&income&URM	&has B.A.	&Female\\" _n ///
					"&(1)&(2)&(3)&(4)\\" _n ///
					"\midrule" _n 
	file close t	
	esttab *40 using t_cov.tex, k(retook) coef(retook "Bandwidth = 40") s(, lay(`""')) $opts  
	esttab *60 using t_cov.tex, k(retook) coef(retook "Bandwidth = 60") s(, lay(`""')) $opts  
	esttab *80 using t_cov.tex, k(retook) coef(retook "Bandwidth = 80") s(, lay(`""')) $opts  
	esttab *00 using t_cov.tex, k(retook) coef(retook "Bandwidth = 100") s(,) $opts  
	file open  t 	using t_cov.tex, append write
	file write t 	"\midrule" _n "\end{tabular*}" _n ///
					"\begin{tabular*}{1\textwidth}{p{6.3in}}" _n ///
					"\footnotesize Notes: Heteroskedasticity robust standard errors clustered by first SAT score are in parentheses (* p$<$.10 ** p$<$.05 *** p$<$.01). " ///
					"Each coefficient is an instrumental variables estimate of the impact of SAT retaking, where retaking is instrumented with indicators for scoring below a multiple of 100. " ///
					"The sample consists of all SAT-takers from the high school classes of 2006-14 who had valid math, reading and writing scores and who took their first SAT by November of senior year." ///
					"\end{tabular*}" _n "\end{table}" _n 
	file close t
	estimates clear	
	
	}
	end
	
	// Covariate balance figures
	program define f_cov
	{
	
	cd "$data"
	use retook below *distance cohort months threshold mvw income race *educ female using retaking_stacked.dta if (months>=7)&inrange(distance,-60,50), clear
	global y lowinc urm anyba female
	
	g byte all = 1	
	g byte lowinc = inrange(income,5,45)
	g byte urm = inlist(race,1,3,4,5,6)
	g byte momba = (motheduc>=7)
	g byte dadba = (fatheduc>=7)
	g byte anyba = momba|dadba
			
	egen dc = group(cohort months)
	forval x=1000(100)2300 {
		g byte retook_`x' = retook
		label var retook_`x' "`x'"
		foreach y of varlist $y {
			preserve
			keep if threshold==`x'
			reghdfe `y' *distance (retook_`x' = below), a(threshold dc) vce(cluster mvw)
			est sto `y'`x'
			restore
		}
	}
	
	cd "$paper"	
	coefplot lowinc1000 lowinc1100 lowinc1200 lowinc1300 lowinc1400 lowinc1500 lowinc1600 lowinc1700 lowinc1800 lowinc1900 lowinc2000 lowinc2100 lowinc2200 lowinc2300, ///
				keep(retook_*) vertical legend(off) nooffset msize(small)  ///
				xtitle("Threshold near first SAT score", height(4)) ///
				ytitle("Low income", height(4)) ///
				coeflabels(,labsize(vsmall)) yline(0) ylabel(-1 0 1) /// 
				mcolor(gs0) msymbol(O) ciopts(lcolor(gs0) lw(vthin)) ///
				saving(a.gph, replace)
	coefplot urm1000 urm1100 urm1200 urm1300 urm1400 urm1500 urm1600 urm1700 urm1800 urm1900 urm2000 urm2100 urm2200 urm2300, ///
				keep(retook_*) vertical legend(off) nooffset msize(small)  ///
				xtitle("Threshold near first SAT score", height(4)) ///
				ytitle("URM", height(4)) ///
				coeflabels(,labsize(vsmall)) yline(0) ylabel(-1 0 1) /// 
				mcolor(gs0) msymbol(O) ciopts(lcolor(gs0) lw(vthin)) ///
				saving(b.gph, replace)				
	coefplot anyba1000 anyba1100 anyba1200 anyba1300 anyba1400 anyba1500 anyba1600 anyba1700 anyba1800 anyba1900 anyba2000 anyba2100 anyba2200 anyba2300, ///
				keep(retook_*) vertical legend(off) nooffset msize(small)  ///
				xtitle("Threshold near first SAT score", height(4)) ///
				ytitle("Mom or dad has B.A.", height(4)) ///
				coeflabels(,labsize(vsmall)) yline(0) ylabel(-1 0 1) /// 
				mcolor(gs0) msymbol(O) ciopts(lcolor(gs0) lw(vthin)) ///
				saving(c.gph, replace)
	coefplot female1000 female1100 female1200 female1300 female1400 female1500 female1600 female1700 female1800 female1900 female2000 female2100 female2200 female2300, ///
				keep(retook_*) vertical legend(off) nooffset msize(small)  ///
				xtitle("Threshold near first SAT score", height(4)) ///
				ytitle("Female", height(4)) ///
				coeflabels(,labsize(vsmall)) yline(0) ylabel(-1 0 1) /// 
				mcolor(gs0) msymbol(O) ciopts(lcolor(gs0) lw(vthin)) ///
				saving(d.gph, replace)
	graph combine a.gph b.gph c.gph d.gph, rows(2)
	graph export f_cov.pdf, replace	
	rm a.gph
	rm b.gph
	rm c.gph
	rm d.gph
	estimates clear
	}
	end

	// First stage table
	program define t_fs
	{
	cd "$data"
	use retook takes below *distance months cohort threshold mvw income race female using retaking_stacked.dta if (months>=7)&inrange(distance,-60,50), clear
	
	egen int dc = group(cohort months)
	forval x=700(100)2300 {
		g byte t`x'_distance		= (threshold==`x')*distance
		g byte t`x'_below_distance	= (threshold==`x')*below_distance
	}

	g byte all = 1	
	g byte lowscore = inrange(threshold,700,1500)
	g byte highscore = inrange(threshold,1600,2300)
	g byte lowinc = inrange(income,5,45)
	g byte highinc = inrange(income,130,130)
	g byte nonurm = inlist(race,2,7)
	g byte urm = inlist(race,1,3,4,5,6)

	g byte oneminusretook = 1-retook
	foreach y of varlist mvw lowinc urm female {
		g `y'_Y0 = `y'*oneminusretook
	}	

	foreach g of varlist all lowscore highscore lowinc highinc urm nonurm {	
		preserve
		keep if `g'==1	
		reghdfe retook takes below t*00_* oneminusretook *_Y0, a(threshold dc) vce(cluster mvw) cache(save)
		reghdfe retook 		below t*00_*		, a(threshold dc) vce(cluster mvw) cache(use)
		est sto a_`g'
		reghdfe takes 		below t*00_*		, a(threshold dc) vce(cluster mvw) cache(use)
		est sto b_`g'
		foreach x in mvw lowinc urm female {
			cap reghdfe `x'_Y0 t*00_* (oneminusretook = below), a(threshold dc) vce(cluster mvw) cache(use)
			cap local ccm_`x' = _b[oneminusretook]
			sum `x'
			local mean_`x' = r(mean)
		}
		reghdfe takes t*00_* (retook = below)		, a(threshold dc) vce(cluster mvw) ffirst cache(use)
		est sto c_`g'
		estadd scalar fstat = e(widstat)
		cap estadd scalar ccm_mvw = `ccm_mvw'
		cap estadd scalar ccm_low = `ccm_lowinc'
		cap estadd scalar ccm_urm  = `ccm_urm'
		cap estadd scalar ccm_fem = `ccm_female'
		cap estadd scalar mean_mvw = `mean_mvw'
		cap estadd scalar mean_low = `mean_lowinc'
		cap estadd scalar mean_urm  = `mean_urm'
		cap estadd scalar mean_fem = `mean_female'

		restore
	}
		
	cd "$paper"	
	file open  t	using t_fs.tex, replace write
	file write t	"\begin{table}[htbp!] \centering \small" _n "\def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}" _n ///
					"\caption{Round-Number Thresholds and SAT Retaking}" _n "\label{t_fs}" _n ///
					"\begin{tabular*}{1\textwidth}{@{\extracolsep{\fill}}l*{7}{c}}" _n "\midrule" _n ///
					"&All		&Low	&High	 	&Low	&High	&	&	\\" _n ///
					"&students	&scoring&scoring	&income	&income	&URM	&Non-URM\\" _n ///
					"&(1)&(2)&(3)&(4)&(5)&(6)&(7)\\" _n ///
					"\midrule" _n 
	file close t	
	esttab a* using t_fs.tex, k(below) coef(below "Retook (FS)") s(, lay(`""')) $opts  
	esttab b* using t_fs.tex, k(below) coef(below "Takes (RF)") s(, lay(`""')) $opts  
	esttab c* using t_fs.tex, k(retook) coef(retook "Takes (IV)") s(fstat, l("" "First stage F-statistic") f(1) lay(`""' @)) $opts 
	file open  t 	using t_fs.tex, append write
	file write t	"\cmidrule{1-1}" "Complier characteristics\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab c* using t_fs.tex, drop(*) s(ccm_mvw ccm_low ccm_urm ccm_fem, l("First SAT score" "Low income" "URM" "Female") f(%4.0f 2 2 2) lay(@ @ @ @)) $opts
	file open  t 	using t_fs.tex, append write
	file write t	"\cmidrule{1-1}" "Mean characteristics\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab c* using t_fs.tex, drop(*) s(mean_mvw mean_low mean_urm mean_fem N, l("First SAT score" "Low income" "URM" "Female" "" "N") f(%4.0f 2 2 2 %11.0fc) lay(@ @ @ @ `""' @)) $opts
	file open  t 	using t_fs.tex, append write
	file write t 	"\midrule" _n "\end{tabular*}" _n ///
					"\begin{tabular*}{1\textwidth}{p{6.3in}}" _n ///
					"\footnotesize Notes: Heteroskedasticity robust standard errors clustered by first SAT score are in parentheses (* p$<$.10 ** p$<$.05 *** p$<$.01). " ///
					"Each coefficient in the first two rows is an estimate of the impact of scoring below a multiple of 100 on retaking behavior. " ///
					"Each coefficient in the third row is an instrumental variables estimate of the impact of initial retaking on the total number of retakes, SAT retaking, where initial retaking is instrumented with an indicator for scoring below a multiple of 100. " ///
					"The sample consists of all SAT-takers from the high school classes of 2006-14 who had valid math, reading and writing scores and who took their first SAT by November of senior year. " ///
					"Columns 2 and 3 split the sample into students with first scores nearest to the 700-1500 thresholds and those nearest to the 1600-2300 thresholds. " ///
					"Columns 4 and 5 split the sample into students with family incomes below 50,000 and above 100,000. " ///
					"Columns 6 and 7 split the sample into students who are underrepresented minorities (Black, Hispanic or Native American) and those who are not (White or Asian)." ///
					"The bottom panels show complier characteristics and mean characteristics for each subgroup. " ///
					"\end{tabular*}" _n "\end{table}" _n 
	file close t
	estimates clear	
	}
	end

	// First stage coefficients graph
	program define f_fscoeffs
	{
	
	cd "$data"
	use retook below *distance cohort months threshold mvw using retaking_stacked.dta if (months>=7)&inrange(distance,-60,50), clear
	
	egen dc = group(cohort months)
	forval x=700(100)2300 {
		g byte below_`x'		= (threshold==`x')*below
		label var below_`x' "`x'"
		g byte t`x'_distance		= (threshold==`x')*distance
		g byte t`x'_below_distance	= (threshold==`x')*below_distance
	}
	
	reghdfe retook below_* t*_distance, a(threshold dc) vce(cluster mvw)
	est sto r

	cd "$paper"	
	coefplot 	r, keep(below_*) drop(below_distance) vertical legend(off) nooffset msize(vsmall)  ///
				ytitle("Estimated discontinuity in retake rates", height(5)) ///
				coeflabels(,labsize(vsmall)) yline(0) ylabel(0 0.12) /// 
				mcolor(gs0) msymbol(O) ciopts(lcolor(gs0) lw(vthin))
	graph export f_fscoeffs.pdf, replace	
	}
	end

	// Maximum SAT scores
	program define t_max
	{
	cd "$data"
	use mvw m v w gain* max* retook below *distance cohort months threshold mvw income race using retaking_stacked.dta if (months>=7)&inrange(distance,-60,50), clear
	global y gainmvw max2mvw maxmvw up150 upm upv upw up1 up2 up3

	g gainmvw = gainm+gainv+gainw
	g max2mvw = mvw + max(gainm,0) + max(gainv,0) + max(gainw,0)
	g byte up150 = (maxmvw-mvw)>=150
	g byte upm = (gainm>0)
	g byte upv = (gainv>0)
	g byte upw = (gainw>0)
	g byte up1 = (upm+upv+upw)>=1
	g byte up2 = (upm+upv+upw)>=2
	g byte up3 = (upm+upv+upw)>=3

	egen int dc = group(cohort months)
	forval x=700(100)2300 {
		g byte t`x'_distance		= (threshold==`x')*distance
		g byte t`x'_below_distance	= (threshold==`x')*below_distance
	}
	
	g byte all = 1	
	g byte lowscore = inrange(threshold,700,1500)
	g byte highscore = inrange(threshold,1600,2300)
	g byte lowinc = inrange(income,5,45)
	g byte highinc = inrange(income,130,130)
	g byte nonurm = inlist(race,2,7)
	g byte urm = inlist(race,1,3,4,5,6)

	foreach g of varlist all lowscore highscore lowinc highinc urm nonurm {	
		preserve
		keep if `g'==1	
		reghdfe $y t*00_* retook below, a(threshold dc) vce(cluster mvw) cache(save)
		foreach y of varlist $y {
			reghdfe `y' t*00_* (retook = below)		, a(threshold dc) vce(cluster mvw) cache(use)
			est sto `g'_`y'
		}
		restore
	}

	cd "$paper"	
	file open  t	using t_max.tex, replace write
	file write t	"\begin{sidewaystable}[htbp!] \centering" _n "\def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}" _n ///
					"\caption{Retaking and SAT Scores}" _n "\label{t_max}" _n ///
					"\begin{tabular*}{1\textwidth}{@{\extracolsep{\fill}}l*{10}{c}}" _n "\midrule" _n ///
					"&Mean gain,	&Superscore gain&Superscore	&150+ point	&\multicolumn{3}{c}{Raised score in}&\multicolumn{3}{c}{Raised score in at least}\\" _n ///
					"&1st retake	&by 2nd take	&by last take	&increase	&Math&Reading&Writing	&1 section&2 sections&3 sections\\" _n ///
					"&(1)&(2)&(3)&(4)&(5)&(6)&(7)&(8)&(9)&(10)\\" _n ///
					"\midrule" _n 
	file close t	
	file open  t 	using t_max.tex, append write
	file write t	"(A) All students\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab all* using t_max.tex, k(retook) coef(retook "All") s(,) $opts  	 
	file open  t 	using t_max.tex, append write
	file write t	"\cmidrule{1-1}" "(B) By initial score\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab lowsc* using t_max.tex, k(retook) coef(retook "Lower scoring") s(, lay(`""')) $opts  
	esttab highsc* using t_max.tex, k(retook) coef(retook "Higher scoring") s(,) $opts  
	file open  t 	using t_max.tex, append write
	file write t	"\cmidrule{1-1}" "(C) By income\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab lowinc* using t_max.tex, k(retook) coef(retook "Low income") s(, lay(`""')) $opts   
	esttab highinc* using t_max.tex, k(retook) coef(retook "High income") s(,) $opts  
	file open  t 	using t_max.tex, append write
	file write t	"\cmidrule{1-1}" "(D) By race/ethnicity\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab urm* using t_max.tex, k(retook) coef(retook "URM") s(, lay(`""')) $opts    
	esttab nonurm* using t_max.tex, k(retook) coef(retook "Non-URM") s(,) $opts   
	file open  t 	using t_max.tex, append write
	file write t 	"\midrule" _n "\end{tabular*}" _n ///
					"\begin{tabular*}{1\textwidth}{p{8.4in}}" _n ///
					"\footnotesize Notes: Heteroskedasticity robust standard errors clustered by first SAT score are in parentheses (* p$<$.10 ** p$<$.05 *** p$<$.01). " ///
					"Each coefficient is an instrumental variables estimate of the impact of SAT retaking, where retaking is instrumented with indicators for scoring below a multiple of 100. " ///
					"The sample consists of all SAT-takers from the high school classes of 2006-14 who had valid math, reading and writing scores and who took their first SAT by November of senior year. " ///
					"Panel B splits the sample into students with first scores nearest to the 700-1500 thresholds and those nearest to the 1600-2300 thresholds. " ///
					"Panel C splits the sample into students with family incomes below 50,000 and above 100,000. " ///
					"Panel D splits the sample into students who are underrepresented minorities (Black, Hispanic or Native American) and those who are not (White or Asian)." ///
					"\end{tabular*}" _n "\end{sidewaystable}" _n 
	file close t
	estimates clear	
	
	}
	end

	// Maximum SAT scores (reduced form)
	program define t_maxrf
	{
	cd "$data"
	use mvw m v w gain* max* below *distance cohort months threshold mvw income race using retaking_stacked.dta if (months>=7)&inrange(distance,-60,50), clear
	global y gainmvw max2mvw maxmvw up150

	g gainmvw = gainm+gainv+gainw
	g max2mvw = mvw + max(gainm,0) + max(gainv,0) + max(gainw,0)
	g byte up150 = (maxmvw-mvw)>=150

	egen int dc = group(cohort months)
	forval x=700(100)2300 {
		g byte t`x'_distance		= (threshold==`x')*distance
		g byte t`x'_below_distance	= (threshold==`x')*below_distance
	}
	
	g byte all = 1	
	g byte lowscore = inrange(threshold,700,1500)
	g byte highscore = inrange(threshold,1600,2300)
	g byte lowinc = inrange(income,5,45)
	g byte highinc = inrange(income,130,130)
	g byte nonurm = inlist(race,2,7)
	g byte urm = inlist(race,1,3,4,5,6)

	foreach g of varlist all lowscore highscore lowinc highinc urm nonurm {	
		preserve
		keep if `g'==1	
		reghdfe $y t*00_* below, a(threshold dc) vce(cluster mvw) cache(save) old
		foreach y of varlist $y {
			reghdfe `y' t*00_* below, a(threshold dc) vce(cluster mvw) cache(use) old
			est sto `g'_`y'
		}
		restore
	}

	cd "$paper"	
	file open  t	using t_maxrf.tex, replace write
	file write t	"\begin{table}[htbp!] \centering" _n "\def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}" _n ///
					"\caption{Missing Round Number Thresholds and SAT Scores (Reduced Form)}" _n "\label{t_maxrf}" _n ///
					"\begin{tabular*}{1\textwidth}{@{\extracolsep{\fill}}l*{4}{c}}" _n "\midrule" _n ///
					"&Mean gain,	&Superscore gain&Superscore		&150+ point	\\" _n ///
					"&1st retake	&by 2nd take	&by last take	&increase	\\" _n ///
					"&(1)&(2)&(3)&(4)\\" _n ///
					"\midrule" _n 
	file close t	
	file open  t 	using t_maxrf.tex, append write
	file write t	"(A) All students\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab all* using t_maxrf.tex, k(below) coef(below "All") s(,) $opts  	 
	file open  t 	using t_maxrf.tex, append write
	file write t	"\cmidrule{1-1}" "(B) By initial score\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab lowsc* using t_maxrf.tex, k(below) coef(below "Lower scoring") s(, lay(`""')) $opts  
	esttab highsc* using t_maxrf.tex, k(below) coef(below "Higher scoring") s(,) $opts  
	file open  t 	using t_maxrf.tex, append write
	file write t	"\cmidrule{1-1}" "(C) By income\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab lowinc* using t_maxrf.tex, k(below) coef(below "Low income") s(, lay(`""')) $opts   
	esttab highinc* using t_maxrf.tex, k(below) coef(below "High income") s(,) $opts  
	file open  t 	using t_maxrf.tex, append write
	file write t	"\cmidrule{1-1}" "(D) By race/ethnicity\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab urm* using t_maxrf.tex, k(below) coef(below "URM") s(, lay(`""')) $opts    
	esttab nonurm* using t_maxrf.tex, k(below) coef(below "Non-URM") s(,) $opts   
	file open  t 	using t_maxrf.tex, append write
	file write t 	"\midrule" _n "\end{tabular*}" _n ///
					"\begin{tabular*}{1\textwidth}{p{6.3in}}" _n ///
					"\footnotesize Notes: Heteroskedasticity robust standard errors clustered by first SAT score are in parentheses (* p$<$.10 ** p$<$.05 *** p$<$.01). " ///
					"Each coefficient is a reduced form estimate of the impact of one's first SAT score missing a multiple of 100. " ///
					"The sample consists of all SAT-takers from the high school classes of 2006-14 who had valid math, reading and writing scores and who took their first SAT by November of senior year. " ///
					"Panel B splits the sample into students with first scores nearest to the 700-1500 thresholds and those nearest to the 1600-2300 thresholds. " ///
					"Panel C splits the sample into students with family incomes below \textdollar 50,000 and above \textdollar 100,000. " ///
					"Panel D splits the sample into students who are underrepresented minorities (Black, Hispanic or Native American) and those who are not (White or Asian)." ///
					"\end{tabular*}" _n "\end{table}" _n 
	file close t
	estimates clear	
	
	}
	end

	// Maximum SAT score coefficients graph
	program define f_maxmvwcoeffs
	{
	
	cd "$data"
	use maxmvw retook below *distance cohort months threshold mvw using retaking_stacked.dta if (months>=7)&inrange(distance,-60,50), clear
	
	egen dc = group(cohort months)
	forval x=1000(100)2300 {
		g byte retook_`x' = retook
		label var retook_`x' "`x'"
		preserve
		keep if threshold==`x'
		reghdfe maxmvw *distance (retook_`x' = below), a(threshold dc) vce(cluster mvw)
		est sto r`x'
		restore
	}
	
	cd "$paper"	
	coefplot r1000 r1100 r1200 r1300 r1400 r1500 r1600 r1700 r1800 r1900 r2000 r2100 r2200 r2300, ///
				keep(retook_*) vertical legend(off) nooffset msize(small)  ///
				xtitle("Threshold near first SAT score", height(6)) ///
				ytitle("Impact of retaking on superscores", height(6)) ///
				coeflabels(,labsize(small)) yline(0) ylabel(0 50 100 150 200 250 300) /// 
				mcolor(gs0) msymbol(O) ciopts(lcolor(gs0) lw(vthin))
	graph export f_maxmvwcoeffs.pdf, replace	
	estimates clear
	}
	end

	// College enrollment
	program define t_coll
	{
	cd "$data"
	use coll4 coll2 gradrate inc_baseline retook below *distance cohort months threshold mvw income race using retaking_stacked.dta if (months>=7)&inrange(distance,-60,50), clear
	global y coll4 coll2 gradrate gr50 gr80 inc_baseline inc_base50 inc_base65

	g byte gr50 = (gradrate>.5)
	g byte gr80 = (gradrate>.8)
	replace inc_base = inc_base/1000
	g byte inc_base50 = (inc_baseline>50)
	g byte inc_base65 = (inc_baseline>65)

	g byte oneminusretook = 1-retook
	foreach y of varlist $y {
		g `y'_Y0 = `y'*oneminusretook
	}		
	egen dc = group(cohort months)
	compress
	
	forval x=700(100)2300 {
		g byte t`x'_distance		= (threshold==`x')*distance
		g byte t`x'_below_distance	= (threshold==`x')*below_distance
	}
	
	g byte all = 1	
	g byte lowscore = inrange(threshold,700,1500)
	g byte highscore = inrange(threshold,1600,2300)
	g byte lowinc = inrange(income,5,45)
	g byte highinc = inrange(income,130,130)
	g byte nonurm = inlist(race,2,7)
	g byte urm = inlist(race,1,3,4,5,6)

	foreach g of varlist all lowscore highscore lowinc highinc urm nonurm {	
		preserve
		keep if `g'==1	
		reghdfe $y *_Y0 t*00_* oneminusretook retook below, a(threshold dc) vce(cluster mvw) cache(save)
		foreach y of varlist $y {
			reghdfe `y'_Y0 t*00_* (oneminusretook = below)	, a(threshold dc) vce(cluster mvw) cache(use)
			local ccm = _b[oneminusretook]
			reghdfe `y' t*00_* (retook = below)		, a(threshold dc) vce(cluster mvw) cache(use)
			est sto `g'_`y'
			estadd scalar ccm = `ccm'
		}
		restore
	}
	
	cd "$paper"	
	file open  t	using t_coll.tex, replace write
	file write t	"\begin{sidewaystable}[htbp!] \centering" _n "\def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}" _n ///
					"\caption{Retaking and College Enrollment}" _n "\label{t_coll}" _n ///
					"\begin{tabular*}{1\textwidth}{@{\extracolsep{\fill}}l*{8}{c}}" _n "\midrule" _n ///
					"&\multicolumn{2}{c}{College type}	&\multicolumn{3}{c}{College's graduation rate}	&\multicolumn{3}{c}{College's mean earnings}	\\" _n ///
					"&Four-year&Two-year			&Overall & $>$50\%  & $>$80\% 			&Overall &$>$ \textdollar 50,000 &$>$ \textdollar 65,000\\" _n ///
					"&(1)&(2)&(3)&(4)&(5)&(6)&(7)&(8)\\" _n ///
					"\midrule" _n 
	file close t	
	file open  t 	using t_coll.tex, append write
	file write t	"(A) All students\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab all* using t_coll.tex, k(retook) coef(retook "All") s(ccm, l("Control complier mean") f(2) lay(@)) $opts  	 
	file open  t 	using t_coll.tex, append write
	file write t	"\cmidrule{1-1}" "(B) By initial score\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab lowsc* using t_coll.tex, k(retook) coef(retook "Lower scoring") s(ccm, l("Control complier mean") f(2) lay(@ `""')) $opts  
	esttab highsc* using t_coll.tex, k(retook) coef(retook "Higher scoring") s(ccm, l("Control complier mean") f(2) lay(@)) $opts  
	file open  t 	using t_coll.tex, append write
	file write t	"\cmidrule{1-1}" "(C) By income\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab lowinc* using t_coll.tex, k(retook) coef(retook "Low income") s(ccm, l("Control complier mean") f(2) lay(@ `""')) $opts   
	esttab highinc* using t_coll.tex, k(retook) coef(retook "High income") s(ccm, l("Control complier mean") f(2) lay(@)) $opts   
	file open  t 	using t_coll.tex, append write
	file write t	"\cmidrule{1-1}" "(D) By race/ethnicity\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab urm* using t_coll.tex, k(retook) coef(retook "URM") s(ccm, l("Control complier mean") f(2) lay(@ `""')) $opts   
	esttab nonurm* using t_coll.tex, k(retook) coef(retook "Non-URM") s(ccm, l("Control complier mean") f(2) lay(@)) $opts  
	file open  t 	using t_coll.tex, append write
	file write t 	"\midrule" _n "\end{tabular*}" _n ///
					"\begin{tabular*}{1\textwidth}{p{8.4in}}" _n ///
					"\footnotesize Notes: Heteroskedasticity robust standard errors clustered by first SAT score are in parentheses (* p$<$.10 ** p$<$.05 *** p$<$.01). " ///
					"Each coefficient is an instrumental variables estimate of the impact of SAT retaking, where retaking is instrumented with indicators for scoring below a multiple of 100. " ///
					"The sample consists of all SAT-takers from the high school classes of 2006-14 who had valid math, reading and writing scores and who took their first SAT by November of senior year. " ///
					"Panel B splits the sample into students with first scores nearest to the 700-1500 thresholds and those nearest to the 1600-2300 thresholds. " ///
					"Panel C splits the sample into students with family incomes below 50,000 and above 100,000. " ///
					"Panel D splits the sample into students who are underrepresented minorities (Black, Hispanic or Native American) and those who are not (White or Asian)." ///
					"\end{tabular*}" _n "\end{sidewaystable}" _n 
	file close t
	estimates clear	
		
	}
	end

	// College enrollment (reduced form)
	program define t_collrf
	{
	cd "$data"
	use coll4 coll2 gradrate inc_baseline retook below *distance cohort months threshold mvw income race using retaking_stacked.dta if (months>=7)&inrange(distance,-60,50), clear
	global y coll4 coll2 gradrate gr50 gr80 inc_baseline inc_base50 inc_base65

	g byte gr50 = (gradrate>.5)
	g byte gr80 = (gradrate>.8)
	replace inc_base = inc_base/1000
	g byte inc_base50 = (inc_baseline>50)
	g byte inc_base65 = (inc_baseline>65)

	egen dc = group(cohort months)
	compress
	
	forval x=700(100)2300 {
		g byte t`x'_distance		= (threshold==`x')*distance
		g byte t`x'_below_distance	= (threshold==`x')*below_distance
	}
	
	g byte all = 1	
	g byte lowscore = inrange(threshold,700,1500)
	g byte highscore = inrange(threshold,1600,2300)
	g byte lowinc = inrange(income,5,45)
	g byte highinc = inrange(income,130,130)
	g byte nonurm = inlist(race,2,7)
	g byte urm = inlist(race,1,3,4,5,6)

	foreach g of varlist all lowscore highscore lowinc highinc urm nonurm {	
		preserve
		keep if `g'==1	
		reghdfe $y t*00_* below, a(threshold dc) vce(cluster mvw) cache(save) old
		foreach y of varlist $y {
			reghdfe `y' t*00_* below, a(threshold dc) vce(cluster mvw) cache(use) old
			est sto `g'_`y'
		}
		restore
	}
	
	cd "$paper"	
	file open  t	using t_collrf.tex, replace write
	file write t	"\begin{sidewaystable}[htbp!] \centering" _n "\def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}" _n ///
					"\caption{Missing Round Number Thresholds and College Enrollment (Reduced Form)}" _n "\label{t_collrf}" _n ///
					"\begin{tabular*}{1\textwidth}{@{\extracolsep{\fill}}l*{8}{c}}" _n "\midrule" _n ///
					"&\multicolumn{2}{c}{College type}	&\multicolumn{3}{c}{College's graduation rate}	&\multicolumn{3}{c}{College's mean earnings}	\\" _n ///
					"&Four-year&Two-year			&Overall & $>$50\%  & $>$80\% 			&Overall &$>$ \textdollar 50,000 &$>$ \textdollar 65,000\\" _n ///
					"&(1)&(2)&(3)&(4)&(5)&(6)&(7)&(8)\\" _n ///
					"\midrule" _n 
	file close t	
	file open  t 	using t_collrf.tex, append write
	file write t	"(A) All students\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab all* using t_collrf.tex, k(below) coef(below "All") s(,) $opts4  	 
	file open  t 	using t_collrf.tex, append write
	file write t	"\cmidrule{1-1}" "(B) By initial score\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab lowsc* using t_collrf.tex, k(below) coef(below "Lower scoring") s(, lay(`""')) $opts4 
	esttab highsc* using t_collrf.tex, k(below) coef(below "Higher scoring") s(,) $opts4  
	file open  t 	using t_collrf.tex, append write
	file write t	"\cmidrule{1-1}" "(C) By income\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab lowinc* using t_collrf.tex, k(below) coef(below "Low income") s(, lay(`""')) $opts4    
	esttab highinc* using t_collrf.tex, k(below) coef(below "High income") s(,) $opts4   
	file open  t 	using t_collrf.tex, append write
	file write t	"\cmidrule{1-1}" "(D) By race/ethnicity\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab urm* using t_collrf.tex, k(below) coef(below "URM") s(, lay(`""')) $opts4    
	esttab nonurm* using t_collrf.tex, k(below) coef(below "Non-URM") s(,) $opts4   
	file open  t 	using t_collrf.tex, append write
	file write t 	"\midrule" _n "\end{tabular*}" _n ///
					"\begin{tabular*}{1\textwidth}{p{8.4in}}" _n ///
					"\footnotesize Notes: Heteroskedasticity robust standard errors clustered by first SAT score are in parentheses (* p$<$.10 ** p$<$.05 *** p$<$.01). " ///
					"Each coefficient is a reduced form estimate of the impact of one's first SAT score missing a multiple of 100. " ///
					"The sample consists of all SAT-takers from the high school classes of 2006-14 who had valid math, reading and writing scores and who took their first SAT by November of senior year. " ///
					"Panel B splits the sample into students with first scores nearest to the 700-1500 thresholds and those nearest to the 1600-2300 thresholds. " ///
					"Panel C splits the sample into students with family incomes below \textdollar 50,000 and above \textdollar 100,000. " ///
					"Panel D splits the sample into students who are underrepresented minorities (Black, Hispanic or Native American) and those who are not (White or Asian)." ///
					"\end{tabular*}" _n "\end{sidewaystable}" _n 
	file close t
	estimates clear	
		
	}
	end
		
	// 4-year college coefficients graph
	program define f_coll4coeffs
	{
	
	cd "$data"
	use coll4 retook below *distance cohort months threshold mvw using retaking_stacked.dta if (months>=7)&inrange(distance,-60,50), clear
	
	egen dc = group(cohort months)
	forval x=1000(100)2300 {
		g byte retook_`x' = retook
		label var retook_`x' "`x'"
		preserve
		keep if threshold==`x'
		reghdfe coll4 *distance (retook_`x' = below), a(threshold dc) vce(cluster mvw)
		est sto r`x'
		restore
	}
	
	cd "$paper"	
	coefplot r1000 r1100 r1200 r1300 r1400 r1500 r1600 r1700 r1800 r1900 r2000 r2100 r2200 r2300, ///
				keep(retook_*) vertical legend(off) nooffset msize(small)  ///
				xtitle("Threshold near first SAT score", height(5)) ///
				ytitle("Impact of retaking on 4-year college enrollment", height(5)) ///
				coeflabels(,labsize(small)) yline(0) ylabel(-0.5 -0.25 0 0.25 0.5 0.75 1 1.25) /// 
				mcolor(gs0) msymbol(O) ciopts(lcolor(gs0) lw(vthin))
	graph export f_coll4coeffs.pdf, replace	
	estimates clear
	}
	end

	// Robustness to BW choice of main results
	program define f_bw 
	{
	
	cd "$data"
	use coll4 gradrate inc_baseline retook below *distance cohort months threshold mvw income race using retaking_stacked.dta if (months>=7), clear
	global y coll4 gradrate inc_baseline

	replace inc_base = inc_base/1000
	egen int dc = group(cohort months)

	forval x=700(100)2300 {
		g byte t`x'_distance		= (threshold==`x')*distance
		g byte t`x'_below_distance	= (threshold==`x')*below_distance
	}
	
	g byte all = 1	
	g byte lowscore = inrange(threshold,700,1500)
	g byte lowinc = inrange(income,5,45)
	g byte urm = inlist(race,1,3,4,5,6)

	foreach g of varlist all lowscore lowinc urm {	
	forval x=30(10)100 {
		preserve
		keep if (`g'==1)&inrange(distance,-1*`x',`x'-10)
		rename retook retook_`x'	
		reghdfe $y t*00_* retook below, a(threshold dc) vce(cluster mvw) cache(save)
		foreach y of varlist $y {
			reghdfe `y' t*00_* (retook_`x' = below), a(threshold dc) vce(cluster mvw) cache(use)
			est sto `y'_`g'_`x'
		}
		restore
	}
	}
	
	coefplot coll4_all_30 coll4_all_40 coll4_all_50 coll4_all_60 coll4_all_70 coll4_all_80 coll4_all_90 coll4_all_100  ///
		, keep(retook_*) vertical legend(off) nooffset xtitle("Bandwidth (SAT points)", height(4)) ytitle("Impact of Retaking", height(4)) ///
		coeflabels(retook_30="30" retook_40="40" retook_50="50" retook_60="60" retook_70="70" retook_80="80" retook_90="90" retook_100="100") ///
		yline(0) title("(A) All students", size(medsmall)) /// 		
		mcolor(gs0) msymbol(O) ciopts(lcolor(gs0)) ///
		saving(a.gph, replace)
	coefplot coll4_lowscore_30 coll4_lowscore_40 coll4_lowscore_50 coll4_lowscore_60 coll4_lowscore_70 coll4_lowscore_80 coll4_lowscore_90 coll4_lowscore_100  ///
		, keep(retook_*) vertical legend(off) nooffset xtitle("Bandwidth (SAT points)", height(4)) ytitle("Impact of Retaking", height(4)) ///
		coeflabels(retook_30="30" retook_40="40" retook_50="50" retook_60="60" retook_70="70" retook_80="80" retook_90="90" retook_100="100") ///
		yline(0) title("(B) Lower scoring", size(medsmall)) /// 		
		mcolor(gs0) msymbol(O) ciopts(lcolor(gs0)) ///
		saving(b.gph, replace)
	coefplot coll4_lowinc_30 coll4_lowinc_40 coll4_lowinc_50 coll4_lowinc_60 coll4_lowinc_70 coll4_lowinc_80 coll4_lowinc_90 coll4_lowinc_100  ///
		, keep(retook_*) vertical legend(off) nooffset xtitle("Bandwidth (SAT points)", height(4)) ytitle("Impact of Retaking", height(4)) ///
		coeflabels(retook_30="30" retook_40="40" retook_50="50" retook_60="60" retook_70="70" retook_80="80" retook_90="90" retook_100="100") ///
		yline(0) title("(C) Low income students", size(medsmall)) /// 		
		mcolor(gs0) msymbol(O) ciopts(lcolor(gs0)) ///
		saving(c.gph, replace)
	coefplot coll4_urm_30 coll4_urm_40 coll4_urm_50 coll4_urm_60 coll4_urm_70 coll4_urm_80 coll4_urm_90 coll4_urm_100  ///
		, keep(retook_*) vertical legend(off) nooffset xtitle("Bandwidth (SAT points)", height(4)) ytitle("Impact of Retaking", height(4)) ///
		coeflabels(retook_30="30" retook_40="40" retook_50="50" retook_60="60" retook_70="70" retook_80="80" retook_90="90" retook_100="100") ///
		yline(0) title("(D) Under-represented minorities", size(medsmall)) /// 		
		mcolor(gs0) msymbol(O) ciopts(lcolor(gs0)) ///
		saving(d.gph, replace)
	graph combine a.gph b.gph c.gph d.gph, row(2)
	graph export f_bw_coll4.pdf, replace	
	
	coefplot gradrate_all_30 gradrate_all_40 gradrate_all_50 gradrate_all_60 gradrate_all_70 gradrate_all_80 gradrate_all_90 gradrate_all_100  ///
		, keep(retook_*) vertical legend(off) nooffset xtitle("Bandwidth (SAT points)", height(4)) ytitle("Impact of Retaking", height(4)) ///
		coeflabels(retook_30="30" retook_40="40" retook_50="50" retook_60="60" retook_70="70" retook_80="80" retook_90="90" retook_100="100") ///
		yline(0) title("(A) All students", size(medsmall)) /// 		
		mcolor(gs0) msymbol(O) ciopts(lcolor(gs0)) ///
		saving(e.gph, replace)
	coefplot gradrate_lowscore_30 gradrate_lowscore_40 gradrate_lowscore_50 gradrate_lowscore_60 gradrate_lowscore_70 gradrate_lowscore_80 gradrate_lowscore_90 gradrate_lowscore_100  ///
		, keep(retook_*) vertical legend(off) nooffset xtitle("Bandwidth (SAT points)", height(4)) ytitle("Impact of Retaking", height(4)) ///
		coeflabels(retook_30="30" retook_40="40" retook_50="50" retook_60="60" retook_70="70" retook_80="80" retook_90="90" retook_100="100") ///
		yline(0) title("(B) Lower scoring", size(medsmall)) /// 		
		mcolor(gs0) msymbol(O) ciopts(lcolor(gs0)) ///
		saving(f.gph, replace)
	coefplot gradrate_lowinc_30 gradrate_lowinc_40 gradrate_lowinc_50 gradrate_lowinc_60 gradrate_lowinc_70 gradrate_lowinc_80 gradrate_lowinc_90 gradrate_lowinc_100  ///
		, keep(retook_*) vertical legend(off) nooffset xtitle("Bandwidth (SAT points)", height(4)) ytitle("Impact of Retaking", height(4)) ///
		coeflabels(retook_30="30" retook_40="40" retook_50="50" retook_60="60" retook_70="70" retook_80="80" retook_90="90" retook_100="100") ///
		yline(0) title("(C) Low income students", size(medsmall)) /// 		
		mcolor(gs0) msymbol(O) ciopts(lcolor(gs0)) ///
		saving(g.gph, replace)
	coefplot gradrate_urm_30 gradrate_urm_40 gradrate_urm_50 gradrate_urm_60 gradrate_urm_70 gradrate_urm_80 gradrate_urm_90 gradrate_urm_100  ///
		, keep(retook_*) vertical legend(off) nooffset xtitle("Bandwidth (SAT points)", height(4)) ytitle("Impact of Retaking", height(4)) ///
		coeflabels(retook_30="30" retook_40="40" retook_50="50" retook_60="60" retook_70="70" retook_80="80" retook_90="90" retook_100="100") ///
		yline(0) title("(D) Under-represented minorities", size(medsmall)) /// 		
		mcolor(gs0) msymbol(O) ciopts(lcolor(gs0)) ///
		saving(h.gph, replace)
	graph combine e.gph f.gph g.gph h.gph, row(2)
	graph export f_bw_gr.pdf, replace	

	coefplot inc_baseline_all_30 inc_baseline_all_40 inc_baseline_all_50 inc_baseline_all_60 inc_baseline_all_70 inc_baseline_all_80 inc_baseline_all_90 inc_baseline_all_100  ///
		, keep(retook_*) vertical legend(off) nooffset xtitle("Bandwidth (SAT points)", height(4)) ytitle("Impact of Retaking", height(4)) ///
		coeflabels(retook_30="30" retook_40="40" retook_50="50" retook_60="60" retook_70="70" retook_80="80" retook_90="90" retook_100="100") ///
		yline(0) title("(A) All students") /// 		
		mcolor(gs0) msymbol(O) ciopts(lcolor(gs0)) ///
		saving(i.gph, replace)
	coefplot inc_baseline_lowscore_30 inc_baseline_lowscore_40 inc_baseline_lowscore_50 inc_baseline_lowscore_60 inc_baseline_lowscore_70 inc_baseline_lowscore_80 inc_baseline_lowscore_90 inc_baseline_lowscore_100  ///
		, keep(retook_*) vertical legend(off) nooffset xtitle("Bandwidth (SAT points)", height(4)) ytitle("Impact of Retaking", height(4)) ///
		coeflabels(retook_30="30" retook_40="40" retook_50="50" retook_60="60" retook_70="70" retook_80="80" retook_90="90" retook_100="100") ///
		yline(0) title("(B) Lower scoring") /// 		
		mcolor(gs0) msymbol(O) ciopts(lcolor(gs0)) ///
		saving(j.gph, replace)
	coefplot inc_baseline_lowinc_30 inc_baseline_lowinc_40 inc_baseline_lowinc_50 inc_baseline_lowinc_60 inc_baseline_lowinc_70 inc_baseline_lowinc_80 inc_baseline_lowinc_90 inc_baseline_lowinc_100  ///
		, keep(retook_*) vertical legend(off) nooffset xtitle("Bandwidth (SAT points)", height(4)) ytitle("Impact of Retaking", height(4)) ///
		coeflabels(retook_30="30" retook_40="40" retook_50="50" retook_60="60" retook_70="70" retook_80="80" retook_90="90" retook_100="100") ///
		yline(0) title("(C) Low income students") /// 		
		mcolor(gs0) msymbol(O) ciopts(lcolor(gs0)) ///
		saving(k.gph, replace)
	coefplot inc_baseline_urm_30 inc_baseline_urm_40 inc_baseline_urm_50 inc_baseline_urm_60 inc_baseline_urm_70 inc_baseline_urm_80 inc_baseline_urm_90 inc_baseline_urm_100  ///
		, keep(retook_*) vertical legend(off) nooffset xtitle("Bandwidth (SAT points)", height(4)) ytitle("Impact of Retaking", height(4)) ///
		coeflabels(retook_30="30" retook_40="40" retook_50="50" retook_60="60" retook_70="70" retook_80="80" retook_90="90" retook_100="100") ///
		yline(0) title("(D) Under-represented minorities") /// 		
		mcolor(gs0) msymbol(O) ciopts(lcolor(gs0)) ///
		saving(l.gph, replace)
	graph combine i.gph j.gph k.gph l.gph, row(2)
	graph export f_bw_inc.pdf, replace	

	
	
	
	}
	end

	// Robustness checks
	program define t_robust
	{
	cd "$data"
	use maxmvw coll4 gradrate retook below *distance cohort months threshold mvw income race motheduc female using retaking_stacked.dta if (months>=7), clear
	keep if runiform()<.001
	global y maxmvw coll4 gradrate
	
	egen int x = group(income race motheduc female)
	egen int dc = group(cohort months)
	forval x=700(100)2300 {
		g byte t`x'_distance		= (threshold==`x')*distance
		g byte t`x'_below_distance	= (threshold==`x')*below_distance
	}
	
	g byte all = 1	
	g byte lowscore = inrange(threshold,700,1500)
	g byte highscore = inrange(threshold,1600,2300)
	g byte lowinc = inrange(income,5,45)
	g byte highinc = inrange(income,130,130)
	g byte nonurm = inlist(race,2,7)
	g byte urm = inlist(race,1,3,4,5,6)

	foreach g of varlist all lowscore highscore lowinc highinc urm nonurm {	

		preserve
		keep if (`g'==1)&inrange(distance,-40,30)	
		reghdfe $y t*00_* retook below, a(threshold dc) vce(cluster mvw) cache(save)
		foreach y of varlist $y {
			reghdfe `y' t*00_* (retook = below), a(threshold dc) vce(cluster mvw) cache(use)
			est sto `g'_`y'_40
		}
		restore

		preserve
		keep if (`g'==1)&inrange(distance,-60,50)	
		reghdfe $y t*00_* retook below, a(threshold dc) vce(cluster mvw) cache(save)
		foreach y of varlist $y {
			reghdfe `y' t*00_* (retook = below), a(threshold dc) vce(cluster mvw) cache(use)
			est sto `g'_`y'_60
		}
		restore

		preserve
		keep if (`g'==1)&inrange(distance,-80,70)	
		reghdfe $y t*00_* retook below, a(threshold dc) vce(cluster mvw) cache(save)
		foreach y of varlist $y {
			reghdfe `y' t*00_* (retook = below), a(threshold dc) vce(cluster mvw) cache(use)
			est sto `g'_`y'_80
		}
		restore

		preserve
		keep if (`g'==1)&inrange(distance,-100,90)	
		reghdfe $y t*00_* retook below, a(threshold dc) vce(cluster mvw) cache(save)
		foreach y of varlist $y {
			reghdfe `y' t*00_* (retook = below), a(threshold dc) vce(cluster mvw) cache(use)
			est sto `g'_`y'_100
		}
		restore

		preserve
		keep if (`g'==1)&inrange(distance,-60,50)	
		reghdfe $y t*00_* retook below, a(threshold dc x) vce(cluster mvw) cache(save)
		foreach y of varlist $y {
			reghdfe `y' t*00_* (retook = below), a(threshold dc x) vce(cluster mvw) cache(use)
			est sto `g'_`y'_60c
		}
		restore
		
		preserve
		keep if (`g'==1)&inrange(distance,-60,50)	
		reghdfe $y t*00_* retook below, a(threshold dc) vce(robust) cache(save)
		foreach y of varlist $y {
			reghdfe `y' t*00_* (retook = below), a(threshold dc) vce(robust) cache(use)
			est sto `g'_`y'_60r
		}
		restore
				
	}
	
	cd "$paper"	

	file open  t	using t_robust_maxmvw.tex, replace write
	file write t	"\begin{table}[htbp!] \centering" _n "\def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}" _n ///
					"\caption{Robustness Checks: SAT Superscore}" _n "\label{t_robust_maxmvw}" _n ///
					"\begin{tabular*}{1\textwidth}{@{\extracolsep{\fill}}l*{6}{c}}" _n "\midrule" _n ///
					"&(1)&(2)&(3)&(4)&(5)&(6)\\" _n ///
					"\midrule" _n 
	file close t	
	file open  t 	using t_robust_maxmvw.tex, append write
	file write t	"(A) All students\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab all_max* using t_robust_maxmvw.tex, k(retook) coef(retook "All") s(,) $opts  	 
	file open  t 	using t_robust_maxmvw.tex, append write
	file write t	"\cmidrule{1-1}" "(B) By initial score\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab lowscore_max* using t_robust_maxmvw.tex, k(retook) coef(retook "Lower scoring") s(, lay(`""')) $opts  
	esttab highscore_max* using t_robust_maxmvw.tex, k(retook) coef(retook "Higher scoring") s(,) $opts 
	file open  t 	using t_robust_maxmvw.tex, append write
	file write t	"\cmidrule{1-1}" "(C) By income\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab lowinc_max* using t_robust_maxmvw.tex, k(retook) coef(retook "Low income") s(, lay(`""')) $opts    
	esttab highinc_max* using t_robust_maxmvw.tex, k(retook) coef(retook "High income") s(,) $opts  
	file open  t 	using t_robust_maxmvw.tex, append write
	file write t	"\cmidrule{1-1}" "(D) By race/ethnicity\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab urm_max* using t_robust_maxmvw.tex, k(retook) coef(retook "URM") s(, lay(`""')) $opts     
	esttab nonurm_max* using t_robust_maxmvw.tex, k(retook) coef(retook "Non-URM") s(, lay(`""')) $opts       
	file open  t 	using t_robust_maxmvw.tex, append write
	file write t 	"Bandwidth&40&60&80&100&60&60\\   " _n ///
			"Covariates&N&N&N&N&Y&N\\   " _n ///
			"Clustered s.e.&Y&Y&Y&Y&Y&N\\   " _n ///
			"\midrule" _n "\end{tabular*}" _n ///
			"\begin{tabular*}{1\textwidth}{p{6.3in}}" _n ///
			"\footnotesize Notes: Heteroskedasticity robust standard errors (clustered by first SAT score in all but the final column) are in parentheses (* p$<$.10 ** p$<$.05 *** p$<$.01). " ///
			"Each coefficient is an instrumental variables estimate of the impact of SAT retaking on SAT superscore, where retaking is instrumented with indicators for scoring below a multiple of 100. " ///
			"The sample consists of all SAT-takers from the high school classes of 2006-14 who had valid math, reading and writing scores and who took their first SAT by November of senior year. " ///
			"Panel B splits the sample into students with first scores nearest to the 700-1500 thresholds and those nearest to the 1600-2300 thresholds. " ///
			"Panel C splits the sample into students with family incomes below 50,000 and above 100,000. " ///
			"Panel D splits the sample into students who are underrepresented minorities (Black, Hispanic or Native American) and those who are not (White or Asian)." ///
			"\end{tabular*}" _n "\end{table}" _n 
	file close t

	file open  t	using t_robust_coll4.tex, replace write
	file write t	"\begin{table}[htbp!] \centering" _n "\def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}" _n ///
					"\caption{Robustness Checks: Four-Year College Enrollment}" _n "\label{t_robust_coll4}" _n ///
					"\begin{tabular*}{1\textwidth}{@{\extracolsep{\fill}}l*{6}{c}}" _n "\midrule" _n ///
					"&(1)&(2)&(3)&(4)&(5)&(6)\\" _n ///
					"\midrule" _n 
	file close t	
	file open  t 	using t_robust_coll4.tex, append write
	file write t	"(A) All students\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab all_coll4* using t_robust_coll4.tex, k(retook) coef(retook "All") s(,) $opts  	 
	file open  t 	using t_robust_coll4.tex, append write
	file write t	"\cmidrule{1-1}" "(B) By initial score\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab lowscore_coll4* using t_robust_coll4.tex, k(retook) coef(retook "Lower scoring") s(, lay(`""')) $opts  
	esttab highscore_coll4* using t_robust_coll4.tex, k(retook) coef(retook "Higher scoring") s(,) $opts 
	file open  t 	using t_robust_coll4.tex, append write
	file write t	"\cmidrule{1-1}" "(C) By income\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab lowinc_coll4* using t_robust_coll4.tex, k(retook) coef(retook "Low income") s(, lay(`""')) $opts    
	esttab highinc_coll4* using t_robust_coll4.tex, k(retook) coef(retook "High income") s(,) $opts  
	file open  t 	using t_robust_coll4.tex, append write
	file write t	"\cmidrule{1-1}" "(D) By race/ethnicity\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab urm_coll4* using t_robust_coll4.tex, k(retook) coef(retook "URM") s(, lay(`""')) $opts     
	esttab nonurm_coll4* using t_robust_coll4.tex, k(retook) coef(retook "Non-URM") s(, lay(`""')) $opts       
	file open  t 	using t_robust_coll4.tex, append write
	file write t 	"Bandwidth&40&60&80&100&60&60\\   " _n ///
			"Covariates&N&N&N&N&Y&N\\   " _n ///
			"Clustered s.e.&Y&Y&Y&Y&Y&N\\   " _n ///
			"\midrule" _n "\end{tabular*}" _n ///
			"\begin{tabular*}{1\textwidth}{p{6.3in}}" _n ///
			"\footnotesize Notes: Heteroskedasticity robust standard errors (clustered by first SAT score in all but the final column) are in parentheses (* p$<$.10 ** p$<$.05 *** p$<$.01). " ///
			"Each coefficient is an instrumental variables estimate of the impact of SAT retaking on four-year college enrollment, where retaking is instrumented with indicators for scoring below a multiple of 100. " ///
			"The sample consists of all SAT-takers from the high school classes of 2006-14 who had valid math, reading and writing scores and who took their first SAT by November of senior year. " ///
			"Panel B splits the sample into students with first scores nearest to the 700-1500 thresholds and those nearest to the 1600-2300 thresholds. " ///
			"Panel C splits the sample into students with family incomes below 50,000 and above 100,000. " ///
			"Panel D splits the sample into students who are underrepresented minorities (Black, Hispanic or Native American) and those who are not (White or Asian)." ///
			"\end{tabular*}" _n "\end{table}" _n 
	file close t

	file open  t	using t_robust_gradrate.tex, replace write
	file write t	"\begin{table}[htbp!] \centering" _n "\def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}" _n ///
					"\caption{Robustness Checks: College's Graduation Rate}" _n "\label{t_robust_gradrate}" _n ///
					"\begin{tabular*}{1\textwidth}{@{\extracolsep{\fill}}l*{6}{c}}" _n "\midrule" _n ///
					"&(1)&(2)&(3)&(4)&(5)&(6)\\" _n ///
					"\midrule" _n 
	file close t	
	file open  t 	using t_robust_gradrate.tex, append write
	file write t	"(A) All students\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab all_gradrate* using t_robust_gradrate.tex, k(retook) coef(retook "All") s(,) $opts  	 
	file open  t 	using t_robust_gradrate.tex, append write
	file write t	"\cmidrule{1-1}" "(B) By initial score\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab lowscore_gradrate* using t_robust_gradrate.tex, k(retook) coef(retook "Lower scoring") s(, lay(`""')) $opts  
	esttab highscore_gradrate* using t_robust_gradrate.tex, k(retook) coef(retook "Higher scoring") s(,) $opts 
	file open  t 	using t_robust_gradrate.tex, append write
	file write t	"\cmidrule{1-1}" "(C) By income\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab lowinc_gradrate* using t_robust_gradrate.tex, k(retook) coef(retook "Low income") s(, lay(`""')) $opts    
	esttab highinc_gradrate* using t_robust_gradrate.tex, k(retook) coef(retook "High income") s(,) $opts  
	file open  t 	using t_robust_gradrate.tex, append write
	file write t	"\cmidrule{1-1}" "(D) By race/ethnicity\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab urm_gradrate* using t_robust_gradrate.tex, k(retook) coef(retook "URM") s(, lay(`""')) $opts     
	esttab nonurm_gradrate* using t_robust_gradrate.tex, k(retook) coef(retook "Non-URM") s(, lay(`""')) $opts       
	file open  t 	using t_robust_gradrate.tex, append write
	file write t 	"Bandwidth&40&60&80&100&60&60\\   " _n ///
			"Covariates&N&N&N&N&Y&N\\   " _n ///
			"Clustered s.e.&Y&Y&Y&Y&Y&N\\   " _n ///
			"\midrule" _n "\end{tabular*}" _n ///
			"\begin{tabular*}{1\textwidth}{p{6.3in}}" _n ///
			"\footnotesize Notes: Heteroskedasticity robust standard errors (clustered by first SAT score in all but the final column) are in parentheses (* p$<$.10 ** p$<$.05 *** p$<$.01). " ///
			"Each coefficient is an instrumental variables estimate of the impact of SAT retaking on the chosen college's graduation rate, where retaking is instrumented with indicators for scoring below a multiple of 100. " ///
			"The sample consists of all SAT-takers from the high school classes of 2006-14 who had valid math, reading and writing scores and who took their first SAT by November of senior year. " ///
			"Panel B splits the sample into students with first scores nearest to the 700-1500 thresholds and those nearest to the 1600-2300 thresholds. " ///
			"Panel C splits the sample into students with family incomes below 50,000 and above 100,000. " ///
			"Panel D splits the sample into students who are underrepresented minorities (Black, Hispanic or Native American) and those who are not (White or Asian)." ///
			"\end{tabular*}" _n "\end{table}" _n 
	file close t

	estimates clear	
	
	}
	end

	// College enrollment effects among low-scorers, by income and race
	program define t_collhet
	{
	cd "$data"
	use coll4 coll2 gradrate inc_baseline retook below *distance cohort months threshold mvw income race using retaking_stacked.dta if (months>=7)&inrange(distance,-60,50)&inrange(threshold,700,1500), clear
	global y coll4 coll2 gradrate gr50 gr80 inc_baseline inc_base50 inc_base65

	g byte gr50 = (gradrate>.5)
	g byte gr80 = (gradrate>.8)
	replace inc_base = inc_base/1000
	g byte inc_base50 = (inc_baseline>50)
	g byte inc_base65 = (inc_baseline>65)

	g byte oneminusretook = 1-retook
	foreach y of varlist $y {
		g `y'_Y0 = `y'*oneminusretook
	}		
	egen dc = group(cohort months)
	compress
	
	forval x=700(100)1500 {
		g byte t`x'_distance		= (threshold==`x')*distance
		g byte t`x'_below_distance	= (threshold==`x')*below_distance
	}
	
	g byte lowinc = inrange(income,5,45)
	g byte highinc = inrange(income,130,130)
	g byte nonurm = inlist(race,2,7)
	g byte urm = inlist(race,1,3,4,5,6)

	foreach g of varlist lowinc highinc urm nonurm {	
		preserve
		keep if `g'==1	
		reghdfe $y *_Y0 t*00_* oneminusretook retook below, a(threshold dc) vce(cluster mvw) cache(save)
		foreach y of varlist $y {
			reghdfe `y'_Y0 t*00_* (oneminusretook = below)	, a(threshold dc) vce(cluster mvw) cache(use)
			local ccm = _b[oneminusretook]
			reghdfe `y' t*00_* (retook = below)		, a(threshold dc) vce(cluster mvw) cache(use)
			est sto `g'_`y'
			estadd scalar ccm = `ccm'
		}
		restore
	}
	
	cd "$paper"	
	file open  t	using t_collhet.tex, replace write
	file write t	"\begin{sidewaystable}[htbp!] \centering" _n "\def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}" _n ///
					"\caption{Retaking and College Enrollment among Lower Scoring Students}" _n "\label{t_collhet}" _n ///
					"\begin{tabular*}{1\textwidth}{@{\extracolsep{\fill}}l*{8}{c}}" _n "\midrule" _n ///
					"&\multicolumn{2}{c}{College type}	&\multicolumn{3}{c}{College's graduation rate}	&\multicolumn{3}{c}{College's mean earnings}	\\" _n ///
					"&Four-year&Two-year			&Overall & $>$50\%  & $>$80\% 			&Overall &$>$ \textdollar 50,000 &$>$ \textdollar 65,000\\" _n ///
					"&(1)&(2)&(3)&(4)&(5)&(6)&(7)&(8)\\" _n ///
					"\midrule" _n 
	file close t	
	file open  t	using t_collhet.tex, append write
	file write t	"\cmidrule{1-1}" "(A) By income\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab lowinc* using t_collhet.tex, k(retook) coef(retook "Low income") s(ccm, l("Control complier mean") f(2) lay(@ `""')) $opts   
	esttab highinc* using t_collhet.tex, k(retook) coef(retook "High income") s(ccm, l("Control complier mean") f(2) lay(@)) $opts   
	file open  t 	using t_collhet.tex, append write
	file write t	"\cmidrule{1-1}" "(B) By race/ethnicity\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab urm* using t_collhet.tex, k(retook) coef(retook "URM") s(ccm, l("Control complier mean") f(2) lay(@ `""')) $opts   
	esttab nonurm* using t_collhet.tex, k(retook) coef(retook "Non-URM") s(ccm, l("Control complier mean") f(2) lay(@)) $opts  
	file open  t 	using t_collhet.tex, append write
	file write t 	"\midrule" _n "\end{tabular*}" _n ///
					"\begin{tabular*}{1\textwidth}{p{8.4in}}" _n ///
					"\footnotesize Notes: Heteroskedasticity robust standard errors clustered by first SAT score are in parentheses (* p$<$.10 ** p$<$.05 *** p$<$.01). " ///
					"Each coefficient is an instrumental variables estimate of the impact of SAT retaking, where retaking is instrumented with indicators for scoring below a multiple of 100. " ///
					"The sample consists of all SAT-takers from the high school classes of 2006-14 who took their first SAT by November of senior year and whose first score was nearest to a threshold of at most 1500. " ///
					"Panel A splits the sample into students with family incomes below 50,000 and above 100,000. " ///
					"Panel B splits the sample into students who are underrepresented minorities (Black, Hispanic or Native American) and those who are not (White or Asian)." ///
					"\end{tabular*}" _n "\end{sidewaystable}" _n 
	file close t
	estimates clear	
		
	}
	end

	// College enrollment effects among high-scorers, by income and race
	program define t_collhet1
	{
	cd "$data"
	use coll4 coll2 gradrate inc_baseline retook below *distance cohort months threshold mvw income race using retaking_stacked.dta if (months>=7)&inrange(distance,-60,50)&inrange(threshold,1600,2300), clear
	global y coll4 coll2 gradrate gr50 gr80 inc_baseline inc_base50 inc_base65

	g byte gr50 = (gradrate>.5)
	g byte gr80 = (gradrate>.8)
	replace inc_base = inc_base/1000
	g byte inc_base50 = (inc_baseline>50)
	g byte inc_base65 = (inc_baseline>65)

	g byte oneminusretook = 1-retook
	foreach y of varlist $y {
		g `y'_Y0 = `y'*oneminusretook
	}		
	egen dc = group(cohort months)
	compress
	
	forval x=1600(100)2300 {
		g byte t`x'_distance		= (threshold==`x')*distance
		g byte t`x'_below_distance	= (threshold==`x')*below_distance
	}
	
	g byte lowinc = inrange(income,5,45)
	g byte highinc = inrange(income,130,130)
	g byte nonurm = inlist(race,2,7)
	g byte urm = inlist(race,1,3,4,5,6)

	foreach g of varlist lowinc highinc urm nonurm {	
		preserve
		keep if `g'==1	
		reghdfe $y *_Y0 t*00_* oneminusretook retook below, a(threshold dc) vce(cluster mvw) cache(save)
		foreach y of varlist $y {
			reghdfe `y'_Y0 t*00_* (oneminusretook = below)	, a(threshold dc) vce(cluster mvw) cache(use)
			local ccm = _b[oneminusretook]
			reghdfe `y' t*00_* (retook = below)		, a(threshold dc) vce(cluster mvw) cache(use)
			est sto `g'_`y'
			estadd scalar ccm = `ccm'
		}
		restore
	}
	
	cd "$paper"	
	file open  t	using t_collhet1.tex, replace write
	file write t	"\begin{sidewaystable}[htbp!] \centering" _n "\def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}" _n ///
					"\caption{Retaking and College Enrollment among Higher Scoring Students}" _n "\label{t_collhet1}" _n ///
					"\begin{tabular*}{1\textwidth}{@{\extracolsep{\fill}}l*{8}{c}}" _n "\midrule" _n ///
					"&\multicolumn{2}{c}{College type}	&\multicolumn{3}{c}{College's graduation rate}	&\multicolumn{3}{c}{College's mean earnings}	\\" _n ///
					"&Four-year&Two-year			&Overall & $>$50\%  & $>$80\% 			&Overall &$>$ \textdollar 50,000 &$>$ \textdollar 65,000\\" _n ///
					"&(1)&(2)&(3)&(4)&(5)&(6)&(7)&(8)\\" _n ///
					"\midrule" _n 
	file close t	
	file open  t	using t_collhet1.tex, append write
	file write t	"\cmidrule{1-1}" "(A) By income\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab lowinc* using t_collhet1.tex, k(retook) coef(retook "Low income") s(ccm, l("Control complier mean") f(2) lay(@ `""')) $opts   
	esttab highinc* using t_collhet1.tex, k(retook) coef(retook "High income") s(ccm, l("Control complier mean") f(2) lay(@)) $opts   
	file open  t 	using t_collhet1.tex, append write
	file write t	"\cmidrule{1-1}" "(B) By race/ethnicity\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab urm* using t_collhet1.tex, k(retook) coef(retook "URM") s(ccm, l("Control complier mean") f(2) lay(@ `""')) $opts   
	esttab nonurm* using t_collhet1.tex, k(retook) coef(retook "Non-URM") s(ccm, l("Control complier mean") f(2) lay(@)) $opts  
	file open  t 	using t_collhet1.tex, append write
	file write t 	"\midrule" _n "\end{tabular*}" _n ///
					"\begin{tabular*}{1\textwidth}{p{8.4in}}" _n ///
					"\footnotesize Notes: Heteroskedasticity robust standard errors clustered by first SAT score are in parentheses (* p$<$.10 ** p$<$.05 *** p$<$.01). " ///
					"Each coefficient is an instrumental variables estimate of the impact of SAT retaking, where retaking is instrumented with indicators for scoring below a multiple of 100. " ///
					"The sample consists of all SAT-takers from the high school classes of 2006-14 who took their first SAT by November of senior year and whose first score was nearest to a threshold of at least 1600. " ///
					"Panel A splits the sample into students with family incomes below 50,000 and above 100,000. " ///
					"Panel B splits the sample into students who are underrepresented minorities (Black, Hispanic or Native American) and those who are not (White or Asian)." ///
					"\end{tabular*}" _n "\end{sidewaystable}" _n 
	file close t
	estimates clear	
		
	}
	end
	
	// Score sends
	program define t_sends
	{
	cd "$data"
	use sends* retook below *distance cohort months threshold mvw income race using retaking_stacked.dta if (months>=7)&inrange(distance,-60,50), clear
	global y gr1 gr2 gr3 

	g gr1 = sends - sends_gr50
	g gr2 = sends_gr50-sends_gr80
	g gr3 = sends_gr80
	drop sends*

	g byte oneminusretook = 1-retook
	foreach y of varlist $y {
		g `y'_Y0 = `y'*oneminusretook
	}		
	egen int dc = group(cohort months)
	compress
	
	forval x=700(100)2300 {
		g byte t`x'_distance		= (threshold==`x')*distance
		g byte t`x'_below_distance	= (threshold==`x')*below_distance
	}
	
	g byte all = 1	
	g byte lowscore = inrange(threshold,700,1500)
	g byte highscore = inrange(threshold,1600,2300)
	g byte lowinc = inrange(income,5,45)
	g byte highinc = inrange(income,130,130)
	g byte nonurm = inlist(race,2,7)
	g byte urm = inlist(race,1,3,4,5,6)

	foreach g of varlist all lowscore highscore lowinc highinc urm nonurm {	
		preserve
		keep if `g'==1	
		reghdfe $y *Y0 t*00_* oneminusretook retook below, a(threshold dc) vce(cluster mvw) cache(save)
		foreach y of varlist $y {
			reghdfe `y'_Y0 t*00_* (oneminusretook = below)	, a(threshold dc) vce(cluster mvw) cache(use)
			local ccm = _b[oneminusretook]
			reghdfe `y' t*00_* (retook = below)		, a(threshold dc) vce(cluster mvw) cache(use)
			est sto `g'_`y'
			estadd scalar ccm = `ccm'
		}
		restore
	}
	
	cd "$paper"	
	file open  t	using t_sends.tex, replace write
	file write t	"\begin{table}[htbp!] \centering" _n "\def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}" _n ///
					"\caption{Retaking and College Applications}" _n "\label{t_sends}" _n ///
					"\begin{tabular*}{1\textwidth}{@{\extracolsep{\fill}}l*{3}{c}}" _n "\midrule" _n ///
					"&\multicolumn{3}{c}{Score sends to colleges, by B.A. completion rate}\\" _n ///
					"&0-0.5&0.5-0.8 &0.8-1\\" _n ///
					"&(1)&(2)&(3)\\" _n ///
					"\midrule" _n 
	file close t	
	file open  t 	using t_sends.tex, append write
	file write t	"(A) All students\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab all* using t_sends.tex, k(retook) coef(retook "All") s(ccm, l("Control complier mean") f(2) lay(@)) $opts  	 
	file open  t 	using t_sends.tex, append write
	file write t	"\cmidrule{1-1}" "(B) By initial score\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab lowsc* using t_sends.tex, k(retook) coef(retook "Lower scoring") s(ccm, l("Control complier mean") f(2) lay(@ `""')) $opts  
	esttab highsc* using t_sends.tex, k(retook) coef(retook "Higher scoring") s(ccm, l("Control complier mean") f(2) lay(@)) $opts  
	file open  t 	using t_sends.tex, append write
	file write t	"\cmidrule{1-1}" "(C) By income\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab lowinc* using t_sends.tex, k(retook) coef(retook "Low income") s(ccm, l("Control complier mean") f(2) lay(@ `""')) $opts   
	esttab highinc* using t_sends.tex, k(retook) coef(retook "High income") s(ccm, l("Control complier mean") f(2) lay(@)) $opts   
	file open  t 	using t_sends.tex, append write
	file write t	"\cmidrule{1-1}" "(D) By race/ethnicity\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab urm* using t_sends.tex, k(retook) coef(retook "URM") s(ccm, l("Control complier mean") f(2) lay(@ `""')) $opts   
	esttab nonurm* using t_sends.tex, k(retook) coef(retook "Non-URM") s(ccm, l("Control complier mean") f(2) lay(@)) $opts  
	file open  t 	using t_sends.tex, append write
	file write t 	"\midrule" _n "\end{tabular*}" _n ///
					"\begin{tabular*}{1\textwidth}{p{6.3in}}" _n ///
					"\footnotesize Notes: Heteroskedasticity robust standard errors clustered by first SAT score are in parentheses (* p$<$.10 ** p$<$.05 *** p$<$.01). " ///
					"Each coefficient is an instrumental variables estimate of the impact of SAT retaking, where retaking is instrumented with indicators for scoring below a multiple of 100. " ///
					"The sample consists of all SAT-takers from the high school classes of 2006-14 who had valid math, reading and writing scores and who took their first SAT by November of senior year. " ///
					"Panel B splits the sample into students with first scores nearest to the 700-1500 thresholds and those nearest to the 1600-2300 thresholds. " ///
					"Panel C splits the sample into students with family incomes below 50,000 and above 100,000. " ///
					"Panel D splits the sample into students who are underrepresented minorities (Black, Hispanic or Native American) and those who are not (White or Asian)." ///
					"\end{tabular*}" _n "\end{table}" _n 
	file close t
	estimates clear	
	
	}
	end

	// Remove states one at a time
	program define t_states
	{
	cd "$data"
	use coll4 gradrate retook below *distance cohort months threshold mvw income race state using retaking_stacked.dta if (months>=7)&inrange(distance,-60,50), clear

	global y coll4 gradrate 	
	egen int dc = group(cohort months)
	forval x=700(100)2300 {
		g byte t`x'_distance		= (threshold==`x')*distance
		g byte t`x'_below_distance	= (threshold==`x')*below_distance
	}
	
	g byte all = 1	
	g byte lowscore = inrange(threshold,700,1500)
	g byte lowinc = inrange(income,5,45)
	g byte urm = inlist(race,1,3,4,5,6)

	foreach state of numlist 6 36 48 42 12 34 13 25 51 37 24 18 {
	foreach g of varlist all lowscore lowinc urm {	
		preserve
		keep if (`g'==1)&(state!=`state')	
		reghdfe $y t*00_* retook below, a(threshold dc) vce(cluster mvw) cache(save)
		foreach y of varlist $y {
			reghdfe `y' t*00_* (retook = below)		, a(threshold dc) vce(cluster mvw) cache(use)
			est sto `g'_`y'_`state'
		}
		restore
	}
	}
	
	esttab all_coll4* 	using t_states.tex, k(retook) coef(retook "All") s(,) $opts  	 
	esttab lowscore_coll4* 	using t_states.tex, k(retook) coef(retook "Lower scoring") s(,) $opts    
	esttab lowinc_coll4* 	using t_states.tex, k(retook) coef(retook "Low income") s(,) $opts    
	esttab urm_coll4* 	using t_states.tex, k(retook) coef(retook "URM") s(,) $opts   

	esttab all_grad* 	using t_states.tex, k(retook) coef(retook "All") s(,) $opts  	 
	esttab lowscore_grad* 	using t_states.tex, k(retook) coef(retook "Lower scoring") s(,) $opts    
	esttab lowinc_grad* 	using t_states.tex, k(retook) coef(retook "Low income") s(,) $opts    
	esttab urm_grad* 	using t_states.tex, k(retook) coef(retook "URM") s(,) $opts   

	estimates clear	
		
	}
	end	

	// Total scores matter
	program define f_mv
	{
	
	cd "$data"
	use retook m v w months using retaking_splines.dta if (m!=.)&(v!=.)&(w==.)&(months>=7), clear
	g mv = m+v
	collapse retook, by(mv)

	cd "$paper"
	scatter retook mv, ylabel(0(0.1)0.8) msize(tiny) mcolor(black) mfcolor(black) m(O) legend(off) ///
						xlabel(400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600, labsize(small)) ///
						 xline(400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600, lw(vvthin) lstyle(dot)) ///
						xtitle("First SAT score (math + reading)", size(medsmall) height(5)) ytitle("Retook SAT", size(medsmall) height(4)) ///
						title("(A) First take lacked writing section", size(medium)) saving(a.gph, replace)
	
	cd "$data"
	use retook m v w months using retaking_splines.dta if (m!=.)&(v!=.)&(w!=.)&(months>=7), clear
	g mv = m+v
	collapse retook, by(mv)

	cd "$paper"
	scatter retook mv, ylabel(0(0.1)0.8) msize(tiny) mcolor(black) mfcolor(black) m(O) legend(off) ///
						xlabel(400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600, labsize(small)) ///
						 xline(400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600, lw(vvthin) lstyle(dot)) ///
						xtitle("First SAT score (math + reading)", size(medsmall) height(5)) ytitle("Retook SAT", size(medsmall) height(4)) ///
						title("(B) First take had writing section", size(medium)) saving(b.gph, replace)
	
	graph combine a.gph b.gph, col(1) xsize(8.5) ysize(11) ycommon
	graph export f_mv.pdf, replace	
	rm a.gph
	rm b.gph
		
	}
	end	
	
	// Individual subject scores
	program define f_mvw
	{
	
	cd "$data"
	use retook m v w mvw months using retaking_splines.dta if (mvw!=.)&(months>=7), clear
	cd "$paper"
	
	preserve
	collapse retook, by(m)
	scatter retook m, ylabel(0.3(0.1)0.7) msize(tiny) mcolor(black) mfcolor(black) m(O) legend(off) ///
						xlabel(200 300 400 500 600 700 800, labsize(small)) ///
						 xline(200 300 400 500 600 700 800, lw(vvthin) lstyle(dot)) ///
						xtitle("First math score", size(medsmall) height(5)) ytitle("Retook SAT", size(medsmall) height(4)) ///
						title("(A) Math score", size(medium)) saving(a.gph, replace)
	restore
	
	preserve
	collapse retook, by(v)
	scatter retook v, ylabel(0.3(0.1)0.7) msize(tiny) mcolor(black) mfcolor(black) m(O) legend(off) ///
						xlabel(200 300 400 500 600 700 800, labsize(small)) ///
						 xline(200 300 400 500 600 700 800, lw(vvthin) lstyle(dot)) ///
						xtitle("First reading score", size(medsmall) height(5)) ytitle("Retook SAT", size(medsmall) height(4)) ///
						title("(B) Reading score", size(medium)) saving(b.gph, replace)
	restore

	preserve
	collapse retook, by(w)
	scatter retook w, ylabel(0.3(0.1)0.7) msize(tiny) mcolor(black) mfcolor(black) m(O) legend(off) ///
						xlabel(200 300 400 500 600 700 800, labsize(small)) ///
						 xline(200 300 400 500 600 700 800, lw(vvthin) lstyle(dot)) ///
						xtitle("First writing score", size(medsmall) height(5)) ytitle("Retook SAT", size(medsmall) height(4)) ///
						title("(C) Writing score", size(medium)) saving(c.gph, replace)
	restore
	
	graph combine a.gph b.gph c.gph, col(1) xsize(6.5) ysize(11) ycommon
	graph export f_mvw.pdf, replace	
	rm a.gph
	rm b.gph
	rm c.gph
	
	}
	end	

	// Degree completion
	program define t_ba6
	{
	
	cd "$data"
	use mvw maxmvw coll4 gradrate ba4 ba6 retook below *distance cohort months threshold mvw income race using retaking_stacked.dta if (months>=7)&inrange(distance,-60,50)&(ba6!=.), clear
	global y maxmvw coll4 gradrate ba6

	egen int dc = group(cohort months)
	forval x=700(100)2300 {
		g byte t`x'_distance		= (threshold==`x')*distance
		g byte t`x'_below_distance	= (threshold==`x')*below_distance
	}
	
	g byte all = 1	
	g byte lowscore = inrange(threshold,700,1500)
	g byte highscore = inrange(threshold,1600,2300)
	g byte lowinc = inrange(income,5,45)
	g byte highinc = inrange(income,130,130)
	g byte nonurm = inlist(race,2,7)
	g byte urm = inlist(race,1,3,4,5,6)

	foreach g of varlist all lowscore highscore lowinc highinc urm nonurm {
		preserve
		keep if `g'
		reghdfe $y t*00_* retook below, a(threshold dc) vce(cluster mvw) cache(save)
		foreach y of varlist $y {
			reghdfe `y' t*00_* (retook = below), a(threshold dc) vce(cluster mvw) cache(use)
			est sto `g'_`y'
		}
		restore
	}				
			
	cd "$paper"	
	file open  t	using t_ba.tex, replace write
	file write t	"\begin{table}[htbp!] \centering" _n "\def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}" _n ///
					"\caption{Retaking and College Completion}" _n "\label{t_ba}" _n ///
					"\begin{tabular*}{1\textwidth}{@{\extracolsep{\fill}}l*{4}{c}}" _n "\midrule" _n ///
					"&Maximum		&Four-year	&College's	& Earned B.A. \\" _n ///
					"&SAT score	&college		&grad. rate	&	in 6 years	\\" _n ///
					"&(1)&(2)&(3)&(4)\\" _n ///
					"\midrule" _n 
	file close t	
	file open  t 	using t_ba.tex, append write
	file write t	"(A) All students\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab all* using t_ba.tex, k(retook) coef(retook "All") s(,) $opts  	 
	file open  t 	using t_ba.tex, append write
	file write t	"\cmidrule{1-1}" "(B) By initial score\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab lowsc* using t_ba.tex, k(retook) coef(retook "Lower scoring") s(,) $opts
	esttab highsc* using t_ba.tex, k(retook) coef(retook "Higher scoring") s(,) $opts  
	file open  t 	using t_ba.tex, append write
	file write t	"\cmidrule{1-1}" "(C) By income\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab lowinc* using t_ba.tex, k(retook) coef(retook "Low income") s(,) $opts
	esttab highinc* using t_ba.tex, k(retook) coef(retook "High income") s(,) $opts 
	file open  t 	using t_ba.tex, append write
	file write t	"\cmidrule{1-1}" "(D) By race/ethnicity\\ " _n "\cmidrule{1-1}" 
	file close t
	esttab urm* using t_ba.tex, k(retook) coef(retook "URM") s(,) $opts
	esttab nonurm* using t_ba.tex, k(retook) coef(retook "Non-URM") s(,) $opts
	file open  t 	using t_ba.tex, append write
	file write t 	"\midrule" _n "\end{tabular*}" _n ///
					"\begin{tabular*}{1\textwidth}{p{6.3in}}" _n ///
					"\footnotesize Notes: Heteroskedasticity robust standard errors clustered by first SAT score are in parentheses (* p$<$.10 ** p$<$.05 *** p$<$.01). " ///
					"Each coefficient is an instrumental variables estimate of the impact of SAT retaking, where retaking is instrumented with indicators for scoring below a multiple of 100. " ///
					"The sample consists of all SAT-takers from the high school classes of 2006-08 who had valid math, reading and writing scores and who took their first SAT by November of senior year. " ///
					"Panel B splits the sample into students with first scores nearest to the 700-1500 thresholds and those nearest to the 1600-2300 thresholds. " ///
					"Panel C splits the sample into students with family incomes below 50,000 and above 100,000. " ///
					"Panel D splits the sample into students who are underrepresented minorities (Black, Hispanic or Native American) and those who are not (White or Asian)." ///
					"\end{tabular*}" _n "\end{table}" _n 
	file close t
	estimates clear	
	
	}
	end

	// Income and race gaps 
	program define t_gaps
	{
	cd "$data"
	use retaking_splines.dta if (mvw!=.), clear
	
	g byte urm = inlist(race,1,3,4,5,6)
	g byte other = inlist(race,0,8)
	g byte lowinc = inrange(income,5,45)
	g byte medinc = inrange(income,50,125)
	g byte missinc = (income==0)
	
	label var urm "URM"
	label var lowinc "Low income"
	label var medinc "Middle income"

	foreach y of varlist retook mvw maxmvw coll4 {
		reg `y' lowinc medinc missinc i.cohort, cluster(highschool)
		eststo inc_`y'
		sum `y' if income==130
		estadd scalar mu = r(mean)
		reg `y' urm other	i.cohort, cluster(highschool)
		eststo race_`y'
		sum `y' if race==7
		estadd scalar mu = r(mean)
	}	
		
	cd "$paper"
	file open  t	using t_gaps.tex, replace write
	file write t	"\begin{table}[htbp!] \centering" _n "\def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}" _n ///
					"\caption{Income and Race Gaps}" _n "\label{t_gaps}" _n ///
					"\begin{tabular*}{1\textwidth}{@{\extracolsep{\fill}}l*{4}{c}}" _n "\midrule" _n ///
					"&&First&Final&Four-year \\" _n ///
					"&Retook&SAT score&superscore&college \\" _n ///
					"&(1)&(2)&(3)&(4)\\" _n ///
					"\midrule" _n
	file close t
	file open  t	using t_gaps.tex, append write
	file write t	"(A) Income \\ \cmidrule{1-1}"
	file close t
	esttab inc* using t_gaps.tex, l keep(lowinc medinc) s(mu, l("High income mean")) $opts
	file open  t	using t_gaps.tex, append write
	file write t	"\cmidrule{1-1} (B) Race \\ \cmidrule{1-1}"
	file close t
	esttab race* using t_gaps.tex, l keep(urm) s(mu, l("Non-URM mean")) $opts
	file open  t 	using t_gaps.tex, append write
	file write t 	"\midrule" _n "\end{tabular*}" _n ///
			"\begin{tabular*}{1\textwidth}{p{6.3in}}" _n ///
			"\footnotesize Notes: Heteroskedasticity robust standard errors clustered by high school are in parentheses (* p$<$.10 ** p$<$.05 *** p$<$.01). " ///
			"Each column regresses the listed outcome on the demographic group indicators. " ///
			"All regressions include cohort fixed effects, as well as indicators for missing income or race, so that high income and non-URM students are the reference groups. " /// 
			"Below each column is the mean outcome for the reference group in each panel. The sample consists of all SAT-takers from the high school classes of 2006-14 who had valid math, reading and writing scores. " ///
			"Each regression uses 13,656,612 observations." ///
			"\end{tabular*}" _n "\end{table}" _n 
	file close t
	estimates clear		
	}
	end
	
	// Calculations for gap closing
	program define gapcalcs
	{

	// HSLS calculations underlying gap-closing estimates
	
	clear all
	set maxvar 10000
	use "C:\Users\jgoodma1\Dropbox\Retaking\HSLS\data\hsls_16_student_v1_0.dta", clear
	keep 	W1STUDENT X3HSCOMPSTAT X1RACE X1FAMINCOME /// 
			S2SATNUM X4ATNDCLG16FB X4PS1LEVEL X4HS2PSMOS

	// Income
	g lowinc = inrange(X1FAMINCOME,1,3)  if X1FAMINCOME>0
	g highinc= inrange(X1FAMINCOME,6,13) if X1FAMINCOME>0
	drop X1FAMINCOME
	
	// Race
	g urm 	= inlist(X1RACE,1,3,4,5) if X1RACE>0
	g nonurm= inlist(X1RACE,2,8) 	 if X1RACE>0
	drop X1RACE
	
	// Test takes
	g took0 = (S2SATNUM==0) if inrange(S2SATNUM,0,3)
	g took1 = (S2SATNUM==1) if inrange(S2SATNUM,0,3)
	g took2 = (S2SATNUM>=2) if inrange(S2SATNUM,0,3)
	drop S2SATNUM	
		
	// Four-year college enrollment
	g coll4 = (X4PS1LEVEL==1)&inrange(X4HS2PSMOS,0,6) if inrange(X4PS1LEVEL,-7,3)
	drop X4ATNDCLG16FB X4PS1LEVEL X4HS2PSMOS

	// Keep only high school graduates for whom we observe test takes and college enrollment status
	keep if (X3HSCOMPSTAT==1)&(took0!=.)&(coll4!=.)
	drop X3HSCOMPSTAT
	
	sum took* if (nonurm==1)	
	sum coll4 if (took0==1)&(nonurm==1)
	sum coll4 if (took1==1)&(nonurm==1)
	sum coll4 if (took2==1)&(nonurm==1)

	sum took* if (urm==1)
	sum coll4 if (took0==1)&(urm==1)
	sum coll4 if (took1==1)&(urm==1)
	sum coll4 if (took2==1)&(urm==1)

	sum took* if (highinc==1)	
	sum coll4 if (took0==1)&(highinc==1)
	sum coll4 if (took1==1)&(highinc==1)
	sum coll4 if (took2==1)&(highinc==1)

	sum took* if (lowinc==1)
	sum coll4 if (took0==1)&(lowinc==1)
	sum coll4 if (took1==1)&(lowinc==1)
	sum coll4 if (took2==1)&(lowinc==1)
	
	// CB calculations underlying gap-closing estimates
	
	cd "$data"
	use retaking_splines.dta, clear
	g byte lowinc = inrange(income,5,45)
	g byte highinc = inrange(income,130,130)
	g byte nonurm = inlist(race,2,7)
	g byte urm = inlist(race,1,3,4,5,6)
		
	sum retook if (nonurm==1)
	sum coll4 if (nonurm==1)&(takes==1)
	sum coll4 if (nonurm==1)&(takes>=2)
	
	sum retook if (urm==1)
	sum coll4 if (urm==1)&(takes==1)
	sum coll4 if (urm==1)&(takes>=2)
	
	sum retook if (highinc==1)
	sum coll4 if (highinc==1)&(takes==1)
	sum coll4 if (highinc==1)&(takes>=2)
	
	sum retook if (lowinc==1)
	sum coll4 if (lowinc==1)&(takes==1)
	sum coll4 if (lowinc==1)&(takes>=2)
	
	
	}
	end
	
*t_means
*t_retake
*t_gaps
*f_rawfs
*f_fs
*f_density
*t_cov
*f_cov
*t_fs
*f_fscoeffs
*t_max (split off t_max1)
*t_maxrf
*f_maxmvwcoeffs
*t_coll
*t_collrf
*f_coll4coeffs
*f_bw
*t_robust
*t_collhet
*t_collhet1
*t_sends
*t_states
*f_mv
*f_mvw
*t_ba6	
