/*******************************************************************************

Replication file 1: Place, Peers and the Teenage Years

Author:		Nathan Deutscher
			
Date:		3 May 2019

This program is the first of two replication files.  Steps are:
	1. Sample selection: use parental postcode histories to identifer movers.
	2. Calculate independent variables: predicted outcomes and peer groups.
	3. Descriptive statistics.
	4. Estimate the main models.
	5. Validation exercises.
	6. Data appendix.
	7. Peers analysis.
	
*******************************************************************************/

*------------------------------------------------------------------------------*
* 	0. PRELIMINARIES													   
*------------------------------------------------------------------------------*

clear all

	version 13.1

	* Location is home (ie offsite) or ATO 

	global location="home"
	*global location="ATO"

	* Corresponding path names

	global home_path="/Users/Nathan/Documents/1. Research - ATO mirror/Project 2 (FOR WEB)"
	global ATO_path=""

	* Begin the log

	capture log close
	log using "${${location}_path}/5. Output/Log - $S_DATE", append text 
	
	* Start in text statement numbering

	global statement_num = 1

	* Working directory

	cd "${${location}_path}/2. Data (working)"
	
	* Settings
	
	set max_memory 200g, perm
	set maxvar 30000, perm
	set scheme s1color, perm
	set more off, perm
	set matsize 10000
	
	* Precision multipliers (home dataset smaller)
	
	global home_pm=100
	global ATO_pm=1

*------------------------------------------------------------------------------*
* 	1. SELECT SAMPLE FOR CAUSAL PLACE WORK		 										   
*------------------------------------------------------------------------------*

*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* 	1A. Generate outcome file									   
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*

clear all
use "${${location}_path}/1. Data (MASTER)/mobility"

	do derived_variables

	keep int_id fyob sex expat *p1* *p2* num_kids* *r_total* *r_hdo* *r_depend* family_type d_age m_age has_spouse
		
	keep if fyob>=1978 & fyob<=1991
	
	drop *r_wages* *r_private* *r_disposable*
	
* TABLE 1: Family Characteristics in Sample and Population

	gen byte basecohorts=(fyob>=1978 & fyob<=1991)
	
	** Family structure **

	tabulate family_type if p1_int_id!=. & basecohorts==1, gen(family_type) 
	tabstat family_type? if p1_int_id!=. & basecohorts==1, stat(mean) format(%9.1g) save
		drop family_type?
		matrix fam_type=r(StatTotal)'
	
	** Age at birth **

	tabstat d_age m_age if p1_int_id!=. & basecohorts==1, stat(median) format(%9.3g) save
		drop m_* d_* 
		matrix p_age=r(StatTotal)'
	
	** Family size **

	tabstat num_kids? if p1_int_id!=. & basecohorts==1, stat(mean) format(%9.1g) save
		drop num_kids?
		matrix fam_size=r(StatTotal)'
	
	tabstat num_kids if p1_int_id!=. & basecohorts==1, stat(mean) format(%9.1g) save
		matrix fam_size_mean=r(StatTotal)'
		
	** Form matrix, add sample sizes **
	
	matrix bmarking = fam_type \ p_age \ fam_size \ fam_size_mean  
	
	di _N
	count if p1_int_id!=. & basecohorts==1
		matrix sample_sizes = _N \ r(N)
	
	egen tmp_fam_tag=tag(p1_int_id)
	count if tmp_fam_tag==1	& basecohorts==1
		matrix sample_sizes = sample_sizes \ r(N)
		matrix rownames sample_sizes = Numkids Numlinkedkids Numfamilies
		
	matrix bmarking = bmarking \ sample_sizes	
		
	** List and output the matrix **

	matrix list bmarking, format(%9.1gc)

	outtable using "${${location}_path}/5. Output/Tables/T1 - Bmarking", replace mat(bmarking) format(%9.1gc)
	
* Drop expats		
	
	count if expat==1
	di "Proportion expats: `=100*r(N)/_N'"
	
	file open intext using "${${location}_path}/5. Output/Text/In_text_statments.txt", write append
	file write intext "Statement ${statement_num}: The proportion of the sample dropped because they are possibly expats is: " (100*r(N)/_N) " per cent" _n(2)
	global statement_num=${statement_num}+1
	file close intext
	
	keep if expat==0 

* Benchmark definitions

	global p_syear=1991
	global p_eyear=2001
	global p_span=(${p_eyear}-${p_syear}+1)

* Household income definitions

	order _all, sequential

	forvalues year=1991(1)2015 {
		egen p_h_total`year'=rowtotal(p?_r_total`year'), missing
		egen c_h_total`year'=rowtotal(c_r_total`year' c_s_r_total`year'), missing
	}

* Parent income and ranks

	order _all, sequential
	gen p_inc=.
	gen c_inc=.
	
	* Mum and dad income and ranks
	
	egen p1_life_total=rowtotal(p1_r_total${p_syear}-p1_r_total${p_eyear}), missing
	gen p1_inc=p1_life_total/$p_span
	
	egen p2_life_total=rowtotal(p2_r_total${p_syear}-p2_r_total${p_eyear}), missing
	gen p2_inc=p2_life_total/$p_span
	
	quietly forvalues parentsex=1/2 {		
		replace p_inc=p1_inc 	if p1_sex==`parentsex'
		replace p_inc=p2_inc 	if p1_sex!=`parentsex'
		
		ranks, parents(1) children(0)
		
		rename p_rank psex`parentsex'_rank
	}
	
	rename psex1_rank mum_rank
	rename psex2_rank dad_rank
	
	drop *r_total*
	
	* Parent household income ranks
	
	egen p_h_life_total=rowtotal(p_h_total${p_syear}-p_h_total${p_eyear}), missing
	replace p_inc=p_h_life_total/$p_span

	ranks, parents(1) children(0)
	gen p_rank2=p_rank^2
	gen byte p_p10=(p_rank<=10)
	gen byte p_p90=(p_rank>90)
					
* Child income, ranks at 16, 18, 20, 22, 24, 26, 28, 30 and p10/20 and p80/90 outcomes

	forvalues age=16(2)30 {
		forvalues cohort=1978(1)1996 {
			local year=`cohort'+`age'		
			if `year'<1991 | `year'>2015 {
				continue
			}
			replace c_inc=c_h_total`year' if fyob==`cohort'		
		}	

		ranks, parents(0) children(1)
		rename c_rank c`age'_rank
		if `age'==24 {
			gen c24_inc=c_inc
		}
		replace c_inc=.
	}
	
	gen byte c_p20=(c24_rank<=20)
	gen byte c_p80=(c24_rank>80)
	
	gen byte c_p10=(c24_rank<=10)
	gen byte c_p90=(c24_rank>90)

* Child uni at age 24; kids in 2015; spouse	
	
	gen byte c24_uni=.	
	
	forvalues cohort=1978(1)1996 {
		local year24=`cohort'+24		
			if `year24'<2000 | `year24'>2015 {
				continue
			}	
		replace c24_uni=(c_r_hdo`year24'>0 & c_r_hdo`year24'!=.) if fyob==`cohort'	
	}	
	
	sum c_r_depend_child2015, detail
	sum c_r_depend_child20??
	
	gen byte c_kids=c_r_depend_child2015
	
* Child location at age 24

	gen income_year=fyob+24
	
	merge 1:1 int_id income_year using "${${location}_path}/1. Data (MASTER)/Return_locations", keep(master match) keepusing(SA4_id) nogen	
	drop income_year 
	
	rename SA4_id kidlocationat24_id

* Child in capital city location at age 24

	do SA4_to_GCC_kids

	gen c24_urban=(substr(kidlocationat24_GCCid,2,1)=="G"|substr(kidlocationat24_GCCid,1,1)=="8")
		replace c24_urban=. 	if kidlocationat24_GCCid==""
	
* Some summary statistics
	
tabstat c24_uni, by(fyob)
tabstat has_spouse, by(fyob)		
tabstat c_kids, by(fyob)	

sum c24_uni
sum has_spouse
sum c_kids
	
* TABLE 2: Comparison of Data with Chetty and Hendren (2018a)
	
	preserve 
	
		merge m:1 p1_int_id fyob using "${${location}_path}/1. Data (MASTER)/parents_moves1", keep(master match) keepusing(orig_mapid) nogen
		
		global c_syear=2011
		global c_eyear=2015
		global c_span=(${c_eyear}-${c_syear}+1)
		
		egen c_h_life_total=rowtotal(c_h_total${c_syear}-c_h_total${c_eyear}), missing
		replace c_inc=c_h_life_total/$c_span
		
		ranks, parents(0) children(1)
		
		keep if fyob>=1978 & fyob<=1982
		keep if p1_int_id!=.
		
		reg c_rank i.orig_mapid##c.p_rank, 
		drop p_rank
		gen p_rank=25
		
		predict AM, xb
		bysort orig_mapid: gen N=_N
		bysort orig_mapid: keep if _n==1
		
		keep orig_mapid AM N
		
		tabstat AM, stat(mean p10 p50 p90) format(%9.2g) save
			matrix AM=r(StatTotal)
	
		tabstat AM [fw=N], stat(mean p10 p50 p90) format(%9.2g) save
			matrix AM_w=r(StatTotal)
	
		tabstat N, stat(mean p10 p50 p90) format(%9.2g) save
			matrix N=r(StatTotal)
			
		matrix samples = AM \ AM_w \ N
		matrix list samples, format(%9.3gc)
	
		outtable using "${${location}_path}/5. Output/Tables/T2 - Samples", replace mat(samples) format(%9.1gc)
		
		sum N
		
		file open intext using "${${location}_path}/5. Output/Text/In_text_statments.txt", write append
		file write intext "Statement ${statement_num}: The smallest region size is: " (r(min)) " children" _n(2)
		global statement_num=${statement_num}+1
		file close intext
	
	restore
		
save "mobility_causal", replace

* Parent ranks

	gen byte female=(sex==1)

	keep p1_int_id fyob female p_rank mum_rank dad_rank 
	
	bysort p1_int_id fyob: gen byte p_kids=_N
	bysort p1_int_id fyob: egen byte p_girls=total(female)
	bysort p1_int_id fyob: keep if _n==1

save "parent_ranks", replace
	
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* 	1B. Add primary parent movements to outcome file								   
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*

clear
use "mobility_causal"

merge m:1 p1_int_id fyob using "${${location}_path}/1. Data (MASTER)/parents_moves1", keep(master match) nogen

* Additional variables

gen byte moves=.
	replace moves=0 if (orig_mapid!=. & dest_mapid==. & dest2_mapid==.)
	replace moves=1 if (orig_mapid!=. & dest_mapid!=. & dest2_mapid==.)
	replace moves=2 if (orig_mapid!=. & dest_mapid!=. & dest2_mapid!=.)

	gen int orig_duration=orig_end_year-orig_start_year+1
	gen int dest_duration2=dest_end_year-dest_start_year+1
	gen int dest2_duration3=dest2_end_year-dest2_start_year+1

	gen int age_at_move=dest_start_year-fyob
	gen int age_at_move2=dest2_start_year-fyob
	
	gen int gap=dest_start_year-orig_end_year
	gen int gap2=dest2_start_year-dest_end_year
	
	gen byte precise=(gap==1)
	gen byte precise2=(gap2==1)

* Drop unnecessary variables

drop *total* 

* Count sample sizes

table moves
	
save "mobility_causal", replace
	
*------------------------------------------------------------------------------*
* 	2. PERMANENT RESIDENTS AND PEERS											   
*------------------------------------------------------------------------------*

*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* 	2A. Predicted outcomes for permanent residents								   
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*

clear 
use "mobility_causal"

set matsize 10000

	keep if moves==0|moves==1
	keep int_id sex p1_int_id orig_mapid fyob p_rank* c24_rank c26_rank c28_rank c30_rank c_p?0 c*uni c*kids has_spouse c*urban kidlocationat24_id moves
	
*** Originally looped over a few options ***

	keep if moves==0
		
	* Region-cohort sizes, ranks and placeholders for predicted values

		bysort orig_mapid fyob: gen region_cohort_size=_N
		bysort orig_mapid fyob: egen peer_rank=mean(p_rank)
		
		foreach outcome in 24 24sex1 24sex2 26 28 30 p20 p80 p10 p90 uni24 kids spouse urban {
			gen y`outcome'_cs=.
			gen y`outcome'_cs_se=.
		}	

	* Prediction set (dummy observations, 100 for each region-cohort)

		bysort orig_mapid fyob: gen tmp_tag=(_n==1)

		expand 101 if tmp_tag==1, gen(prediction)
			replace int_id=. if prediction==1
			replace p1_int_id=. if prediction==1
			
		foreach var of varlist c*_rank c_p20 c_p80 c_p10 c_p90 {
			replace `var'=. if prediction==1
		}	
			
		gsort orig_mapid fyob -prediction
		by orig_mapid fyob: replace p_rank=_n if prediction==1
		
	* Regressions and peer rank calculation
	
		* Mean rank predictions
	
		forvalues age=24(2)30 {
			* Regressions
			reg c`age'_rank i.orig_mapid#i.fyob i.orig_mapid#i.fyob#c.p_rank  
			estimates store reg`age'
			* Predictions
			predict tmp_pred_outcome if prediction==1 
				replace y`age'_cs=tmp_pred_outcome if prediction==1 
				replace y`age'_cs=. if fyob>2015-`age'
			predict tmp_pred_se if prediction==1, stdp	
				replace y`age'_cs_se=tmp_pred_se if prediction==1 
				replace y`age'_cs_se=. if fyob>2015-`age'
				drop tmp*	
		}
		
		* Gender specific predictions
		
		forvalues childsex=1/2 {
			* Regressions
			reg c24_rank i.orig_mapid#i.fyob i.orig_mapid#i.fyob#c.p_rank if sex==`childsex'  
			estimates store reg24sex`childsex'
			* Predictions
			predict tmp_pred_outcome if prediction==1 
				replace y24sex`childsex'_cs=tmp_pred_outcome if prediction==1 
			predict tmp_pred_se if prediction==1, stdp	
				replace y24sex`childsex'_cs_se=tmp_pred_se if prediction==1 
				drop tmp*	
		}
		
		* Percentile predictions		
		quietly foreach ptile in p20 p80 p10 p90 {				
			* Regressions 
			reg c_`ptile' i.orig_mapid#i.fyob i.orig_mapid#i.fyob#c.p_rank   
			estimates store reg`ptile'
			* Predictions
			predict tmp_pred_outcome if prediction==1 
				replace y`ptile'_cs=tmp_pred_outcome if prediction==1 
			predict tmp_pred_se if prediction==1, stdp
				replace y`ptile'_cs_se=tmp_pred_se if prediction==1 
			drop tmp*			
		}
		
		* Probability went to uni / number of children / has spouse
	
		* Regressions (uni)
		reg c24_uni i.orig_mapid#i.fyob i.orig_mapid#i.fyob#c.p_rank 
		estimates store reguni24
		* Predictions
		predict tmp_pred_outcome if prediction==1 
			replace yuni24_cs=tmp_pred_outcome if prediction==1 
		predict tmp_pred_se if prediction==1, stdp
			replace yuni24_cs_se=tmp_pred_se if prediction==1 
		drop tmp*
		
		
		* Regressions (kids)
		reg c_kids i.orig_mapid#i.fyob i.orig_mapid#i.fyob#c.p_rank  
		estimates store regkids
		* Predictions
		predict tmp_pred_outcome if prediction==1 
			replace ykids_cs=tmp_pred_outcome if prediction==1 
		predict tmp_pred_se if prediction==1, stdp
			replace ykids_cs_se=tmp_pred_se if prediction==1 
		drop tmp*
		
		* Regressions (spouse) 
		reg has_spouse i.orig_mapid#i.fyob i.orig_mapid#i.fyob#c.p_rank  
		estimates store regspouse
		* Predictions
		predict tmp_pred_outcome if prediction==1 
			replace yspouse_cs=tmp_pred_outcome if prediction==1 
		predict tmp_pred_se if prediction==1, stdp
			replace yspouse_cs_se=tmp_pred_se if prediction==1 
		drop tmp*
		
		* Regressions (urban) 
		reg c24_urban i.orig_mapid#i.fyob i.orig_mapid#i.fyob#c.p_rank  
		estimates store regurban
		* Predictions
		predict tmp_pred_outcome if prediction==1 
			replace yurban_cs=tmp_pred_outcome if prediction==1 
		predict tmp_pred_se if prediction==1, stdp
			replace yurban_cs_se=tmp_pred_se if prediction==1 
		drop tmp*
		
	* Keep only the set of predictions

		keep if prediction==1
		keep orig_mapid fyob p_rank y*cs y*cs_se peer_rank region_cohort_size

		gen dest_mapid=orig_mapid

		foreach place in orig dest {
			foreach outcome in 24 24sex1 24sex2 26 28 30 p20 p80 p10 p90 uni24 kids spouse urban {
				gen `place'_outcomes`outcome'=y`outcome'_cs
				gen `place'_outcomes`outcome'_se=y`outcome'_cs_se
			}
			gen `place'_peer_rank=peer_rank
			gen `place'_size=region_cohort_size
		}

		keep orig* dest* fyob p_rank peer_rank region_cohort_size

save "predicted_outcomes", replace
		
* Save the matrices

clear

foreach reg in 24 24sex1 24sex2 26 28 30 p20 p80 p10 p90 uni24 kids spouse urban {
	clear
	estimates restore reg`reg'
	matrix V=(e(df_r)/e(N))*e(V)
	svmat double V, name(V)
	save "V`reg'", replace		
}

		
* Merge with master dataset and calculate differences

clear 
use "mobility_causal"
	
	* Baseline predictions

	merge m:1 orig_mapid fyob p_rank using "predicted_outcomes", keepusing(orig_outcomes* orig_size orig_peer_rank) keep(master match) nogen  
	merge m:1 dest_mapid fyob p_rank using "predicted_outcomes", keepusing(dest_outcomes* dest_size dest_peer_rank) keep(master match) nogen  

	foreach outcome in 24 24sex1 24sex2 26 28 30 p20 p80 p10 p90 uni24 kids spouse urban {
		gen Delta_odps`outcome'=dest_outcomes`outcome'-orig_outcomes`outcome'
		gen Delta_odps`outcome'_se=sqrt(dest_outcomes`outcome'_se^2+orig_outcomes`outcome'_se^2)
	}

	gen Delta_peer_rank=dest_peer_rank-orig_peer_rank
	gen Delta_size=dest_size-orig_size	
	
save "mobility_causal", replace	

*------------------------------------------------------------------------------*
* 	3. DESCRIPTIVE STATISTICS							   
*------------------------------------------------------------------------------*	

*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* 	3A. Test of the linear rank-rank specification								   
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*

clear
use "mobility_causal"

* APPENDIX FIGURE A1 - Rank-rank relationships

	keep c24_rank p_rank fyob moves orig_mapid

	gen p_decile=ceil(p_rank/10)
	
	drop if p_decile==.
	
	keep if fyob==1978 & moves==0
		
	bysort orig_mapid: gen total_pop=_N	
	bysort p_decile orig_mapid: gen pop=_N
	bysort p_decile orig_mapid: egen mean_c24_rank=mean(c24_rank)
	
	bysort p_decile: gen nat_pop=_N
	bysort p_decile: egen nat_mean_c24_rank=mean(c24_rank)
	
	bysort p_decile orig_mapid: keep if _n==1
	
	gsort -total_pop orig_mapid p_decile
	gen order=ceil(_n/10)
	
	keep if order<=20
	
	sum pop
	di "Minimum observation number is `=`r(min)''"
	list orig_mapid p_decile if pop==`r(min)'
		
		#delimit ;
			graph twoway
				(scatter nat_mean_c24_rank p_decile, msym(O) mcolor(ebblue*0.5))
					(lfit nat_mean_c24_rank p_decile, lcolor(ebblue*0.5) lpattern(dash))
				(scatter mean_c24_rank p_decile, msym(O) mcolor(ebblue))
					(lfit mean_c24_rank p_decile, lcolor(ebblue))
			,
			by(orig_mapid, note(""))
			xtitle("Parent income decile")
			ytitle("Expected rank")
			xlabel(,grid)
			ylabel(,grid)
			legend(order(1 "National" 3 "Regional"))
			;
		#delimit cr
		graph export "${${location}_path}/5. Output/Charts/FA1_Linear_spec.png", replace

*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* 	3B. How important is place anyway?  Back of the envelope...							   
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*	
	
clear
use "predicted_outcomes"

	keep if p_rank==25
	keep orig_outcomes24
	
	sum orig_outcomes24, detail
	
	global p10=r(p10)
	global p50=r(p50)
	
	file open intext using "${${location}_path}/5. Output/Text/In_text_statments.txt", write append
	file write intext "Statement ${statement_num}: The p10 region-cohort for kids born into p25 had expected percentile rank of: " (round(r(p10),0.01)) _n(2)
	file write intext "Statement ${statement_num}: The p50 region-cohort for kids born into p25 had expected percentile rank of: " (round(r(p50),0.01)) _n(2)
	file close intext
	
clear
use "mobility_causal"

	keep if fyob==1991
	keep c24_inc c24_rank
	
	keep if c24_rank==round(${p10})|c24_rank==round(${p50})
	
	sum c24_inc 	if c24_rank==round(${p10})
	
	file open intext using "${${location}_path}/5. Output/Text/In_text_statments.txt", write append
	file write intext "Statement ${statement_num}: And this translates into mean income of: " (round(r(mean),1)) "for the first" _n(2)
	file close intext

	sum c24_inc 	if c24_rank==round(${p50})
	
	file open intext using "${${location}_path}/5. Output/Text/In_text_statments.txt", write append
	file write intext "Statement ${statement_num}: And: " (round(r(mean),1)) "for the second" _n(2)
	file close intext
	
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* 	3C. Variables used elsewhere								   
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*

clear 
use "mobility_causal"

	gen byte move_smpl=(precise==1 & orig_duration>1 & dest_duration>1 & move_distance>=15 & move_distance!=.)
	gen byte move_smpl2=(precise2==1 & dest_duration2>1 & dest2_duration>1 & move_distance2>=15 & move_distance2!=.)

	* Early cut point

	global early_cutpoint=11
	 
	* Age covariates
	
	gen byte older=(age_at_move>24)
	
	gen post_exposure=(age_at_move-25)
		replace post_exposure=0 if post_exposure<0
			
	gen late_exposure=25-age_at_move
		replace late_exposure=25-${early_cutpoint} if late_exposure>25-${early_cutpoint}
		replace late_exposure=0 if late_exposure<0
		
	gen early_exposure=(${early_cutpoint}-age_at_move)
		replace early_exposure=0 if early_exposure<0
	
	* Age covariates for kink at 15
	
	gen late_exposure15=25-age_at_move
		replace late_exposure15=25-15 if late_exposure15>25-15
		replace late_exposure15=0 if late_exposure15<0
				
	gen early_exposure15=(15-age_at_move)
		replace early_exposure15=0 if early_exposure15<0
	
	* Parent decile

	gen byte couple=(family_type<3)
	gen p_decile=ceil(p_rank/10)
	
	* Family fixed effects and sizes
	
	egen long family_sex_groups=group(p1_int_id sex)
	
	bysort p1_int_id: gen fam_size=_N
	table fam_size
	
	* Locations
	
	gen byte inorig=(kidlocationat24_id==orig_mapid)
	gen byte indest=(kidlocationat24_id==dest_mapid)
	gen byte elsewhere=(kidlocationat24_id!=. & inorig==0 & indest==0)
	gen byte unknown=(kidlocationat24_id==.)
	
	* This should drop nothing at ATO 
	
	table age_at_move
	keep if age_at_move<=35 | age_at_move==.

save "mobility_causal", replace
	
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* 	3D. Permanent residents versus movers 		   
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
	
clear 
use "mobility_causal"

	drop if moves==.

* APPENDIX TABLE B1: Summary statistics for permanent and moving residents
	* Parent outcomes, size, type, child outcomes

	tabstat p_inc p_rank couple num_kids c24_inc c24_rank if moves==0, statistics(mean sd median) columns(statistics) save
		matrix define perm=r(StatTotal)'
		
	tabstat p_inc p_rank couple num_kids c24_inc c24_rank if moves==1 & move_smpl==1, statistics(mean sd median) columns(statistics) save
		matrix define mover=r(StatTotal)'	
		
	matrix perm_movers = perm, mover
	matrix list perm_movers

	outtable using "${${location}_path}/5. Output/Tables/TB1 - Perm_v_movers", replace mat(perm_movers) format(%9.3gc)
	
	file open intext using "${${location}_path}/5. Output/Text/In_text_statments.txt", write append
		count if moves==0
		file write intext "Statement ${statement_num}A: The number of permanent residents is " (r(N)) "." _n(1)
		count if moves==1 & move_smpl==1
		file write intext "Statement ${statement_num}B: And the number of movers is " (r(N)) "." _n(2)
		global statement_num=${statement_num}+1
	file close intext
	
* APPENDIX TABLE B2: summary statistics for child regions, origins, destinations and the difference
	* Child outcomes, peer ranks and peer size 

	rename Delta_odps24 Delta_outcomes24

	tabstat orig_outcomes24 orig_peer_rank orig_size, statistics(mean sd median) columns(statistics) save	
		matrix define all=r(StatTotal)'
		
	foreach place in dest orig Delta {
		tabstat `place'_outcomes24 `place'_peer_rank `place'_size if moves==1 & move_smpl==1, statistics(mean sd median) columns(statistics) save
			matrix define `place'=r(StatTotal)'
	}

	rename Delta_outcomes24 Delta_odps24

	matrix places = all, orig, dest, Delta

	outtable using "${${location}_path}/5. Output/Tables/TB1 - Place", replace mat(places) format(%9.3gc)
	outtable using "${${location}_path}/5. Output/Tables/TB2 - Orig_dest_diffs", replace mat(Delta) format(%9.3gc)

*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* 	3E. Features by age at move	   
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*	

clear 
use "mobility_causal"	

	keep if moves==1	
	
	keep if fyob==1991
			
	foreach var in p_rank couple {	
		bysort age_at_move: egen tmp_mean_`var'=mean(`var') 			
	}
			
	replace tmp_mean_couple=tmp_mean_couple*100
			
	bysort age_at_move: gen tmp_num_movers=_N
			
	egen tmp_tag=tag(age_at_move) 

* APPENDIX FIGURE A3: Family characteristics by age at move	
	
	#delimit ;
		graph twoway
		(scatter tmp_mean_p_rank age_at_move, msym(O) mcolor(ebblue))
		(scatter tmp_mean_couple age_at_move, msym(X) mcolor(ebblue))
		(scatter tmp_num_movers age_at_move, msym(Oh) mcolor(black) yaxis(2))
		if tmp_tag==1
		,
		xtitle("Age of child when parents move")
		ytitle("Rank / %", axis(1))
		ytitle("", axis(2))
		xlabel(,grid)
		ylabel(,grid)
		legend(label(1 "Mean parental rank (LHS)") label(2 "Per cent couple families (LHS)") label(3 "Number of children moving (RHS)") )
		;
	#delimit cr

	graph export "${${location}_path}/5. Output/Charts/FA3_Differences_by_age_at_move1991.png", replace
			
*------------------------------------------------------------------------------*
* 	4. ANALYSIS - EXPOSURE TO PLACE EFFECTS									   
*------------------------------------------------------------------------------*	

*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* 	4A. Precision								   
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*

clear 
use "mobility_causal"

	keep if moves==1
	
	* Precision 

	gen tmp_abs_Delta_odps24=abs(Delta_odps24)
	sum tmp_abs_Delta_odps24 if move_smpl==1, detail
	global mean_Delta=r(mean)
	drop tmp*
		
	gen x_rank=_n if _n<101
	
	pctile y_se=Delta_odps24_se if move_smpl==1, nq(100)
	
	pctile yp80_se=Delta_odpsp80_se if move_smpl==1, nq(100)
	pctile yp20_se=Delta_odpsp20_se if move_smpl==1, nq(100)
	
	gen tmp_abs_Delta_odpsp80=abs(Delta_odpsp80)
	sum tmp_abs_Delta_odpsp80 if move_smpl==1, detail
	
	gen tmp_abs_Delta_odpsp20=abs(Delta_odpsp20)
	sum tmp_abs_Delta_odpsp20 if move_smpl==1, detail
	
	drop tmp*
	
* APPENDIX FIGURE D1: Precision of Delta term
	
	#delimit ;
		graph twoway
			(scatter y_se x_rank, mcolor(ebblue))
			,
			ytitle("Percentile rank points")
			xtitle("Percentile")
			xlabel(,grid)
			yline(${mean_Delta}, lpattern(dash))
			ylabel(,grid)
		;
	#delimit cr	
	
	graph export "${${location}_path}/5. Output/Charts/FD1_Distribution_delta_se.png", replace
	
	count if move_smpl==1
	di `=r(N)/_N*100'
	
* Some sample sizes relating to the coefficients

table age_at_move if move_smpl==1 & Delta_odps24_se<2*${${location}_pm}	

*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* 	4B. Baseline regression I  								   
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*

* Drop existing matrices

matrix drop _all

* Define regression covariates
	
	foreach outcome in "24" "" "uni24" "kids" "spouse" "urban" {
		#delimit ;
		global baselineI_covars`outcome'="ibn.age_at_move
			i.age_at_move#c.p_rank
			ibn.fyob
			i.age_at_move#c.Delta_odps`outcome'
			i.fyob#c.orig_outcomes`outcome'
			i.fyob#c.Delta_odps`outcome'"
			;
		#delimit cr
	}
	
* Regression 
	
	gen orig_outcomes=.
	gen Delta_odps=.

	replace orig_outcomes=orig_outcomes24
	replace Delta_odps=Delta_odps24
	
* Baseline regression I

reg c24_rank $baselineI_covars if move_smpl==1 & Delta_odps24_se<2*${${location}_pm}, nocons   
capture noisily MT_errors, model(base_I) vmodel(V24) modelcovars($baselineI_covars) delta(Delta_odps) orig(orig_outcomes)

estimates store Exp_baselineI

	* Convert coefficients to variables
	
	gen int x_age_at_move=_n if _n<40
	quietly coef2var , name(baseline) sample(_est_Exp_baselineI) disruption(0)
	
	* Disruption variables
	
	gen fc_coef=.
	gen fc_se=.
			
	quietly sum age_at_move if _est_Exp_baselineI==1
			
	forvalues i=`=`r(min)'+1'(1)`r(max)' {			
		lincom `i'.age_at_move + 50*`i'.age_at_move#c.p_rank+1991.fyob+50*1991.fyob#c.orig_outcomes
			replace fc_coef=`r(estimate)' if _n==`i'
			replace fc_se=`r(se)' if _n==`i'		
	}
	
	gen fc_upper=fc_coef+1.96*fc_se
	gen fc_lower=fc_coef-1.96*fc_se
	
	* Constant and slope of coefficients
			
	reg baseline_coef x_age_at_move if x_age_at_move>24, cformat(%9.3f)	
	
		file open intext using "${${location}_path}/5. Output/Text/In_text_statments.txt", write append
		file write intext "Statement ${statement_num}: If we fit a line to the estimated regression coefficients b_m for m={25,...,34} we get a slope coefficient of " (_b[x_age_at_move]) " (s.e. " (_se[x_age_at_move]) ")" _n(2)
		global statement_num=${statement_num}+1
		file close intext
	
	sum baseline_coef if x_age_at_move>24	
	global del1 = r(mean)
	
		file open intext using "${${location}_path}/5. Output/Text/In_text_statments.txt", write append
		file write intext "Statement ${statement_num}: The mean selection effect is " (r(mean)) "." _n(2)
		global statement_num=${statement_num}+1
		file close intext
		
**************
*** CHARTS ***
**************	

* FIGURE 1: Exposure effects
	
	#delimit ;
		graph twoway
			(rspike baseline_upper baseline_lower x_age_at_move, color(gs8))
			(scatter baseline_coef x_age_at_move, mcolor(ebblue))	
			if x_age_at_move<=40
			,
			ytitle("Coefficient on predicted rank in destination")
			xtitle("Age of child when parents move")
			xlabel(,grid)
			ylabel(,grid)
			legend(off)
			saving(Exposure_effects, replace)
		;
	#delimit cr
	graph export "${${location}_path}/5. Output/Charts/F1_Exposure_effects.png", replace
	
* APPENDIX FIGURE A2: Disruption effects	
	
	#delimit ;
		graph twoway
			(rspike fc_upper fc_lower x_age_at_move, color(gs8))
			(scatter fc_coef x_age_at_move, mcolor(ebblue))		
			if x_age_at_move<=40
			,
			ytitle("Disruption cost of moving (percentile rank points)")
			xtitle("Age of child when parents move")
			xlabel(,grid)
			ylabel(,grid)
			legend(off)
			saving(Disruption_effects, replace)
		;
	#delimit cr
	graph export "${${location}_path}/5. Output/Charts/FA2_Disruption_effects.png", replace	
	
	count if age_at_move==34 & move_smpl==1 & Delta_odps24_se<2*${${location}_pm}
	replace baseline_N=r(N) 	if x_age_at_move==34
	
**************************
*** NET COST OF MOVING ***
**************************
	
	estimates restore Exp_baselineI
	
	* Disruption costs (selection effects first, then individual effect)
		
		gen dis_cost=0
		gen dis_selection=0 
		
		quietly forvalues age=25(1)34 {
			replace dis_selection=dis_selection+(_b[`age'.age_at_move]+_b[`age'.age_at_move#c.p_rank]*p_rank)
		}
		
		replace dis_selection=dis_selection/10
		
		quietly forvalues age=2(1)34 {
			replace dis_cost=(_b[`age'.age_at_move]+_b[`age'.age_at_move#c.p_rank]*p_rank)-dis_selection if age_at_move==`age'
		}
		
	* Location costs (or benefits)
	
		gen loc_cost=0
		
		forvalues age=2(1)30 {
			replace loc_cost=(_b[`age'.age_at_move#c.Delta_odps]-${del1})*Delta_odps if age_at_move==`age'
		}

	* Summarize
	
		gen net_move_cost=loc_cost+dis_cost

		sum net_move_cost if move_smpl==1 & Delta_odps24_se<2*${${location}_pm}
			local tmp_movers=r(N)
		
			file open intext using "${${location}_path}/5. Output/Text/In_text_statments.txt", write append
			file write intext "Statement ${statement_num}: Net move cost is " (r(mean)) " percentile rank points." _n(1)
			
			count if net_move_cost>0 & move_smpl==1 & Delta_odps24_se<2*${${location}_pm}
			file write intext _tab "and only "(100*r(N)/`tmp_movers') " per cent are positive." _n(2)
			
			global statement_num=${statement_num}+1
			file close intext
		
		sum net_move_cost if move_smpl==1 & Delta_odps24_se<2*${${location}_pm} & Delta_odps24>0
			local tmp_movers=r(N)
			
			file open intext using "${${location}_path}/5. Output/Text/In_text_statments.txt", write append
			file write intext "Statement ${statement_num}: Net move cost for positive moves is " (r(mean)) " percentile rank points." _n(1)
			
			count if net_move_cost>0 & move_smpl==1 & Delta_odps24_se<2*${${location}_pm} & Delta_odps24>0
			file write intext _tab "and only "(100*r(N)/`tmp_movers') " per cent are positive." _n(2)
			
			global statement_num=${statement_num}+1
			file close intext
		
	* Check for log file what it looks like for full sample	
		
		sum net_move_cost 
		sum net_move_cost if Delta_odps>0
		
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* 		4C. Baseline regression II 							   
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*

clear 
use "mobility_causal"

	keep if moves==1
	
	* Define covariates	

	foreach outcome in "24" "" {
		#delimit ;
			global baselineII_covars`outcome'="ibn.age_at_move 
				i.age_at_move#c.p_rank
				ibn.fyob 
				i.fyob#c.orig_outcomes`outcome' 
				i.older#c.Delta_odps`outcome'					
				c.early_exposure#c.Delta_odps`outcome'
				c.late_exposure#c.Delta_odps`outcome'
				c.post_exposure#c.Delta_odps`outcome'
				i.fyob#c.Delta_odps`outcome'"
			;
		#delimit cr
	}

	reg c24_rank $baselineII_covars24 if move_smpl==1 & Delta_odps24_se<2*${${location}_pm}, nocons 
	capture noisily MT_errors, model(base_II) vmodel(V24) modelcovars($baselineII_covars24) delta(Delta_odps24) orig(orig_outcomes24)
	estimates store Exp_baselineII

	* Test coefficients are equal

	testparm c.early_exposure#c.Delta_odps24 c.late_exposure#c.Delta_odps24, equal
	
		file open intext using "${${location}_path}/5. Output/Text/In_text_statments.txt", write append
		file write intext "Statement ${statement_num}: The p-value of the test that early and late exposure effects are equal is " (r(p)) "." _n(2)
		global statement_num=${statement_num}+1
		file close intext
	
	* Construct my estimate of total exposure effects
	
	di _b[c.early_exposure#c.Delta_odps]*11+_b[c.late_exposure#c.Delta_odps]*14
	
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* 		4D. Precision-based tests							   
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*	

capture noisily {	
	
	foreach Delta of numlist 1000 5 4 3 2 {
		capture noisily reg c24_rank $baselineII_covars24 if move_smpl==1 & Delta_odps24_se<`Delta'*0.5*${${location}_pm}, nocons 
		capture noisily MT_errors, model(base_II) vmodel(V24) modelcovars($baselineII_covars24) delta(Delta_odps24) orig(orig_outcomes24)
		estimates store Exp_pre`Delta'
	}
	
	* APPENDIX TABLE D1
	
	esttab Exp_baselineII Exp_pre1000 Exp_pre5 Exp_pre4 Exp_pre3 Exp_pre2 using "${${location}_path}/5. Output/Tables/TD1 - Exp_precision", se fixed keep(*early* *late* *post*) b(%9.3f) replace 

}
	
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* 		4E. Model tests								   
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
	
	#delimit ;
			global baseline_linear="ibn.age_at_move
				i.age_at_move#c.p_rank
				ibn.fyob
				i.fyob#c.orig_outcomes24
				i.older#c.Delta_odps24
				c.exposure#c.Delta_odps24
				c.post_exposure#c.Delta_odps24
				i.fyob#c.Delta_odps24"
				;
	#delimit cr
	
	gen exposure=late_exposure+early_exposure 
	
	* Linear exposure model with CH length sample
	
	reg c24_rank $baseline_linear if move_smpl==1 & Delta_odps24_se<2*${${location}_pm} & age_at_move>=9, nocons  
		di _b[c.exposure#c.Delta_odps]
		di _b[c.exposure#c.Delta_odps]*25
		di _b[c.exposure#c.Delta_odps]*20
		
		file open intext using "${${location}_path}/5. Output/Text/In_text_statments.txt", write append
		file write intext "Statement ${statement_num}A: A linear model with CH length implies a total exposure effects of " (_b[c.exposure#c.Delta_odps]*25) "." _n(2)
		file write intext "Statement ${statement_num}B: Or " (_b[c.exposure#c.Delta_odps]*20) " if you take their conservative approach." _n(2)
		global statement_num=${statement_num}+1
		file close intext
	
	* Linear exposure model 
	
	reg c24_rank $baseline_linear if move_smpl==1 & Delta_odps24_se<2*${${location}_pm}, nocons 
	capture noisily MT_errors, model(base_IIlinear) vmodel(V24) modelcovars($baseline_linear) delta(Delta_odps24) orig(orig_outcomes24)
	estimates store Exp_baseline_linear
	
	* Piecewise linear exposure model 
	
	forvalues kink=10(1)16 {
		preserve
			
			drop early_exposure late_exposure
			
			gen late_exposure=25-age_at_move
				replace late_exposure=25-`kink' if late_exposure>25-`kink'
				replace late_exposure=0 if late_exposure<0
				
			gen early_exposure=(`kink'-age_at_move)
				replace early_exposure=0 if early_exposure<0
			
			reg c24_rank $baselineII_covars24 if move_smpl==1 & Delta_odps24_se<2*${${location}_pm}, nocons 
			capture noisily MT_errors, model(base_II) vmodel(V24) modelcovars($baselineII_covars24) delta(Delta_odps24) orig(orig_outcomes24)
			estimates store Exp_baselineII`kink'
			
		restore
	}	
	
	* APPENDIX TABLE B3 - Different parametrisations of exposure effects
	
	estimates table Exp_baselineI Exp_baseline_linear Exp_baselineII10 Exp_baselineII11 Exp_baselineII12 Exp_baselineII13 Exp_baselineII14 Exp_baselineII15 Exp_baselineII16, stats(r2 ar2 aic bic)
	
	#delimit ;
		esttab Exp_baselineI Exp_baseline_linear Exp_baselineII10 Exp_baselineII11 Exp_baselineII12 Exp_baselineII13 Exp_baselineII14 Exp_baselineII15 Exp_baselineII16 
		using "${${location}_path}/5. Output/Tables/TB3 - Exp_models", 
		r2(%12.7g) ar2(%12.7g) aic(%12.7gc) bic(%12.7gc) obslast b(%9.3f) fixed se keep(*exposure*)  replace 
		;
	#delimit cr

*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* 		4F. Other outcome variables							   
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
	
clear 
use "mobility_causal"

	keep if moves==1
		
	* Renamings
	
	rename c24_urban c_urban
	rename c24_uni c_uni24
	rename has_spouse c_spouse
	
	*** Baseline I regression ***
	
	foreach outcome in uni24 kids spouse urban {
		
		* Baseline regression I

		sum Delta_odps`outcome'_se, detail
		
		reg c_`outcome' ${baselineI_covars`outcome'} if move_smpl==1, nocons   
		capture noisily MT_errors, model(base_I) vmodel(V`outcome') modelcovars(${baselineI_covars`outcome'}) delta(Delta_odps`outcome') orig(orig_outcomes`outcome')

		estimates store Exp_baselineI

		* Convert coefficients to variables
	
		capture drop x_age_at_move
		gen int x_age_at_move=_n if _n<40
		quietly coef2var, name(baseline`outcome') sample(_est_Exp_baselineI) disruption(0)
		
		* Graph it
		
		#delimit ;
		graph twoway
			(scatter baseline`outcome'_coef x_age_at_move, mcolor(ebblue))		
			if x_age_at_move<=40 & x_age_at_move>-10
			,
			ytitle("Coefficient on difference in predicted outcomes")
			xtitle("Age of child when parents move")
			xlabel(,grid)
			ylabel(,grid)
			legend(off)
			saving(`outcome', replace)
		;
		#delimit cr
		graph export "${${location}_path}/5. Output/Charts/F2_Exp_effects_`outcome'.png", replace	
				
	}
	
	* FIGURE 2 - Exposure effects for other outcomes
	
	graph combine uni24.gph kids.gph spouse.gph urban.gph 
	graph export "${${location}_path}/5. Output/Charts/F2_Exp_effects_other.png", replace	
	
	drop baseline*
	
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* 		4G. SUBGROUPS				   
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*

clear 
use "mobility_causal"

	keep if moves==1

* Other subgroups for analysis

	gen byte Male=(sex==2)
	gen byte Female=(sex==1)
	gen byte Poorer=(p_rank<=50)
	gen byte Richer=(p_rank>50)
	gen byte Dest_worse=(Delta_odps24<0)
	gen byte Dest_better=(Delta_odps24>0)

	reg c24_rank $baselineII_covars24 if move_smpl==1 & Delta_odps24_se<2*${${location}_pm}, nocons 
	capture noisily MT_errors, model(base_II) vmodel(V24) modelcovars($baselineII_covars24) delta(Delta_odps24) orig(orig_outcomes24)
	estimates store Exp_baselineII
	testparm c.early_exposure#c.Delta_odps24 c.late_exposure#c.Delta_odps24, equal
	estadd scalar p=r(p)
	
* Regressions 

	foreach subpop of varlist Male Female Poorer Richer Dest_worse Dest_better {
		quietly reg c24_rank $baselineII_covars24 if move_smpl==1 & Delta_odps24_se<2*${${location}_pm}  & `subpop'==1, nocons 
		capture noisily MT_errors, model(base_II) vmodel(V24) modelcovars($baselineII_covars24) delta(Delta_odps24) orig(orig_outcomes24)
		estimates store Exp_sub_`subpop'	
		
		testparm c.early_exposure#c.Delta_odps24 c.late_exposure#c.Delta_odps24, equal
		estadd scalar p=r(p)
	}

* TABLE 3 - Heterogeneity in exposure effects	
	
	#delimit ;
		esttab Exp_baselineII Exp_sub_Male Exp_sub_Female Exp_sub_Poorer Exp_sub_Richer Exp_sub_Dest_worse Exp_sub_Dest_better 
		using "${${location}_path}/5. Output/Tables/T3 - Exp_sub", 
		se fixed keep(*early* *late* *post*) b(%9.3f) scalars(p) replace 
		;
	#delimit cr
	estimates drop _all
	
*** Exploring by quintile
	
gen x_prank=_n	if _n<=100	

foreach stage in early late {
	gen y`stage'_quint=.
	gen y`stage'_quint_upper=.
	gen y`stage'_quint_lower=.
	gen y`stage'_quint_N=.
}

	
forvalues i=20(20)100 {

	reg c24_rank $baselineII_covars24 if move_smpl==1 & Delta_odps24_se<2*${${location}_pm}  & (p_rank>`=`i'-20' & p_rank<=`i'), nocons 
	capture noisily MT_errors, model(base_II) vmodel(V24) modelcovars($baselineII_covars24) delta(Delta_odps24) orig(orig_outcomes24)
	estimates store Exp_quint_`i'
	
	foreach stage in early late {
		replace y`stage'_quint=_b[c.`stage'_exposure#c.Delta_odps24]														if x_prank==`i'-10
		replace y`stage'_quint_upper=_b[c.`stage'_exposure#c.Delta_odps24]+1.96*_se[c.`stage'_exposure#c.Delta_odps24]		if x_prank==`i'-10
		replace y`stage'_quint_lower=_b[c.`stage'_exposure#c.Delta_odps24]-1.96*_se[c.`stage'_exposure#c.Delta_odps24]		if x_prank==`i'-10
		replace y`stage'_quint_N=e(N)																						if x_prank==`i'-10	
	}
}		
	
* FIGURE 3 - Heterogeneity by parent income quintile
	
	#delimit ;
		graph twoway
			(rspike ylate_quint_upper ylate_quint_lower x_prank, color(ebblue*0.5))
				(scatter ylate_quint x_prank, mcolor(ebblue))	
			(rspike yearly_quint_upper yearly_quint_lower x_prank, color(green*0.5))
				(scatter yearly_quint x_prank, mcolor(green))		
			,
			ytitle("Exposure effect")
			xtitle("Parent rank")
			xlabel(,grid)
			ylabel(,grid)
			legend(off)
		;
	#delimit cr
	graph export "${${location}_path}/5. Output/Charts/F3_morecont_quint.png", replace	
		
*------------------------------------------------------------------------------*
* 	5. VALIDATION EXERCISES										   
*------------------------------------------------------------------------------*

*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* 	5A. Specification and age at observation exercises						   
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*

	*** FIXED EFFECT CONTROLS ***

	egen long fe_groups=group(orig_mapid p_decile fyob age_at_move)

	* Regressions (for graphs and coefficients)

	gen orig_outcomes=orig_outcomes24
	gen Delta_odps=Delta_odps24

	global feI_covars="ib2.age_at_move#c.Delta_odps i.fyob#c.Delta_odps"
	global feII_covars="i.older#c.Delta_odps c.early_exposure#c.Delta_odps c.late_exposure#c.Delta_odps c.post_exposure#c.Delta_odps i.fyob#c.Delta_odps"
	
	areg c24_rank $feI_covars if move_smpl==1 & Delta_odps24_se<2*${${location}_pm}, absorb(fe_groups) 
	capture noisily MT_errors, model(fe_I) vmodel(V24) modelcovars($feI_covars) delta(Delta_odps) orig(orig_outcomes) absorb(fe_groups)
	estimates store Exp_feI

	coef2var , name(baseline_fe) sample(_est_Exp_feI) disruption(0)
		
	areg c24_rank $feII_covars if move_smpl==1 & Delta_odps24_se<2*${${location}_pm}, absorb(fe_groups) 
	capture noisily MT_errors, model(fe_II) vmodel(V24) modelcovars($feII_covars) delta(Delta_odps) orig(orig_outcomes) absorb(fe_groups)
	estimates store Exp_feII
		
	*** AGE AT OBSERVATION ***
	
	forvalues age=26(2)30 {
	
		replace orig_outcomes=orig_outcomes`age'
		replace Delta_odps=Delta_odps`age'
					
		reg c`age'_rank ${baselineII_covars} if move_smpl==1 & Delta_odps`age'_se<2*${${location}_pm}, nocons 
		capture noisily MT_errors, model(base_II) vmodel(V`age') modelcovars($baselineII_covars) delta(Delta_odps) orig(orig_outcomes) obsage(`age')
		estimates store Exp_altage`age'
		
	}
	
	*** Repeat baseline specifications ***

	replace orig_outcomes=orig_outcomes24
	replace Delta_odps=Delta_odps24
	
	reg c24_rank $baselineI_covars if move_smpl==1 & Delta_odps24_se<2*${${location}_pm}, nocons   
	capture noisily MT_errors, model(base_I) vmodel(V24) modelcovars($baselineI_covars) delta(Delta_odps) orig(orig_outcomes)
	estimates store Exp_baselineI
	
	gen int x_age_at_move=_n if _n<40
	quietly coef2var , name(baseline) sample(_est_Exp_baselineI) disruption(0)
	
	reg c24_rank $baselineII_covars24 if move_smpl==1 & Delta_odps24_se<2*${${location}_pm}, nocons 
	capture noisily MT_errors, model(base_II) vmodel(V24) modelcovars($baselineII_covars24) delta(Delta_odps24) orig(orig_outcomes24)
	estimates store Exp_baselineII
	
	*** APPENDIX TABLE C1 - Alternative specifications ***
	
	#delimit ;
	esttab Exp_baselineII Exp_feII Exp_altage* using "${${location}_path}/5. Output/Tables/TC1 - Exp_specs", se fixed 
	keep(*early* *late* *post*) b(%9.3f) replace 
	;
	#delimit cr	
	
	*** APPENDIX FIGURE C1 - Alternative specifications ***
	
	gen x_age_at_move2=x_age_at_move+0.25

	#delimit ;
		graph twoway	
			(rspike baseline_upper baseline_lower x_age_at_move, color(gs8))
			(scatter baseline_coef x_age_at_move, color(black))
			(rspike baseline_fe_upper baseline_fe_lower x_age_at_move2, color(gs8))
			(scatter baseline_fe_coef x_age_at_move2, color(ebblue))
			if x_age_at_move<=30
			,
			ytitle("Coefficient on prediced rank in destination")
			xtitle("Age of child when parents move")
			xlabel(,grid)
			ylabel(,grid)
			legend(order(2 "Baseline regression" 4 "Fixed effects regression"))
		;
	#delimit cr
	graph export "${${location}_path}/5. Output/Charts/FC1_Exposure_effects_fe.png", replace
	
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* 	5B. Family fixed effects					   
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*

	drop *exposure

	global early_cutpoint=11
	 
	* Age covariates
		
	gen post_exposure=(age_at_move-25)
		replace post_exposure=0 if post_exposure<0
			
	gen late_exposure=25-age_at_move
		replace late_exposure=25-${early_cutpoint} if late_exposure>25-${early_cutpoint}
		replace late_exposure=0 if late_exposure<0
		
	gen early_exposure=(${early_cutpoint}-age_at_move)
		replace early_exposure=0 if early_exposure<0

	replace orig_outcomes=orig_outcomes24
	replace Delta_odps=Delta_odps24

	* Regressions 

	areg c24_rank $baselineII_covars if move_smpl==1 & fam_size<=5 & Delta_odps24_se<2*${${location}_pm}, absorb(p1_int_id) 
	capture noisily MT_errors, model(base_II) vmodel(V24) modelcovars($baselineII_covars) delta(Delta_odps) orig(orig_outcomes) absorb(p1_int_id)
	estimates store Exp_famII_se1	
		
	areg c24_rank $baselineII_covars if move_smpl==1 & fam_size<=5 & Delta_odps24_se<1.75*${${location}_pm}, absorb(p1_int_id) 
	capture noisily MT_errors, model(base_II) vmodel(V24) modelcovars($baselineII_covars) delta(Delta_odps) orig(orig_outcomes) absorb(p1_int_id)
	estimates store Exp_famII_se2
		
	areg c24_rank $baselineII_covars if move_smpl==1 & fam_size<=5 & Delta_odps24_se<1.5*${${location}_pm}, absorb(p1_int_id) 
	capture noisily MT_errors, model(base_II) vmodel(V24) modelcovars($baselineII_covars) delta(Delta_odps) orig(orig_outcomes) absorb(p1_int_id)
	estimates store Exp_famII_se3
	
	areg c24_rank $baselineII_covars if move_smpl==1 & fam_size<=5 & Delta_odps24_se<2*${${location}_pm}, absorb(family_sex_groups) 
	capture noisily MT_errors, model(base_II) vmodel(V24) modelcovars($baselineII_covars) delta(Delta_odps) orig(orig_outcomes) absorb(family_sex_groups)
	estimates store Exp_famsexII_se1	
		
	areg c24_rank $baselineII_covars if move_smpl==1 & fam_size<=5 & Delta_odps24_se<1.75*${${location}_pm}, absorb(family_sex_groups) 
	capture noisily MT_errors, model(base_II) vmodel(V24) modelcovars($baselineII_covars) delta(Delta_odps) orig(orig_outcomes) absorb(family_sex_groups)
	estimates store Exp_famsexII_se2
		
	areg c24_rank $baselineII_covars if move_smpl==1 & fam_size<=5 & Delta_odps24_se<1.5*${${location}_pm}, absorb(family_sex_groups) 
	capture noisily MT_errors, model(base_II) vmodel(V24) modelcovars($baselineII_covars) delta(Delta_odps) orig(orig_outcomes) absorb(family_sex_groups)
	estimates store Exp_famsexII_se3	

	* TABLE C2 - Exposure effects with family fixed effects
	
	#delimit ;
	esttab Exp_baselineII Exp_famII* Exp_famsexII* using "${${location}_path}/5. Output/Tables/TC2 - Exp_famfe", se fixed 
	keep(*early* *late* *post* *older#c.Delta_odps) b(%9.3f) replace 
	;
	#delimit cr

*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* 	5C. Displacements								   
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*

clear
use "${${location}_path}/1. Data (MASTER)/parents_moves2"

	keep if fyob>=1978 & fyob<=1991 & p1_int_id!=. & postcode!=.

* Keep unique records (ie dump twins)

	bysort p1_int_id fyob postcode start_year end_year: keep if _n==1	
	
* Keep postcodes ended by a new postcode 

	sort p1_int_id fyob start_year
	bysort p1_int_id fyob: gen moved_postcode=(end_year[_n]==(start_year[_n+1]-1) & end_year[_n]!=.)
	
	keep if moved_postcode==1
	drop moved*
	
	gen postcode_duration=end_year-start_year+1

* Save for later linking

	save "tmp_pcs", replace	
	
* Count by families, not kids, and identify shocks to outflows

	bysort p1_int_id postcode start_year end_year: keep if _n==1

	collapse (count) K_zt=p1_int_id, by(postcode end_year)
	save "tmp_pc_outflows", replace

* Generate file of postcode outflows

	* Row for each postcode and end year

	keep postcode
	bysort postcode: keep if _n==1

	expand 24
	bysort postcode: gen end_year=1991+_n

	* Bring in observed outflows (K_zt), calculate mean (K_t) and normalise
	
	merge 1:1 postcode end_year using "tmp_pc_outflows", keepusing(K_zt) keep(master match) nogen
		replace K_zt=0 if K_zt==.
	
	bysort postcode: egen K_z=mean(K_zt)
	
	gen k_zt=K_zt/K_z

	save "tmp_pc_outflows_norm", replace
	
*** Analysis ***

clear
use "mobility_causal"

	keep if moves==1
	
	keep if fyob>=1978
	
	rename orig_outcomes24 orig_outcomes
	rename Delta_odps24 Delta_odps 
	
	gen end_year=fyob+age_at_move-1
	
	merge m:1 p1_int_id fyob end_year using "tmp_pcs", keep(master match) keepusing(postcode) nogen
	merge m:1 postcode end_year using "tmp_pc_outflows_norm", keepusing(k_zt K_zt) keep(master match) nogen

	* Exclude all small outflows if running at ATO

	if "${location}"=="ATO" {
		drop if K_zt<10 
	}
	
	* Shock quantile

	fastxtile shock_quantile=k_zt, nq(100)
	drop if shock_quantile==.
	
	* Generate instruments

	bysort postcode p_decile: egen instr_Delta_odps=mean(Delta_odps)
	bysort postcode p_decile: egen instr_orig_outcomes=mean(orig_outcomes)
	
	* Generate clusters
	
	egen long postcode_decile_year=group(postcode p_decile end_year)
	
* IV regression 

	forvalues age=2(1)34 {
		gen age_at_move_`age'=(age_at_move==`age')
		gen age_at_move_p_rank_`age'=p_rank*(age_at_move==`age')
	}
	
	forvalues year=1978(1)1991 {
		gen orig_outcomes_`year'=orig_outcomes*(fyob==`year')
		gen instr_orig_outcomes_`year'=instr_orig_outcomes*(fyob==`year')
		gen fyob_`year'=(fyob==`year')
	}

	forvalues year=1978(1)1990 {
		gen Delta_odps_`year'=Delta_odps*(fyob==`year')
		gen instr_Delta_odps_`year'=instr_Delta_odps*(fyob==`year')
	}

	#delimit ;
		global baselineII_ex_covars="age_at_move_*
			fyob_19??" 
		;
		global baselineII_en_covars="orig_outcomes_19??
			i.older#c.Delta_odps
			c.early_exposure#c.Delta_odps
			c.late_exposure#c.Delta_odps
			c.post_exposure#c.Delta_odps
			Delta_odps_19??"
		;
		global baselineII_instr="instr_orig_outcomes_19??
			i.older#c.instr_Delta_odps
			c.early_exposure#c.instr_Delta_odps
			c.late_exposure#c.instr_Delta_odps
			c.post_exposure#c.instr_Delta_odps
			instr_Delta_odps_19??"
		;
	#delimit cr

	gen x_quantile=(45+5*_n) if _n<11 

	gen shock_N=.
	gen shock_nonIVN=.
	
	foreach period in early late {
		gen shock_gamma_nonIV`period'=.
		gen shock_gamma_nonIV`period'_se=.
		
		gen shock_gamma`period'=.
		gen shock_gamma`period'_se=.
		
		gen shock_gammaCH`period'=.
		gen shock_gammaCH`period'_se=.
	}
	
	capture noisily {
		forvalues quantile=50(5)95 {

			noisily display "Calculating for shock quantiles above: `quantile'"
			
			* IV specification
			ivregress 2sls c24_rank ${baselineII_ex_covars} (${baselineII_en_covars}=${baselineII_instr}) if move_smpl==1 & Delta_odps24_se<2*${${location}_pm} & shock_quantile>=`quantile', cluster(postcode_decile_year) perfect
			
			replace shock_N=e(N) if x_quantile==`quantile'
			
			foreach period in early late {
				replace shock_gamma`period'=_b[`period'_exposure#c.Delta_odps] if x_quantile==`quantile'
				replace shock_gamma`period'_se=_se[`period'_exposure#c.Delta_odps] if x_quantile==`quantile'
			}
		}
	}

	capture noisily {
		forvalues quantile=50(5)95 {

			noisily display "Calculating for shock quantiles above: `quantile'"
			
			* Non-IV specification
			
			reg c24_rank ${baselineII_covars} if move_smpl==1 & Delta_odps24_se<2*${${location}_pm} & shock_quantile>=`quantile', nocons 
			capture noisily MT_errors, model(base_II) vmodel(V24) modelcovars($baselineII_covars) delta(Delta_odps) orig(orig_outcomes)
			
			replace shock_nonIVN=e(N) if x_quantile==`quantile'
			
			foreach period in early late {
				replace shock_gamma_nonIV`period'=_b[`period'_exposure#c.Delta_odps] if x_quantile==`quantile'
				replace shock_gamma_nonIV`period'_se=_se[`period'_exposure#c.Delta_odps] if x_quantile==`quantile'
			}
			
		}
	}
	
	foreach period in early late {
		gen shock_gamma_nonIV`period'_upper=shock_gamma_nonIV`period'+1.96*shock_gamma_nonIV`period'_se
		gen shock_gamma_nonIV`period'_lower=shock_gamma_nonIV`period'-1.96*shock_gamma_nonIV`period'_se
	
		gen shock_gamma`period'_upper=shock_gamma`period'+1.96*shock_gamma`period'_se
		gen shock_gamma`period'_lower=shock_gamma`period'-1.96*shock_gamma`period'_se
	}

	* APPENDIX FIGURE C2 - Displacement shocks

	#delimit ;
		graph twoway
			(scatter shock_gamma_nonIVlate x_quantile, mcolor(ebblue))
				(rspike shock_gamma_nonIVlate_upper shock_gamma_nonIVlate_lower x_quantile, color(gs8))
			(scatter shock_gammalate x_quantile, mcolor(ebblue))
				(rspike shock_gammalate_upper shock_gammalate_lower x_quantile, color(gs8))
			,
			ytitle("Exposure effect")
			xtitle("Sample: displacement shocks above percentile x of distribution")
			xlabel(,grid)
			ylabel(,grid)
			legend(off)
		;
	#delimit cr
	graph export "${${location}_path}/5. Output/Charts/FC2_Exposure_effects_(shocks).png", replace

*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* 	5D. Placebo testing - cohorts								   
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*

clear 
use "mobility_causal"

	keep if moves==1
	keep if fyob>=1978
	
* Bring in the placebo cohort predictions

	foreach var of varlist fyob dest_outcomes* orig_outcomes* {
		rename `var' _`var'
	}

	quietly forvalues t=1(1)13 {
		noisily di "Loop `t'"
		gen fyob=_fyob+`t'-7
			merge m:1 orig_mapid fyob p_rank using "predicted_outcomes", keepusing(orig_outcomes24) keep(master match) nogen  
			merge m:1 dest_mapid fyob p_rank using "predicted_outcomes", keepusing(dest_outcomes24) keep(master match) nogen  
			* Replace missings as zeroes
			gen missing_t`t'=(orig_outcomes24==. & dest_outcomes24==. & _orig_outcomes24!=.)
			replace orig_outcomes24=0 if orig_outcomes24==. & _orig_outcomes24!=.
			replace dest_outcomes24=0 if dest_outcomes24==. & _orig_outcomes24!=.
					
		gen Delta_odps_t`t'=dest_outcomes24-orig_outcomes24
		rename orig_outcomes24 orig_outcomes_t`t'
		rename dest_outcomes24 dest_outcomes_t`t'
		rename fyob fyob_t`t'	
	}

	foreach var in fyob dest_outcomes* orig_outcomes* {
		rename _`var' `var'
	}

* Generate coefficient place holders

	foreach reg in sep_child sep_teen sim_child sim_teen {
		foreach var in coef se {
			gen `reg'_`var'=.
		}
	}	

	gen sep_N=.
	gen sim_N=.
	
* Separate regressions
	
	gen x_t=_n-7 if _n<14

	quietly forvalues t=1(1)13 {
		
		#delimit ;
		global sep_covars="ibn.age_at_move i.age_at_move#c.p_rank ibn.fyob
				i.fyob#c.orig_outcomes_t`t'
				i.fyob#c.Delta_odps_t`t'
				i.older#c.Delta_odps_t`t'
				c.early_exposure#c.Delta_odps_t`t'
				c.late_exposure#c.Delta_odps_t`t'
				c.post_exposure#c.Delta_odps_t`t'";
		#delimit cr		
		
		reg c24_rank $sep_covars if move_smpl==1 & Delta_odps24_se<2*${${location}_pm}, nocons 
		capture noisily MT_errors, model(base_II) vmodel(V24) modelcovars($sep_covars) delta(Delta_odps_t`t') orig(orig_outcomes_t`t') 
		
		replace sep_child_coef=_b[c.early_exposure#c.Delta_odps_t`t'] if _n==`t'
		replace sep_child_se=_se[c.early_exposure#c.Delta_odps_t`t'] if _n==`t'
		
		replace sep_teen_coef=_b[c.late_exposure#c.Delta_odps_t`t'] if _n==`t'
		replace sep_teen_se=_se[c.late_exposure#c.Delta_odps_t`t'] if _n==`t'
		
		replace sep_N=e(N) if _n==`t'
		
	}

* Simultaneous regression

	local width=13
	local start=(7-(`width'-1)/2)
	local end=(7+(`width'-1)/2)

	global sim_covars="ibn.age_at_move i.age_at_move#c.p_rank ibn.fyob i.fyob#c.orig_outcomes_t7 i.fyob#c.Delta_odps_t7"

	forvalues t=`start'(1)`end' {
		global sim_covars="${sim_covars}"+" i.missing_t`t' orig_outcomes_t`t' i.older#c.Delta_odps_t`t' c.early_exposure#c.Delta_odps_t`t' c.late_exposure#c.Delta_odps_t`t' c.post_exposure#c.Delta_odps_t`t'" 
	}

	reg c24_rank $sim_covars if move_smpl==1 & Delta_odps24_se<2*${${location}_pm}, nocons 
	capture noisily MT_errors, model(cohort_simul) vmodel(V24) modelcovars($sim_covars) delta(Delta_odps_t) orig(orig_outcomes_t)

	* Test significance of true cohort
	
	testparm c.late_exposure#c.Delta_odps_t7
	
		file open intext using "${${location}_path}/5. Output/Text/In_text_statments.txt", write append
		file write intext "Statement ${statement_num}: The p-value for the true cohort coefficient being different from zero is: " (r(p)) "" _n(2)
		global statement_num=${statement_num}+1
		file close intext
	
	local coeflist=""
	
		foreach t in 1 2 3 4 5 6 8 9 10 11 12 13 {
			local coeflist="`coeflist'"+" c.late_exposure#c.Delta_odps_t`t'"
		}
	
	* Test significance of false cohorts
	
	testparm `coeflist'
	
		file open intext using "${${location}_path}/5. Output/Text/In_text_statments.txt", write append
		file write intext "Statement ${statement_num}: The p-value for the false cohort coefficients being different from zero is: " (r(p)) "" _n(2)
		global statement_num=${statement_num}+1
		file close intext
		
	forvalues t=1(1)13 {
		replace sim_teen_coef=_b[late_exposure#c.Delta_odps_t`t'] if _n==`t'
		replace sim_teen_se=_se[late_exposure#c.Delta_odps_t`t'] if _n==`t'
	}

	foreach series in sep sim {
		gen `series'_teen_coef_upper=`series'_teen_coef+1.96*`series'_teen_se
		gen `series'_teen_coef_lower=`series'_teen_coef-1.96*`series'_teen_se
	}

* APPENDIX FIGURE 3 - EVENT STUDY

	#delimit ;
		graph twoway
			(rspike sep_teen_coef_upper sep_teen_coef_lower x_t, color(gs8))
			(scatter sep_teen_coef x_t, color(black) symbol(T))
			(rspike sim_teen_coef_upper sim_teen_coef_lower x_t, color(gs8))	
			(scatter sim_teen_coef x_t, color(ebblue) symbol(O))	
			,
			ytitle("Exposure effect")
			xtitle("Lag")
			yline(0, lpattern(dash))
			xlabel(,grid)
			ylabel(,grid)
			legend(order(2 "Separate" 4 "Simultaneous"))
	
		;
			
	#delimit cr
	graph export "${${location}_path}/5. Output/Charts/FC3_Exposure_effects_(Event).png", replace

*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* 	5E. Placebo testing - gender						   
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*	
	
clear 
use "mobility_causal"

	keep if moves==1
	
	* Correlations
	
	corr orig_outcomes24sex1 orig_outcomes24sex2
	
	* Create own and opposite gender

	foreach var in orig_outcomes Delta_odps {
	
		gen `var'own=.
			replace `var'own=`var'24sex1		if sex==1
			replace `var'own=`var'24sex2		if sex==2

		gen `var'opp=.
			replace `var'opp=`var'24sex1		if sex==2
			replace `var'opp=`var'24sex2		if sex==1
	
	}
	
	gen Delta_odps_se_own=.
		replace Delta_odps_se_own=Delta_odps24sex1_se		if sex==1
		replace Delta_odps_se_own=Delta_odps24sex2_se		if sex==2
	
	gen Delta_odps_se_opp=.
		replace Delta_odps_se_opp=Delta_odps24sex1_se		if sex==2
		replace Delta_odps_se_opp=Delta_odps24sex2_se		if sex==1
		
	* Covariates
	
	foreach outcome in "own" "opp" {
		#delimit ;
			global baselineII_covarsCH`outcome'="ibn.age_at_move 
				i.age_at_move#c.p_rank
				ibn.fyob 
				i.fyob#c.orig_outcomesown
				i.older#c.Delta_odps`outcome'
				c.early_exposure#c.Delta_odps`outcome'
				c.late_exposure#c.Delta_odps`outcome'
				c.post_exposure#c.Delta_odps`outcome'
				i.fyob#c.Delta_odpsown"
				;
		#delimit cr
	}

	* Two gender households
	
	sort p1_int_id sex
	bysort p1_int_id: gen byte two_genders=sex[_N]-sex[1]
	
	* Regressions
			
	* Baseline
			
		* Own gender
		reg c24_rank ${baselineII_covarsCHown} if move_smpl==1 & Delta_odps_se_own<2*${${location}_pm}, nocons  
		estimates store Exp_sex_1
				
		* Opposite gender
		reg c24_rank ${baselineII_covarsCHopp} orig_outcomesopp if move_smpl==1 & Delta_odps_se_own<2*${${location}_pm} & Delta_odps_se_opp<2*${${location}_pm}, nocons  
		estimates store Exp_sex_2
			
		* Both
		reg c24_rank ${baselineII_covarsCHown} ${baselineII_covarsCHopp} orig_outcomesopp if move_smpl==1 & Delta_odps_se_own<2*${${location}_pm} & Delta_odps_se_opp<2*${${location}_pm}, nocons  
		estimates store Exp_sex_3
	
	* With family fixed effects
	
		* Own gender
		areg c24_rank ${baselineII_covarsCHown} if move_smpl==1 & Delta_odps_se_own<2*${${location}_pm}, absorb(p1_int_id)  
		estimates store Exp_sex_4
				
		* Opposite gender
		areg c24_rank ${baselineII_covarsCHopp} orig_outcomesopp if move_smpl==1 & Delta_odps_se_own<2*${${location}_pm} & Delta_odps_se_opp<2*${${location}_pm}, absorb(p1_int_id)   
		estimates store Exp_sex_5
			
		* Both
		areg c24_rank ${baselineII_covarsCHown} ${baselineII_covarsCHopp} orig_outcomesopp if move_smpl==1 & Delta_odps_se_own<2*${${location}_pm} & Delta_odps_se_opp<2*${${location}_pm}, absorb(p1_int_id)   
		estimates store Exp_sex_6
	
		* Both
		areg c24_rank ${baselineII_covarsCHown} ${baselineII_covarsCHopp} orig_outcomesopp if move_smpl==1 & Delta_odps_se_own<2*${${location}_pm} & Delta_odps_se_opp<2*${${location}_pm} & two_genders==1, absorb(p1_int_id)   
		estimates store Exp_sex_7
	
	*** APPENDIX TABLE C3 - Distributional predictions ***
	
	#delimit ;
	esttab Exp_sex_* using "${${location}_path}/5. Output/Tables/TC3 - Exp_sex", se fixed 
	keep(*early* *late* *post*) b(%9.3f) replace 
	;
	#delimit cr	
	
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* 	5F. Placebo testing - distribution							   
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*	

clear 
use "mobility_causal"

	keep if moves==1
	
	quietly foreach var of varlist *p10* *p20* *p80* *p90* {
		replace `var'=`var'*100
	}

	sum Delta_odps24_se, detail
	sum Delta_odpsp20_se, detail
	sum Delta_odpsp80_se, detail
	
	gen abs_Delta_odpsp20=abs(Delta_odpsp20)
	sum abs_Delta_odpsp20, detail
	
	gen abs_Delta_odpsp80=abs(Delta_odpsp80)
	sum abs_Delta_odpsp80, detail
	
	capture noisily corr Delta_odps24 Delta_odpsp20
	capture noisily corr Delta_odps24 Delta_odpsp20 if move_smpl==1 & Delta_odpsp20_se<10*${${location}_pm} & Delta_odps24_se<2*${${location}_pm}
	capture noisily corr Delta_odps24 Delta_odpsp20 if move_smpl==1 & Delta_odpsp20_se<2*${${location}_pm} & Delta_odps24_se<1.5*${${location}_pm}
	
	capture noisily corr Delta_odps24 Delta_odpsp80
	capture noisily corr Delta_odps24 Delta_odpsp80 if move_smpl==1 & Delta_odpsp80_se<10*${${location}_pm} & Delta_odps24_se<2*${${location}_pm}
	capture noisily corr Delta_odps24 Delta_odpsp80 if move_smpl==1 & Delta_odpsp80_se<2*${${location}_pm} & Delta_odps24_se<1.5*${${location}_pm}
	
	* Define covariates
	* Follow CH and have cohort controls as always based on distributional predictions
	
	foreach outcome in "dist" "" {
		#delimit ;
			global baselineII_covars`outcome'="ibn.age_at_move 
				i.age_at_move#c.p_rank
				ibn.fyob 
				i.fyob#c.orig_outcomesdist 
				i.older#c.Delta_odps`outcome'
				c.early_exposure#c.Delta_odps`outcome'
				c.late_exposure#c.Delta_odps`outcome'
				c.post_exposure#c.Delta_odps`outcome'
				i.fyob#c.Delta_odpsdist"
				;
		#delimit cr
	}

	gen orig_outcomes=orig_outcomes24
	gen Delta_odps=Delta_odps24
	gen orig_outcomesdist=.
	gen Delta_odpsdist=.

	* Regressions
		* MT standard errors only available for first one, others use more than one model
		
	foreach outcome in p10 p20 p80 p90 {

			foreach var in orig_outcomes Delta_odps {
				replace `var'dist=`var'`outcome'
			}
			
			* Outcome
			reg c_`outcome' ${baselineII_covarsdist} if move_smpl==1 & Delta_odps`outcome'_se<10*${${location}_pm}, nocons  
			estimates store Exp_dist_`outcome'1
			
			* Mean rank (placebo)
			reg c_`outcome' ${baselineII_covars} orig_outcomes if move_smpl==1 & Delta_odps`outcome'_se<10*${${location}_pm} & Delta_odps24_se<2*${${location}_pm}, nocons  
			estimates store Exp_dist_`outcome'2
		
			* Both
			reg c_`outcome' ${baselineII_covarsdist} ${baselineII_covars} orig_outcomes if move_smpl==1 & Delta_odps`outcome'_se<10*${${location}_pm} & Delta_odps24_se<2*${${location}_pm}, nocons  
			estimates store Exp_dist_`outcome'3
			
	}
	
	*** APPENDIX TABLE C4 - Distributional predictions ***
	
	#delimit ;
	esttab Exp_dist_* using "${${location}_path}/5. Output/Tables/TC4 - Exp_dist", se fixed 
	keep(*early* *late* *post*) b(%9.3f) replace 
	;
	#delimit cr
	
*------------------------------------------------------------------------------*
* 	6. DATA APPENDIX 													   
*------------------------------------------------------------------------------*	

clear
use "${${location}_path}/1. Data (MASTER)/mobility"

	keep p1_int_id fyob link_source age*

* APPENDIX FIGURE E1 - Age at event	
	
preserve
	
	capture rename age_fyofa age_in_fyfa
	
	keep if fyob==1980
	
	matrix age_at_event = J(30,3,0)
	
	matrix colnames age_at_event = Registration First_address First_GNAF_address
	
	capture noisily forvalues age=1(1)30 {
		count if age_in_fyor==`age'
			matrix age_at_event[`age',1]=r(N)
		
		count if age_in_fyfa==`age'
			matrix age_at_event[`age',2]=r(N)
			
		count if age_in_GNAF==`age'
			matrix age_at_event[`age',3]=r(N)	
	}
	
	matrix list age_at_event
	
	clear
	svmat age_at_event, name(event)
	
		rename event1 age_in_fyor
		rename event2 age_in_fyfa
		rename event3 age_in_GNAF
	
	gen age=_n
	
	set obs 31
		replace age_in_fyor=0 if age==. 
		replace age_in_fyfa=0 if age==. 
		replace age_in_GNAF=0 if age==. 
	
	forvalues age=1(1)30 {
		replace age_in_fyor=age_in_fyor+age_in_fyor[`age'] if age==. 
		replace age_in_fyfa=age_in_fyfa+age_in_fyfa[`age'] if age==. 
		replace age_in_GNAF=age_in_GNAF+age_in_GNAF[`age'] if age==. 
	}
	
	foreach var of varlist age_in* {
		replace `var'=round(`var',100)
	}
	
	export excel using "${${location}_path}/5. Output/Charts/FE1_Age_at_event", firstrow(var) replace	
	
restore	
	
* APPENDIX FIGURE E2 - Link rates		
	
preserve

	matrix link_rates = J(31,2,0)
	
	matrix colnames link_rates = Pop Linked_pop
	
	quietly forvalues fyob=1970(1)2000 {
		count if fyob==`fyob'
			matrix link_rates[`=`fyob'-1969',1]=r(N)
		
		count if fyob==`fyob' & p1_int_id!=.
			matrix link_rates[`=`fyob'-1969',2]=r(N)
	}
	
	matrix list link_rates
	
	outtable using "${${location}_path}/5. Output/Tables/TE2 - Link rates", replace mat(link_rates) format(%9.1gc)

* STATEMENT: about link rates and sources
	
	keep if fyob>=1978 & fyob<=1991
	
	count if p1_int_id!=.
	
		file open intext using "${${location}_path}/5. Output/Text/In_text_statments.txt", write append
		file write intext "Statement ${statement_num}: For the 1978-1991 cohorts the link rate is: " (100*r(N)/_N) " per cent" _n(2)
		global statement_num=${statement_num}+1
		file close intext
	
	local tmp_denom=r(N)
	
		file open intext using "${${location}_path}/5. Output/Text/In_text_statments.txt", write append
		
		count if p1_int_id!=. & link_source==1
			file write intext "Statement ${statement_num}: Of these links:" _n(1) _tab (100*r(N)/`tmp_denom') " per cent are address based." _n(1)
		count if p1_int_id!=. & link_source==2
			file write intext _tab (100*r(N)/`tmp_denom') " per cent are FTA based." _n(1)
		count if p1_int_id!=. & link_source==3
			file write intext _tab (100*r(N)/`tmp_denom') " per cent are postcode based." _n(1)
		count if p1_int_id!=. & link_source==4
			file write intext _tab (100*r(N)/`tmp_denom') " per cent are sibling based." _n(2)	
		
		global statement_num=${statement_num}+1
		file close intext
	
restore

*------------------------------------------------------------------------------*
* 	7. RUN THE PEERS ANALYSIS								   
*------------------------------------------------------------------------------*

clear all
do "${${location}_path}/3. Code/Peer_effects.do"
