/*
This program creates additional variables 
	needed, selects the samples for analysis,
	and further restricts the sample to meet 
	annual minimum earnings thresholds.

Two supplementary data files are needed to run 
	this program:
	1. annual cpi-urs inflation adjustment 
		factors ("CPI_URS_19782017.dta")
	2. annual state-level minimum wage 
		data ("VZ_state_annual.dta")
*/


log using ${dopath}/p2_SampleSelection.log, replace


**************************************
*1. CREATE ADDITIONAL VARIABLES
**************************************
**load data that has been transposed into long format (in 1.DataPrep.do)
use ${mydatapath}/GSFadminlong.dta, clear

keep if male==1
drop male

format deathdate %td
format birthdate %td
format mh_date1 %td
format mh_date2 %td
format first_admin_birthdate %td
format sipp_panel_beg_date %td


**create longitudinal schooling variables
gen educ_long=1
sort personid year
by personid: replace educ_long=2 if educ_5cat>=2 & year>=year_end_hs
by personid: replace educ_long=3 if educ_5cat==3 & year>=year_end_posths
by personid: replace educ_long=4 if educ_5cat>=4 & year>=year_bach
by personid: replace educ_long=5 if educ_5cat>=5 & year>=year_end_posths
replace educ_long=. if educ_5cat==.

gen educyears1=.
replace educyears1=10 if educ_long==1
replace educyears1=12 if educ_long==2
replace educyears1=14 if educ_long==3
replace educyears1=16 if educ_long==4
replace educyears1=18 if educ_long==5


**create second alternative years of schooling variables that smooths discrete jumps
gen educyears2=educyears1
*10-12
by personid: replace educyears2=11 if educyears2==10 & educyears2[_n+1]==12
*12-14
by personid: replace educyears2=13 if educ_5cat==3 & year>year_beg_posths & year<year_end_posths
*12-16
by personid: replace educyears2=13 if educ_5cat>=4 & year>year_beg_posths & year<year_bach


**longitudinal marriage variable
gen marriedyear=year(mh_date1)
gen marriedendyear=year(mh_date2)
gen married2year=year(mh_date3)
gen married2endyear=year(mh_date4)
gen married3year=year(mh_date5)
gen married3endyear=year(mh_date6)
gen married4year=year(mh_date7)
gen married4endyear=year(mh_date8)
gen married=0
replace married=1 if mh1==1 & year>=marriedyear
replace married=0 if mh2>1 & year>=marriedendyear
replace married=1 if mh3==1 & year>=married2year
replace married=0 if mh4>1 & year>=married2endyear
replace married=1 if mh5==1 & year>=married3year
replace married=0 if mh6>1 & year>=married3endyear
replace married=1 if mh7==1 & year>=married4year
replace married=0 if mh8>1 & year>=married4endyear

**create age variable
gen birthyear=year(birthdate)
gen birthmonth=month(birthdate)
gen birthquarter=.
replace birthquarter=1 if birthmonth>=1 & birthmonth<=3
replace birthquarter=2 if birthmonth>=4 & birthmonth<=6
replace birthquarter=3 if birthmonth>=7 & birthmonth<=9
replace birthquarter=4 if birthmonth>=10 & birthmonth<=12

gen sippyear=year(sipp_panel_beg_date)
gen sippmonth=month(sipp_panel_beg_date)
gen sippquarter=.
replace sippquarter=1 if sippmonth>=1 & sippmonth<=3
replace sippquarter=2 if sippmonth>=4 & sippmonth<=6
replace sippquarter=3 if sippmonth>=7 & sippmonth<=9
replace sippquarter=4 if sippmonth>=10 & sippmonth<=12

gen age=year-birthyear
gen ageq=age*4
gen quarterdiff=sippquarter-birthquarter
gen age_start=year(sipp_panel_beg_date)-year(birthdate)
replace ageq=ageq+quarterdiff
sum age
sum quarterdiff
sum ageq

gen age2=age^2
gen ageq2=ageq^2

*log earnings
gen log_total_der_fica_=log(total_der_fica)
gen total_der=total_der_fica+total_der_nonfica
gen log_total_der=log(total_der)

save ${mydatapath}/GSFadminlongprepDOA.dta, replace






**************************************
*2. SAMPLE SELECTION
**************************************
**FIRST, IDENTIFY ALL INDIVIDUALS WITH 34 YEARS OF NON-MISSING DATA AND *POSITIVE* DER EARNINGS
drop if year>2011
*count observations for each variable, so we can keep a balanced sample
by personid: egen logearningsobsBOTH=count(log_total_der)
tab logearningsobsBOTH
by personid: egen educyears2obs=count(educyears2)
tab educyears2obs
by personid: egen ageobs=count(age)
tab ageobs
by personid: egen marriedobs=count(married)
tab marriedobs
by personid: egen foreignobs=count(foreign_born)
tab foreignobs
by personid: egen hispanicobs=count(hispanic)
tab hispanicobs
by personid: egen stateobs=count(state)
tab stateobs
by personid: egen birthdateobs=count(birthdate)
tab birthdateobs
by personid: egen minage=min(age)
by personid: egen maxage=max(age)
by personid: egen hsobs=count(year_end_hs)
by personid: egen bachobs=count(year_bach)
by personid: egen begposthsobs=count(year_beg_posths)
by personid: egen endposthsobs=count(year_end_posths)
*identify individuals with two schooling changes
gen schoolchange2=0
by personid: replace schoolchange2=1 if educyears2[_n]!=educyears2[_n-1] & year>1978
by personid: egen schoolchanges2=sum(schoolchange2)
tab schoolchanges2
*identify individuals without missing data for the education timing variables
gen HSvars=1
replace HSvars=0 if educ_5cat>=2 & hsobs<34
gen SomeCollvars=1
replace SomeCollvars=0 if educ_5cat==3 & (begposthsobs<34 | endposthsobs<34)
gen Bachvars=1
replace Bachvars=0 if educ_5cat>=4 & (begposthsobs<34 | bachobs<34 | endposthsobs<34)
gen Gradvars=1
replace Gradvars=0 if educ_5cat==5 & (begposthsobs<34 | endposthsobs<34)
*create the sample indicator variable - 1+ changes
gen insampleBOTH2_1ch=0
replace insampleBOTH2_1ch=1 if logearningsobsBOTH==34 & educyears2obs==34 & ageobs==34 & marriedobs==34 & foreignobs==34 & hispanicobs==34 & stateobs==34 & birthdateobs==34 & minage>=16 & maxage<=65 & schoolchanges2>=1 & HSvars==1 & SomeCollvars==1 & Bachvars==1 & Gradvars==1
*create the sample indicator variable - 2+ changes
gen insampleBOTH2_2ch=0
replace insampleBOTH2_2ch=1 if logearningsobsBOTH==34 & educyears2obs==34 & ageobs==34 & marriedobs==34 & foreignobs==34 & hispanicobs==34 & stateobs==34 & birthdateobs==34 & minage>=16 & maxage<=65 & schoolchanges2>=2 & HSvars==1 & SomeCollvars==1 & Bachvars==1 & Gradvars==1


**SECOND, comparison sample: only requires positive earnings data AFTER FINISHING SCHOOL
gen postschool=0
by personid: replace postschool=1 if educ_5cat>2 & year>year_end_posths
by personid: replace postschool=1 if educ_5cat==2 & year>year_end_hs
by personid: egen postschoolyears=sum(postschool)
by personid: egen postschoolearnobsBOTH=count(log_total_der) if postschool==1
gen nomisspostschoolBOTH=0
by personid: replace nomisspostschoolBOTH=1 if postschoolyears==postschoolearnobsBOTH
by personid: egen nomisspostBOTH=max(nomisspostschoolBOTH)

gen waterfall_samp=0
replace waterfall_samp=1 if minage>=16 & maxage<=65 & educyears2obs==34 & HSvars==1 & SomeCollvars==1 & Bachvars==1 & Gradvars==1 & ageobs==34 & marriedobs==34 & foreignobs==34 & hispanicobs==34 & stateobs==34 & birthdateobs==34 & current_enroll_coll!=1 & current_enroll_hs!=1 & age_start>=27






**************************************
*3. ADDITIONAL VARIABLE CREATION AND ADJUSTMENT
**************************************
*drop individuals current enrolled in college
drop if current_enroll_coll==1
drop if current_enroll_hs==1


*adjust earnings data for inflation
preserve 
use ${mydatapath}/CPI_URS_19782017.dta, clear
gen avg99=avg if year==1999
egen base99=max(avg99)
gen factor99=base99/avg
tempfile adjustment_factors
save "`adjustment_factors'"
restore

merge m:1 year using "`adjustment_factors'", keepusing(factor99) gen(_mergeAdjFactors)
drop if _mergeAdjFactors==2

gen total_der_fica_cpi=total_der_fica*factor99
gen log_total_der_fica_cpi=log(total_der_fica_cpi)

gen total_der_cpi=total_der*factor99
gen log_total_der_cpi=log(total_der_cpi)


*additional variables needed for regressions
gen black=(race==2)
gen otherrace=(race==3)

gen ageq3=ageq^3
gen ageq4=ageq^4
gen educyears_sq2=educyears2^2

gen birthquarter2=birthquarter^2
gen birthyear2=birthyear^2

**Restrict sample to individuals with annual earnings equal to at least MW*800 hours
merge m:m year using ${mydatapath}/VZ_state_annual.dta, keepusing(mean_fed_mw) gen(_mergeMWdata)
drop if _mergeMWdata==2
sum mean_fed_mw if year==1990

gen earnings_thresh=mean_fed_mw*800
gen gte_thresh=(total_der>=earnings_thresh)
bysort personid: egen always_gte_earnings_thresh=min(gte_thresh)





**************************************
*4. SUBSET DATA TO THE BASELINE SAMPLE AND MAIN SAMPLE
**************************************
keep if waterfall_samp==1
compress
save ${mydatapath}/GSFadminlongDOA_19782011Baseline.dta, replace


keep if insampleBOTH2_1ch==1 | insampleBOTH2_2ch==1
sum insample*
compress
save ${mydatapath}/GSFadminlongDOA_19782011.dta, replace
keep if always_gte_earnings_thresh==1
compress
save ${mydatapath}/GSFadminlongDOA_19782011EarnRestrict.dta, replace





log close
