

*	Daniel Kamhfer, Hendrik Schmitz
*	Reanalyzing Zero Returns to Education in Germany
*	February, 2015


*************************
*** Data manipulation ***
*************************

************************************************************************************
	*** Prepare instruments ***

*Read Excel file
cd "$extern"
import excel external_information, clear first

*Instruments for school supply
gen num_acad_schools_area = school_num_acad / (state_area / 1000)
gen num_inter_schools_area = school_num_inter / (state_area / 1000)

*School weighted by total number of students (in 10,000) per year and state (for Appendix Figure A2)
gen num_acad_schools_students = school_num_acad / (students_total / 10000)
gen num_inter_schools_students = school_num_inter / (students_total / 10000)

*Save
cd "$edit"
save schooling_instruments.dta, replace



************************************************************************************
	*** SOEP data ***

*Read data
cd "$edit"
use 01_merging.dta

keep persnr wbula bsschla gebjahr wp12401 labgro06 wtatzeit labnet06 is8806 wp87 wp94 wp95 migback msbil vsbil wp11721 wp11724 wbilzeit wpsbil wpsbila wpsbilo wpbbila wpbbilo wpbbil01 wpbbil02 wp07 f96t90s

*Renaming
rename wbula bula
rename bsschla state_school
rename gebjahr year_birth

*Gender
gen female = 1 if wp12401 == 2
replace female = 0 if wp12401 == 1
label def female 0 "male" 1 "female"
label val female female
label var female Female

*Respondent's age
gen age = 2006 - year_birth
label var age age

*Year in which the respondent is at the age 10
gen year_age10 = year_birth + 10
tostring year_age10, replace

*Gross monthly income
rename labgro06 income
label var income "Gross income"

*Gross hourly wage
replace wtatzeit = . if wtatzeit < 0
gen wage = income / 4.3 / wtatzeit

*Log gross hourly wage
gen log_wage = ln(wage) if wage > 0 & wage < .
replace log_wage = 0.000001 if wage == 0

*Same for net income
quietly{
*Net monthly income
rename labnet06 income_net
label var income_net "Net income"

*Net hourly wage
gen wage_net = income_net / 4.3 / wtatzeit

*Log gross hourly wage
gen log_wage_net = ln(wage_net) if wage_net > 0 & wage_net < .
replace log_wage_net = 0.000001 if wage_net == 0
}

*Match skill level with ISCO-88 code
gen job_skill = .
replace job_skill = 4 if is8806 >= 2000 & is8806 < 3000
replace job_skill = 3 if (is8806 >= 3000 & is8806 < 4000) | (is8806 >= 1000 & is8806 < 2000)
replace job_skill = 2 if (is8806 >= 4000 & is8806 < 8000) | (is8806 >= 0000 & is8806 < 1000)
replace job_skill = 1 if is8806 >= 8000 & is8806 < 9000

*Self-rated health status binary
gen srh_good = 0
replace srh_good = 1 if wp87 <= 2 & wp87 > 0
replace srh_good = . if wp87 < 0

*Obesity: Body Mass Index (BMI) >= 30
*Adjust missing values
mvdecode wp94 wp95, mv(-1=.a\-2=.b\-3=.c)
*Build BMI
gen obesity = wp95 / ((wp94 / 100) * (wp94 / 100)) >= 30

*Migration indicator
gen migration = 0
replace migration = 1 if migback >= 2 & !missing(migback)
replace migration = . if missing(migback)
label var migration "Migration Background"

*Mother's education
gen edu_mother = .
replace edu_mother = 0 if msbil == 1 | msbil == 5 | msbil == 6 | msbil == 7
replace edu_mother = 1 if msbil == 2 | msbil == 3 | msbil == 4
label var edu_mother "Education mother"

*Father's education
gen edu_father = .
replace edu_father = 0 if vsbil == 1 | vsbil == 5 | vsbil == 6 | vsbil == 7
replace edu_father = 1 if vsbil == 2 | vsbil == 3 | vsbil == 4
label var edu_father "Education father"

*Number of siblings
replace wp11721 = 0 if wp11721 < 0
replace wp11724 = 0 if wp11724 < 0
gen siblings = 0
replace siblings = wp11721 + wp11724
label var siblings "Number of siblings"

*Now we operate with the state of school information
mvdecode state_school, mv(-1=.\-2=.\-3=.)
*Mark respondents which state of school missings
gen state_school_mis = 0
replace state_school_mis = 1 if state_school == .
*We chose the state of residence during the first interview as proxy if the state of last school attendance is missing
quietly{
replace state_school = 11 if bula == 11 & state_school == .
replace state_school = 1 if bula == 1 & state_school == .
replace state_school = 2 if bula == 2 & state_school == .
replace state_school = 3 if bula == 3 & state_school == .
replace state_school = 4 if bula == 4 & state_school == .
replace state_school = 5 if bula == 5 & state_school == .
replace state_school = 6 if bula == 6 & state_school == .
replace state_school = 7 if (bula == 7  & state_school == .) | (bula == 10 & state_school == .)		/* the state of school information does not distinguish between the states of Rhineland-Palatinate and Saarland */
replace state_school = 8 if bula == 8 & state_school == .
replace state_school = 9 if bula == 9 & state_school == .
replace state_school = 13 if bula == 12 & state_school == .
replace state_school = 12 if bula == 13 & state_school == .
replace state_school = 14 if bula == 15 & state_school == .
replace state_school = 15 if bula == 16 & state_school == .
replace state_school = 16 if bula == 14 & state_school == .
}

*Generate new variable containing the string of the state
quietly{
gen state_string = .
tostring state_string, replace
replace state_string = "BE_" if state_school == 0
replace state_string = "SH_" if state_school == 1
replace state_string = "HH_" if state_school == 2
replace state_string = "NI_" if state_school == 3
replace state_string = "HB_" if state_school == 4
replace state_string = "NW_" if state_school == 5
replace state_string = "HE_" if state_school == 6
replace state_string = "RP_" if state_school == 7
replace state_string = "BW_" if state_school == 8
replace state_string = "BY_" if state_school == 9
}

*We need an identifier for each state/year pair
gen edu_ID = state_string + year_age10

*Dummy indicating the compulsory schooling reform
quietly{
gen compulsory = .
#delimit ;
replace compulsory = 1 if
	(state_school == 1 & year_birth >= 1941) |
	(state_school == 2 & year_birth >= 1934) |
	(state_school == 3 & year_birth >= 1947) |
	(state_school == 4 & year_birth >= 1943) |
	(state_school == 5 & year_birth >= 1953) |
	(state_school == 6 & year_birth >= 1953) |
	(state_school == 7 & year_birth >= 1953) |
	(state_school == 8 & year_birth >= 1953) |
	(state_school == 9 & year_birth >= 1949) ;
replace compulsory = 0 if
	(state_school == 1 & year_birth < 1941) |
	(state_school == 2 & year_birth < 1934) |
	(state_school == 3 & year_birth < 1947) |
	(state_school == 4 & year_birth < 1943) |
	(state_school == 5 & year_birth < 1953) |
	(state_school == 6 & year_birth < 1953) |
	(state_school == 7 & year_birth < 1953) |
	(state_school == 8 & year_birth < 1953) |
	(state_school == 9 & year_birth < 1949) ;
#delimit cr
label var compulsory "Compulsory Reform"
}


*Years of education
gen edu_years = .
replace edu_years = wbilzeit if wbilzeit >= 0
replace edu_years = edu_years - 1 if compulsory == 0 &  wbilzeit >= 0

*Generating years of schooling
quietly{
*The number of years of education is not originally asked. The primary researchers have generated a proxy variable with the years of education based on measured characteristics like school track. Nevertheless they don't pay attention to the compulsory school reform. For this reason we calculate the years of education in the way the primary researchers do but which respect to the reform.
gen school = -1
gen occupa = -1

*East-German
replace school = 1 if wpsbilo == 1 & (wpsbilo >= 1 & school == -1)
replace school = 2 if wpsbilo == 2 & (wpsbilo >= 1 & school == -1)
replace school = 4 if wpsbilo == 3 & (wpsbilo >= 1 & school == -1)
replace school = 2 if wpsbilo == 4 & (wpsbilo >= 1 & school == -1)
replace school = 0 if wpsbilo == 5 & (wpsbilo >= 1 & school == -1)

replace occupa = 1 if wpbbilo == 1 & (wpbbilo >= 1 & occupa == -1)
replace occupa = 2 if wpbbilo == 2 & (wpbbilo >= 1 & occupa == -1)
replace occupa = 3 if wpbbilo == 3 & (wpbbilo >= 1 & occupa == -1)
replace occupa = 2 if wpbbilo == 4 & (wpbbilo >= 1 & occupa == -1)

*Foreigners
replace school = 0 if wpsbila == 1 & (wpsbila >= 1 & school == -1)
replace school = 1 if wpsbila == 2 & (wpsbila >= 1 & school == -1)
replace school = 2 if wpsbila == 3 & (wpsbila >= 1 & school == -1)

replace occupa = 0 if wpbbila == 1 & (wpbbila >= 1 & occupa == -1)
replace occupa = 1 if wpbbila == 2 & (wpbbila >= 1 & occupa == -1)

*All samples with finished education in Germany
replace school = 0 if wpsbil == 6
replace school = wpsbil if (wpsbil >= 1 & wpsbil <= 4)
replace school = 2 if wpsbil == 5
replace school = 1 if (wpsbil == 5 & wpsbila == 2)
replace school = 2 if (wpsbil == 5 & wpsbila == 3)

*Matching of variable values with time of education
gen schoolyears = -1
replace schoolyears = 7 if school == 0
replace schoolyears = 8 if school == 1 & wpsbil == 1 & compulsory == 0		/* Here we pay respect to the reform */
replace schoolyears = 9 if school == 1 & wpsbil == 1 & compulsory == 1		/* The previous command without the reform */
replace schoolyears = 10 if school == 2
replace schoolyears = 12 if school == 3
replace schoolyears = 13 if school == 4
replace schoolyears = -2 if wpsbil == 7
}

*School type indicators
gen haupt = 0
replace haupt = 1 if (wpsbil == 1 | wpsbil == 6 | wpsbil == 5) & !missing(wpsbil)
gen mittel = 0
replace mittel = 1 if (wpsbil == 2 | wpsbil == 3) & !missing(wpsbil)
gen gym = 0
replace gym = 1 if wpsbil == 4 & !missing(wpsbil)

*Vocation and university dummies
gen vocation = 0
replace vocation = 1 if wpbbil01 >= 1 & wpbbil01 <= 6
label var vocation "Voc. Training"
gen uni = 0
replace uni = 1 if wpbbil02 >= 1 & wpbbil02 <= 4
label var uni "Uni Degree"

*Adjust coding for missing values (not possible before because we need the missings for the years of education variable)
qui mvdecode _all, mv(-1=.a\-2=.b\-3=.c)

*Merge with the number of intermediate and academic schools (instruments) as well as the average number of students per school by track (additional control variables added as robustness check)
cd "$edit"
merge m:1 edu_ID using schooling_instruments.dta
drop _merge

*Re-scale the variables in order to enable an easier interpretation
*replace num_inter_schools_area = num_inter_schools_area * 200
*replace num_acad_schools_area = num_acad_schools_area * 200

*We focus on West-German non-city states
*Drop if no state information (only "Germany Until 1949")
drop if state_school == 98
*Only West-German respondents
drop if state_school >= 12 & state_school <= 16
*Drop city states (Hamburg, Bremen, Berlin West, Berlin East, Berlin No Further Details)
drop if state_school == 0 | state_school == 2 | state_school == 4 | state_school == 11 | state_school == 18

*Age restriction
destring year_age10, replace
drop if year_age10 < 1950 | year_age10 > 1980
*Drop "No School Degree Yet"
drop if wpsbil == 7

*Drop if key variables are missing
*No information about the years of education
drop if missing(edu_years)
*Drop if controls are missing
drop if missing(year_birth)
drop if missing(state_school)

*Generate sets of control dummy variables
qui tab year_birth, gen(year_birth_d)
qui tab state_school, gen(state_school_d)

count

************************************************************************************
	*** Wage data set ***

*So far, the variables needed for the wage and skills regressions are the same, now we exclude missing wage information. This is only necessary for the wage regressions, however.
*For reasons depict in the paper, we make our sample more homogeneous and drop outliers.
*Preserve data for skill regressions
preserve
*Drop if wage information is missing
drop if log_wage == .
*We are only interested in full-time or regular part-time employees
keep if wp07 == 1 | wp07 == 2
*Since we calculate the log hourly gross wage using the monthly income, weeks per month, and the average hours per week, we drop extreme low and extreme high hourly wages in order to avoid flaws in the calculation
drop if wage < 5 | wage > 60

count

*Save
cd "$edit"
save 02_clear_wage.dta, replace

count

************************************************************************************
	*** Cognitive skills data set ***

*Restore data set with all wage information in order to increase the number of observations
restore

*Rename skills test score
rename f96t90s skills_score

*Test scores for respondents who refuse to participate are coded as a zero, we replace these test scores to missing values
replace skills_score = . if skills_score == 0

*Re-scale test score in order to make the interpretation easier
gen skills_score_ln = ln(skills_score)

*Drop if test score is missing, e.g. because ultra-short intelligence tests was not undertaken
drop if missing(skills_score_ln)

*Save
cd "$edit"
save 03_clear_skills.dta, replace


