set more off

cap log close
log using "Output/census",replace

//Do file to clean census data
***POPULATION
**2001 in SS_bevolking2001.dta
clear
import excel "Source\Census 2001\N_T_3001.ese België-y01.xls", sheet("Sheet1") cellrange(A4:O23181)

*Names
rename A niscode
rename B sectorcode
rename C naamvandesector

rename E mannenbelg
rename F vrouwenbelg
rename G totaalbelg
rename H mannenvreemd
rename I vrouwenvreemd
rename J totaalvreemd
rename K mannentotaal
rename L vrouwentotaal
rename M totaalinwoners
rename N oppervlaktesector
label variable oppervlaktesector "Oppervlakte van de statistische sector in ca"
rename O bevolkingsdichtheid

*Relative
foreach var of varlist mannenbelg-vrouwentotaal {
generate `var'rel = `var'/totaalinwoners
}

*Clean
replace sectorcode=subinstr(sectorcode," ","",.)
replace sectorcode="9999" if sectorcode=="Z"
replace D=0 if sectorcode=="9999" 
drop if D~=0 
drop D

sort niscode sectorcode

*Labelen
label variable totaalinwoners "Total number of citizens, 2001"
label variable bevolkingsdichtheid "Inhabitants/square meter, 2001"
label variable totaalvreemdrel "% foreign inhabitants, 2001"
label variable mannentotaalrel "% male, 2001"
label variable mannenbelgrel "% Belgian male, 2001"
label variable vrouwenbelgrel "% Belgian female, 2001"
label variable totaalbelgrel "% Belgian, 2001"
label variable mannenvreemdrel "% foreign male, 2001"
label variable vrouwenvreemdrel "% foreign female, 2001"
label variable vrouwentotaalrel "% female, 2001"

*Bevolkingsdichtheid (population density) in 100inw/km² 
gen bevolkingsdichtheid100=bevolkingsdichtheid/100
label variable bevolkingsdichtheid100 "100 inhabitants/square kilometer, 2001"
order bevolkingsdichtheid100, after(bevolkingsdichtheid)

save "Output/SS_bevolking2001.dta", replace

	//muni level
	collapse (sum) totaalbelg totaalvreemd oppervlaktesector ,by(niscode)

	gen bevolkingsdichtheid100_nis=10000*((totaalbelg+totaalvreemd)/oppervlaktesector)
	gen totaalbelgrel_nis=totaalbelg/(totaalbelg+totaalvreemd)
	
	save "Output/NIS_bevolking2001.dta", replace
	

***HOUSEHOLDS
** 2001 in SS_huishoudens2001.dta
clear
import excel "Source\Census 2001\N_T_3007A.ese België-y01.xls", sheet(" ") cellrange(A4:P22395)

rename A niscode
rename B sectorcode
rename C naamvandesector

rename E alleenman
rename F alleenvrouw
rename G huishouden2
label variable huishouden2 "Huishouden met 2 personen"
rename H huishouden3
rename I huishouden4
rename J huishouden5
rename K huishouden6
rename L huishouden7
rename M huishouden8
label variable huishouden8 "Huishouden met 8 personen of meer"
rename N totparticulierehuishoudens
rename O totcollectievehuishoudens
rename P totaalhuishoudens


gen huishouden1=alleenman+alleenvrouw
order huishouden1,before(huishouden2)
	//muni level
	preserve
	*collapse by muni
	collapse (sum) totparticulierehuishoudens huishouden1-huishouden8 ,by(niscode)
	
	gen hh_average=0
	forvalues i=1/8 {
	*create average
	replace hh_average=hh_average+`i'*huishouden`i'

	*create rel
	gen huishouden`i'rel=huishouden`i'/totparticulierehuishoudens
	}
	replace hh_average=hh_average/totparticulierehuishoudens

	keep niscode huishouden*rel hh_average
	
	foreach var of varlist huishouden*rel hh_average {
	rename `var' `var'_nis
	}
	
	save "Output/NIS_huishoudens2001.dta", replace
	restore


replace sectorcode=subinstr(sectorcode," ","",.)
replace sectorcode="9999" if sectorcode=="Z"
replace D=0 if sectorcode=="9999"
drop if D~=0
drop D

sort niscode sectorcode

gen hh_average=0
forvalues i=1/8 {
*create average
replace hh_average=hh_average+`i'*huishouden`i'

*create rel
gen huishouden`i'rel=huishouden`i'/totparticulierehuishoudens
}
replace hh_average=hh_average/totparticulierehuishoudens

replace sectorcode=subinstr(sectorcode," ","",.)
replace sectorcode="9999" if sectorcode=="ZZZZ"

keep niscode sectorcode huishouden*rel hh_average

label variable huishouden1rel "% household of 1, 2001"
label variable huishouden2rel "% household of 2, 2001"
label variable huishouden3rel "% household of 3, 2001"
label variable huishouden4rel "% household of 4, 2001"
label variable huishouden5rel "% household of 5, 2001"
label variable huishouden6rel "% household of 6, 2001"
label variable huishouden7rel "% household of 7, 2001"
label variable huishouden8rel "% household of >7, 2001"

save "Output/SS_huishoudens2001.dta", replace


	

clear


***EDUCATION 
**2001 in SS_onderwijs2001.dta
import excel "Source\Census 2001\N_T_3022ATF2.ese België-y01.xls", sheet("     ") cellrange(A5:J22396)
rename A niscode
rename B sectorcode
rename C naamvandesector

rename E postsecundairniethoger
rename F hogeronderwijs
rename G opleidingbelgischnietgekend
rename H buitenlandsdiploma
rename I geendiploma
rename J opleidinggeenantwoord

replace sectorcode=subinstr(sectorcode," ","",.)
replace sectorcode="9999" if sectorcode=="Z"
replace D=0 if sectorcode=="9999"
drop if D~=0
drop D

drop if sectorcode=="9999"

save "Output/SS_onderwijs2001TEMP.dta", replace

clear

**Data in 2 excel sheets, merged here
import excel "Source\Census 2001\N_T_3022ATF1.ese België-y01.xls", sheet("     ") cellrange(A5:O22396)
rename A niscode
rename B sectorcode
rename C naamvandesector

rename E totaalnietmeerschoolgaand
rename F totaalbelgischdiploma
label variable totaalbelgischdiploma "Dit omvat de kolommen lageronderwijs tem opleidingbelgischnietgekend"
rename G lageronderwijs
rename H ASOlager
rename I TSOlager
rename J KSOlager
rename K BSOlager
rename L ASOhoger
rename M TSOhoger
rename N KSOhoger
rename O BSOhoger

replace sectorcode=subinstr(sectorcode," ","",.)
replace sectorcode="9999" if sectorcode=="Z"
replace D=0 if sectorcode=="9999"
drop if D~=0
drop D

sort niscode sectorcode

drop if sectorcode=="9999"

merge 1:1 niscode sectorcode using "Output/SS_onderwijs2001TEMP.dta"
drop _merge

foreach var of varlist totaalbelgischdiploma-geendiploma {
generate `var'rel = `var'/(totaalnietmeerschoolgaand-opleidinggeenantwoord)
}

gen opleidinggeenantwoordrel=opleidinggeenantwoord/totaalnietmeerschoolgaand 

label variable lageronderwijsrel "% lower education"
label variable ASOlagerrel "% high school (ASO, first 3 years)"
label variable TSOlagerrel "% high school (TSO, first 3 years)"
label variable KSOlagerrel "% high school (KSO, first 3 years)"
label variable BSOlagerrel "% high school (BSO, first 3 years)"
label variable ASOhogerrel "% high school (ASO, 6 years)"
label variable TSOhogerrel "% high school (TSO, 6 years)"
label variable KSOhogerrel "% high school (KSO, 6 years)"
label variable BSOhogerrel  "% high school (BSO, 6 years)"
label variable postsecundairniethogerrel "% post-high school, not higher education"
label variable hogeronderwijsrel "% higher education"
label variable opleidingbelgischnietgekendrel "% Belgian degree unknown"
label variable buitenlandsdiplomarel "% foreign degree"
label variable geendiplomarel "% no degree"
label variable opleidinggeenantwoordrel "% education: no answer to the question"

gen seclagerrel=ASOlagerrel+TSOlagerrel+KSOlagerrel+BSOlagerrel
label variable seclagerrel "% high school, 3 years"
gen sechogerrel=ASOhogerrel+TSOhogerrel+KSOhogerrel+BSOhogerrel
label variable sechogerrel "% high school, 6 years"
gen ASOrel=(ASOlager+ASOhoger)/(ASOlager+ASOhoger+TSOlager+TSOhoger+BSOlager+BSOhoger+KSOlager+KSOhoger)
label variable ASOrel "% of high school: general"
gen TSOrel=(TSOlager+TSOhoger)/(ASOlager+ASOhoger+TSOlager+TSOhoger+BSOlager+BSOhoger+KSOlager+KSOhoger)
label variable TSOrel "% of high school: technical"
gen BSOrel=(BSOlager+BSOhoger)/(ASOlager+ASOhoger+TSOlager+TSOhoger+BSOlager+BSOhoger+KSOlager+KSOhoger)
label variable BSOrel "% of high school: professional"
gen KSOrel=(KSOlager+KSOhoger)/(ASOlager+ASOhoger+TSOlager+TSOhoger+BSOlager+BSOhoger+KSOlager+KSOhoger)
label variable KSOrel "% of high school: arts"
gen anderdiplomarel=1-seclagerrel-sechogerrel-lageronderwijsrel-hogeronderwijsrel
label variable anderdiplomarel "% other degree"

tab anderdiplomarel if anderdiplomarel<0
replace anderdiplomarel=0 if anderdiplomarel<0

rename opleidingbelgischnietgekend oplbelgnietgek

save "Output/SS_onderwijs2001.dta", replace
erase "Output/SS_onderwijs2001TEMP.dta"

 //muni level
 drop *rel
 collapse (sum) totaalnietmeerschoolgaand-opleidinggeenantwoord ,by(niscode)

	 
	foreach var of varlist totaalbelgischdiploma-geendiploma {
	generate `var'rel = `var'/(totaalnietmeerschoolgaand-opleidinggeenantwoord)
	}

	gen opleidinggeenantwoordrel=opleidinggeenantwoord/totaalnietmeerschoolgaand 

	gen seclagerrel=ASOlagerrel+TSOlagerrel+KSOlagerrel+BSOlagerrel
	label variable seclagerrel "% high school, 3 years"
	gen sechogerrel=ASOhogerrel+TSOhogerrel+KSOhogerrel+BSOhogerrel
	label variable sechogerrel "% high school, 6 years"
	gen ASOrel=(ASOlager+ASOhoger)/(ASOlager+ASOhoger+TSOlager+TSOhoger+BSOlager+BSOhoger+KSOlager+KSOhoger)
	label variable ASOrel "% of high school: general"
	gen TSOrel=(TSOlager+TSOhoger)/(ASOlager+ASOhoger+TSOlager+TSOhoger+BSOlager+BSOhoger+KSOlager+KSOhoger)
	label variable TSOrel "% of high school: technical"
	gen BSOrel=(BSOlager+BSOhoger)/(ASOlager+ASOhoger+TSOlager+TSOhoger+BSOlager+BSOhoger+KSOlager+KSOhoger)
	label variable BSOrel "% of high school: professional"
	gen KSOrel=(KSOlager+KSOhoger)/(ASOlager+ASOhoger+TSOlager+TSOhoger+BSOlager+BSOhoger+KSOlager+KSOhoger)
	label variable KSOrel "% of high school: arts"
	gen anderdiplomarel=1-seclagerrel-sechogerrel-lageronderwijsrel-hogeronderwijsrel
	label variable anderdiplomarel "% other degree"

	tab anderdiplomarel if anderdiplomarel<0
	replace anderdiplomarel=0 if anderdiplomarel<0

	foreach var of varlist totaalnietmeerschoolgaand- anderdiplomarel {
	rename `var' `var'_nis
	}

	save "Output/NIS_onderwijs2001.dta", replace
	
	
		
	
	
***INCOME
**2004 in SS_inkomen2004
import excel "Source\SECPUB_Y2005 inkomens 2004 aanslagjaar 2005 - via JE.xls", sheet("DFS.SECTOR.SECPUB.Y2005.versieD") cellrange(A4:J21379) allstring clear

rename A niscode
rename B sectorcode
rename C naamvandesector
rename D aantalaangiften
label variable aantalaangiften "Total number of tax returns"
rename E totaalbelastbaarnettoinkomen
rename F gemiddeldinkomen
label variable gemiddeldinkomen "Mean income"
rename G mediaaninkomen
label variable mediaaninkomen "Median income"
rename H interkwartielverschil
label variable interkwartielverschil "Interquartile difference of tax returns"
rename I interkwartielcoef
rename J interkwartielasymmetrie

drop if missing(sectorcode) /*lege rijen en rij met tekst weg*/

*non disponible laten vallen bij ignore lukt niet in dit jaartal, vandaar deze loop
foreach var of varlist totaalbelastbaarnettoinkomen gemiddeldinkomen mediaaninkomen interkwartielverschil interkwartielcoef interkwartielasymmetrie {
replace `var'="." if `var'=="non disponible"
}

generate dateincome=2004
*destring, ignore("," "  ") replace

keep niscode sectorcode aantalaangiften gemiddeldinkomen mediaaninkomen
destring,replace

*replace sectorcode=subinstr(sectorcode,"O","0",.) /*letter O werd ingevuld ipv cijfer 0, deze regel is nodig om te kunnen matchen met datasets van website*/
replace sectorcode="A0MA" if sectorcode=="AOMA" /*avoids problem with merge->not sure if still the case in this db*/

sort niscode sectorcode

save "Output/SS_inkomen2004.dta", replace

	//muni level
	collapse (mean) gemiddeldin  (median) mediaaninkomen [fw=aantalaangiften] ,by(niscode)
	sum
	
	foreach var of varlist gemiddeldin mediaaninkomen {
	rename `var' `var'_nis
	}
	
	save "Output/NIS_inkomen2004.dta",replace
	
	
*merge 2001
use "Output/SS_onderwijs2001.dta",clear
merge 1:1 niscode sectorcode using "Output/SS_bevolking2001.dta",gen(_merge_bev)
merge 1:1 niscode sectorcode using "Output/SS_huishoudens2001.dta",gen(_merge_hh)

drop _merge*

save "Output/census2001",replace

use "Output/NIS_onderwijs2001.dta",clear
merge 1:1 niscode using "Output/NIS_bevolking2001.dta",gen(_merge_bev)
merge 1:1 niscode using "Output/NIS_huishoudens2001.dta",gen(_merge_hh)

drop _merge*

save "Output/NIS_census2001",replace	
	
log close
