********************************************************************************
************************ TEGEMEO TECHNOLOGY ADOPTION ***************************
********************************************************************************
 
di "hello"

*Set the base directory
local maindir  "/path/to/folder"

*Set ado file location
*sysdir set PERSONAL "\\client\c$\stataado"
*sysdir set PLUS "\\client\c$\stataado"


*These packages are required in some part of this file
ssc install filelist 
ssc install sdecode 
ssc install labutil2

*****************
** CLEAN FILES **
*****************

*This cleaning is tailored to a specific file structure. See documentation for help.

cd "`maindir'/data/raw/tegemeo"

*Get list of all dta files
filelist, pat("*.dta") save("`maindir'/data/tegemeo_datasets.dta") replace

*Some variable names which overlap with the abbreviations of other variables, turn varabbrev off
set varabbrev off

set more off
 
*Large cleaning operation, perform quietly 
*Get data set for all dta files
use "`maindir'/data/tegemeo_datasets.dta", clear 
split dirname, parse(/)
save "`maindir'/data/tegemeo_datasets.dta", replace

*Set obs equal to the number of obserations (which is each dta file)
*window stopbox rusure "Do the directories already exist for the cleaned files?" `"Pressing "No" will result in errors if they do exist."'
local make_dir=1
if `make_dir'==1 {
keep dirname2 dirname3 dirname4
replace dirname3 = dirname2 + "/" + dirname3
replace dirname4 = dirname3 + "/" + dirname4
by dirname2 dirname3 dirname4, sort: gen dup = cond(_N==1,0,_n)
drop if dup > 1
drop dup
mkdir "`maindir'/data/intermediate"
mkdir "`maindir'/data/cleaned"
mkdir "`maindir'/data/intermediate/tegemeo"
mkdir "`maindir'/data/intermediate/processed"
mkdir "`maindir'/data/cleaned/tegemeo"
foreach var in dirname2 dirname3 dirname4{
	levelsof `var'
	foreach lev in `r(levels)' {
		local newdirI = "`maindir'/data/intermediate/tegemeo/" + "`lev'"
		mkdir "`newdirI'"
		if "`var'" == "dirname2" {
			local newdirP = "`maindir'/data/intermediate/processed/" + "`lev'"
			mkdir "`newdirP'"
		}
	}
}
clear
*window stopbox note "Directories created. Continuing with script."
}

*window stopbox rusure "Do you want to clean the files?" `"Pressing "Yes" will create new files in the "/data/cleaned/" directory."'
if `make_dir'==1{
	use "`maindir'/data/tegemeo_datasets.dta", clear 
*	Set obs equal to the number of obserations (which is each dta file)
	local obs = _N
*	Loop through each dta file
	forvalues i=1/`obs' { 
		use "`maindir'/data/tegemeo_datasets.dta" in `i', clear 
*		Set f to directory with filename
		local f = "`maindir'/data/raw/tegemeo" + substr(dirname , 2, .) + "/" + filename
		local newf = "`maindir'/data/intermediate/tegemeo" + substr(dirname , 2, .) + "/" + filename 
		local filenamemac = filename
*		Get directory to check for components later
		local datacheck=dirname[1]
*		Get year for associated file
		local yearsave=dirname2[1]
*		Open dta file
		use "`f'", clear 
*		Tegemeo codes missing as -888 and not applicable as -889, replace with extended missing 
		mvdecode _all, mv(-888=.a \ -889=.n)
*		Grab any other missing values 
		labmvs _all, mv("not recorded" .m "notrecorded" .m "not recor*" .m)
		labmvs _all, mv("*not applicable*" .n "NA" .n "*N*/*A*" .n)
*		Get variables with value labels 
		ds, has(vallabel)
*		Loop through variables with value labels, check if varlist is empty 
		if  "`r(varlist)'" != ""{
*			Convert value label variables to strings, clean them, remove value labels
			foreach v of varlist `r(varlist)'{
*				Convert value label to string 
				sdecode `v', replace  
*				To ease appending later, trim string and make all lower case
				replace `v' = trim(itrim(lower(`v')))
			}
		}
*		Drop all value labels
		label drop _all
*		If the season variable exists, rename it harvest (named harvest in other data sets)
		qui cap d season
		if _rc == 0 rename season harvest 
*		Destring all variables (this will only destring numeric variables that remain as string, string variables are ignored)
		foreach v of varlist _all{ 
			destring `v', replace
		}
*		Harmonize the 1997 year with other years
		if "`yearsave'" == "1997" {
			qui cap d year
			if _rc == 0 {
				capture confirm numeric variable year
				if _rc {
					replace year = "1996" if year == "1995-96"
					replace year = "1997" if year == "1996-97"
					destring year, replace
				}
			}
			qui cap d harvest
			if _rc == 0{
				replace harvest = "main" if harvest == "main harvest"
				replace harvest = "short" if harvest == "small harvest"
			}
		}
*		Harmonize the 2000 year with other years
		if "`yearsave'" == "2000" {
			qui cap d year
			if _rc == 0 {
				replace year = "2000" if year == "99/2000"
				destring year, replace
			}
		}
*		Check if the directory name contains the string "General"
		if strmatch(strupper("`datacheck'"), "*GENERAL*") == 0 {
*			If it does not, check if it contians the string "Data"
			if strmatch(strupper("`datacheck'"), "*DATA*") == 1 {
*				If it does, check if there is  year variable 
				capture confirm variable year
*					If year variable does not exist
					if _rc {
*						Generate and label year variable 
						gen year = "`yearsave'"
						label var year "Year survey took place (Note: Harvest took place year before)"
						destring year, replace
					}
			}
		}
*		Try and make fertilizer more uniform
		foreach v in ferttype fertunit{
			qui cap d `v'
			if _rc == 0 {
				gen `v'temp = subinstr(`v'," ","",.)
				order `v'temp, after(`v')
				drop `v'
				rename `v'temp `v'
			}
		}
*		Rename comment and de variables 
		foreach v in comment de enum{
			qui cap d `v'
			if _rc == 0 {
*				local filenamenodta = subinstr("`filename'",".dta","",.)
*				rename `v' `v'_`filenamenodta'
				drop `v'
			}
		}
*		Save the resultant file, replace
		local filenamenodta = subinstr("`newf'",".dta","",.)
		save "`filenamenodta'_cleaned.dta", replace
	}
}
*Clear Stata when cleaning is done
clear
*Display message when cleaning is done
*capture window stopbox note "File cleaning has finished"

****************************************************************
********************** DATA PREPARATION ************************ 
****************************************************************

***************
** RAIN DATA **
***************

*Open rain data (same as used by Suri (2011))
use "`maindir'/data/raw/tegemeo/General/Extract/General Data/Tapra_hh_seasonal_rain_1996_2010.dta", clear

*Rename variables to work with reshape
*These variables are the fraction of data available for both seasons in 1996 (incomplete data)
rename main96frac mainfrac96
rename short96frac shortfrac96

*Reshape data
reshape long main short mainfrac shortfrac mainstress shortstress, i(hhid) j(year, string)

*Replace years to full years 
replace year = "1996" if year == "96"
replace year = "1997" if year == "97"
replace year = "1998" if year == "98"
replace year = "1999" if year == "99"
replace year = "2000" if year == "00"
replace year = "2001" if year == "01"
replace year = "2002" if year == "02"
replace year = "2003" if year == "03"
replace year = "2004" if year == "04"
replace year = "2005" if year == "05"
replace year = "2006" if year == "06"
replace year = "2007" if year == "07"
replace year = "2008" if year == "08"
replace year = "2009" if year == "09"
replace year = "2010" if year == "10"

*Destring year to numeric 
destring year, replace

*Duplicate the data once
expand 2, gen(duplicate)

*Drop if there is no short harvest data 
drop if duplicate == 1 & short == .

*We want all rainfall data in one variable, with a variable controlling for the harvest
gen rainfall = main

*Replace rainfall with short harvest data 
replace rainfall = short if duplicate == 1

*Repeat process for stress
gen stress = mainstress
replace stress = shortstress if duplicate == 1

*Repeat process for frac
gen frac = mainfrac
replace frac = shortfrac if duplicate == 1

*Repeat process for harvest 
gen harvest = "main" if duplicate == 0
replace harvest = "short" if duplicate == 1

*Sort by hhid harvest year
sort hhid harvest year

*Generate average rainfalls, stress for each household by harvest 
by hhid harvest: egen avgrainfall=mean(rainfall)
by hhid harvest: egen avgstress=mean(stress)

*Drop left over variables 
drop duplicate main short mainstress shortstress mainfrac shortfrac

*Change order for better organization 
order gpsmiss, last
order harvest, after(year)

*Label variables 
label var harvest "Harvest season"
label var rainfall "Rainfall(mm), year is end of season (i.e. Season is Year-1 to Year)"
label var stress "Fraction of 20 day periods with <40mm rain for season"
label var frac "For 1996, fraction of data included in estimate for season (incomplete data)"
label var avgrainfall "Average of the rainfalls (mm) for the given years for the season"
label var avgstress "Average fraction of 20 day periods with <40mm for season"

keep if harvest == "main"
keep hhid year rainfall avgrainfall

*Save the data set
save "`maindir'/data/intermediate/processed/rainfall.dta", replace

preserve 

keep if year == 1997 | year == 1996 

save "`maindir'/data/intermediate/processed/1997/rain97.dta", replace

restore
preserve 

keep if year == 2000

save "`maindir'/data/intermediate/processed/2000/rain00.dta", replace

restore
preserve 

keep if year == 2004

save "`maindir'/data/intermediate/processed/2004/rain04.dta", replace

restore
preserve 

keep if year == 2007

save "`maindir'/data/intermediate/processed/2007/rain07.dta", replace

restore

keep if year == 2010

save "`maindir'/data/intermediate/processed/2010/rain10.dta", replace

clear


*********************
** LOCATIONAL DATA **
*********************

***1997***
copy "`maindir'/data/intermediate/tegemeo/1997/Extract/1997 Data/hhidfinl97_cleaned.dta" "`maindir'/data/intermediate/processed/1997/hhidfinl97_cleaned.dta", replace

*Next, create a dataset of all the district & fertilizer combinations 
*This will be used later in the fertilizer section
use "`maindir'/data/intermediate/tegemeo/1997/Extract/1997 Data/hhidfinl97_cleaned.dta", clear

*Only need districts 
keep dist prov zone

*Drop duplicates
quietly by dist prov zone, sort:  gen dup = cond(_N==1,0,_n)
drop if dup > 1
drop dup

*Since overlap in district per zone, reshape for a unique entry for each district
sort dist
by dist: gen zone_no = _n
reshape wide zone, i(prov dist) j(zone_no)

*Temporarily save the file with districts, provinces, and zones
tempfile distprovzone
save "`distprovzone'"

***2000***

use "`maindir'/data/intermediate/tegemeo/2000/Extract/2000 Data/hhidfinal00_cleaned.dta", clear 

drop sample intview super

save "`maindir'/data/intermediate/processed/2000/hhidfinal00_cleaned.dta", replace

***2004***

use "`maindir'/data/intermediate/tegemeo/2004/Extract/2004 Data/hhidfinal04_cleaned.dta", clear 

drop memnum sample benefic ngo program intview surdate super

save "`maindir'/data/intermediate/processed/2004/hhidfinal04_cleaned.dta", replace

***2007***

use "`maindir'/data/intermediate/tegemeo/2007/Extract/2007 Data/hhidfinal07_cleaned.dta", clear 

drop hh1 deg1 min1 sec1 hh2 deg2 min2 sec2 surdate memnum snum masl intview

save "`maindir'/data/intermediate/processed/2007/hhidfinal07_cleaned.dta", replace

***2010***
*copy "`maindir'/data/intermediate/tegemeo/2010/Extract/2010 Data/hhidfinal10_cleaned.dta" "`maindir'/data/intermediate/processed/2010/hhidfinal10_cleaned.dta", replace

use "`maindir'/data/intermediate/tegemeo/2010/Extract/2010 Data/hhidfinal10_cleaned.dta", clear  

drop mem hh1 dd_ns hh2 dd_ew masl intview surdate snum shhpanel othintvi

save "`maindir'/data/intermediate/processed/2010/hhidfinal10_cleaned.dta", replace

******************
** DEMOGRAPHICS **
******************

***1997***
use "`maindir'/data/intermediate/tegemeo/1997/Extract/1997 Data/demog97_cleaned.dta", clear

keep hhid age sex

gen agecalc = missing(age)
gen agemiss = missing(age)
gen gendermiss = missing(sex)

*Generate number of boys (defined to be <=16)
gen boys = cond(age == "under 6 years"& sex == "male",1,0) 
replace boys = 1 if age == "6-16 years" & sex == "male"

*Generate number of girls (defined to be <=16)
gen girls = cond(age == "under 6 years" & sex == "female",1,0)
replace girls = 1 if (age == "6-16 years" & sex == "female")

*Generate number of men (defined to be >16)
gen men = cond(age == "17-39 years" & sex == "male",1,0) 

*Generate number of women (defined to be >16)
gen women = cond(age == "17-39 years" & sex == "female",1,0) 
replace women = 1 if (age == "over 40 years old" & sex == "female")

*Generate number of older men (defined to be >= 40)
gen oldermen = cond(age == "over 40 years old" & sex == "male",1,0) 

*drop mem gender yearborn age 
collapse (sum) boys girls men women oldermen agecalc agemiss gendermiss, by(hhid)

*Generate total household size (sum of each demographic defined above)
gen hhsize = boys + girls + men + women + oldermen

label var boys "Number of boys 16 or younger in household"
label var girls "Number of girls 16 or younger in household"
label var men "Number of men 17-39 in household"
label var women "Number of women 17 or older in household"
label var oldermen "Number of men 40 or older household"
label var hhsize "Number of people in household"
label var agecalc "number of members whose age was calculated = year of survey - year born"
label var agemiss "number of members with missing age/estimated age (treated as 0 in sum)"
label var gendermiss "number of members with missing genders"

save "`maindir'/data/intermediate/processed/1997/demographics97_cleaned.dta", replace

***2000***
use "`maindir'/data/intermediate/tegemeo/2000/Extract/2000 Data/demog00_cleaned.dta", clear

ren d3 age
ren d2 gender

*Missing is "person unavailable to give information"
replace age = "" if age == "person unavailable to give information"
replace gender = "" if gender == "person unavailable to give information"
destring age, replace

gen agecalc = missing(age)
gen agemiss = missing(age)
gen gendermiss = missing(gender)

*Generate number of boys (defined to be <=16)
gen boys = cond(age <= 16 & gender == "male", 1, 0)

*Generate number of girls (defined to be <=16)
gen girls = cond(age <= 16 & gender == "female", 1, 0)

*Generate number of men (defined to be >16)
gen men = cond(40 > age & age > 16 & gender == "male", 1, 0)

*Generate number of women (defined to be >16)
gen women = cond(age > 16 & gender == "female", 1, 0)

*Generate number of older men (defined to be >= 40)
gen oldermen = cond(age >= 40 & gender == "male", 1, 0)

*drop mem gender yearborn age 
collapse (sum) boys girls men women oldermen agecalc agemiss gendermiss, by(hhid)

*Generate total household size (sum of each demographic defined above)
gen hhsize = boys + girls + men + women + oldermen

label var boys "Number of boys 16 or younger in household"
label var girls "Number of girls 16 or younger in household"
label var men "Number of men 17-39 in household"
label var women "Number of women 17 or older in household"
label var oldermen "Number of men 40 or older household"
label var hhsize "Number of people in household"
label var agecalc "number of members whose age was calculated = year of survey - year born"
label var agemiss "number of members with missing age/estimated age (treated as 0 in sum)"
label var gendermiss "number of members with missing genders"

save "`maindir'/data/intermediate/processed/2000/demographics00_cleaned.dta", replace

***2004***
use "`maindir'/data/intermediate/tegemeo/2004/Extract/2004 Data/demoga_a04_cleaned.dta", clear

keep hhid ad01 ad02
rename ad01 yearborn
rename ad02 gender 

tempfile temp1demog2004
save "`temp1demog2004'"

use "`maindir'/data/intermediate/tegemeo/2004/Extract/2004 Data/demoga04_cleaned.dta", clear

keep hhid age da02
rename da02 gender

tempfile temp2demog2004
save "`temp2demog2004'"

use "`maindir'/data/intermediate/tegemeo/2004/Extract/2004 Data/demogc04_cleaned.dta", clear

keep hhid dc01 dc02
rename dc01 yearborn 
rename dc02 gender

append using "`temp1demog2004'" "`temp2demog2004'"

gen agecalc = missing(age) & !missing(yearborn)
gen agemiss = missing(age) & missing(yearborn)
gen gendermiss = missing(gender)

replace age = (2004 - yearborn) if missing(age)

*Generate number of boys (defined to be <=16)
gen boys = cond(age <= 16 & gender == "male", 1, 0)

*Generate number of girls (defined to be <=16)
gen girls = cond(age <= 16 & gender == "female", 1, 0)

*Generate number of men (defined to be >16)
gen men = cond(40 > age & age > 16 & gender == "male", 1, 0)

*Generate number of women (defined to be >16)
gen women = cond(age > 16 & gender == "female", 1, 0)

*Generate number of older men (defined to be >= 40)
gen oldermen = cond(age >= 40 & gender == "male", 1, 0)

*drop mem gender yearborn age 
collapse (sum) boys girls men women oldermen agecalc agemiss gendermiss, by(hhid)

*Generate total household size (sum of each demographic defined above)
gen hhsize = boys + girls + men + women + oldermen

label var boys "Number of boys 16 or younger in household"
label var girls "Number of girls 16 or younger in household"
label var men "Number of men 17-39 in household"
label var women "Number of women 17 or older in household"
label var oldermen "Number of men 40 or older household"
label var hhsize "Number of people in household"
label var agecalc "number of members whose age was calculated = year of survey - year born"
label var agemiss "number of members with missing age/estimated age (treated as 0 in sum)"
label var gendermiss "number of members with missing genders"

save "`maindir'/data/intermediate/processed/2004/demographics04_cleaned.dta", replace

***2007***
use "`maindir'/data/intermediate/tegemeo/2007/Extract/2007 Data/demog07_cleaned.dta", clear

keep hhid da01 da02 age

ren da01 yearborn
ren da02 gender

tempfile temp1demog2007
save "`temp1demog2007'"

use "`maindir'/data/intermediate/tegemeo/2007/Extract/2007 Data/demoga07_cleaned.dta", clear

keep hhid ad01 ad02 age

ren ad01 yearborn
ren ad02 gender

append using "`temp1demog2007'"

gen agecalc = missing(age) & !missing(yearborn)
gen agemiss = missing(age) & missing(yearborn)
gen gendermiss = missing(gender)

replace age = (2007 - yearborn) if missing(age)

*Generate number of boys (defined to be <=16)
gen boys = cond(age <= 16 & gender == "male", 1, 0)

*Generate number of girls (defined to be <=16)
gen girls = cond(age <= 16 & gender == "female", 1, 0)

*Generate number of men (defined to be >16)
gen men = cond(40 > age & age > 16 & gender == "male", 1, 0)

*Generate number of women (defined to be >16)
gen women = cond(age > 16 & gender == "female", 1, 0)

*Generate number of older men (defined to be >= 40)
gen oldermen = cond(age >= 40 & gender == "male", 1, 0)

*drop mem gender yearborn age 
collapse (sum) boys girls men women oldermen agecalc agemiss gendermiss, by(hhid)

*Generate total household size (sum of each demographic defined above)
gen hhsize = boys + girls + men + women + oldermen

label var boys "Number of boys 16 or younger in household"
label var girls "Number of girls 16 or younger in household"
label var men "Number of men 17-39 in household"
label var women "Number of women 17 or older in household"
label var oldermen "Number of men 40 or older household"
label var hhsize "Number of people in household"
label var agecalc "number of members whose age was calculated = year of survey - year born"
label var agemiss "number of members with missing age/estimated age (treated as 0 in sum)"
label var gendermiss "number of members with missing genders"

save "`maindir'/data/intermediate/processed/2007/demographics07_cleaned.dta", replace

***2010***
use "`maindir'/data/intermediate/tegemeo/2010/Extract/2010 Data/demog10_cleaned.dta", clear

keep hhid da01 da02 age
ren da01 yearborn
ren da02 gender

tempfile temp1demog2010
save "`temp1demog2010'"

use "`maindir'/data/intermediate/tegemeo//2010/Extract/2010 Data/demoga10_cleaned.dta", clear

keep ad01 ad02 age
ren ad01 yearborn
ren ad02 gender 

append using "`temp1demog2010'"

gen agecalc = missing(age) & !missing(yearborn)
gen agemiss = missing(age) & missing(yearborn)
gen gendermiss = missing(gender)

replace age = (2010 - yearborn) if missing(age)

*Generate number of boys (defined to be <=16)
gen boys = cond(age <= 16 & gender == "male", 1, 0)

*Generate number of girls (defined to be <=16)
gen girls = cond(age <= 16 & gender == "female", 1, 0)

*Generate number of men (defined to be >16)
gen men = cond(40 > age & age > 16 & gender == "male", 1, 0)

*Generate number of women (defined to be >16)
gen women = cond(age > 16 & gender == "female", 1, 0)

*Generate number of older men (defined to be >= 40)
gen oldermen = cond(age >= 40 & gender == "male", 1, 0)

*drop mem gender yearborn age 
collapse (sum) boys girls men women oldermen agecalc agemiss gendermiss, by(hhid)

*Generate total household size (sum of each demographic defined above)
gen hhsize = boys + girls + men + women + oldermen

label var boys "Number of boys 16 or younger in household"
label var girls "Number of girls 16 or younger in household"
label var men "Number of men 17-39 in household"
label var women "Number of women 17 or older in household"
label var oldermen "Number of men 40 or older household"
label var hhsize "Number of people in household"
label var agecalc "number of members whose age was calculated = year of survey - year born"
label var agemiss "number of members with missing age/estimated age (treated as 0 in sum)"
label var gendermiss "number of members with missing genders"

save "`maindir'/data/intermediate/processed/2010/demographics10_cleaned.dta", replace


***************
** CROP DATA **
***************

***1997***

*Load crop data
use "`maindir'/data/intermediate/tegemeo/1997/Extract/1997 Data/croplev97_cleaned.dta", clear

replace crop=lower(crop)

*Only need maize data
keep if strpos(crop,"maize")
 
*Suri only uses 1997, so drop 1996 data (NEED TO CHECK this as year may be off)
drop if year != 1997

*Drop the observation if it is a short harvest, short harvests come before, and it is not the only obervation for that household 
keep if (harvest == "main")

*Save
save "`maindir'/data/intermediate/processed/1997/croplev97_cleaned.dta", replace

***2000***
use "`maindir'/data/intermediate/tegemeo/2000/Extract/2000 Data/croplev00_cleaned.dta", clear

replace crop=lower(crop)

*Only need maize data
keep if strpos(crop,"maize")

keep if (harvest == "main") 

save "`maindir'/data/intermediate/processed/2000/croplev00_cleaned.dta", replace

***2004***
use "`maindir'/data/intermediate/tegemeo/2004/Extract/2004 Data/croplev04_cleaned.dta", clear

replace crop=lower(crop)

*Only need maize data
keep if strpos(crop,"maize")

keep if (harvest == "main") 

merge m:1 sunit using "`maindir'/intermediate/processed/2004/sd04_convert.dta", keep(match) nogenerate

gen seedkg = skgconvert*sqt

save "`maindir'/data/intermediate/processed/2004/croplev04_cleaned.dta", replace

***2007***
use "`maindir'/data/intermediate/tegemeo/2007/Extract/2007 Data/croplev07_cleaned.dta", clear

drop in 4286 if hhid==368

replace crop=lower(crop)

*Only need maize data
keep if strpos(crop,"maize")
drop if strpos(crop,"fodder")

keep if (harvest == "main") 

replace crop = "dry maize" if crop == "maize-dry"
replace crop = "green maize" if crop == "maize-green"

save "`maindir'/data/intermediate/processed/2007/croplev07_cleaned.dta", replace

***2010***
use "`maindir'/data/intermediate/tegemeo/2010/Extract/2010 Data/croplev10_cleaned.dta", clear


replace crop=lower(crop)

*Only need maize data
keep if strpos(crop,"maize")
drop if strpos(crop,"fodder")

keep if (harvest == "main") 

replace crop = "dry maize" if crop == "maize-dry"
replace crop = "green maize" if crop == "maize-green"

save "`maindir'/data/intermediate/processed/2010/croplev10_cleaned.dta", replace

*********************
** FERTILIZER DATA **
*********************

***1997***
clear

*create a data set of all the fertilizer types 
set obs 11
gen ferttype = ""
replace ferttype = "asn(26:0:0)" in 1
replace ferttype = "can(26:0:0)" in 2
replace ferttype = "dap(18:46:0)" in 3
replace ferttype = "map(11:52:0)" in 4
replace ferttype = "npk(17:17:17)" in 5
replace ferttype = "npk(20:20:0)" in 6
replace ferttype = "npk(23:23:23)" in 7
replace ferttype = "npk(25:5:+5s)" in 8
replace ferttype = "ssp(0:21-24:)" in 9
replace ferttype = "tsp(0:46:0)" in 10
replace ferttype = "urea(46:0:0)" in 11

*Save this to a temp file
tempfile ferttypes
save "`ferttypes'"
		
*Cartesian product with fertilizer type 
cross using "`distprovzone'"
sort dist ferttype

*Save this data set 
tempfile fertgoal
save "`fertgoal'"

*Next, get the fertilizer price data 
use "`maindir'/data/intermediate/tegemeo/1997/Extract/1997 Data/fertprc97_cleaned.dta", clear

*Add the dap data, which is separate for some reason 
append using "`maindir'/data/intermediate/tegemeo/1997/Extract/1997 Data/fertprice_dap97_cleaned.dta"

*Create a common prefix 
foreach x of var kgdap KG10DAP KG25DAP KG50DAP kgmap KG50MAP KG50TSP kgssp KG20_20 KG502020 KG23_23 KG502323 KG17_17 KG501717 KG25_5 KG50255 kgcan KG50CAN kgasn KG50ASN kgurea KG50UREA{ 
rename `x' price`x' 
}

*Generate the i variable for reshape 
gen reshapev = _n

*Reshape
reshape long price, i(reshapev) j(fertvar) string
drop reshapev

*Generate fertunit and ferrtype. Unfortunately, this has to do be done manually.
gen ferttype = ""
gen fertunit = ""
replace ferttype = "dap(18:46:0)" if fertvar == "kgdap"
replace ferttype = "dap(18:46:0)" if fertvar == "KG10DAP"
replace ferttype = "dap(18:46:0)" if fertvar == "KG25DAP"
replace ferttype = "dap(18:46:0)" if fertvar == "KG50DAP"
replace ferttype = "map(11:52:0)" if fertvar == "kgmap"
replace ferttype = "map(11:52:0)" if fertvar == "KG50MAP"
replace ferttype = "tsp(0:46:0)" if fertvar == "KG50TSP"
replace ferttype = "ssp(0:21-24:)" if fertvar == "kgssp"
replace ferttype = "npk(20:20:0)" if fertvar == "KG20_20"
replace ferttype = "npk(20:20:0)" if fertvar == "KG502020"
replace ferttype = "npk(23:23:23)" if fertvar == "KG23_23"
replace ferttype = "npk(23:23:23)" if fertvar == "KG502323"
replace ferttype = "npk(17:17:17)" if fertvar == "KG17_17"
replace ferttype = "npk(17:17:17)" if fertvar == "KG501717"
replace ferttype = "npk(25:5:+5s)" if fertvar == "KG25_5"
replace ferttype = "npk(25:5:+5s)" if fertvar == "KG50255"
replace ferttype = "can(26:0:0)" if fertvar == "kgcan"
replace ferttype = "can(26:0:0)" if fertvar == "KG50CAN"
replace ferttype = "asn(26:0:0)" if fertvar == "kgasn"
replace ferttype = "asn(26:0:0)" if fertvar == "KG50ASN"
replace ferttype = "urea(46:0:0)" if fertvar == "kgurea"
replace ferttype = "urea(46:0:0)" if fertvar == "KG50UREA"
replace fertunit = "kg" if fertvar == "kgdap"
replace fertunit = "10 kg bag" if fertvar == "KG10DAP"
replace fertunit = "25 kg bag" if fertvar == "KG25DAP"
replace fertunit = "50 kg bag" if fertvar == "KG50DAP"
replace fertunit = "kg" if fertvar == "kgmap"
replace fertunit = "50 kg bag" if fertvar == "KG50MAP"
replace fertunit = "50 kg bag" if fertvar == "KG50TSP"
replace fertunit = "kg" if fertvar == "kgssp"
replace fertunit = "kg" if fertvar == "KG20_20"
replace fertunit = "50 kg bag" if fertvar == "KG502020"
replace fertunit = "kg" if fertvar == "KG23_23"
replace fertunit = "50 kg bag" if fertvar == "KG502323"
replace fertunit = "kg" if fertvar == "KG17_17"
replace fertunit = "50 kg bag" if fertvar == "KG501717"
replace fertunit = "kg" if fertvar == "KG25_5"
replace fertunit = "50 kg bag" if fertvar == "KG50255"
replace fertunit = "kg" if fertvar == "kgcan"
replace fertunit = "50 kg bag" if fertvar == "KG50CAN"
replace fertunit = "kg" if fertvar == "kgasn"
replace fertunit = "50 kg bag" if fertvar == "KG50ASN"
replace fertunit = "kg" if fertvar == "kgurea"
replace fertunit = "50 kg bag" if fertvar == "KG50UREA"

*Remove left over variable 
drop fertvar

*Do some general cleaning 
order ferttype, first
order fertunit, after(ferttype)

sort ferttype fertunit dist div

drop if missing(price)

*Household fertilizer use is in kg, so we convert to kg
gen kgconvert = 0
replace kgconvert = 50 if fertunit == "50 kg bag"
replace kgconvert = 10 if fertunit == "10 kg bag"
replace kgconvert = 1 if fertunit == "kg"
replace kgconvert = 25 if fertunit == "25 kg bag"
gen pricekg = price/kgconvert

*Remove unwanted variables
drop fertunit price kgconvert 

*This district seems to be misnamed. Checked 2004/Extract/2004 Docs/2004_Synthetic_Questionnaire.pdf, pg. 2 to confirm kakkamega does not exist.
replace dist = "kakamega" if dist == "kakkamega"

joinby dist using "`distprovzone'", unmatched(both)

drop div _merge

append using "`fertgoal'", generate(_merge)

by ferttype dist, sort: gen dup = cond(_N==1,0,_n)
by ferttype dist:  egen dupmax = max(dup)
drop if (dupmax > 0) & (_merge == 1)
drop dup dupmax _merge

*Generate the number of observations by each geographical region and fertilizer type
sort dist zone1 zone2 prov ferttype 
egen distno = count(pricekg), by(dist ferttype)
egen zone1no = count(pricekg), by(zone1 ferttype) 
egen zone2no = count(pricekg), by(zone2 ferttype) 
egen bothzoneno = count(pricekg), by(zone1 zone2 ferttype) 
egen provno = count(pricekg), by(prov ferttype) 

*Generate the district price, zone price, province price, and national price by fertilizer type 
*Since a district can be part of more than one zone, we average the two if it is
egen distpfert = median(pricekg), by(dist ferttype)
egen zone1pfert = median(pricekg), by(zone1 ferttype)
egen zone2pfert = median(pricekg), by(zone2 ferttype)
*Since there are only two zones, them median of the median is the average of the medians
gen bothzonepfert = (zone1pfert + zone2pfert)/2
egen provpfert = median(pricekg), by(prov ferttype)
egen natpfert = median(pricekg), by(ferttype)

sort dist ferttype 

gen pfert = natpfert 
replace pfert = provpfert if !missing(provpfert)
replace pfert = bothzonepfert if bothzoneno >= 10 & !missing(zone2)
replace pfert = zone1pfert if (zone1no >= 10) & missing(zone2)
*There is no case where there is a zone2 but not a zone 1
*Additionally, zone2 does not overlap with zone1 (there is no case when zone1_i == zone2_j), so this method works fine 
replace pfert = distpfert if distno >= 10 

gen priceloc = "National" 
replace priceloc = "Provincial" if !missing(provpfert)
replace priceloc = "Zonal"  if bothzoneno >= 10 & !missing(zone2)
replace priceloc = "Zonal"  if (zone1no >= 10) & missing(zone2)
replace priceloc = "District" if distno >= 10 

quietly by dist ferttype, sort: gen dup = cond(_N==1,0,_n)
drop if dup > 1

keep dist ferttype pfert priceloc 

save "`maindir'/data/intermediate/processed/1997/fertprices97.dta", replace

*Load distance from fertilizer seller
use "`maindir'/data/intermediate/tegemeo/1997/Extract/1997 Data/hh97_cleaned.dta", clear

keep hhid fertskm

merge m:1 hhid using "`maindir'/data/intermediate/tegemeo/1997/Extract/1997 Data/hhidfinl97_cleaned.dta", keep(match) nogen

sort dist zone prov fertskm 
egen distno = count(fertskm), by(dist)
egen zoneno = count(fertskm), by(zone) 
egen provno = count(fertskm), by(prov) 

sort dist zone prov fertskm 
egen distmed = median(fertskm), by(dist)
egen zonemed = median(fertskm), by(zone) 
egen provmed = median(fertskm), by(prov) 

gen fertskmest = .
replace fertskmest = fertskm if !missing(fertskm)
replace fertskmest = distmed if distno >= 10 & !missing(distmed) & missing(fertskmest)
replace fertskmest = zonemed if zoneno >= 10 & !missing(zonemed) & missing(fertskmest)
replace fertskmest = provmed if provno >= 10 & !missing(provmed) & missing(fertskmest)

keep hhid fertskmest
rename fertskmest fertskm

save "`maindir'/data/intermediate/processed/1997/fertdist97.dta", replace

*Load field data, which has fertilizer data for 1997 
use "`maindir'/data/intermediate/tegemeo/1997/Extract/1997 Data/field97_cleaned.dta", clear

*Remove short harvest, earlier year
drop if harvest == "short"
drop if year == 1996

*Rename variables for reshaping 
rename fertype ferttype1
rename fertkg fertkg1
rename frtypeb ferttype2
rename fertkgb fertkg2

*Reshape into long format in order to merge price data 
reshape long ferttype fertkg, i(hhid field) j(reshapev)

*Make the same as other years
ren fertkg fertotal

*Harmonize ferttype with other years
replace ferttype = "manure" if ferttype == "farm yard manure"
replace ferttype = "compost" if ferttype == "compost mixture"

*Apply locational data (as fertilizer price data is by location, not by household)
merge m:1 hhid using "`maindir'/data/intermediate/tegemeo/1997/Extract/1997 Data/hhidfinl97_cleaned.dta", keep(match) nogenerate

*Apply fertilizer price data
merge m:1 dist ferttype using "`maindir'/data/intermediate/processed/1997/fertprices97.dta", keep(master match)

sort hhid field

*Generate a variable for the expenditure 
gen efert = pfert*fertotal

*Remove the merge variable
drop _merge

gen DAP_t = cond(strpos(strupper(ferttype),"DAP"),fertotal,0)
gen MAP_t = cond(strpos(strupper(ferttype),"MAP"),fertotal,0)
gen CAN_t = cond(strpos(strupper(ferttype),"CAN"),fertotal,0)

bys hhid field: egen DAP = max(DAP_t)
bys hhid field: egen MAP = max(MAP_t)
bys hhid field: egen CAN = max(CAN_t)

drop DAP_t MAP_t CAN_t

bys hhid field: egen totfertexp = sum(efert)

*Reshape back to wide format

reshape wide ferttype fertotal pfert priceloc efert, i(hhid field dist) j(reshapev)

order hhid field harvest year

*Save
save "`maindir'/data/intermediate/processed/1997/fieldfertloc97_cleaned.dta", replace

***2000***
use "`maindir'/data/intermediate/tegemeo/2000/Extract/2000 Data/hh00_cleaned.dta", clear

tempfile temp1fertskm00
save "`temp1fertskm00'"

use "`maindir'/data/intermediate/tegemeo/2000/Extract/2000 Data/fert00_cleaned.dta", clear

drop if harvest == "short"

by hhid harvest field, sort: gen reshapev = _n

gen DAP_t = cond(strpos(strupper(ferttype),"DAP"),fertotal,0)
gen MAP_t = cond(strpos(strupper(ferttype),"MAP"),fertotal,0)
gen CAN_t = cond(strpos(strupper(ferttype),"CAN"),fertotal,0)

bys hhid field: egen DAP = max(DAP_t)
bys hhid field: egen MAP = max(MAP_t)
bys hhid field: egen CAN = max(CAN_t)

drop DAP_t MAP_t CAN_t

reshape wide ferttype fertqty fertunit kgconver fertotal, i(hhid harvest field) j(reshapev)

merge m:1 hhid using "`temp1fertskm00'", keep(match) nogenerate

save "`maindir'/data/intermediate/processed/2000/fert00_cleaned.dta", replace

***2004***
use "`maindir'/data/intermediate/tegemeo/2004/Extract/2004 Data/hh04_cleaned.dta", clear

drop if missing(fertskm)

keep hhid fertskm

save "`maindir'/data/intermediate/processed/2004/fertskm04.dta", replace

use "`maindir'/data/intermediate/tegemeo/2004/Extract/2004 Data/fert04_cleaned.dta", clear

drop if harvest == "short"

merge m:1 hhid using "`maindir'/data/intermediate/processed/2004/hhidfinal04_cleaned.dta", keep(match) nogenerate

merge m:1 ferttype fertunit dist using "`maindir'/data/intermediate/tegemeo/2004/Extract/2004 Lookup/pricefert_cleaned.dta", keep(master match) 

sort hhid field

*Generate a variable for the expenditure 
gen efert = cond(!missing(pfert) & !missing(fertqty), pfert*fertqty,  0)

*Remove the merge variable
drop _merge

gen DAP_t = cond(strpos(strupper(ferttype),"DAP"),fertotal,0)
gen MAP_t = cond(strpos(strupper(ferttype),"MAP"),fertotal,0)
gen CAN_t = cond(strpos(strupper(ferttype),"CAN"),fertotal,0)

bys hhid field: egen DAP = max(DAP_t)
bys hhid field: egen MAP = max(MAP_t)
bys hhid field: egen CAN = max(CAN_t)

drop DAP_t MAP_t CAN_t

bys hhid field: egen totfertexp = sum(efert)

by hhid field, sort: gen reshapev = _n

reshape wide ferttype fertqty fertunit kgconver fertotal pfert pfertrep efert, i(hhid field dist) j(reshapev)

save "`maindir'/data/intermediate/processed/2004/fert04_cleaned.dta", replace

***2007***

use "`maindir'/data/intermediate/tegemeo/2007/Extract/2007 Data/hh07_cleaned.dta", clear

keep hhid fertskm 

tempfile temp1fertskm07
save "`temp1fertskm07'"

use "`maindir'/data/intermediate/tegemeo/2007/Extract/2007 Data/fert07_cleaned.dta", clear

drop if harvest == "short"

*We drop field level observations when there is a fertilizer (that is not manure, as it is assumed to be free), but there is no price data
*Generate variable to mark such an observation
gen todrop = 0
replace todrop = 1 if missing(fertcost) & (ferttype != "manure") & (ferttype != "compost")

*If one observation is missing, drop all observations for that field
sort hhid field
by hhid field: egen todropmax = max(todrop)
drop if todropmax == 1
drop todrop todropmax

sort hhid field

*Generate a variable for the expenditure 
rename fertcost efert

gen DAP_t = cond(strpos(strupper(ferttype),"DAP"),fertotal,0)
gen MAP_t = cond(strpos(strupper(ferttype),"MAP"),fertotal,0)
gen CAN_t = cond(strpos(strupper(ferttype),"CAN"),fertotal,0)

bys hhid field: egen DAP = max(DAP_t)
bys hhid field: egen MAP = max(MAP_t)
bys hhid field: egen CAN = max(CAN_t)

drop DAP_t MAP_t CAN_t

bys hhid field: egen totfertexp = sum(efert)

by hhid field, sort: gen reshapev = _n

reshape wide ferttype fertunit fertqty kgconver fertotal pfert pfertrep efert, i(hhid field dist) j(reshapev)

merge m:1 hhid using "`temp1fertskm07'", keep(match) nogenerate

save "`maindir'/data/intermediate/processed/2007/fert07_cleaned.dta", replace

***2010***
use "`maindir'/data/intermediate/tegemeo/2010/Extract/2010 Data/hh10_cleaned.dta", clear

keep if !missing(fertskm)

keep hhid fertskm 

tempfile temp1fertskm10
save "`temp1fertskm10'"

use "`maindir'/data/intermediate/tegemeo/2010/Extract/2010 Data/fert10_cleaned.dta", clear

drop if harvest == "short"

*We drop field level observations when there is a fertilizer (that is not manure, as it is assumed to be free), but there is no price data
*Generate variable to mark such an observation
gen todrop = 0
replace todrop = 1 if missing(fertcost) & ferttype != "compost" & ferttype != "manure"

*If one observation (which is a field-fertilizer combination) is missing, drop all observations for that field
sort hhid field
by hhid field: egen todropmax = max(todrop)
drop if todropmax == 1
drop todrop todropmax

sort hhid field

*Generate a variable for the expenditure 
rename fertcost efert

gen DAP_t = cond(strpos(strupper(ferttype),"DAP"),fertotal,0)
gen MAP_t = cond(strpos(strupper(ferttype),"MAP"),fertotal,0)
gen CAN_t = cond(strpos(strupper(ferttype),"CAN"),fertotal,0)

bys hhid field: egen DAP = max(DAP_t)
bys hhid field: egen MAP = max(MAP_t)
bys hhid field: egen CAN = max(CAN_t)

drop DAP_t MAP_t CAN_t

bys hhid field: egen totfertexp = sum(efert)

by hhid field, sort: gen reshapev = _n

reshape wide ferttype fertunit fertqty kgconver fertotal pfert pfertrep efert, i(hhid field dist) j(reshapev)

merge m:1  hhid using "`temp1fertskm10'", keep(match) nogenerate

save "`maindir'/data/intermediate/processed/2010/fert10_cleaned.dta", replace

*********************
** MAIZE SEED DATA **
*********************

***1997***
*Data is already with crop data

***2000***
*Data is already with crop data

***2004***
use "`maindir'/data/intermediate/tegemeo/2004/Extract/2004 Data/maizeseed04_cleaned.dta", clear

drop if (harvest == "short") 

save "`maindir'/data/intermediate/processed/2004/maizeseed04_cleaned.dta", replace

***2007***
use "`maindir'/data/intermediate/tegemeo/2007/Extract/2007 Data/maizeseed07_cleaned.dta", clear

drop if (harvest == "short") 

replace crop=lower(crop)
keep if strpos(crop,"maize")
drop if strpos(crop,"fodder")

replace crop = "dry maize" if crop == "maize-dry"
replace crop = "green maize" if crop == "maize-green"

save "`maindir'/data/intermediate/processed/2007/maizeseed07_cleaned.dta", replace

***2010***
use "`maindir'/data/intermediate/tegemeo/2010/Extract/2010 Data/maizeseed10_cleaned.dta", clear

drop if (harvest == "short") 

replace crop=lower(crop)
keep if strpos(crop,"maize")
drop if strpos(crop,"fodder")

replace crop = "dry maize" if crop == "maize-dry"
replace crop = "green maize" if crop == "maize-green"

save "`maindir'/data/intermediate/processed/2010/maizeseed10_cleaned.dta", replace

****************
** FIELD DATA **
****************

***1997***
use "`maindir'/data/intermediate/tegemeo/1997/Extract/1997 Data/field97_cleaned.dta", clear

drop if year == 1996
drop if harvest == "short"

save "`maindir'/data/intermediate/processed/1997/field97_cleaned.dta", replace

***2000***
use "`maindir'/data/intermediate/tegemeo/2000/Extract/2000 Data/field00_cleaned.dta", clear

drop if (harvest == "short") 

save "`maindir'/data/intermediate/processed/2000/field00_cleaned.dta", replace

***2004***
use "`maindir'/data/intermediate/tegemeo/2004/Extract/2004 Data/field04_cleaned.dta", clear

drop if (harvest == "short") 

save "`maindir'/data/intermediate/processed/2004/field04_cleaned.dta", replace

***2007***
use "`maindir'/data/intermediate/tegemeo/2007/Extract/2007 Data/field07_cleaned.dta", clear
drop if (harvest == "short") 

save "`maindir'/data/intermediate/processed/2007/field07_cleaned.dta", replace

***2010***
use "`maindir'/data/intermediate/tegemeo/2010/Extract/2010 Data/field10_cleaned.dta", clear
drop if (harvest == "short") 

save "`maindir'/data/intermediate/processed/2010/field10_cleaned.dta", replace

****************
** LABOR DATA **
****************

***1997***

use "`maindir'/data/intermediate/tegemeo/1997/Extract/1997 Data/croplev97_cleaned.dta", clear
drop if year != 1997
drop if crop != "dry maize"
bys hhid: gen count = _N
bys hhid: egen hasshort = max(cond(1, harvest=="short", 0))
bys hhid: egen hasmain = max(cond(1, harvest=="main", 0))

merge m:1 hhid using "`maindir'/data/intermediate/tegemeo/1997/Extract/1997 Data/hhidfinl97_cleaned.dta", keep(master match) nogen

drop if prov == "rift valley" & harvest == "main" & hasshort == 1 & hasmain == 1 & count != 1
drop if prov == "western" & harvest == "main" & hasshort == 1 & hasmain == 1 & count != 1
drop if prov == "coast" & harvest == "main" & hasshort == 1 & hasmain == 1 & count != 1
drop if prov == "central" & harvest == "main" & hasshort == 1 & hasmain == 1 & count != 1
drop if prov == "nyanza" & harvest == "main" & hasshort == 1 & hasmain == 1 & count != 1
drop if prov == "eastern" & harvest == "short" & hasshort == 1 & hasmain == 1 & count != 1

*Note this has a perfect match, so no need to drop households with some missing observation
merge 1:1 hhid harvest field year using "`maindir'/data/intermediate/tegemeo/1997/Extract/1997 Data/field97_cleaned.dta", nogen keep(master match)

bys hhid: egen maxacres = max(acres)
drop if acres != maxacres

bys hhid acres: gen dup = cond(_N==1,0,_n)
drop if dup > 1
drop dup

keep hhid acres

tempfile fieldsforlabor97
save "`fieldsforlabor97'"

use "`maindir'/data/intermediate/tegemeo/1997/Extract/1997 Data/labor97_cleaned.dta", clear

rename labor activity

label define activityclass 1 "Land preparation" 2 "Planting" 3 "Weeding" 4 "Harvest" 5 "Postharvest activities" 6 "Fertilizer application"  7 "Other"

gen multiple = 0

*Went through data, multiple activities use these key terms/characters
replace multiple = 1 if strpos(activity, "/")
replace multiple = 1 if strpos(activity, " and ")
replace multiple = 1 if strpos(activity, " & ")
replace multiple = 1 if strpos(activity, ",")

gen activityclass1 = 0
gen activityclass2 = 0
gen activityclass3 = 0
gen activityclass4 = 0
gen activityclass5 = 0
gen activityclass6 = 0
gen activityclass7 = 0

split activity, parse(& " and ", /)

local parsed = "`r(varlist)'"

qui foreach var in `parsed'{
	replace `var' = trim(`var')
	foreach activityname in "1st ploughing" "2nd ploughing" "furrowing" "grading" "harrow" "harrowing" "land preparation" "ridging" "slashing" "terracing" {
		replace activityclass1 = 1 if `var' == "`activityname'"
	}

	replace activityclass2 = 1 if strpos(activity, "planting")

	foreach activityname in "1st weeding" "2nd weeding" "3rd weeding" "first weeding" "second weeding" "weeding"{
		replace activityclass3 = 1 if `var' == "`activityname'"
	}

	foreach activityname in "harvest" "harvesting"{
		replace activityclass4 = 1 if `var' == "`activityname'"
	}

	foreach activityname in "bag" "bagging" "dry" "drying" "dusting (post harvest)" "haul to storage" "haul to store" "post harvest dusting" "shell" "shelling" "shelling labour" "sort" "stacking" "stooking" "storage" "store" "thresh" "threshing" "total sheller cost" "transport" "weigh" "winnow" "winnowing"{
		replace activityclass5 = 1 if `var' == "`activityname'"
	}

	foreach activityname in "dust" "dusting" "field dusting" "manuring" "spraying" "top dressing" "top-dressing"{
		replace activityclass6 = 1 if `var' == "`activityname'"
	}

	foreach activityname in "desuckering" "irrigation" "other" "other (specify)" "propping" "security" "thinning" "watchman" "wiring"{
		replace activityclass7 = 1 if `var' == "`activityname'"
	}
}

egen activityclass = rowtotal(activityclass*)

drop `parsed'

foreach var in days people hm hf fm ff fc hhrs fhrs avhrs{
	replace `var' = 0 if missing(`var')
}

foreach var in days people hm hf fm ff fc hhrs fhrs avhrs{
	replace `var' = `var'/activityclass if activityclass > 1
}

gen id = _n
rename activityclass sum_activclass
reshape long activityclass, i(id) j(classnum)
drop if activityclass == 0
drop activityclass
ren classnum activityclass

*Calculate wage labor
gen hiredlabor = (hm+hf)*days

*Generate family labor 
gen familylabor = (fm + ff + fc)*days*fhrs

*Collapse by activity to get sum by activity type ()
collapse (sum) hiredlabor (sum) familylabor, by(hhid activityclass)

*Store activity type in a macro for use after reshape
levelsof activityclass, local(activityclass_levels)    
foreach val of local activityclass_levels {
	local activityclassv1`val' : label activityclass `val'
}

*Reshape 
reshape wide hiredlabor familylabor, i(hhid) j(activityclass)

*Store list of hiredlabor and familylabor variables in macro
local hlflvars "hiredlabor familylabor"

*Apply the previous activity class to the label
foreach variable of local hlflvars{
	foreach value of local activityclass_levels{
		label variable `variable'`value' "`variable' for `activityclassv1`value''"
	}
}

save "`maindir'/data/intermediate/processed/1997/labour97_cleaned.dta", replace

*Merge with field (acres) data
merge 1:1 hhid using "`fieldsforlabor97'", keep(match) nogenerate

*Change description to "per acre" for each variable, change the variable to per acre
foreach variable of local hlflvars{
	foreach value of local activityclass_levels{
		local lab: variable label `variable'`value'
		label var `variable'`value' "`lab' per acre"
		replace `variable'`value' = (`variable'`value')/acres
	}
}

*Store familylabor and hiredlabor variables in macros
ds, has(varl *familylabor*)
local famlbrvarlist `r(varlist)'
ds, has(varl *hiredlabor*)
local hiredlbrvarlist `r(varlist)'
local laborvars `famlbrvarlist' `hiredlbrvarlist'
di "`laborvars'"

*Generate totals for hiredlabor and familylabor
egen hiredlabor_S = rowtotal(`hiredlbrvarlist')
egen familylabor_S = rowtotal(`famlbrvarlist')

*Only need data on a household-field basis
drop acres

*Label total variables for hiredlabor and familylabor
label var hiredlabor_S "Total hired labor by Kshs/acre"
label var familylabor_S "Total family labor by hours/acre"

save "`maindir'/data/intermediate/processed/1997/labour97_hhidfield.dta", replace

qui ds hhid, not
collapse (mean) `r(varlist)', by(hhid)

save "`maindir'/data/intermediate/processed/1997/labour97_hhid.dta", replace

***2000***
*Data is not detailed here and is left out of Suri (2011)

***2004***

use "`maindir'/data/intermediate/tegemeo/2004/Extract/2004 Data/field04_cleaned.dta", clear

drop if harvest == "short"

keep hhid field acres

tempfile fieldsforlabor04
save "`fieldsforlabor04'"

use "`maindir'/data/intermediate/tegemeo/2004/Extract/2004 Data/labour04_cleaned.dta", clear

drop if crop != "dry maize" & !missing(crop)

label define activityclass 1 "Land preparation" 2 "Planting" 3 "Weeding" 4 "Harvest" 5 "Postharvest activities" 6 "Fertilizer application"  7 "Other"

gen multiple = 0

*Went through data, multiple activities use these key terms/characters
replace multiple = 1 if strpos(activity, "/")
replace multiple = 1 if strpos(activity, " and ")
replace multiple = 1 if strpos(activity, " & ")
replace multiple = 1 if strpos(activity, ",")

gen activityclass1 = 0
gen activityclass2 = 0
gen activityclass3 = 0
gen activityclass4 = 0
gen activityclass5 = 0
gen activityclass6 = 0
gen activityclass7 = 0

split activity, parse(& " and ", /)

local parsed = "`r(varlist)'"

qui foreach var in `parsed'{
	replace `var' = trim(`var')
	foreach activityname in "1st ploughing" "2nd ploughing" "furrowing" "grading" "harrow" "harrowing" "land preparation" "ridging" "slashing" "terracing" {
		replace activityclass1 = 1 if `var' == "`activityname'"
	}

	replace activityclass2 = 1 if strpos(activity, "planting")

	foreach activityname in "1st weeding" "2nd weeding" "3rd weeding" "first weeding" "second weeding" "weeding"{
		replace activityclass3 = 1 if `var' == "`activityname'"
	}

	foreach activityname in "harvest" "harvesting"{
		replace activityclass4 = 1 if `var' == "`activityname'"
	}

	foreach activityname in "bag" "bagging" "dry" "drying" "dusting (post harvest)" "haul to storage" "haul to store" "post harvest dusting" "shell" "shelling" "shelling labour" "sort" "stacking" "stooking" "storage" "store" "thresh" "threshing" "total sheller cost" "transport" "weigh" "winnow" "winnowing"{
		replace activityclass5 = 1 if `var' == "`activityname'"
	}

	foreach activityname in "dust" "dusting" "field dusting" "manuring" "spraying" "top dressing" "top-dressing"{
		replace activityclass6 = 1 if `var' == "`activityname'"
	}

	foreach activityname in "desuckering" "irrigation" "other" "other (specify)" "propping" "security" "thinning" "watchman" "wiring"{
		replace activityclass7 = 1 if `var' == "`activityname'"
	}
}

egen activityclass = rowtotal(activityclass*)

drop `parsed'

foreach var in lb01 lb02 lb03 lb04 lb06 lb08 lb10{
	replace `var' = 0 if missing(`var')
}

foreach var in lb01 lb02 lb03 lb04 lb06 lb08 lb10{
	replace `var' = `var'/activityclass if activityclass > 1
}

gen id = _n
rename activityclass sum_activclass
reshape long activityclass, i(id) j(classnum)
drop if activityclass == 0

*Calculate wage labor as 
gen hiredlaborwage = lb01*lb02*lb03

*Generate hired labor as contract expenses + hourly expenses
gen hiredlabor = hiredlaborwage + lb04

*Generate family labor as sum of 
gen familylabor = lb06 + lb08 + lb10

*Collapse by activity to get sum by activity type ()
collapse (sum) hiredlabor (sum) familylabor, by(hhid field classnum)

*Store activity type in a macro for use after reshape
levelsof classnum, local(activityclass_levels)    
foreach val of local activityclass_levels {
	local activityclassv1`val' : label activityclass `val'
}

rename classnum activityclass

*Reshape 
reshape wide hiredlabor familylabor, i(hhid field) j(activityclass)

*Store list of hiredlabor and familylabor variables in macro
local hlflvars "hiredlabor familylabor"

*Apply the previous activity class to the label
foreach variable of local hlflvars{
	foreach value of local activityclass_levels{
		label variable `variable'`value' "`variable' for `activityclassv1`value''"
	}
}

save "`maindir'/data/intermediate/processed/2004/labour04_cleaned.dta", replace

*Merge with field (acres) data
merge 1:1 hhid field using "`fieldsforlabor04'", keep(match) nogenerate

drop field
qui ds hhid, not
collapse (sum) `r(varlist)', by(hhid)

*Generate totals for hiredlabor and familylabor
egen hiredlabor_S = rowtotal(`hiredlbrvarlist')
egen familylabor_S = rowtotal(`famlbrvarlist')

*Label total variables for hiredlabor and familylabor
label var hiredlabor_S "Total hired labor by Kshs/acre"
label var familylabor_S "Total family labor by hours/acre"

*Change description to "per acre" for each variable, change the variable to per acre
foreach variable of local hlflvars {
	foreach value of local activityclass_levels{
		local lab: variable label `variable'`value'
		label var `variable'`value' "`lab' per acre"
		replace `variable'`value' = (`variable'`value')/acres
	}
}

replace hiredlabor_S = hiredlabor_S/acres
replace familylabor_S = familylabor_S/acres

*Store familylabor and hiredlabor variables in macros
ds, has(varl *familylabor*)
local famlbrvarlist `r(varlist)'
ds, has(varl *hiredlabor*)
local hiredlbrvarlist `r(varlist)'
local laborvars `famlbrvarlist' `hiredlbrvarlist'
di "`laborvars'"

*Only need data on a household basis
drop acres

save "`maindir'/data/intermediate/processed/2004/labour04_hhid.dta", replace

***2007***

use "`maindir'/data/intermediate/tegemeo/2007/Extract/2007 Data/field07_cleaned.dta", clear

drop if harvest == "short"

keep hhid field acres

tempfile fieldsforlabor07
save "`fieldsforlabor07'"

use "`maindir'/data/intermediate/tegemeo/2007/Extract/2007 Data/croplev07_cleaned.dta", clear

drop in 4286 if hhid==368

*Keep dry maize and main harvest
drop if (crop != "maize-dry")
drop if harvest == "short"

keep hhid field
by hhid field, sort:  gen dup = cond(_N==1,0,_n)
drop if dup > 1
drop dup

merge 1:1 hhid field using "`fieldsforlabor07'", keep(match) nogenerate

*Get the largest field size
by hhid: egen largestacreage = max(acres)

*Gen a variable to indicate largest (there may be multiple)
gen largest = 0

*Record as 1 if largest
replace largest = 1 if acres == float(largestacreage)
drop if largest != 1
drop largestacreage largest
by hhid acres, sort: gen dup = cond(_N==1,0,_n)
drop if dup > 1
drop dup

tempfile fieldsforlabor07
save "`fieldsforlabor07'"

use "`maindir'/data/intermediate/tegemeo/2007/Extract/2007 Data/labour07_cleaned.dta", clear

label define activityclass 1 "Land preparation" 2 "Planting" 3 "Weeding" 4 "Harvest" 5 "Postharvest activities" 6 "Fertilizer application"  7 "Other"

gen multiple = 0

*Went through data, multiple activities use these key terms/characters
replace multiple = 1 if strpos(activity, "/")
replace multiple = 1 if strpos(activity, " and ")
replace multiple = 1 if strpos(activity, " & ")
replace multiple = 1 if strpos(activity, ",")

gen activityclass1 = 0
gen activityclass2 = 0
gen activityclass3 = 0
gen activityclass4 = 0
gen activityclass5 = 0
gen activityclass6 = 0
gen activityclass7 = 0

split activity, parse(& " and ", /)

local parsed = "`r(varlist)'"

qui foreach var in `parsed'{
	replace `var' = trim(`var')
	foreach activityname in "1st ploughing" "2nd ploughing" "furrowing" "grading" "harrow" "harrowing" "land preparation" "ridging" "slashing" "terracing" {
		replace activityclass1 = 1 if `var' == "`activityname'"
	}

	replace activityclass2 = 1 if strpos(activity, "planting")

	foreach activityname in "1st weeding" "2nd weeding" "3rd weeding" "first weeding" "second weeding" "weeding"{
		replace activityclass3 = 1 if `var' == "`activityname'"
	}

	foreach activityname in "harvest" "harvesting"{
		replace activityclass4 = 1 if `var' == "`activityname'"
	}

	foreach activityname in "bag" "bagging" "dry" "drying" "dusting (post harvest)" "haul to storage" "haul to store" "post harvest dusting" "shell" "shelling" "shelling labour" "sort" "stacking" "stooking" "storage" "store" "thresh" "threshing" "total sheller cost" "transport" "weigh" "winnow" "winnowing"{
		replace activityclass5 = 1 if `var' == "`activityname'"
	}

	foreach activityname in "dust" "dusting" "field dusting" "manuring" "spraying" "top dressing" "top-dressing"{
		replace activityclass6 = 1 if `var' == "`activityname'"
	}

	foreach activityname in "desuckering" "irrigation" "other" "other (specify)" "propping" "security" "thinning" "watchman" "wiring"{
		replace activityclass7 = 1 if `var' == "`activityname'"
	}
}

egen activityclass = rowtotal(activityclass*)

drop `parsed'

foreach var in lb01 lb02 lb03 lb04 lb06 lb08 lb10{
	replace `var' = 0 if missing(`var')
}

foreach var in lb01 lb02 lb03 lb04 lb06 lb08 lb10{
	replace `var' = `var'/activityclass if activityclass > 1
}

gen id = _n
rename activityclass sum_activclass
reshape long activityclass, i(id) j(classnum)
drop if activityclass == 0
drop activityclass 
ren classnum activityclass

*Calculate wage labor as 
gen hiredlaborwage = lb01*lb02*lb03
gen hiredlabor = hiredlaborwage + lb04
gen familylabor = lb06 + lb08 + lb10
collapse (sum) hiredlabor (sum) familylabor, by(hhid activityclass)

levelsof activityclass, local(activityclass_levels)    
foreach val of local activityclass_levels {
	local activityclassv1`val' : label activityclass `val'
}

reshape wide hiredlabor familylabor, i(hhid) j(activityclass)

*Store root of variable names in macro
local hlflvars "hiredlabor familylabor"

*Loop through variables and apply previous actvity classes as labels
foreach variable of local hlflvars{
	foreach value of local activityclass_levels{
		label variable `variable'`value' "`variable' for `activityclassv1`value''"
	}
}

save "`maindir'/data/intermediate/processed/2007/labour07_cleaned.dta", replace

*Merge with field (acres) data
merge 1:1 hhid using "`fieldsforlabor07'", keep(match) nogenerate

*Change description to "per acre" for each variable, change the variable to per acre
foreach variable of local hlflvars{
	foreach value of local activityclass_levels{
		local lab: variable label `variable'`value'
		label var `variable'`value' "`lab' per acre"
		replace `variable'`value' = (`variable'`value')/acres
	}
}

*Store familylabor and hiredlabor variables in macros
ds, has(varl *familylabor*)
local famlbrvarlist `r(varlist)'
ds, has(varl *hiredlabor*)
local hiredlbrvarlist `r(varlist)'
local laborvars `famlbrvarlist' `hiredlbrvarlist'
di "`laborvars'"

*Generate totals for hiredlabor and familylabor
egen hiredlabor_S = rowtotal(`hiredlbrvarlist')
egen familylabor_S = rowtotal(`famlbrvarlist')

*Only need data on a household basis
drop acres

*Label total variables for hiredlabor and familylabor
label var hiredlabor_S "Total hired labor by Kshs/acre"
label var familylabor_S "Total family labor by hours/acre"

order field, after(hhid)

save "`maindir'/data/intermediate/processed/2007/labour07_hhidfield.dta", replace

qui ds hhid field, not
collapse (mean) `r(varlist)', by(hhid field)
drop field 

save "`maindir'/data/intermediate/processed/2007/labour07_hhid.dta", replace

***2010***

use "`maindir'/data/intermediate/tegemeo/2010/Extract/2010 Data/field10_cleaned.dta", clear

drop if harvest == "short"

keep hhid field acres

tempfile fieldsforlabor10
save "`fieldsforlabor10'"

use "`maindir'/data/intermediate/tegemeo/2010/Extract/2010 Data/croplev10_cleaned.dta", clear
drop if (crop != "maize-dry") & (crop != "maize-green")
drop if harvest == "short"
keep hhid field
by hhid field, sort: gen dup = cond(_N==1,0,_n)
drop if dup > 1
drop dup
merge 1:1 hhid field using "`fieldsforlabor10'", keep(match) nogenerate
*Get the largest field size
by hhid: egen largestacreage = max(acres)
*Gen a variable to indicate largest (there may be multiple)
gen largest = 0
*Record as 1 if largest
replace largest = 1 if acres == float(largestacreage)
drop if largest != 1
drop largestacreage largest
by hhid acres, sort: gen dup = cond(_N==1,0,_n)
drop if dup > 1
drop dup

tempfile fieldsforlabor10
save "`fieldsforlabor10'"

use "`maindir'/data/intermediate/tegemeo/2010/Extract/2010 Data/labour10_cleaned.dta", clear

label define activityclass 1 "Land preparation" 2 "Planting" 3 "Weeding" 4 "Harvest" 5 "Postharvest activities" 6 "Fertilizer application"  7 "Other"

gen multiple = 0

*Went through data, multiple activities use these key terms/characters
replace multiple = 1 if strpos(activity, "/")
replace multiple = 1 if strpos(activity, " and ")
replace multiple = 1 if strpos(activity, " & ")
replace multiple = 1 if strpos(activity, ",")

gen activityclass1 = 0
gen activityclass2 = 0
gen activityclass3 = 0
gen activityclass4 = 0
gen activityclass5 = 0
gen activityclass6 = 0
gen activityclass7 = 0

split activity, parse(& " and ", /)

local parsed = "`r(varlist)'"

qui foreach var in `parsed'{
	replace `var' = trim(`var')
	foreach activityname in "1st ploughing" "2nd ploughing" "furrowing" "grading" "harrow" "harrowing" "land preparation" "ridging" "slashing" "terracing" {
		replace activityclass1 = 1 if `var' == "`activityname'"
	}

	replace activityclass2 = 1 if strpos(activity, "planting")

	foreach activityname in "1st weeding" "2nd weeding" "3rd weeding" "first weeding" "second weeding" "weeding"{
		replace activityclass3 = 1 if `var' == "`activityname'"
	}

	foreach activityname in "harvest" "harvesting"{
		replace activityclass4 = 1 if `var' == "`activityname'"
	}

	foreach activityname in "bag" "bagging" "dry" "drying" "dusting (post harvest)" "haul to storage" "haul to store" "post harvest dusting" "shell" "shelling" "shelling labour" "sort" "stacking" "stooking" "storage" "store" "thresh" "threshing" "total sheller cost" "transport" "weigh" "winnow" "winnowing"{
		replace activityclass5 = 1 if `var' == "`activityname'"
	}

	foreach activityname in "dust" "dusting" "field dusting" "manuring" "spraying" "top dressing" "top-dressing"{
		replace activityclass6 = 1 if `var' == "`activityname'"
	}

	foreach activityname in "desuckering" "irrigation" "other" "other (specify)" "propping" "security" "thinning" "watchman" "wiring"{
		replace activityclass7 = 1 if `var' == "`activityname'"
	}
}

egen sum_activclass = rowtotal(activityclass*)

drop `parsed'

foreach var in lb01 lb02 lb03 lb04 lb06 lb08 lb10{
	replace `var' = 0 if missing(`var')
}

foreach var in lb01 lb02 lb03 lb04 lb06 lb08 lb10{
	replace `var' = `var'/sum_activclass if sum_activclass > 1
}

gen id = _n
reshape long activityclass, i(id) j(classnum)
drop if activityclass == 0
drop activityclass
rename classnum activityclass

*Calculate wage labor as 
gen hiredlaborwage = lb01*lb02*lb03
gen hiredlabor = hiredlaborwage + lb04
gen familylabor = lb06 + lb08 + lb10
collapse (sum) hiredlabor (sum) familylabor, by(hhid activityclass)

levelsof activityclass, local(activityclass_levels)    
foreach val of local activityclass_levels {
	local activityclassv1`val' : label activityclass `val'
}

reshape wide hiredlabor familylabor, i(hhid) j(activityclass)

local hlflvars "hiredlabor familylabor"

foreach variable of local hlflvars{
	foreach value of local activityclass_levels{
		label variable `variable'`value' "`variable' for `activityclassv1`value''"
	}
}

save "`maindir'/data/intermediate/processed/2010/labour10_cleaned.dta", replace

*Merge with field (acres) data
merge 1:1 hhid using "`fieldsforlabor10'", keep(match) nogenerate

*Change description to "per acre" for each variable, change the variable to per acre
foreach variable of local hlflvars{
	foreach value of local activityclass_levels{
		local lab: variable label `variable'`value'
		label var `variable'`value' "`lab' per acre"
		replace `variable'`value' = (`variable'`value')/acres
	}
}

*Store familylabor and hiredlabor variables in macros
ds, has(varl *familylabor*)
local famlbrvarlist `r(varlist)'
ds, has(varl *hiredlabor*)
local hiredlbrvarlist `r(varlist)'
local laborvars `famlbrvarlist' `hiredlbrvarlist'
di "`laborvars'"

*Generate totals for hiredlabor and familylabor
egen hiredlabor_S = rowtotal(`hiredlbrvarlist')
egen familylabor_S = rowtotal(`famlbrvarlist')

*Only need data on a household basis
drop acres

*Label total variables for hiredlabor and familylabor
label var hiredlabor_S "Total hired labor by Kshs/acre"
label var familylabor_S "Total family labor by hours/acre"

order field, after(hhid)

save "`maindir'/data/intermediate/processed/2010/labour10_hhidfield.dta", replace

qui ds hhid field, not
collapse (mean) `r(varlist)', by(hhid field)
drop field 

save "`maindir'/data/intermediate/processed/2010/labour10_cleaned.dta", replace

********************
** LAND PREP COSTS **
********************

***1997***
use "`maindir'/data/intermediate/tegemeo/1997/Extract/1997 Data/field97_cleaned.dta", clear

drop if year == 1996
drop if harvest == "short"

collapse (sum) lpcost acres, by(hhid)

replace lpcost=lpcost/acres

keep hhid lpcost

save "`maindir'/data/intermediate/processed/1997/lpcost97_cleaned.dta", replace

***2000***
use "`maindir'/data/intermediate/tegemeo/2000/Extract/2000 Data/field00_cleaned.dta", clear

drop if (harvest == "short") 

collapse (sum) lpcost acres, by(hhid)

replace lpcost=lpcost/acres

keep hhid lpcost

save "`maindir'/data/intermediate/processed/2000/lpcost00_cleaned.dta", replace

***2004***
use "`maindir'/data/intermediate/tegemeo/2004/Extract/2004 Data/field04_cleaned.dta", clear

drop if (harvest == "short") 

*There are lpcost==0 and missing lpcost, so the lpcost is truly missing
drop if missing(lpcost)

collapse (sum) lpcost acres, by(hhid)

replace lpcost=lpcost/acres

keep hhid lpcost

save "`maindir'/data/intermediate/processed/2004/lpcost04_cleaned.dta", replace

***2007***
use "`maindir'/data/intermediate/tegemeo/2007/Extract/2007 Data/field07_cleaned.dta", clear

drop if (harvest == "short") 

collapse (sum) lpcost acres, by(hhid)

replace lpcost=lpcost/acres

keep hhid lpcost

save "`maindir'/data/intermediate/processed/2007/lpcost07_cleaned.dta", replace

***2010***
use "`maindir'/data/intermediate/tegemeo/2010/Extract/2010 Data/field10_cleaned.dta", clear

drop if (harvest == "short") 

collapse (sum) lpcost acres, by(hhid)

replace lpcost=lpcost/acres

keep hhid lpcost

save "`maindir'/data/intermediate/processed/2010/lpcost10_cleaned.dta", replace

************************
** CONSUMER PRICE INDEX **
************************

use "`maindir'/data/intermediate/tegemeo/General/Extract/General Data/cpi_allyears_cleaned.dta", clear
gen cpi_0304 = cpi_0607
recast double cpi_0304
replace cpi_0304= cpi_0304/.7379999756813
keep cpi_0304 yearCPI
gen year = substr(yearCPI,4,2)
gen year_dig = substr(yearCPI,4,1)
replace year = "19" + year if year_dig == "9"
replace year = "20" + year if year_dig != "9"
drop year_dig yearCPI
destring year, replace
rename cpi_0304 cpi

save "`maindir'/data/cleaned/CPI.dta", replace

use "`maindir'/data/intermediate/tegemeo/General/Extract/General Data/cpi_allyears_cleaned.dta", clear
recast double cpi_0910
keep cpi_0910 yearCPI
gen year = substr(yearCPI,4,2)
gen year_dig = substr(yearCPI,4,1)
replace year = "19" + year if year_dig == "9"
replace year = "20" + year if year_dig != "9"
drop year_dig yearCPI
destring year, replace
rename cpi_0910 cpi

save "`maindir'/data/cleaned/CPI_modern.dta", replace

********************************************************************************
******************************** FINISH FILE ***********************************
********************************************************************************
