# This code generates the dataset as described in Section 3 and Table 1 of the paper
# "Identifying the sources of consumption variation" by M. Barigozzi and A. Moneta
#
# IMPORTANT INSTRUCTIONS:
#
# Put your original data (that you get from running the code "prepare_data") in the folder "original_data".
# Create a new empty folder called "data". 
# Create new empty sub-folders of the folder "data" and name them according the time window you want to analyse.
# For example: for the time window 1997-2006 the folder has to be called "1997_2006"
# This code will save the new dataset (according to the procedure described in Table 1 of the paper) in a subfolder of the folder "data".
#
# Create also a new empty folder called "tot_exp_descriptive" (for the descriptive study of total expenditure over time)
#
#
# To create the time window from year XXXX to year YYYY
# assign to startyear <-XXXX and to endyear<-YYYY
# in the paper our time windows are: 1997-2006; 1987-1996; 1977-1986
#
# fix "minnmf" (see below) and "maxnmf" (see below)
#
#
#
wherefrom<-".../original_data/" # the folder where the original data are ## <- CHANGE HERE if NEEDED
whereto<-".../data/"   # the folder where the dataset is saved           ## <- CHANGE HERE if NEEDED
w_totexp<- ".../totexp_descriptive/"  # the folder where the total expenditure data are saved  ## <- CHANGE HERE if NEEDED
startyear<-1997                                                          ## <- CHANGE HERE if NEEDED
endyear<-2006                                                            ## <- CHANGE HERE if NEEDED 
minnmf<-2  # minimum number of family members #                          ## <- CHANGE HERE if NEEDED
maxnmf<-4  # maximum number of family members #                          ## <- CHANGE HERE if NEEDED
whereto<-paste(whereto, startyear, "_" ,endyear, "/", sep="")

minage<-0 # minimum age of the household head # 
maxage<-200  # maximum age of the household head # 
lg<-100   # length of the grid

#
if (minnmf==maxnmf){ # this needed to name the output file
nm<-minnmf
}else{
nm<-paste(minnmf,"_", maxnmf,sep="")
}  

###

years<- startyear:endyear
years_1<-years-1
nobs<-1:length(years)

mint<-1:length(years)
maxt<-1:length(years)


fun_outliers_remover<-function(X){
  sd1<- sd(X)
  m1<-mean(X)
  v1<- (X>= (m1- 3*sd1) & X<= (m1+3*sd1))   # cleaning extreme values
  v1
}

DL<-as.list(nobs)

#################

################?
for (i in 1:length(years)){
  
  if( years[i] > 1993) {
  data_filename = paste("dat", years_1[i], "_", years[i], "def.csv", sep="")
  } else {
    data_filename = paste("dat", years[i], "def.csv", sep="")
  }
  DATA <- read.csv(paste(wherefrom,data_filename, sep=""))   # load data
  
  DATA <- DATA[DATA$nper >= minnmf & DATA$nper <= maxnmf & DATA$age>=minage & DATA$age<= maxage,]
  Dbs<- cbind(DATA[,1:5] , DATA[,6:19]/DATA$total_expenditure) 
  RO<-apply(Dbs[,5:19],2,fun_outliers_remover)
  SO<-apply(RO,1,sum)
  #Dbsc<-Dbs[which(SO==15),] #remove outliers in tot exp and all categories of exp
  Dbsc<-Dbs[RO[,1],] #remove outliers only in tot exp
  #Dbsc<-Dbs
  Dbsc<-Dbsc[order(Dbsc$total_expenditure),]
  Texp<-Dbsc$total_expenditure
  mint[i]<-min(Texp); maxt[i]<-max(Texp)
  DL[[i]]<-Dbsc
}
#mean miscellaneous
mmisc<-1:length(years)
for(i in 1:length(years)){
mmisc[i]<-mean(DL[[i]][,19], na.rm=TRUE)
}
mmisc<-mean(mmisc)
##### study of the distribution functions over time (library sm needed)
#i=1
#texp<-DL[[i]]$total_expenditure
#dtexp<-sm.density(texp, display="none")
#evp<-dtexp$eval.points
#est<-dtexp$estimate
#if (minnmf==maxnmf){
#  pdf(paste("C:/Documents and Settings/User/Documenti/BM_Engel_curves/prog/r4_revision/figs/pdf_totexp",
#            minnmf, "member.pdf", sep=""), width=6, height=6)
#}else{
#  pdf(paste("C:/Documents and Settings/User/Documenti/BM_Engel_curves/prog/r4_revision/figs/pdf_totexp",
#            minnmf,"_",maxnmf, "member.pdf", sep=""),  width=6, height=6)
#}
#
#matplot(evp,est, xlim=c(0, max(evp)),ylim=c(0, (max(est)+ max(est)/5)),
##        t="l", xlab="total expenditure", ylab="probability density function")
#for(i in 2:length(years)){
#  texp<-DL[[i]]$total_expenditure
#  dtexp<-sm.density(texp, display="none")
#  evp<-dtexp$eval.points
#  est<-dtexp$estimate
#matplot(evp,est, t="l", add=TRUE)
#  if (minnmf==maxnmf){
#    titletxt<-paste("p.d.f. of total expenditure ", startyear,"-",
#                    endyear," (", minnmf, " members)", sep="")
#    title(titletxt)
#  }else{
##  titletxt<-paste("p.d.f. of total expenditure ", startyear,"-",
#                  endyear," (", minnmf, "-", maxnmf,"members)", sep="")
#  title(titletxt)
#}
#}
#dev.off()
##################
##create matrix with row data and save it on the folder ###
for(i in 1:length(years)){
  if (minnmf==maxnmf){
filetxt<-paste(w_totexp, "realtotexp_",
                   years[i],"_",minnmf,"member.csv", sep="") 
write.csv(DL[[i]][,c("nper","total_expenditure")], filetxt)     
  }else{
   filetxt<-paste(w_totexp, "realtotexp_",
                  years[i],"_",minnmf,"_", maxnmf,"member.csv", sep="") 
   write.csv(DL[[i]][,c("nper","total_expenditure")], filetxt)        
  }  #}
###############################################################
###############################################################
###############################################################

MNT<-max(mint)
MXT<-min(maxt)
TQ<-matrix(nrow=lg, ncol=length(years))

for (i in 1:length(years)){
  Dbsc<-DL[[i]]
  Texp<-Dbsc$total_expenditure
  Dbsc<-Dbsc[Texp >= MNT & Texp <= MXT,]
  pr<-seq(from=1/lg, to=1, by=1/lg)
  Texp<-Dbsc$total_expenditure
  TQ[,i]<-quantile(Texp, probs=pr)
  DL[[i]]<-Dbsc
  }

mtq<-apply(TQ,1,mean)
mtq1<-c(MNT, mtq[-100])
mtq2<-c(MNT, mtq)
TE<-(mtq+mtq1)/2

for (i in 1:length(years)){
  if( years[i] > 1993) {
  data_filename = paste("dat", years_1[i], "_", years[i], "def_mem",nm,".csv", sep="")
  } else {
    data_filename = paste("dat", years[i], "def_mem",nm,".csv", sep="")
  }
  
  Dbsc<-DL[[i]]
  data_new<- matrix(nrow=lg, ncol=15)
  data_new<-as.data.frame(data_new)
  colnames(data_new)<- colnames(Dbs)[5:19]
  data_new$total_expenditure <- TE
  
  TC<-Dbsc$total_expenditure
  nc<-1:lg
  
  for (j in 1:lg){
    wcc<-which(TC>= mtq2[j] &  TC<= mtq2[j+1])
    nc[j]<-length(wcc)
    if (nc[j]==2){print("WARNING! THE NUMBER OF POINTS IN ONE CELL IS 2")}
    if (nc[j]==1){print("WARNING! THE NUMBER OF POINTS IN ONE CELL IS 1")}
    if (nc[j]==0){print("ERROR! NO DATA POINTS IN ONE CELL")}
    dd<-Dbsc[wcc,]
    data_new[j,2:15]<-apply(dd[,6:19],2,mean)
  }
  print(nc)
  
  data_new$number_points_cell<-nc # shows the number of data point in each cell
  
  write.csv(data_new, paste(whereto,data_filename, sep=""))       #write the NEW (artificial data) file
  nobs[i]<-sum(data_new$number_points_cell) # number of families considered each year
  
}
