#####################################################################

#											RANDOM FORESTS 															#

#####################################################################

rm(list=ls())

library(psych)

library(graphics)



rm(list=ls())



# table matrices

out <- matrix(nrow=4, ncol=9)

sig_two <- matrix(nrow=1,ncol=3)



###############################################################
#												RF LONG-RUN DATA									    	#
###############################################################

#Daten  <- read.table("D:/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)


ca <- grep("ca", names(Daten), value=T)
drops <- names(Daten) %in% c(ca)
Daten <- Daten[!drops]

# drop vars not used
assets <- grep("assets", names(Daten), value=T)
stocks <- grep("stocks", names(Daten), value=T)
narrowm <- grep("narrowm", names(Daten), value=T)
money <- grep("money", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
loans <- grep("loans", names(Daten), value=T)
debt <- grep("debt", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)
gdp <- grep("gdp", names(Daten), value=T)
i <- grep("i_", names(Daten), value=T)
c <- grep("c_", names(Daten), value=T)
ri <- grep("ri", names(Daten), value=T)
rc <- grep("rc", names(Daten), value=T)

drops <- names(Daten) %in% c("year", "ccode", stocks, money, stir,assets,i,ri,glo) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo)
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/2

# SELECTION SET:
sel.list <- c("b2", "loans1_y_gap", "pdebt_gap", "narrowm_y_gap",  "rltrate", "gr_rgdp", "gr_cpi",  "er_gap", "loans_y", "pdebt", "ltrate")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/2

###############################################################
#														ANALYSIS   												#
###############################################################

### CLASSIFICATION-TREE ANALYSIS
##############################################################################################################

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)
##############################################################################################################



## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b2"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=5000,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b2"]

roc<-roc(true, pred, ci=T) # ROC analysis
r_1<-roc





###############################################################

#												BOOSTING LONG-RUN DATA							#

###############################################################



#Daten  <- read.table("D:/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)

Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)




ca <- grep("ca", names(Daten), value=T)

drops <- names(Daten) %in% c(ca)

Daten <- Daten[!drops]



# drop vars not used

assets <- grep("assets", names(Daten), value=T)

stocks <- grep("stocks", names(Daten), value=T)

narrowm <- grep("narrowm", names(Daten), value=T)

money <- grep("money", names(Daten), value=T)

ltrate <- grep("ltrate", names(Daten), value=T)

stir <- grep("stir", names(Daten), value=T)

loans <- grep("loans", names(Daten), value=T)

debt <- grep("debt", names(Daten), value=T)

er <- grep("er", names(Daten), value=T)

cpi <- grep("cpi", names(Daten), value=T)

gap <- grep("gap", names(Daten), value=T)

glo <- grep("a_", names(Daten), value=T)

gdp <- grep("gdp", names(Daten), value=T)

i <- grep("i_", names(Daten), value=T)

c <- grep("c_", names(Daten), value=T)

ri <- grep("ri", names(Daten), value=T)

rc <- grep("rc", names(Daten), value=T)



drops <- names(Daten) %in% c("year", "ccode", stocks, money, stir,assets,i,ri,glo) # true-false indicator: true at the names in vector

saves <- names(Daten) %in% c(glo)

full <- Daten[!drops] # drops those variables which have true indication in "drops"

full <- cbind(Daten[glo], full)



# FULL SET: omit observations with missing values

full_om <- na.omit(full)

sum(full_om$b2)/2





###############################################################

#														ANALYSIS   												#

###############################################################



### CLASSIFICATION-TREE ANALYSIS

##############################################################################################################

# variables

var.list <- c( "loans1_y_gap", "pdebt_gap", "narrowm_y_gap",  "rltrate", "gr_rgdp", "gr_cpi",  "er_gap", "loans_y", "pdebt", "ltrate")



# miscellaneous non-independent

misc.list <- c("b2","b1","b3","rec1","rec2","rec3")



# number of cross-validations

runs <- 100



# confidence intervals

ci <- c(0.99, 0.95, 0.9)

##############################################################################################################



library(gbm)



aucs <- matrix(nrow=1, ncol=runs)

ci90 <- matrix(nrow=1, ncol=runs)

ci95_lo <- matrix(nrow=1, ncol=runs)

ci95_up <- matrix(nrow=1, ncol=runs)

ci99 <- matrix(nrow=1, ncol=runs)



N <- matrix(nrow=1, ncol=runs)



##BOOSTING - eclectic

for(j in 1:runs) {

	

	# Bootstrap - training, test sample

	set.seed(j)

	indexes = sample(1:nrow(full_om), size=0.632*nrow(full_om), replace=F)

	test = full_om[-indexes,]

	train = full_om[indexes,]	



	location <- names(train) %in% c(misc.list) # get location of dependent var

	name.indep <- names(train[!location]) # get names of features

	indep <- train[name.indep]

	dep <- train[,"b2"] # dep. var.



	#xnam <- names(full_om_b2[,1:107])

	#(fmla <- as.formula(paste("b2 ~ ", paste(xnam, collapse= "+"))))



	boo1 = gbm.fit(indep, dep,

	 distribution="bernoulli", # Deviance

	# data=train,

	 var.monotone=NULL, # PRIOR: specifies which variables have monotone relation with response

	 n.trees=5000,

	 interaction.depth = 1,

	 #n.minobsinnode=10,

	 shrinkage=0.1,

	 bag.fraction=0.5,

	 #train.fraction=1,

	 nTrain=nrow(train),

 	#cv.folds=10,

 	keep.data=FALSE,

 	verbose=F,

 	#class.stratify.cv=NULL,

 	#n.cores=4

 	)

 	

 	N[,j] <- nrow(full_om)



	#ROC & AUC

	library(pROC)

	pred<-predict(boo1,

					   newdata = test,

					   n.trees=boo1$n.trees,

					   type= "response", # returns probabilities if distribuition is "bernoulli"

					   single.tree=F,

					   )



	gbm.roc.area(test[,"b2"],pred) # special command in gbm environment for auc

	roc<-roc(test[,"b2"], pred)

	

	aucs[,j] <- as.numeric(roc$auc)

		

	ci90[,j] <- as.numeric(ci.auc(roc,conf.level=ci[3]))[1]

	ci95_lo[,j] <- as.numeric(ci.auc(roc,conf.level=ci[2]))[1]

	ci95_up[,j] <- as.numeric(ci.auc(roc,conf.level=ci[2]))[3]

	ci99[,j] <- as.numeric(ci.auc(roc,conf.level=ci[1]))[1]



 	}

	# use colMeans here, as the "as.matrix()" transformation turns the initial row(non)vector into a columnvector.

	# This is necessary, as the *Means commands only apply to matrices, but x[i, ] is not a matrix

	N <- as.numeric(rowMeans(as.matrix(N[ ])))

	auc <- as.numeric(rowMeans(as.matrix(aucs[])))

	

	ci90<-as.numeric(rowMeans(as.matrix(ci90[])))

	ci95_lo<-as.numeric(rowMeans(as.matrix(ci95_lo[ ])))

	ci95_up<-as.numeric(rowMeans(as.matrix(ci95_up[])))

	ci99<-as.numeric(rowMeans(as.matrix(ci99[ ])))



	out[2,2] <- round(auc,2)





# confidence intervals

cis<-paste(round(ci95_lo,2), round(ci95_up,2), sep=",")

cis<-paste("[", cis, sep="")

cis<-paste(cis, "]", sep="")



out[2,3] <- cis

out[2,4]<-round(N,2)



out[2,5]<-boo1$n.trees

out[2,6]<-ncol(indep)



out[2,7]<-boo1$bag.fraction

out[2,8]<-boo1$shrinkage

out[2,9]<-floor(sum(full_om$b2)/2)





#AUC comparison
testobj <- roc.test(roc,r_1,method="delong",alternative="two.sided")
options("scipen"=10)
options()$scipen

sig_two[1,1]<-testobj$p.value[1]








###############################################################
#												2-year horizon Post-1970 yearly DATA				#
###############################################################

#Daten  <- read.table("D:/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class_post70_y.csv", sep=",", dec=".", header=TRUE)


fliab <- grep("fliab", names(Daten), value=T)
drops <- names(Daten) %in% c(fliab)
Daten <- Daten[!drops]

# drop vars not used
stocks <- grep("stocks", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
loans <- grep("loans", names(Daten), value=T)
pdebt <- grep("pdebt", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)

drops <- names(Daten) %in% c("year", "ccode", stocks, ltrate,stir,glo) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo) # keep global variables as they have few missings
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/2

# SELECTION SET:
sel.list <-c("b2", "loans_y_gap", "loans_y", "rer_gap", "gdp_r_gap", "gr_cpi", "nx_y_gap")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/2

###############################################################
#														ANALYSIS   												#
###############################################################

### CLASSIFICATION-TREE ANALYSIS
##############################################################################################################

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)
##############################################################################################################



## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b2"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=5000,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b2"]

roc<-roc(true, pred, ci=T) # ROC analysis
r_2<-roc









###############################################################

#												BOOSTING POST-1970 	YEARLY													#

###############################################################



#Daten  <- read.table("D:/Dropbox/CrisisPrediction/Data/R_class_post70_y.csv", sep=",", dec=".", header=TRUE)

Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class_post70_y.csv", sep=",", dec=".", header=TRUE)


fliab <- grep("fliab", names(Daten), value=T)

drops <- names(Daten) %in% c(fliab)

Daten <- Daten[!drops]



# drop vars not used

stocks <- grep("stocks", names(Daten), value=T)

ltrate <- grep("ltrate", names(Daten), value=T)

stir <- grep("stir", names(Daten), value=T)

loans <- grep("loans", names(Daten), value=T)

pdebt <- grep("pdebt", names(Daten), value=T)

er <- grep("er", names(Daten), value=T)

cpi <- grep("cpi", names(Daten), value=T)

gap <- grep("gap", names(Daten), value=T)

glo <- grep("a_", names(Daten), value=T)



drops <- names(Daten) %in% c("year", "ccode", stocks, ltrate,stir,glo) # true-false indicator: true at the names in vector

saves <- names(Daten) %in% c(glo) # keep global variables as they have few missings

full <- Daten[!drops] # drops those variables which have true indication in "drops"

full <- cbind(Daten[glo], full)



# FULL SET: omit observations with missing values

full_om <- na.omit(full)

sum(full_om$b2)/2





###############################################################

#														ANALYSIS   												#

###############################################################



### CLASSIFICATION-TREE ANALYSIS

##############################################################################################################

# miscellaneous non-independent

misc.list <- c("b2","b1","b3","rec1","rec2","rec3")


# Monte-Carlo Cross-Validation runs

runs <- 100



# confidence intervals

ci <- c(0.99, 0.95, 0.9)

##############################################################################################################



library(gbm)



aucs <- matrix(nrow=1, ncol=runs)

ci90 <- matrix(nrow=1, ncol=runs)

ci95_lo <- matrix(nrow=1, ncol=runs)

ci95_up <- matrix(nrow=1, ncol=runs)

ci99 <- matrix(nrow=1, ncol=runs)



N <- matrix(nrow=1, ncol=runs)



##BOOSTING - eclectic

for(j in 1:runs) {

	

	# Bootstrap - training, test sample

	set.seed(j)

	indexes = sample(1:nrow(full_om), size=0.632*nrow(full_om), replace=F)

	test = full_om[-indexes,]

	train = full_om[indexes,]	



	location <- names(train) %in% c(misc.list) # get location of dependent var

	name.indep <- names(train[!location]) # get names of features

	indep <- train[name.indep]

	dep <- train[,"b2"] # dep. var.



	#xnam <- names(full_om_b2[,1:107])

	#(fmla <- as.formula(paste("b2 ~ ", paste(xnam, collapse= "+"))))



	boo2 = gbm.fit(indep, dep,

	 distribution="bernoulli", # Deviance

	# data=train,

	 var.monotone=NULL, # PRIOR: specifies which variables have monotone relation with response

	 n.trees=5000,

	 interaction.depth = 1,

	 #n.minobsinnode=10,

	 shrinkage=0.1,

	 bag.fraction=0.5,

	 #train.fraction=1,

	 nTrain=nrow(train),

 	#cv.folds=10,

 	keep.data=F,

 	verbose=F,

 	#class.stratify.cv=NULL,

 	#n.cores=4

 	)

 	

 	N[,j] <- nrow(full_om)



	#ROC & AUC

	library(pROC)

	pred<-predict(boo2,

					   newdata = test,

					   n.trees=boo2$n.trees,

					   type= "response", # returns probabilities if distribuition is "bernoulli"

					   single.tree=F,

					   )



	gbm.roc.area(test[,"b2"],pred) # special command in gbm environment for auc

	roc<-roc(test[,"b2"], pred)

	

	aucs[,j] <- as.numeric(roc$auc)

		

	ci90[,j] <- as.numeric(ci.auc(roc,conf.level=ci[3]))[1]

	ci95_lo[,j] <- as.numeric(ci.auc(roc,conf.level=ci[2]))[1]

	ci95_up[,j] <- as.numeric(ci.auc(roc,conf.level=ci[2]))[3]

	ci99[,j] <- as.numeric(ci.auc(roc,conf.level=ci[1]))[1]



 	}

	# use colMeans here, as the "as.matrix()" transformation turns the initial row(non)vector into a columnvector.

	# This is necessary, as the *Means commands only apply to matrices, but x[i, ] is not a matrix

	N <- as.numeric(rowMeans(as.matrix(N[ ])))

	auc <- as.numeric(rowMeans(as.matrix(aucs[])))

	

	ci90<-as.numeric(rowMeans(as.matrix(ci90[])))

	ci95_lo<-as.numeric(rowMeans(as.matrix(ci95_lo[ ])))

	ci95_up<-as.numeric(rowMeans(as.matrix(ci95_up[])))

	ci99<-as.numeric(rowMeans(as.matrix(ci99[ ])))



	out[3,2] <- round(auc,2)



	



# confidence intervals

cis<-paste(round(ci95_lo,2), round(ci95_up,2), sep=",")

cis<-paste("[", cis, sep="")

cis<-paste(cis, "]", sep="")



out[3,3] <- cis

out[3,4]<-round(N,2)



out[3,5]<-boo2$n.trees

out[3,6]<-ncol(indep)



out[3,7]<-boo2$bag.fraction

out[3,8]<-boo2$shrinkage

out[3,9]<-floor(sum(full_om$b2)/2)



#AUC comparison
testobj <- roc.test(roc,r_2,method="delong",alternative="two.sided")
options("scipen"=10)
options()$scipen

sig_two[1,2]<-testobj$p.value[1]









###############################################################
#												2-year horizon Post-1970 quarterly DATA													#
###############################################################

#Daten  <- read.table("D:/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class_post70_q.csv", sep=",", dec=".", header=TRUE)



# drop vars not used
hopr <- grep("hopr", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
gdp <- grep("gdp", names(Daten), value=T)
y <- grep("_y", names(Daten), value=T)
stocks <- grep("stocks", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)
fliab <- grep("fliab", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
res <- grep("res", names(Daten), value=T)
tloans <- grep("tloans", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)


drops <- names(Daten) %in% c("quarter", "year", "ccode", hopr, gdp, y, stocks, stir, ltrate,glo, fliab) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo)
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/8

# SELECTION SET:
sel.list <-c("b2", "tloans_r_gap", "tloans_r_gr", "res_r_gap",  "er_gap",  "a_ltrate_r_gap",  "a_gdp_r_gap", "a_gdp_r_gr", "cpi_gr")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/8

###############################################################
#														ANALYSIS   												#
###############################################################

### CLASSIFICATION-TREE ANALYSIS
##############################################################################################################

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)
##############################################################################################################



## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b2"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=5000,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b2"]

roc<-roc(true, pred, ci=T) # ROC analysis
r_3<-roc








###############################################################

#												BOOSTING POST-1970 QUARTERLY													#

###############################################################



#Daten  <- read.table("D:/Dropbox/CrisisPrediction/Data/R_class_post70_q.csv", sep=",", dec=".", header=TRUE)

Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class_post70_q.csv", sep=",", dec=".", header=TRUE)


# drop vars not used

hopr <- grep("hopr", names(Daten), value=T)

gap <- grep("gap", names(Daten), value=T)

gdp <- grep("gdp", names(Daten), value=T)

y <- grep("_y", names(Daten), value=T)

stocks <- grep("stocks", names(Daten), value=T)

stir <- grep("stir", names(Daten), value=T)

ltrate <- grep("ltrate", names(Daten), value=T)

glo <- grep("a_", names(Daten), value=T)

fliab <- grep("fliab", names(Daten), value=T)

er <- grep("er", names(Daten), value=T)

res <- grep("res", names(Daten), value=T)

tloans <- grep("tloans", names(Daten), value=T)

cpi <- grep("cpi", names(Daten), value=T)





drops <- names(Daten) %in% c("quarter", "year", "ccode", hopr, gdp, y, stocks, stir, ltrate,glo, fliab) # true-false indicator: true at the names in vector

saves <- names(Daten) %in% c(glo)

full <- Daten[!drops] # drops those variables which have true indication in "drops"

full <- cbind(Daten[glo], full)



# FULL SET: omit observations with missing values

full_om <- na.omit(full)

sum(full_om$b2)/8





###############################################################

#														ANALYSIS   												#

###############################################################



### CLASSIFICATION-TREE ANALYSIS

##############################################################################################################

# miscellaneous non-independent

misc.list <- c("b2","b1","b3","rec1","rec2","rec3")


# Monte-Carlo Cross-Validation runs

runs <- 100



# confidence intervals

ci <- c(0.99, 0.95, 0.9)

##############################################################################################################



library(gbm)



aucs <- matrix(nrow=1, ncol=runs)

ci90 <- matrix(nrow=1, ncol=runs)

ci95_lo <- matrix(nrow=1, ncol=runs)

ci95_up <- matrix(nrow=1, ncol=runs)

ci99 <- matrix(nrow=1, ncol=runs)



N <- matrix(nrow=1, ncol=runs)



##BOOSTING - eclectic

for(j in 1:runs) {

	

	# Bootstrap - training, test sample

	set.seed(j)

	indexes = sample(1:nrow(full_om), size=0.632*nrow(full_om), replace=F)

	test = full_om[-indexes,]

	train = full_om[indexes,]	



	location <- names(train) %in% c(misc.list) # get location of dependent var

	name.indep <- names(train[!location]) # get names of features

	indep <- train[name.indep]

	dep <- train[,"b2"] # dep. var.



	#xnam <- names(full_om_b2[,1:107])

	#(fmla <- as.formula(paste("b2 ~ ", paste(xnam, collapse= "+"))))



	boo3 = gbm.fit(indep, dep,

	 distribution="bernoulli", # Deviance

	# data=train,

	 var.monotone=NULL, # PRIOR: specifies which variables have monotone relation with response

	 n.trees=5000,

	 interaction.depth = 1,

	 #n.minobsinnode=10,

	 shrinkage=0.1,

	 bag.fraction=0.5,

	 #train.fraction=1,

	 nTrain=nrow(train),

 	#cv.folds=10,

 	keep.data=F,

 	verbose=F,

 	#class.stratify.cv=NULL,

 	#n.cores=4

 	)

 	

 	N[,j] <- nrow(full_om)



	#ROC & AUC

	library(pROC)

	pred<-predict(boo3,

					   newdata = test,

					   n.trees=boo3$n.trees,

					   type= "response", # returns probabilities if distribuition is "bernoulli"

					   single.tree=F,

					   )



	gbm.roc.area(test[,"b2"],pred) # special command in gbm environment for auc

	roc<-roc(test[,"b2"], pred)

	

	aucs[,j] <- as.numeric(roc$auc)

		

	ci90[,j] <- as.numeric(ci.auc(roc,conf.level=ci[3]))[1]

	ci95_lo[,j] <- as.numeric(ci.auc(roc,conf.level=ci[2]))[1]

	ci95_up[,j] <- as.numeric(ci.auc(roc,conf.level=ci[2]))[3]

	ci99[,j] <- as.numeric(ci.auc(roc,conf.level=ci[1]))[1]



 	}

	# use colMeans here, as the "as.matrix()" transformation turns the initial row(non)vector into a columnvector.

	# This is necessary, as the *Means commands only apply to matrices, but x[i, ] is not a matrix

	N <- as.numeric(rowMeans(as.matrix(N[ ])))

	auc <- as.numeric(rowMeans(as.matrix(aucs[])))

	

	ci90<-as.numeric(rowMeans(as.matrix(ci90[])))

	ci95_lo<-as.numeric(rowMeans(as.matrix(ci95_lo[ ])))

	ci95_up<-as.numeric(rowMeans(as.matrix(ci95_up[])))

	ci99<-as.numeric(rowMeans(as.matrix(ci99[ ])))



	out[4,2] <- round(auc,2)





# confidence intervals

cis<-paste(round(ci95_lo,2), round(ci95_up,2), sep=",")

cis<-paste("[", cis, sep="")

cis<-paste(cis, "]", sep="")



out[4,3] <- cis

out[4,4]<-round(N,2)



out[4,5]<-boo3$n.trees

out[4,6]<-ncol(indep)



out[4,7]<-boo3$bag.fraction

out[4,8]<-boo3$shrinkage

out[4,9]<-floor(sum(full_om$b2)/8)

#AUC comparison
testobj <- roc.test(roc,r_3,method="delong",alternative="two.sided")
options("scipen"=10)
options()$scipen

sig_two[1,3]<-testobj$p.value[1]





out
sig_two



save.image("/Users/felixward/Dropbox/CrisisPrediction/DoFiles/CT_boost") 





###############################################################

#															TABLES													#

###############################################################

load("/Users/felixward/Dropbox/CrisisPrediction/DoFiles/CT_boost")



library(xtable)

model.list <- c("\\vtop{\\hbox{\\strut Long-run sample}\\hbox{\\strut yearly}}", "\\vtop{\\hbox{\\strut Post-1970 sample}\\hbox{\\strut yearly}}", "\\vtop{\\hbox{\\strut Post-1970 sample}\\hbox{\\strut quarterly}}")

for (i in 1:ncol(sig_two)){
	if(sig_two[1,i]<=0.05) {
		out[i+1,2] <- paste(out[i+1,2],"$^{\\mathsection}$",collapse="")
	}	
}


out[2:4,1]<-model.list

param.list <- c("", "AUC", "95\\%-CI", "N","B", "$ J $", "$\\eta$", "$\\nu$", "\\# of crises")

out[1,] <- param.list



tout<-t(out)

mat3<-xtable(tout, align="llccc", caption="Robustness Checks", label="tab:CT_rob") # for whatever reason need one column more than i actually want (added "l" to left)



print(mat3, type="latex", caption.placement="top", hline.after=c(), sanitize.text.function = function(x){x}, file="/Users/felixward/Dropbox/CrisisPrediction/Written/CT_boost.txt", replace=T, floating=F, booktabs=T, include.colnames=F, include.rownames=F, add.to.row=list(pos=list(0,1,4,9), 

command=c(" \\\\ \\cmidrule(){1-4} \\\\",

" \\\\ \\cdashline{1-4} \\\\",

" \\\\ \\cdashline{1-4} \\\\",

" \\\\ \\cmidrule(){1-4} \\\\")))