#####################################################################
#											RANDOM FORESTS 															#
#####################################################################
rm(list=ls())
library(psych)
library(graphics)
library(sandwich)
library(bbmle)
library(pROC)
library(randomForest)
library(reshape)

# table matrices
out <- matrix(nrow=15, ncol=12)
sig_base <- matrix(nrow=15,ncol=2)
sig_many <- matrix(nrow=15,ncol=1)

# model list
model.list <- c("1-year horizon","2-year horizon", "3-year horizon", "RR dummy", "1-year horizon","2-year horizon", "3-year horizon",  "Emerging markets", "Quality data", "1-year horizon","2-year horizon", "3-year horizon", "Q4 only", "Emerging markets", "Quality data")

# parameter list
out.list <- c("\\textbf{Model}", "AUC", "95\\%-CI", "N [crises]", "", "AUC", "95\\%-CI", "N [crises]")

# bootstrap runs
runs=100

# trees
trees=5000

###############################################################
#									1. 1-year horizon LONG-RUN DATA								#
###############################################################

##############################################################################################################

# variables
var.list <- c( "loans1_y_gap", "pdebt_gap", "narrowm_y_gap",  "rltrate", "gr_rgdp", "gr_cpi",  "er_gap", "loans1_y", "pdebt", "ltrate")

# variables (logit)
var.logit <- c("loans1_y_gap", "pdebt_gap", "narrowm_y_gap",  "rltrate", "gr_rgdp", "gr_cpi",  "er_gap")

# interaction terms (logit)
ia.logit <- c("ia_pub", "ia_prb", "ia_jb", "ia_lygr", "ia_pygr", "ia_lyer")

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)

# row of table
row<-1
##############################################################################################################



#Daten  <- read.table("D:/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)

ca <- grep("ca", names(Daten), value=T)
drops <- names(Daten) %in% c(ca)
Daten <- Daten[!drops]

# drop vars not used
assets <- grep("assets", names(Daten), value=T)
stocks <- grep("stocks", names(Daten), value=T)
narrowm <- grep("narrowm", names(Daten), value=T)
money <- grep("money", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
loans <- grep("loans", names(Daten), value=T)
debt <- grep("debt", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)
gdp <- grep("gdp", names(Daten), value=T)
i <- grep("i_", names(Daten), value=T)
c <- grep("c_", names(Daten), value=T)
ri <- grep("ri", names(Daten), value=T)
rc <- grep("rc", names(Daten), value=T)

drops <- names(Daten) %in% c("year", "ccode", stocks, money, stir,assets,i,ri,glo) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo)
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/2

# SELECTION SET:
sel.list <- c("b1", "loans1_y_gap", "pdebt_gap", "narrowm_y_gap",  "rltrate", "gr_rgdp", "gr_cpi",  "er_gap", "loans1_y", "pdebt", "ltrate")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b1)/1


#DATA for logit model
## interaction-terms for logit model
ia_pub<-Daten$pdebt_gap*Daten$ltrate
Daten$ia_pub<-ia_pub

ia_prb<-Daten$loans1_y_gap*Daten$ltrate
Daten$ia_prb<-ia_prb

ia_jb<-Daten$loans1_y_gap*Daten$ltrate*Daten$pdebt_gap
Daten$ia_jb<-ia_jb

ia_lygr<-Daten$loans1_y*Daten$gr_rgdp
Daten$ia_lygr<-ia_lygr

ia_pygr<-Daten$pdebt*Daten$gr_rgdp
Daten$ia_pygr<-ia_pygr

ia_lyer<-Daten$loans1_y_gap*Daten$er_gap
Daten$ia_lyer<-ia_lyer

## country factor
Daten$country.factor<-as.factor(Daten$ccode)

#throw out vars not used
drops.logit <- names(Daten) %in% c("year") # true-false indicator: true at the names in vector
full.logit <- Daten[!drops] # drops those variables which have true indication in "drops"



#														ANALYSIS   												#

#LOGIT
aucs <- matrix(nrow=1, ncol=runs)
ci95_lo <- matrix(nrow=1, ncol=runs)
ci95_up <- matrix(nrow=1, ncol=runs)

N <- matrix(nrow=1, ncol=runs)
	

# get formula
location <- names(full.logit) %in% c(var.logit, ia.logit,"country.factor") # get location of vars
name <- names(full.logit[location]) # get names
indep <- paste(name, collapse="+") # indep. variables
dep <- paste("b1~") # dep. variable
fmla <- as.formula(paste(dep, indep)) # get formula


for(j in 1:runs) {
	
	# training, test sample
	set.seed(j)
	indexes = sample(1:nrow(full.logit), size=0.632*nrow(full), replace=F)
	test = full.logit[-indexes,]
	train = full.logit[indexes,]
	
	# Regression
	logit<-glm(fmla, data=train, family="binomial")
	N[1,j] <- logit$df.null

	# OOS-analysis
	pred<-predict(logit, newdata=test, type="response") # predicted outcome

	location <- names(test) %in% c("b1")
	name <- names(test[location]) # get names
	true<-test[,name] # real outcome

	r<-roc(true,pred,ci=T) # ROC analysis
	aucs[1,j] <- as.numeric(r$auc)
		
	ci95_lo[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[1]
	ci95_up[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[3]

}

N <- as.numeric(colMeans(as.matrix(N[1, ]))) # update output table matrix

auc<-as.numeric(colMeans(as.matrix(aucs[1, ])))
ci95_lo<-as.numeric(colMeans(as.matrix(ci95_lo[1, ])))
ci95_up<-as.numeric(colMeans(as.matrix(ci95_up[1, ])))



# Representative logit model whose AUC equals the MCCV average

# training, test sample
set.seed(9)
indexes = sample(1:nrow(full.logit), size=0.632*nrow(full.logit), replace=F)
test = full.logit[-indexes,]
train = full.logit[indexes,]
	
# Regression
logit<-glm(fmla, data=train, family="binomial")

# OOS-analysis
pred<-predict(logit, newdata=test, type="response") # predicted outcome

true<-test[,"b1"] # real outcome

library(pROC)
r_h1_log<-roc(true,pred,ci=F) # ROC analysis
r_h1_log



## RF-selection
library(randomForest)

location <- names(sel_om) %in% c(var.list) # get location of independent var
name.indep <- names(sel_om[location]) # get names of features
location <- names(sel_om) %in% c("b1") # get location of dependent var
name.dep <- names(sel_om[location]) # get name of dep. var.
indep <- sel_om[name.indep]
dep <- factor(sel_om[name.dep]>0)

# grow trees
set.seed(1)
rf_selection= randomForest(indep, y=dep,
 data=sel_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(sel_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_selection

# convergence diagnostic
palette("default")
plot(rf_selection, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_selection, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- sel_om[,name.dep]

r<-roc(true, pred, ci=T) # ROC analysis

out[row,1]<-model.list[row]
out[row,2] <- round(as.numeric(r$auc),2)
out[row,3]<-round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,4]<-round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,5]<-nrow(sel_om)
out[row,6]<-floor(sum(full_om$b1))


# compare ROCs
r_h1_sel<-r
testobj <- roc.test(r_h1_sel,r_h1_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,1]<-testobj$p.value[1]




## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b1"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b1"]

roc<-roc(true, pred, ci=T) # ROC analysis
r_h1_full<-roc 

 	N <- nrow(full_om)
	
	aucs <- as.numeric(roc$auc)
		
	ci90 <- as.numeric(ci.auc(roc,conf.level=ci[3]))[1]
	ci95_lo <- as.numeric(ci.auc(roc,conf.level=ci[2]))[1]
	ci95_up <- as.numeric(ci.auc(roc,conf.level=ci[2]))[3]
	ci99 <- as.numeric(ci.auc(roc,conf.level=ci[1]))[1]



	# use colMeans here, as the "as.matrix()" transformation turns the initial row(non)vector into a columnvector.
	# This is necessary, as the *Means commands only apply to matrices, but x[i, ] is not a matrix
	N <- as.numeric(as.matrix(N[ ]))
	auc <- as.numeric(as.matrix(aucs[]))
	
	ci90<-as.numeric(as.matrix(ci90[]))
	ci95_lo<-as.numeric(as.matrix(ci95_lo[ ]))
	ci95_up<-as.numeric(as.matrix(ci95_up[]))
	ci99<-as.numeric(as.matrix(ci99[ ]))

	out[row,8] <- round(auc,2)

	

# confidence intervals
cis<-paste(round(ci95_lo,2), round(ci95_up,2), sep=",")
cis<-paste("[", cis, sep="")
cis<-paste(cis, "]", sep="")

out[row,9] <- round(ci95_lo,2)
out[row,10] <- round(ci95_up,2)
out[row,11]<-round(N,2)
out[row,12]<-floor(sum(full_om$b2)/2)


#AUC comparison
testobj <- roc.test(r_h1_full,r_h1_log,method="delong",alternative="two.sided")
options("scipen"=10)
options()$scipen

sig_base[row,2]<-testobj$p.value[1]

testobj <- roc.test(r_h1_full,r_h1_sel,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_many[row,1]<-testobj$p.value[1]









###############################################################
#									1. 2-year horizon LONG-RUN DATA								#
###############################################################

##############################################################################################################

# variables
var.list <- c( "loans1_y_gap", "pdebt_gap", "narrowm_y_gap",  "rltrate", "gr_rgdp", "gr_cpi",  "er_gap", "loans1_y", "pdebt", "ltrate")

# variables (logit)
var.logit <- c("loans1_y_gap", "pdebt_gap", "narrowm_y_gap",  "rltrate", "gr_rgdp", "gr_cpi",  "er_gap")

# interaction terms (logit)
ia.logit <- c("ia_pub", "ia_prb", "ia_jb", "ia_lygr", "ia_pygr", "ia_lyer")

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)

# row of table
row<-2
##############################################################################################################



#Daten  <- read.table("D:/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)

ca <- grep("ca", names(Daten), value=T)
drops <- names(Daten) %in% c(ca)
Daten <- Daten[!drops]

# drop vars not used
assets <- grep("assets", names(Daten), value=T)
stocks <- grep("stocks", names(Daten), value=T)
narrowm <- grep("narrowm", names(Daten), value=T)
money <- grep("money", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
loans <- grep("loans", names(Daten), value=T)
debt <- grep("debt", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)
gdp <- grep("gdp", names(Daten), value=T)
i <- grep("i_", names(Daten), value=T)
c <- grep("c_", names(Daten), value=T)
ri <- grep("ri", names(Daten), value=T)
rc <- grep("rc", names(Daten), value=T)

drops <- names(Daten) %in% c("year", "ccode", stocks, money, stir,assets,i,ri,glo) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo)
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/2

# SELECTION SET:
sel.list <- c("b2", "loans1_y_gap", "pdebt_gap", "narrowm_y_gap",  "rltrate", "gr_rgdp", "gr_cpi",  "er_gap", "loans1_y", "pdebt", "ltrate")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/2


#DATA for logit model
## interaction-terms for logit model
ia_pub<-Daten$pdebt_gap*Daten$ltrate
Daten$ia_pub<-ia_pub

ia_prb<-Daten$loans1_y_gap*Daten$ltrate
Daten$ia_prb<-ia_prb

ia_jb<-Daten$loans1_y_gap*Daten$ltrate*Daten$pdebt_gap
Daten$ia_jb<-ia_jb

ia_lygr<-Daten$loans1_y*Daten$gr_rgdp
Daten$ia_lygr<-ia_lygr

ia_pygr<-Daten$pdebt*Daten$gr_rgdp
Daten$ia_pygr<-ia_pygr

ia_lyer<-Daten$loans1_y_gap*Daten$er_gap
Daten$ia_lyer<-ia_lyer

## country factor
Daten$country.factor<-as.factor(Daten$ccode)

#throw out vars not used
drops.logit <- names(Daten) %in% c("year") # true-false indicator: true at the names in vector
full.logit <- Daten[!drops] # drops those variables which have true indication in "drops"



#														ANALYSIS   												#

#LOGIT
aucs <- matrix(nrow=1, ncol=runs)
ci95_lo <- matrix(nrow=1, ncol=runs)
ci95_up <- matrix(nrow=1, ncol=runs)

N <- matrix(nrow=1, ncol=runs)
	

# get formula
location <- names(full.logit) %in% c(var.logit, ia.logit,"country.factor") # get location of vars
name <- names(full.logit[location]) # get names
indep <- paste(name, collapse="+") # indep. variables
dep <- paste("b2~") # dep. variable
fmla <- as.formula(paste(dep, indep)) # get formula


for(j in 1:runs) {
	
	# training, test sample
	set.seed(j)
	indexes = sample(1:nrow(full.logit), size=0.632*nrow(full), replace=F)
	test = full.logit[-indexes,]
	train = full.logit[indexes,]
	
	# Regression
	logit<-glm(fmla, data=train, family="binomial")
	N[1,j] <- logit$df.null

	# OOS-analysis
	pred<-predict(logit, newdata=test, type="response") # predicted outcome

	location <- names(test) %in% c("b2")
	name <- names(test[location]) # get names
	true<-test[,name] # real outcome

	r<-roc(true,pred,ci=T) # ROC analysis
	aucs[1,j] <- as.numeric(r$auc)
		
	ci95_lo[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[1]
	ci95_up[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[3]

}

N <- as.numeric(colMeans(as.matrix(N[1, ]))) # update output table matrix

auc<-as.numeric(colMeans(as.matrix(aucs[1, ])))
ci95_lo<-as.numeric(colMeans(as.matrix(ci95_lo[1, ])))
ci95_up<-as.numeric(colMeans(as.matrix(ci95_up[1, ])))



# Representative logit model whose AUC equals the MCCV average

# training, test sample
set.seed(2)
indexes = sample(1:nrow(full.logit), size=0.632*nrow(full.logit), replace=F)
test = full.logit[-indexes,]
train = full.logit[indexes,]
	
# Regression
logit<-glm(fmla, data=train, family="binomial")

# OOS-analysis
pred<-predict(logit, newdata=test, type="response") # predicted outcome

true<-test[,"b2"] # real outcome

library(pROC)
r_h2_log<-roc(true,pred,ci=F) # ROC analysis
r_h2_log



## RF-selection
library(randomForest)

location <- names(sel_om) %in% c(var.list) # get location of independent var
name.indep <- names(sel_om[location]) # get names of features
location <- names(sel_om) %in% c("b2") # get location of dependent var
name.dep <- names(sel_om[location]) # get name of dep. var.
indep <- sel_om[name.indep]
dep <- factor(sel_om[name.dep]>0)

# grow trees
set.seed(1)
rf_selection= randomForest(indep, y=dep,
 data=sel_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(sel_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_selection

# convergence diagnostic
palette("default")
plot(rf_selection, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_selection, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- sel_om[,name.dep]

r<-roc(true, pred, ci=T) # ROC analysis

out[row,1]<-model.list[row]
out[row,2] <- round(as.numeric(r$auc),2)
out[row,3]<-round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,4]<-round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,5]<-nrow(sel_om)
out[row,6]<-floor(sum(full_om$b2)/2)


# compare ROCs
r_h2_sel<-r
testobj <- roc.test(r_h2_sel,r_h2_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,1]<-testobj$p.value[1]




## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b2"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b2"]

roc<-roc(true, pred, ci=T) # ROC analysis

 	N <- nrow(full_om)
	
	aucs <- as.numeric(roc$auc)
		
	ci90 <- as.numeric(ci.auc(roc,conf.level=ci[3]))[1]
	ci95_lo <- as.numeric(ci.auc(roc,conf.level=ci[2]))[1]
	ci95_up <- as.numeric(ci.auc(roc,conf.level=ci[2]))[3]
	ci99 <- as.numeric(ci.auc(roc,conf.level=ci[1]))[1]



	# use colMeans here, as the "as.matrix()" transformation turns the initial row(non)vector into a columnvector.
	# This is necessary, as the *Means commands only apply to matrices, but x[i, ] is not a matrix
	N <- as.numeric(as.matrix(N[ ]))
	auc <- as.numeric(as.matrix(aucs[]))
	
	ci90<-as.numeric(as.matrix(ci90[]))
	ci95_lo<-as.numeric(as.matrix(ci95_lo[ ]))
	ci95_up<-as.numeric(as.matrix(ci95_up[]))
	ci99<-as.numeric(as.matrix(ci99[ ]))

	out[row,8] <- round(auc,2)

	

# confidence intervals
r_h2_full<-roc 
cis<-paste(round(ci95_lo,2), round(ci95_up,2), sep=",")
cis<-paste("[", cis, sep="")
cis<-paste(cis, "]", sep="")

out[row,9] <- round(ci95_lo,2)
out[row,10] <- round(ci95_up,2)
out[row,11]<-round(N,2)
out[row,12]<-floor(sum(full_om$b2)/2)


#AUC comparison
testobj <- roc.test(r_h2_full,r_h2_log,method="delong",alternative="two.sided")
options("scipen"=10)
options()$scipen

sig_base[row,2]<-testobj$p.value[1]

testobj <- roc.test(r_h2_full,r_h2_sel,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_many[row,1]<-testobj$p.value[1]







###############################################################
#												3-year horizon LONG-RUN DATA													#
###############################################################
##############################################################################################################

# variables
var.list <- c( "loans1_y_gap", "pdebt_gap", "narrowm_y_gap",  "rltrate", "gr_rgdp", "gr_cpi",  "er_gap", "loans1_y", "pdebt", "ltrate")

# variables (logit)
var.logit <- c("loans1_y_gap", "pdebt_gap", "narrowm_y_gap",  "rltrate", "gr_rgdp", "gr_cpi",  "er_gap")

# interaction terms (logit)
ia.logit <- c("ia_pub", "ia_prb", "ia_jb", "ia_lygr", "ia_pygr", "ia_lyer")

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)

# row of table
row<-3
##############################################################################################################



#Daten  <- read.table("D:/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)

ca <- grep("ca", names(Daten), value=T)
drops <- names(Daten) %in% c(ca)
Daten <- Daten[!drops]

# drop vars not used
assets <- grep("assets", names(Daten), value=T)
stocks <- grep("stocks", names(Daten), value=T)
narrowm <- grep("narrowm", names(Daten), value=T)
money <- grep("money", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
loans <- grep("loans", names(Daten), value=T)
debt <- grep("debt", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)
gdp <- grep("gdp", names(Daten), value=T)
i <- grep("i_", names(Daten), value=T)
c <- grep("c_", names(Daten), value=T)
ri <- grep("ri", names(Daten), value=T)
rc <- grep("rc", names(Daten), value=T)

drops <- names(Daten) %in% c("year", "ccode", stocks, money, stir,assets,i,ri,glo) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo)
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b3)/3

# SELECTION SET:
sel.list <- c("b3", "loans1_y_gap", "pdebt_gap", "narrowm_y_gap",  "rltrate", "gr_rgdp", "gr_cpi",  "er_gap", "loans1_y", "pdebt", "ltrate")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b3)/3


#DATA for logit model
## interaction-terms for logit model
ia_pub<-Daten$pdebt_gap*Daten$ltrate
Daten$ia_pub<-ia_pub

ia_prb<-Daten$loans1_y_gap*Daten$ltrate
Daten$ia_prb<-ia_prb

ia_jb<-Daten$loans1_y_gap*Daten$ltrate*Daten$pdebt_gap
Daten$ia_jb<-ia_jb

ia_lygr<-Daten$loans1_y*Daten$gr_rgdp
Daten$ia_lygr<-ia_lygr

ia_pygr<-Daten$pdebt*Daten$gr_rgdp
Daten$ia_pygr<-ia_pygr

ia_lyer<-Daten$loans1_y_gap*Daten$er_gap
Daten$ia_lyer<-ia_lyer

## country factor
Daten$country.factor<-as.factor(Daten$ccode)

#throw out vars not used
drops.logit <- names(Daten) %in% c("year") # true-false indicator: true at the names in vector
full.logit <- Daten[!drops] # drops those variables which have true indication in "drops"



#														ANALYSIS   												#

#LOGIT
aucs <- matrix(nrow=1, ncol=runs)
ci95_lo <- matrix(nrow=1, ncol=runs)
ci95_up <- matrix(nrow=1, ncol=runs)

N <- matrix(nrow=1, ncol=runs)
	

# get formula
location <- names(full.logit) %in% c(var.logit, ia.logit,"country.factor") # get location of vars
name <- names(full.logit[location]) # get names
indep <- paste(name, collapse="+") # indep. variables
dep <- paste("b3~") # dep. variable
fmla <- as.formula(paste(dep, indep)) # get formula


for(j in 1:runs) {
	
	# training, test sample
	set.seed(j)
	indexes = sample(1:nrow(full.logit), size=0.632*nrow(full), replace=F)
	test = full.logit[-indexes,]
	train = full.logit[indexes,]
	
	# Regression
	logit<-glm(fmla, data=train, family="binomial")
	N[1,j] <- logit$df.null

	# OOS-analysis
	pred<-predict(logit, newdata=test, type="response") # predicted outcome

	location <- names(test) %in% c("b3")
	name <- names(test[location]) # get names
	true<-test[,name] # real outcome

	r<-roc(true,pred,ci=T) # ROC analysis
	aucs[1,j] <- as.numeric(r$auc)
		
	ci95_lo[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[1]
	ci95_up[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[3]

}

N <- as.numeric(colMeans(as.matrix(N[1, ]))) # update output table matrix

auc<-as.numeric(colMeans(as.matrix(aucs[1, ])))
ci95_lo<-as.numeric(colMeans(as.matrix(ci95_lo[1, ])))
ci95_up<-as.numeric(colMeans(as.matrix(ci95_up[1, ])))



# Representative logit model whose AUC equals the MCCV average

# training, test sample
set.seed(3)
indexes = sample(1:nrow(full.logit), size=0.632*nrow(full.logit), replace=F)
test = full.logit[-indexes,]
train = full.logit[indexes,]
	
# Regression
logit<-glm(fmla, data=train, family="binomial")

# OOS-analysis
pred<-predict(logit, newdata=test, type="response") # predicted outcome

true<-test[,"b3"] # real outcome

library(pROC)
r_h3_log<-roc(true,pred,ci=F) # ROC analysis
r_h3_log



## RF-selection
library(randomForest)

location <- names(sel_om) %in% c(var.list) # get location of independent var
name.indep <- names(sel_om[location]) # get names of features
location <- names(sel_om) %in% c("b3") # get location of dependent var
name.dep <- names(sel_om[location]) # get name of dep. var.
indep <- sel_om[name.indep]
dep <- factor(sel_om[name.dep]>0)

# grow trees
set.seed(1)
rf_selection= randomForest(indep, y=dep,
 data=sel_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(sel_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_selection

# convergence diagnostic
palette("default")
plot(rf_selection, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_selection, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- sel_om[,name.dep]

r<-roc(true, pred, ci=T) # ROC analysis

out[row,1]<-model.list[row]
out[row,2] <- round(as.numeric(r$auc),2)
out[row,3]<-round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,4]<-round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,5]<-nrow(sel_om)
out[row,6]<-floor(sum(full_om$b3)/3)


# compare ROCs
r_h3_sel<-r
testobj <- roc.test(r_h3_sel,r_h3_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,1]<-testobj$p.value[1]




## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b3"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b3"]

roc<-roc(true, pred, ci=T) # ROC analysis
r_h3_full<-roc 

 	N <- nrow(full_om)
	
	aucs <- as.numeric(roc$auc)
		
	ci90 <- as.numeric(ci.auc(roc,conf.level=ci[3]))[1]
	ci95_lo <- as.numeric(ci.auc(roc,conf.level=ci[2]))[1]
	ci95_up <- as.numeric(ci.auc(roc,conf.level=ci[2]))[3]
	ci99 <- as.numeric(ci.auc(roc,conf.level=ci[1]))[1]



	# use colMeans here, as the "as.matrix()" transformation turns the initial row(non)vector into a columnvector.
	# This is necessary, as the *Means commands only apply to matrices, but x[i, ] is not a matrix
	N <- as.numeric(as.matrix(N[ ]))
	auc <- as.numeric(as.matrix(aucs[]))
	
	ci90<-as.numeric(as.matrix(ci90[]))
	ci95_lo<-as.numeric(as.matrix(ci95_lo[ ]))
	ci95_up<-as.numeric(as.matrix(ci95_up[]))
	ci99<-as.numeric(as.matrix(ci99[ ]))

	out[row,8] <- round(auc,2)

	

# confidence intervals
cis<-paste(round(ci95_lo,2), round(ci95_up,2), sep=",")
cis<-paste("[", cis, sep="")
cis<-paste(cis, "]", sep="")

out[row,9] <- round(ci95_lo,2)
out[row,10] <- round(ci95_up,2)
out[row,11]<-round(N,2)
out[row,12]<-floor(sum(full_om$b3)/3)


#AUC comparison
testobj <- roc.test(r_h3_full,r_h3_log,method="delong",alternative="two.sided")
options("scipen"=10)
options()$scipen

sig_base[row,2]<-testobj$p.value[1]

testobj <- roc.test(r_h3_full,r_h3_sel,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_many[row,1]<-testobj$p.value[1]











###############################################################
#													REINHART & ROGOFF CRISIS DUMMY		#
###############################################################

##############################################################################################################

# variables
var.list <- c( "loans1_y_gap", "pdebt_gap", "narrowm_y_gap",  "rltrate", "gr_rgdp", "gr_cpi",  "er_gap", "loans1_y", "pdebt", "ltrate")

# variables (logit)
var.logit <- c("loans1_y_gap", "pdebt_gap", "narrowm_y_gap",  "rltrate", "gr_rgdp", "gr_cpi",  "er_gap")

# interaction terms (logit)
ia.logit <- c("ia_pub", "ia_prb", "ia_jb", "ia_lygr", "ia_pygr", "ia_lyer")

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)

# row of table
row<-4
##############################################################################################################



#Daten  <- read.table("/Users/felixward/Documents/Studium/Bonn/Research/CrisisPrediction/Data/R_class_RR.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class_RR.csv", sep=",", dec=".", header=TRUE)
Daten <- rename(Daten, c(RRcrisis2="b2"))

ca <- grep("ca", names(Daten), value=T)
drops <- names(Daten) %in% c(ca)
Daten <- Daten[!drops]

# drop vars not used
assets <- grep("assets", names(Daten), value=T)
stocks <- grep("stocks", names(Daten), value=T)
narrowm <- grep("narrowm", names(Daten), value=T)
money <- grep("money", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
loans <- grep("loans", names(Daten), value=T)
debt <- grep("debt", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)
gdp <- grep("gdp", names(Daten), value=T)
i <- grep("i_", names(Daten), value=T)
c <- grep("c_", names(Daten), value=T)
ri <- grep("ri", names(Daten), value=T)
rc <- grep("rc", names(Daten), value=T)

drops <- names(Daten) %in% c("year", "ccode", stocks, money, stir,assets,i,ri,glo) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo)
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/2

# SELECTION SET:
sel.list <- c("b2", "loans1_y_gap", "pdebt_gap", "narrowm_y_gap",  "rltrate", "gr_rgdp", "gr_cpi",  "er_gap", "loans1_y", "pdebt", "ltrate")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/2


#DATA for logit model
## interaction-terms for logit model
ia_pub<-Daten$pdebt_gap*Daten$ltrate
Daten$ia_pub<-ia_pub

ia_prb<-Daten$loans1_y_gap*Daten$ltrate
Daten$ia_prb<-ia_prb

ia_jb<-Daten$loans1_y_gap*Daten$ltrate*Daten$pdebt_gap
Daten$ia_jb<-ia_jb

ia_lygr<-Daten$loans1_y*Daten$gr_rgdp
Daten$ia_lygr<-ia_lygr

ia_pygr<-Daten$pdebt*Daten$gr_rgdp
Daten$ia_pygr<-ia_pygr

ia_lyer<-Daten$loans1_y_gap*Daten$er_gap
Daten$ia_lyer<-ia_lyer

## country factor
Daten$country.factor<-as.factor(Daten$ccode)

#throw out vars not used
drops.logit <- names(Daten) %in% c("year") # true-false indicator: true at the names in vector
full.logit <- Daten[!drops] # drops those variables which have true indication in "drops"


#														ANALYSIS   												#

#LOGIT
aucs <- matrix(nrow=1, ncol=runs)
ci95_lo <- matrix(nrow=1, ncol=runs)
ci95_up <- matrix(nrow=1, ncol=runs)

N <- matrix(nrow=1, ncol=runs)
	

# get formula
location <- names(full.logit) %in% c(var.logit, ia.logit,"country.factor") # get location of vars
name <- names(full.logit[location]) # get names
indep <- paste(name, collapse="+") # indep. variables
dep <- paste("b2~") # dep. variable
fmla <- as.formula(paste(dep, indep)) # get formula


for(j in 1:runs) {
	
	# training, test sample
	set.seed(j)
	indexes = sample(1:nrow(full.logit), size=0.632*nrow(full), replace=F)
	test = full.logit[-indexes,]
	train = full.logit[indexes,]
	
	# Regression
	logit<-glm(fmla, data=train, family="binomial")
	N[1,j] <- logit$df.null

	# OOS-analysis
	pred<-predict(logit, newdata=test, type="response") # predicted outcome

	location <- names(test) %in% c("b2")
	name <- names(test[location]) # get names
	true<-test[,name] # real outcome

	r<-roc(true,pred,ci=T) # ROC analysis
	aucs[1,j] <- as.numeric(r$auc)
		
	ci95_lo[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[1]
	ci95_up[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[3]

}

N <- as.numeric(colMeans(as.matrix(N[1, ]))) # update output table matrix

auc<-as.numeric(colMeans(as.matrix(aucs[1, ])))
ci95_lo<-as.numeric(colMeans(as.matrix(ci95_lo[1, ])))
ci95_up<-as.numeric(colMeans(as.matrix(ci95_up[1, ])))



# Representative logit model whose AUC equals the MCCV average

# training, test sample
set.seed(5)
indexes = sample(1:nrow(full.logit), size=0.632*nrow(full.logit), replace=F)
test = full.logit[-indexes,]
train = full.logit[indexes,]
	
# Regression
logit<-glm(fmla, data=train, family="binomial")

# OOS-analysis
pred<-predict(logit, newdata=test, type="response") # predicted outcome

true<-test[,"b2"] # real outcome

library(pROC)
r_RR_log<-roc(true,pred,ci=F) # ROC analysis
r_RR_log



## RF-selection
library(randomForest)

location <- names(sel_om) %in% c(var.list) # get location of independent var
name.indep <- names(sel_om[location]) # get names of features
location <- names(sel_om) %in% c("b2") # get location of dependent var
name.dep <- names(sel_om[location]) # get name of dep. var.
indep <- sel_om[name.indep]
dep <- factor(sel_om[name.dep]>0)

# grow trees
set.seed(1)
rf_selection= randomForest(indep, y=dep,
 data=sel_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(sel_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_selection

# convergence diagnostic
palette("default")
plot(rf_selection, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_selection, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- sel_om[,name.dep]

r<-roc(true, pred, ci=T) # ROC analysis

out[row,1]<-model.list[row]
out[row,2] <- round(as.numeric(r$auc),2)
out[row,3]<-round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,4]<-round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,5]<-nrow(sel_om)
out[row,6]<-floor(sum(full_om$b2)/2)


# compare ROCs
r_RR <- r
testobj <- roc.test(r_RR,r_RR_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,1]<-testobj$p.value[1]





## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b2"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b2"]

r<-roc(true, pred, ci=T) # ROC analysis
out[row,8] <- round(as.numeric(r$auc),2)
out[row,9] <- round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,10] <- round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,11]<-nrow(full_om)
out[row,12]<-floor(sum(full_om$b2)/2)


# compare ROCs
r_RR_full <- r
testobj <- roc.test(r_RR_full,r_RR_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,2]<-testobj$p.value[1]


testobj <- roc.test(r_RR_full,r_RR,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_many[row,1]<-testobj$p.value[1]





###############################################################
#												1-year horizon post 1970 annual dataset			#
###############################################################

#Daten  <- read.table("/Users/felixward/Documents/Studium/Bonn/Research/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class_post70_y.csv", sep=",", dec=".", header=TRUE)


fliab <- grep("fliab", names(Daten), value=T)
drops <- names(Daten) %in% c(fliab)
Daten <- Daten[!drops]

# drop vars not used
stocks <- grep("stocks", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
loans <- grep("loans", names(Daten), value=T)
pdebt <- grep("pdebt", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)

drops <- names(Daten) %in% c("year", "ccode", stocks, ltrate,stir,glo) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo) # keep global variables as they have few missings
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b1)/1

# SELECTION SET:
sel.list <-c("b1", "loans_y_gap", "pdebt_y_gap", "gdp_r_gap", "gr_cpi", "rer_gap", "loans_y", "pdebt_y", "nx_y_gap")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b1)/1



#DATA for logit model
## interaction-terms for logit model
ia_pub<-Daten$pdebt_y_gap*Daten$ltrate
Daten$ia_pub<-ia_pub

ia_prb<-Daten$loans_y_gap*Daten$ltrate
Daten$ia_prb<-ia_prb

ia_jb<-Daten$loans_y_gap*Daten$ltrate*Daten$pdebt_y_gap
Daten$ia_jb<-ia_jb

ia_lygr<-Daten$loans_y*Daten$gdp_r_gr
Daten$ia_lygr<-ia_lygr

ia_pygr<-Daten$pdebt_y*Daten$gdp_r_gr
Daten$ia_pygr<-ia_pygr

ia_lyer<-Daten$loans_y_gap*Daten$er_gap
Daten$ia_lyer<-ia_lyer

## country factor
Daten$country.factor<-as.factor(Daten$ccode)

#throw out vars not used
drops.logit <- names(Daten) %in% c("year") # true-false indicator: true at the names in vector
full.logit <- Daten[!drops] # drops those variables which have true indication in "drops"

###############################################################
#														ANALYSIS   												#
###############################################################

### CLASSIFICATION-TREE ANALYSIS
##############################################################################################################
# variables
var.list <- c("loans_y_gap", "pdebt_y_gap", "gdp_r_gap", "gr_cpi", "rer_gap", "loans_y", "pdebt_y", "nx_y_gap")

# variables (logit)
var.logit <- c("loans_y_gap", "pdebt_y_gap", "gdp_r_gap", "gr_cpi", "rer_gap", "nx_y_gap")
# interaction terms (logit)
ia.logit <- c("ia_pub", "ia_prb", "ia_jb", "ia_lygr", "ia_pygr", "ia_lyer")

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)

# row of table
row<-5
##############################################################################################################


#LOGIT
aucs <- matrix(nrow=1, ncol=runs)
ci95_lo <- matrix(nrow=1, ncol=runs)
ci95_up <- matrix(nrow=1, ncol=runs)

N <- matrix(nrow=1, ncol=runs)
	

# get formula
location <- names(full.logit) %in% c(var.logit, ia.logit,"country.factor") # get location of vars
name <- names(full.logit[location]) # get names
indep <- paste(name, collapse="+") # indep. variables
dep <- paste("b1~") # dep. variable
fmla <- as.formula(paste(dep, indep)) # get formula


for(j in 1:runs) {
	
	# training, test sample
	set.seed(j)
	indexes = sample(1:nrow(full.logit), size=0.632*nrow(full), replace=F)
	test = full.logit[-indexes,]
	train = full.logit[indexes,]
	
	# Regression
	logit<-glm(fmla, data=train, family="binomial")
	N[1,j] <- logit$df.null

	# OOS-analysis
	pred<-predict(logit, newdata=test, type="response") # predicted outcome

	location <- names(test) %in% c("b1")
	name <- names(test[location]) # get names
	true<-test[,name] # real outcome

	r<-roc(true,pred,ci=T) # ROC analysis
	aucs[1,j] <- as.numeric(r$auc)
		
	ci95_lo[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[1]
	ci95_up[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[3]

}

N <- as.numeric(colMeans(as.matrix(N[1, ]))) # update output table matrix

auc<-as.numeric(colMeans(as.matrix(aucs[1, ])))
ci95_lo<-as.numeric(colMeans(as.matrix(ci95_lo[1, ])))
ci95_up<-as.numeric(colMeans(as.matrix(ci95_up[1, ])))



# Representative logit model whose AUC equals the MCCV average

# training, test sample
set.seed(3)
indexes = sample(1:nrow(full.logit), size=0.632*nrow(full.logit), replace=F)
test = full.logit[-indexes,]
train = full.logit[indexes,]
	
# Regression
logit<-glm(fmla, data=train, family="binomial")

# OOS-analysis
pred<-predict(logit, newdata=test, type="response") # predicted outcome

true<-test[,"b1"] # real outcome

library(pROC)
r_h1a_log<-roc(true,pred,ci=F) # ROC analysis
r_h1a_log




## RF-selection
library(randomForest)

location <- names(sel_om) %in% c(var.list) # get location of independent var
name.indep <- names(sel_om[location]) # get names of features
location <- names(sel_om) %in% c("b1") # get location of dependent var
name.dep <- names(sel_om[location]) # get name of dep. var.
indep <- sel_om[name.indep]
dep <- factor(sel_om[name.dep]>0)

# grow trees
set.seed(1)
rf_selection= randomForest(indep, y=dep,
 data=sel_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(sel_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_selection

# convergence diagnostic
palette("default")
plot(rf_selection, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_selection, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- sel_om[,name.dep]

r<-roc(true, pred, ci=T) # ROC analysis

out[row,1]<-model.list[row]
out[row,2] <- round(as.numeric(r$auc),2)
out[row,3]<-round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,4]<-round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,5]<-nrow(sel_om)
out[row,6]<-floor(sum(sel_om$b1)/1)

# compare ROCs
r_h1a<-r
testobj <- roc.test(r_h1a,r_h1a_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,1]<-testobj$p.value[1]



## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b1"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b1"]

r<-roc(true, pred, ci=T) # ROC analysis
out[row,8] <- round(as.numeric(r$auc),2)
out[row,9] <- round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,10] <- round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,11]<-nrow(full_om)
out[row,12]<-floor(sum(full_om$b1)/1)

# compare ROCs
r_h1a_full<-r
testobj <- roc.test(r_h1a_full,r_h1a_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,2]<-testobj$p.value[1]



testobj <- roc.test(r_h1a_full,r_h1a,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_many[row,1]<-testobj$p.value[1]









###############################################################
#												2-year horizon post 1970 annual dataset			#
###############################################################

#Daten  <- read.table("/Users/felixward/Documents/Studium/Bonn/Research/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class_post70_y.csv", sep=",", dec=".", header=TRUE)


fliab <- grep("fliab", names(Daten), value=T)
drops <- names(Daten) %in% c(fliab)
Daten <- Daten[!drops]

# drop vars not used
stocks <- grep("stocks", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
loans <- grep("loans", names(Daten), value=T)
pdebt <- grep("pdebt", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)

drops <- names(Daten) %in% c("year", "ccode", stocks, ltrate,stir,glo) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo) # keep global variables as they have few missings
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/2

# SELECTION SET:
sel.list <-c("b2", "loans_y_gap", "pdebt_y_gap", "gdp_r_gap", "gr_cpi", "rer_gap", "loans_y", "pdebt_y", "nx_y_gap")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/2



#DATA for logit model
## interaction-terms for logit model
ia_pub<-Daten$pdebt_y_gap*Daten$ltrate
Daten$ia_pub<-ia_pub

ia_prb<-Daten$loans_y_gap*Daten$ltrate
Daten$ia_prb<-ia_prb

ia_jb<-Daten$loans_y_gap*Daten$ltrate*Daten$pdebt_y_gap
Daten$ia_jb<-ia_jb

ia_lygr<-Daten$loans_y*Daten$gdp_r_gr
Daten$ia_lygr<-ia_lygr

ia_pygr<-Daten$pdebt_y*Daten$gdp_r_gr
Daten$ia_pygr<-ia_pygr

ia_lyer<-Daten$loans_y_gap*Daten$er_gap
Daten$ia_lyer<-ia_lyer

## country factor
Daten$country.factor<-as.factor(Daten$ccode)

#throw out vars not used
drops.logit <- names(Daten) %in% c("year") # true-false indicator: true at the names in vector
full.logit <- Daten[!drops] # drops those variables which have true indication in "drops"

###############################################################
#														ANALYSIS   												#
###############################################################

### CLASSIFICATION-TREE ANALYSIS
##############################################################################################################
# variables
var.list <- c("loans_y_gap", "pdebt_y_gap", "gdp_r_gap", "gr_cpi", "rer_gap", "loans_y", "pdebt_y", "nx_y_gap")

# variables (logit)
var.logit <- c("loans_y_gap", "pdebt_y_gap", "gdp_r_gap", "gr_cpi", "rer_gap", "nx_y_gap")
# interaction terms (logit)
ia.logit <- c("ia_pub", "ia_prb", "ia_jb", "ia_lygr", "ia_pygr", "ia_lyer")

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)

# row of table
row<-6
##############################################################################################################


#LOGIT
aucs <- matrix(nrow=1, ncol=runs)
ci95_lo <- matrix(nrow=1, ncol=runs)
ci95_up <- matrix(nrow=1, ncol=runs)

N <- matrix(nrow=1, ncol=runs)
	

# get formula
location <- names(full.logit) %in% c(var.logit, ia.logit,"country.factor") # get location of vars
name <- names(full.logit[location]) # get names
indep <- paste(name, collapse="+") # indep. variables
dep <- paste("b2~") # dep. variable
fmla <- as.formula(paste(dep, indep)) # get formula


for(j in 1:runs) {
	
	# training, test sample
	set.seed(j)
	indexes = sample(1:nrow(full.logit), size=0.632*nrow(full), replace=F)
	test = full.logit[-indexes,]
	train = full.logit[indexes,]
	
	# Regression
	logit<-glm(fmla, data=train, family="binomial")
	N[1,j] <- logit$df.null

	# OOS-analysis
	pred<-predict(logit, newdata=test, type="response") # predicted outcome

	location <- names(test) %in% c("b2")
	name <- names(test[location]) # get names
	true<-test[,name] # real outcome

	r<-roc(true,pred,ci=T) # ROC analysis
	aucs[1,j] <- as.numeric(r$auc)
		
	ci95_lo[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[1]
	ci95_up[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[3]

}

N <- as.numeric(colMeans(as.matrix(N[1, ]))) # update output table matrix

auc<-as.numeric(colMeans(as.matrix(aucs[1, ])))
ci95_lo<-as.numeric(colMeans(as.matrix(ci95_lo[1, ])))
ci95_up<-as.numeric(colMeans(as.matrix(ci95_up[1, ])))



# Representative logit model whose AUC equals the MCCV average

# training, test sample
set.seed(7)
indexes = sample(1:nrow(full.logit), size=0.632*nrow(full.logit), replace=F)
test = full.logit[-indexes,]
train = full.logit[indexes,]
	
# Regression
logit<-glm(fmla, data=train, family="binomial")

# OOS-analysis
pred<-predict(logit, newdata=test, type="response") # predicted outcome

true<-test[,"b2"] # real outcome

library(pROC)
r_h2a_log<-roc(true,pred,ci=F) # ROC analysis
r_h2a_log




## RF-selection
library(randomForest)

location <- names(sel_om) %in% c(var.list) # get location of independent var
name.indep <- names(sel_om[location]) # get names of features
location <- names(sel_om) %in% c("b2") # get location of dependent var
name.dep <- names(sel_om[location]) # get name of dep. var.
indep <- sel_om[name.indep]
dep <- factor(sel_om[name.dep]>0)

# grow trees
set.seed(1)
rf_selection= randomForest(indep, y=dep,
 data=sel_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(sel_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_selection

# convergence diagnostic
palette("default")
plot(rf_selection, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_selection, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- sel_om[,name.dep]

r<-roc(true, pred, ci=T) # ROC analysis

out[row,1]<-model.list[row]
out[row,2] <- round(as.numeric(r$auc),2)
out[row,3]<-round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,4]<-round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,5]<-nrow(sel_om)
out[row,6]<-floor(sum(sel_om$b2)/2)

# compare ROCs
r_h2a<-r
testobj <- roc.test(r_h2a,r_h2a_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,1]<-testobj$p.value[1]



## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b2"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b2"]

r<-roc(true, pred, ci=T) # ROC analysis
out[row,8] <- round(as.numeric(r$auc),2)
out[row,9] <- round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,10] <- round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,11]<-nrow(full_om)
out[row,12]<-floor(sum(full_om$b2)/2)

# compare ROCs
r_h2a_full<-r
testobj <- roc.test(r_h2a_full,r_h2a_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,2]<-testobj$p.value[1]



testobj <- roc.test(r_h2a_full,r_h2a,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_many[row,1]<-testobj$p.value[1]











###############################################################
#												3-year horizon post 1970 annual dataset			#
###############################################################

#Daten  <- read.table("/Users/felixward/Documents/Studium/Bonn/Research/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class_post70_y.csv", sep=",", dec=".", header=TRUE)


fliab <- grep("fliab", names(Daten), value=T)
drops <- names(Daten) %in% c(fliab)
Daten <- Daten[!drops]

# drop vars not used
stocks <- grep("stocks", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
loans <- grep("loans", names(Daten), value=T)
pdebt <- grep("pdebt", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)

drops <- names(Daten) %in% c("year", "ccode", stocks, ltrate,stir,glo) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo) # keep global variables as they have few missings
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b3)/3

# SELECTION SET:
sel.list <-c("b3", "loans_y_gap", "pdebt_y_gap", "gdp_r_gap", "gr_cpi", "rer_gap", "loans_y", "pdebt_y", "nx_y_gap")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b3)/3



#DATA for logit model
## interaction-terms for logit model
ia_pub<-Daten$pdebt_y_gap*Daten$ltrate
Daten$ia_pub<-ia_pub

ia_prb<-Daten$loans_y_gap*Daten$ltrate
Daten$ia_prb<-ia_prb

ia_jb<-Daten$loans_y_gap*Daten$ltrate*Daten$pdebt_y_gap
Daten$ia_jb<-ia_jb

ia_lygr<-Daten$loans_y*Daten$gdp_r_gr
Daten$ia_lygr<-ia_lygr

ia_pygr<-Daten$pdebt_y*Daten$gdp_r_gr
Daten$ia_pygr<-ia_pygr

ia_lyer<-Daten$loans_y_gap*Daten$er_gap
Daten$ia_lyer<-ia_lyer

## country factor
Daten$country.factor<-as.factor(Daten$ccode)

#throw out vars not used
drops.logit <- names(Daten) %in% c("year") # true-false indicator: true at the names in vector
full.logit <- Daten[!drops] # drops those variables which have true indication in "drops"

###############################################################
#														ANALYSIS   												#
###############################################################

### CLASSIFICATION-TREE ANALYSIS
##############################################################################################################
# variables
var.list <- c("loans_y_gap", "pdebt_y_gap", "gdp_r_gap", "gr_cpi", "rer_gap", "loans_y", "pdebt_y", "nx_y_gap")

# variables (logit)
var.logit <- c("loans_y_gap", "pdebt_y_gap", "gdp_r_gap", "gr_cpi", "rer_gap", "nx_y_gap")
# interaction terms (logit)
ia.logit <- c("ia_pub", "ia_prb", "ia_jb", "ia_lygr", "ia_pygr", "ia_lyer")

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)

# row of table
row<-7
##############################################################################################################


#LOGIT
aucs <- matrix(nrow=1, ncol=runs)
ci95_lo <- matrix(nrow=1, ncol=runs)
ci95_up <- matrix(nrow=1, ncol=runs)

N <- matrix(nrow=1, ncol=runs)
	

# get formula
location <- names(full.logit) %in% c(var.logit, ia.logit,"country.factor") # get location of vars
name <- names(full.logit[location]) # get names
indep <- paste(name, collapse="+") # indep. variables
dep <- paste("b3~") # dep. variable
fmla <- as.formula(paste(dep, indep)) # get formula


for(j in 1:runs) {
	
	# training, test sample
	set.seed(j)
	indexes = sample(1:nrow(full.logit), size=0.632*nrow(full), replace=F)
	test = full.logit[-indexes,]
	train = full.logit[indexes,]
	
	# Regression
	logit<-glm(fmla, data=train, family="binomial")
	N[1,j] <- logit$df.null

	# OOS-analysis
	pred<-predict(logit, newdata=test, type="response") # predicted outcome

	location <- names(test) %in% c("b3")
	name <- names(test[location]) # get names
	true<-test[,name] # real outcome

	r<-roc(true,pred,ci=T) # ROC analysis
	aucs[1,j] <- as.numeric(r$auc)
		
	ci95_lo[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[1]
	ci95_up[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[3]

}

N <- as.numeric(colMeans(as.matrix(N[1, ]))) # update output table matrix

auc<-as.numeric(colMeans(as.matrix(aucs[1, ])))
ci95_lo<-as.numeric(colMeans(as.matrix(ci95_lo[1, ])))
ci95_up<-as.numeric(colMeans(as.matrix(ci95_up[1, ])))



# Representative logit model whose AUC equals the MCCV average

# training, test sample
set.seed(2)
indexes = sample(1:nrow(full.logit), size=0.632*nrow(full.logit), replace=F)
test = full.logit[-indexes,]
train = full.logit[indexes,]
	
# Regression
logit<-glm(fmla, data=train, family="binomial")

# OOS-analysis
pred<-predict(logit, newdata=test, type="response") # predicted outcome

true<-test[,"b3"] # real outcome

library(pROC)
r_h3a_log<-roc(true,pred,ci=F) # ROC analysis
r_h3a_log




## RF-selection
library(randomForest)

location <- names(sel_om) %in% c(var.list) # get location of independent var
name.indep <- names(sel_om[location]) # get names of features
location <- names(sel_om) %in% c("b3") # get location of dependent var
name.dep <- names(sel_om[location]) # get name of dep. var.
indep <- sel_om[name.indep]
dep <- factor(sel_om[name.dep]>0)

# grow trees
set.seed(1)
rf_selection= randomForest(indep, y=dep,
 data=sel_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(sel_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_selection

# convergence diagnostic
palette("default")
plot(rf_selection, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_selection, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- sel_om[,name.dep]

r<-roc(true, pred, ci=T) # ROC analysis

out[row,1]<-model.list[row]
out[row,2] <- round(as.numeric(r$auc),2)
out[row,3]<-round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,4]<-round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,5]<-nrow(sel_om)
out[row,6]<-floor(sum(sel_om$b3)/3)

# compare ROCs
r_h3a<-r
testobj <- roc.test(r_h3a,r_h3a_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,1]<-testobj$p.value[1]



## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b3"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b3"]

r<-roc(true, pred, ci=T) # ROC analysis
out[row,8] <- round(as.numeric(r$auc),2)
out[row,9] <- round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,10] <- round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,11]<-nrow(full_om)
out[row,12]<-floor(sum(full_om$b3)/3)

# compare ROCs
r_h3a_full<-r
testobj <- roc.test(r_h3a_full,r_h3a_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,2]<-testobj$p.value[1]



testobj <- roc.test(r_h3a_full,r_h3a,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_many[row,1]<-testobj$p.value[1]










###############################################################
#												 DEVELOPING ECONOMIES ONLY   				#
###############################################################


#Daten  <- read.table("/Users/felixward/Documents/Studium/Bonn/Research/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class_post70_y_AEDE.csv", sep=",", dec=".", header=TRUE)

#Drop low quality countries
Daten <- Daten[ which(Daten$DE==1), ]


fliab <- grep("fliab", names(Daten), value=T)
drops <- names(Daten) %in% c(fliab)
Daten <- Daten[!drops]

# drop vars not used
AE <- grep("AE", names(Daten), value=T)
DE <- grep("DE", names(Daten), value=T)
big <- grep("big", names(Daten), value=T)
QU <- grep("QU", names(Daten), value=T)
ME <- grep("ME", names(Daten), value=T)

stocks <- grep("stocks", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
loans <- grep("loans", names(Daten), value=T)
pdebt <- grep("pdebt", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)

drops <- names(Daten) %in% c("year", "ccode", stocks, ltrate,stir,glo, AE,DE,big,QU,ME) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo) # keep global variables as they have few missings
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/2

# SELECTION SET:
sel.list <-c("b2", "loans_y_gap", "pdebt_y_gap", "gdp_r_gap", "gr_cpi", "rer_gap", "loans_y", "pdebt_y", "nx_y_gap")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/2



#DATA for logit model
## interaction-terms for logit model
ia_pub<-Daten$pdebt_y_gap*Daten$ltrate
Daten$ia_pub<-ia_pub

ia_prb<-Daten$loans_y_gap*Daten$ltrate
Daten$ia_prb<-ia_prb

ia_jb<-Daten$loans_y_gap*Daten$ltrate*Daten$pdebt_y_gap
Daten$ia_jb<-ia_jb

ia_lygr<-Daten$loans_y*Daten$gdp_r_gr
Daten$ia_lygr<-ia_lygr

ia_pygr<-Daten$pdebt_y*Daten$gdp_r_gr
Daten$ia_pygr<-ia_pygr

ia_lyer<-Daten$loans_y_gap*Daten$er_gap
Daten$ia_lyer<-ia_lyer

## country factor
Daten$country.factor<-as.factor(Daten$ccode)

#throw out vars not used
drops.logit <- names(Daten) %in% c("year") # true-false indicator: true at the names in vector
full.logit <- Daten[!drops] # drops those variables which have true indication in "drops"

###############################################################
#														ANALYSIS   												#
###############################################################

### CLASSIFICATION-TREE ANALYSIS
##############################################################################################################
# variables
var.list <- c("loans_y_gap", "pdebt_y_gap", "gdp_r_gap", "gr_cpi", "rer_gap", "loans_y", "pdebt_y", "nx_y_gap")

# variables (logit)
var.logit <- c("loans_y_gap", "pdebt_y_gap", "gdp_r_gap", "gr_cpi", "rer_gap", "nx_y_gap")
# interaction terms (logit)
ia.logit <- c("ia_pub", "ia_prb", "ia_jb", "ia_lygr", "ia_pygr", "ia_lyer")

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)

# row of table
row<-8
##############################################################################################################


#LOGIT
aucs <- matrix(nrow=1, ncol=runs)
ci95_lo <- matrix(nrow=1, ncol=runs)
ci95_up <- matrix(nrow=1, ncol=runs)

N <- matrix(nrow=1, ncol=runs)
	

# get formula
location <- names(full.logit) %in% c(var.logit, ia.logit,"country.factor") # get location of vars
name <- names(full.logit[location]) # get names
indep <- paste(name, collapse="+") # indep. variables
dep <- paste("b2~") # dep. variable
fmla <- as.formula(paste(dep, indep)) # get formula


for(j in 1:runs) {
	
	# training, test sample
	set.seed(j)
	indexes = sample(1:nrow(full.logit), size=0.632*nrow(full), replace=F)
	test = full.logit[-indexes,]
	train = full.logit[indexes,]
	
	# Regression
	logit<-glm(fmla, data=train, family="binomial")
	N[1,j] <- logit$df.null

	# OOS-analysis
	pred<-predict(logit, newdata=test, type="response") # predicted outcome

	location <- names(test) %in% c("b2")
	name <- names(test[location]) # get names
	true<-test[,name] # real outcome

	r<-roc(true,pred,ci=T) # ROC analysis
	aucs[1,j] <- as.numeric(r$auc)
		
	ci95_lo[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[1]
	ci95_up[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[3]

}

N <- as.numeric(colMeans(as.matrix(N[1, ]))) # update output table matrix

auc<-as.numeric(colMeans(as.matrix(aucs[1, ])))
ci95_lo<-as.numeric(colMeans(as.matrix(ci95_lo[1, ])))
ci95_up<-as.numeric(colMeans(as.matrix(ci95_up[1, ])))



# Representative logit model whose AUC equals the MCCV average

# training, test sample
set.seed(4)
indexes = sample(1:nrow(full.logit), size=0.632*nrow(full.logit), replace=F)
test = full.logit[-indexes,]
train = full.logit[indexes,]
	
# Regression
logit<-glm(fmla, data=train, family="binomial")

# OOS-analysis
pred<-predict(logit, newdata=test, type="response") # predicted outcome

true<-test[,"b2"] # real outcome

library(pROC)
r_DE_log<-roc(true,pred,ci=F) # ROC analysis
r_DE_log




## RF-selection
library(randomForest)

location <- names(sel_om) %in% c(var.list) # get location of independent var
name.indep <- names(sel_om[location]) # get names of features
location <- names(sel_om) %in% c("b2") # get location of dependent var
name.dep <- names(sel_om[location]) # get name of dep. var.
indep <- sel_om[name.indep]
dep <- factor(sel_om[name.dep]>0)

# grow trees
set.seed(1)
rf_selection= randomForest(indep, y=dep,
 data=sel_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(sel_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_selection

# convergence diagnostic
palette("default")
plot(rf_selection, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_selection, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- sel_om[,name.dep]

r<-roc(true, pred, ci=T) # ROC analysis

out[row,1]<-model.list[row]
out[row,2] <- round(as.numeric(r$auc),2)
out[row,3]<- round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,4]<- round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,5]<-nrow(sel_om)
out[row,6]<-floor(sum(sel_om$b2)/2)


# compare ROCs
r_DE<-r
testobj <- roc.test(r_DE,r_DE_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,1]<-testobj$p.value[1]




## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b2"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b2"]

r<-roc(true, pred, ci=T) # ROC analysis
out[row,8] <- round(as.numeric(r$auc),2)
out[row,9] <- round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,10] <- round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,11]<-nrow(full_om)
out[row,12]<-floor(sum(full_om$b2)/2)


# compare ROCs
r_DE_full<-r
testobj <- roc.test(r_DE_full,r_DE_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,2]<-testobj$p.value[1]

testobj <- roc.test(r_DE_full,r_DE,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_many[row,1]<-testobj$p.value[1]









###############################################################
#													 QUALITY DATA												#
###############################################################

#Daten  <- read.table("/Users/felixward/Documents/Studium/Bonn/Research/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class_post70_y_AEDE.csv", sep=",", dec=".", header=TRUE)

#Drop low quality countries
Daten <- Daten[ which(Daten$QU==1), ]


fliab <- grep("fliab", names(Daten), value=T)
drops <- names(Daten) %in% c(fliab)
Daten <- Daten[!drops]

# drop vars not used
AE <- grep("AE", names(Daten), value=T)
DE <- grep("DE", names(Daten), value=T)
big <- grep("big", names(Daten), value=T)
QU <- grep("QU", names(Daten), value=T)
ME <- grep("ME", names(Daten), value=T)

stocks <- grep("stocks", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
loans <- grep("loans", names(Daten), value=T)
pdebt <- grep("pdebt", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)

drops <- names(Daten) %in% c("year", "ccode", stocks, ltrate,stir,glo, AE,DE,big,QU,ME) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo) # keep global variables as they have few missings
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/2

# SELECTION SET:
sel.list <-c("b2", "loans_y_gap", "pdebt_y_gap", "gdp_r_gap", "gr_cpi", "rer_gap", "loans_y", "pdebt_y", "nx_y_gap")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/2



#DATA for logit model
## interaction-terms for logit model
ia_pub<-Daten$pdebt_y_gap*Daten$ltrate
Daten$ia_pub<-ia_pub

ia_prb<-Daten$loans_y_gap*Daten$ltrate
Daten$ia_prb<-ia_prb

ia_jb<-Daten$loans_y_gap*Daten$ltrate*Daten$pdebt_y_gap
Daten$ia_jb<-ia_jb

ia_lygr<-Daten$loans_y*Daten$gdp_r_gr
Daten$ia_lygr<-ia_lygr

ia_pygr<-Daten$pdebt_y*Daten$gdp_r_gr
Daten$ia_pygr<-ia_pygr

ia_lyer<-Daten$loans_y_gap*Daten$er_gap
Daten$ia_lyer<-ia_lyer

## country factor
Daten$country.factor<-as.factor(Daten$ccode)

#throw out vars not used
drops.logit <- names(Daten) %in% c("year") # true-false indicator: true at the names in vector
full.logit <- Daten[!drops] # drops those variables which have true indication in "drops"

#														ANALYSIS   												#

##############################################################################################################
# variables
var.list <- c("loans_y_gap", "pdebt_y_gap", "gdp_r_gap", "gr_cpi", "rer_gap", "loans_y", "pdebt_y", "nx_y_gap")

# variables (logit)
var.logit <- c("loans_y_gap", "pdebt_y_gap", "gdp_r_gap", "gr_cpi", "rer_gap", "nx_y_gap")
# interaction terms (logit)
ia.logit <- c("ia_pub", "ia_prb", "ia_jb", "ia_lygr", "ia_pygr", "ia_lyer")

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)

# row of table
row<-9
##############################################################################################################


#LOGIT
aucs <- matrix(nrow=1, ncol=runs)
ci95_lo <- matrix(nrow=1, ncol=runs)
ci95_up <- matrix(nrow=1, ncol=runs)

N <- matrix(nrow=1, ncol=runs)
	

# get formula
location <- names(full.logit) %in% c(var.logit, ia.logit,"country.factor") # get location of vars
name <- names(full.logit[location]) # get names
indep <- paste(name, collapse="+") # indep. variables
dep <- paste("b2~") # dep. variable
fmla <- as.formula(paste(dep, indep)) # get formula


for(j in 1:runs) {
	
	# training, test sample
	set.seed(j)
	indexes = sample(1:nrow(full.logit), size=0.632*nrow(full), replace=F)
	test = full.logit[-indexes,]
	train = full.logit[indexes,]
	
	# Regression
	logit<-glm(fmla, data=train, family="binomial")
	N[1,j] <- logit$df.null

	# OOS-analysis
	pred<-predict(logit, newdata=test, type="response") # predicted outcome

	location <- names(test) %in% c("b2")
	name <- names(test[location]) # get names
	true<-test[,name] # real outcome

	r<-roc(true,pred,ci=T) # ROC analysis
	aucs[1,j] <- as.numeric(r$auc)
		
	ci95_lo[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[1]
	ci95_up[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[3]

}

N <- as.numeric(colMeans(as.matrix(N[1, ]))) # update output table matrix

auc<-as.numeric(colMeans(as.matrix(aucs[1, ])))
ci95_lo<-as.numeric(colMeans(as.matrix(ci95_lo[1, ])))
ci95_up<-as.numeric(colMeans(as.matrix(ci95_up[1, ])))



# Representative logit model whose AUC equals the MCCV average

# training, test sample
set.seed(11)
indexes = sample(1:nrow(full.logit), size=0.632*nrow(full.logit), replace=F)
test = full.logit[-indexes,]
train = full.logit[indexes,]
	
# Regression
logit<-glm(fmla, data=train, family="binomial")

# OOS-analysis
pred<-predict(logit, newdata=test, type="response") # predicted outcome

true<-test[,"b2"] # real outcome

library(pROC)
r_Q_log<-roc(true,pred,ci=F) # ROC analysis
r_Q_log





## RF-selection
library(randomForest)

location <- names(sel_om) %in% c(var.list) # get location of independent var
name.indep <- names(sel_om[location]) # get names of features
location <- names(sel_om) %in% c("b2") # get location of dependent var
name.dep <- names(sel_om[location]) # get name of dep. var.
indep <- sel_om[name.indep]
dep <- factor(sel_om[name.dep]>0)

# grow trees
set.seed(1)
rf_selection= randomForest(indep, y=dep,
 data=sel_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(sel_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_selection

# convergence diagnostic
palette("default")
plot(rf_selection, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_selection, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- sel_om[,name.dep]

r<-roc(true, pred, ci=T) # ROC analysis

out[row,1]<-model.list[row]
out[row,2] <- round(as.numeric(r$auc),2)
out[row,3]<-round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,4]<-round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,5]<-nrow(sel_om)
out[row,6]<-floor(sum(sel_om$b2)/2)

# compare ROCs
r_Q <- r
testobj <- roc.test(r_Q,r_Q_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,1]<-testobj$p.value[1]




## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b2"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b2"]

r<-roc(true, pred, ci=T) # ROC analysis
out[row,8] <- round(as.numeric(r$auc),2)
out[row,9] <- round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,10] <- round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,11]<-nrow(full_om)
out[row,12]<-floor(sum(full_om$b2)/2)


# compare ROCs
r_Q_full <- r
testobj <- roc.test(r_Q_full,r_Q_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,2]<-testobj$p.value[1]


testobj <- roc.test(r_Q_full,r_Q,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_many[row,1]<-testobj$p.value[1]






###############################################################
#									1-YEAR HORIZON post-1970 quarterly dataset				#
###############################################################

#Daten  <- read.table("/Users/felixward/Documents/Studium/Bonn/Research/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class_post70_q.csv", sep=",", dec=".", header=TRUE)


# drop vars not used
hopr <- grep("hopr", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
gdp <- grep("gdp", names(Daten), value=T)
y <- grep("_y", names(Daten), value=T)
stocks <- grep("stocks", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)
fliab <- grep("fliab", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
res <- grep("res", names(Daten), value=T)
tloans <- grep("tloans", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)


drops <- names(Daten) %in% c("quarter", "year", "ccode", hopr, gdp, y, stocks, stir, ltrate,glo, fliab) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo)
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/8

# SELECTION SET:
sel.list <-c("b1","tloans_r_gap",  "tloans_r_gr", "a_fliab_r_gap",  "a_ltrate_r",  "a_gdp_r_gap", "cpi_gr",  "er_gap", "res_r_gap", "a_gdp_r_gr")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/8



#DATA for logit model
## interaction-terms for logit model

ia_prb<-Daten$tloans_y_gap*Daten$ltrate
Daten$ia_prb<-ia_prb

ia_lygr<-Daten$tloans_y*Daten$gdp_r_gr
Daten$ia_lygr<-ia_lygr

ia_lyer<-Daten$tloans_y_gap*Daten$er_gap
Daten$ia_lyer<-ia_lyer

## country factor
Daten$country.factor<-as.factor(Daten$ccode)

#throw out vars not used
drops.logit <- names(Daten) %in% c("year") # true-false indicator: true at the names in vector
full.logit <- Daten[!drops] # drops those variables which have true indication in "drops"

###############################################################
#														ANALYSIS   												#
###############################################################

### CLASSIFICATION-TREE ANALYSIS
##############################################################################################################
# variables
var.list <- c( "tloans_r_gap",  "tloans_r_gr", "a_fliab_r_gap",  "a_ltrate_r",  "a_gdp_r_gap", "cpi_gr",  "er_gap", "res_r_gap", "a_gdp_r_gr")

# variables (logit)
var.logit <- c( "tloans_r_gap",  "tloans_r_gr", "a_fliab_r_gap",  "a_ltrate_r",  "a_gdp_r_gap", "cpi_gr",  "er_gap", "res_r_gap", "a_gdp_r_gr")
# interaction terms (logit)
ia.logit <- c("ia_prb", "ia_lygr", "ia_lyer")

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)

# row of table
row<-10
##############################################################################################################


#LOGIT
aucs <- matrix(nrow=1, ncol=runs)
ci95_lo <- matrix(nrow=1, ncol=runs)
ci95_up <- matrix(nrow=1, ncol=runs)

N <- matrix(nrow=1, ncol=runs)
	

# get formula
location <- names(full.logit) %in% c(var.logit, ia.logit,"country.factor") # get location of vars
name <- names(full.logit[location]) # get names
indep <- paste(name, collapse="+") # indep. variables
dep <- paste("b1~") # dep. variable
fmla <- as.formula(paste(dep, indep)) # get formula


for(j in 1:runs) {
	
	# training, test sample
	set.seed(j)
	indexes = sample(1:nrow(full.logit), size=0.632*nrow(full), replace=F)
	test = full.logit[-indexes,]
	train = full.logit[indexes,]
	
	# Regression
	logit<-glm(fmla, data=train, family="binomial")
	N[1,j] <- logit$df.null

	# OOS-analysis
	pred<-predict(logit, newdata=test, type="response") # predicted outcome

	location <- names(test) %in% c("b1")
	name <- names(test[location]) # get names
	true<-test[,name] # real outcome

	r<-roc(true,pred,ci=T) # ROC analysis
	aucs[1,j] <- as.numeric(r$auc)
		
	ci95_lo[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[1]
	ci95_up[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[3]

}

N <- as.numeric(colMeans(as.matrix(N[1, ]))) # update output table matrix

auc<-as.numeric(colMeans(as.matrix(aucs[1, ])))
ci95_lo<-as.numeric(colMeans(as.matrix(ci95_lo[1, ])))
ci95_up<-as.numeric(colMeans(as.matrix(ci95_up[1, ])))



# Representative logit model whose AUC equals the MCCV average

# training, test sample
set.seed(14)
indexes = sample(1:nrow(full.logit), size=0.632*nrow(full.logit), replace=F)
test = full.logit[-indexes,]
train = full.logit[indexes,]
	
# Regression
logit<-glm(fmla, data=train, family="binomial")

# OOS-analysis
pred<-predict(logit, newdata=test, type="response") # predicted outcome

true<-test[,"b1"] # real outcome

library(pROC)
r_h1q_log<-roc(true,pred,ci=F) # ROC analysis
r_h1q_log




## RF-selection
library(randomForest)

location <- names(sel_om) %in% c(var.list) # get location of independent var
name.indep <- names(sel_om[location]) # get names of features
location <- names(sel_om) %in% c("b1") # get location of dependent var
name.dep <- names(sel_om[location]) # get name of dep. var.
indep <- sel_om[name.indep]
dep <- factor(sel_om[name.dep]>0)

# grow trees
set.seed(1)
rf_selection= randomForest(indep, y=dep,
 data=sel_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(sel_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_selection

# convergence diagnostic
palette("default")
plot(rf_selection, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_selection, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- sel_om[,name.dep]

r<-roc(true, pred, ci=T) # ROC analysis

out[row,1]<-model.list[row]
out[row,2] <- round(as.numeric(r$auc),2)
out[row,3]<-round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,4]<-round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,5]<-nrow(sel_om)
out[row,6]<-floor(sum(sel_om$b1)/4)

# compare ROCs
r_h1q<-r
testobj <- roc.test(r_h1q,r_h1q_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,1]<-testobj$p.value[1]




## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b1"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b1"]

r<-roc(true, pred, ci=T) # ROC analysis
out[row,8] <- round(as.numeric(r$auc),2)
out[row,9] <- round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,10] <- round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,11]<-nrow(full_om)
out[row,12]<-floor(sum(full_om$b1)/4)


# compare ROCs
r_h1q_full<-r
testobj <- roc.test(r_h1q_full,r_h1q_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,2]<-testobj$p.value[1]


testobj <- roc.test(r_h1q_full,r_h1q,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_many[row,1]<-testobj$p.value[1]








###############################################################
#									2-YEAR HORIZON post-1970 quarterly dataset				#
###############################################################

#Daten  <- read.table("/Users/felixward/Documents/Studium/Bonn/Research/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class_post70_q.csv", sep=",", dec=".", header=TRUE)


# drop vars not used
hopr <- grep("hopr", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
gdp <- grep("gdp", names(Daten), value=T)
y <- grep("_y", names(Daten), value=T)
stocks <- grep("stocks", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)
fliab <- grep("fliab", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
res <- grep("res", names(Daten), value=T)
tloans <- grep("tloans", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)


drops <- names(Daten) %in% c("quarter", "year", "ccode", hopr, gdp, y, stocks, stir, ltrate,glo, fliab) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo)
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/8

# SELECTION SET:
sel.list <-c("b2","tloans_r_gap",  "tloans_r_gr", "a_fliab_r_gap",  "a_ltrate_r",  "a_gdp_r_gap", "cpi_gr",  "er_gap", "res_r_gap", "a_gdp_r_gr")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/8



#DATA for logit model
## interaction-terms for logit model

ia_prb<-Daten$tloans_y_gap*Daten$ltrate
Daten$ia_prb<-ia_prb

ia_lygr<-Daten$tloans_y*Daten$gdp_r_gr
Daten$ia_lygr<-ia_lygr

ia_lyer<-Daten$tloans_y_gap*Daten$er_gap
Daten$ia_lyer<-ia_lyer

## country factor
Daten$country.factor<-as.factor(Daten$ccode)

#throw out vars not used
drops.logit <- names(Daten) %in% c("year") # true-false indicator: true at the names in vector
full.logit <- Daten[!drops] # drops those variables which have true indication in "drops"

###############################################################
#														ANALYSIS   												#
###############################################################

### CLASSIFICATION-TREE ANALYSIS
##############################################################################################################
# variables
var.list <- c( "tloans_r_gap",  "tloans_r_gr", "a_fliab_r_gap",  "a_ltrate_r",  "a_gdp_r_gap", "cpi_gr",  "er_gap", "res_r_gap", "a_gdp_r_gr")

# variables (logit)
var.logit <- c( "tloans_r_gap",  "tloans_r_gr", "a_fliab_r_gap",  "a_ltrate_r",  "a_gdp_r_gap", "cpi_gr",  "er_gap", "res_r_gap", "a_gdp_r_gr")
# interaction terms (logit)
ia.logit <- c("ia_prb", "ia_lygr", "ia_lyer")

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)

# row of table
row<-11
##############################################################################################################


#LOGIT
aucs <- matrix(nrow=1, ncol=runs)
ci95_lo <- matrix(nrow=1, ncol=runs)
ci95_up <- matrix(nrow=1, ncol=runs)

N <- matrix(nrow=1, ncol=runs)
	

# get formula
location <- names(full.logit) %in% c(var.logit, ia.logit,"country.factor") # get location of vars
name <- names(full.logit[location]) # get names
indep <- paste(name, collapse="+") # indep. variables
dep <- paste("b2~") # dep. variable
fmla <- as.formula(paste(dep, indep)) # get formula


for(j in 1:runs) {
	
	# training, test sample
	set.seed(j)
	indexes = sample(1:nrow(full.logit), size=0.632*nrow(full), replace=F)
	test = full.logit[-indexes,]
	train = full.logit[indexes,]
	
	# Regression
	logit<-glm(fmla, data=train, family="binomial")
	N[1,j] <- logit$df.null

	# OOS-analysis
	pred<-predict(logit, newdata=test, type="response") # predicted outcome

	location <- names(test) %in% c("b2")
	name <- names(test[location]) # get names
	true<-test[,name] # real outcome

	r<-roc(true,pred,ci=T) # ROC analysis
	aucs[1,j] <- as.numeric(r$auc)
		
	ci95_lo[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[1]
	ci95_up[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[3]

}

N <- as.numeric(colMeans(as.matrix(N[1, ]))) # update output table matrix

auc<-as.numeric(colMeans(as.matrix(aucs[1, ])))
ci95_lo<-as.numeric(colMeans(as.matrix(ci95_lo[1, ])))
ci95_up<-as.numeric(colMeans(as.matrix(ci95_up[1, ])))



# Representative logit model whose AUC equals the MCCV average

# training, test sample
set.seed(7)
indexes = sample(1:nrow(full.logit), size=0.632*nrow(full.logit), replace=F)
test = full.logit[-indexes,]
train = full.logit[indexes,]
	
# Regression
logit<-glm(fmla, data=train, family="binomial")

# OOS-analysis
pred<-predict(logit, newdata=test, type="response") # predicted outcome

true<-test[,"b2"] # real outcome

library(pROC)
r_h2q_log<-roc(true,pred,ci=F) # ROC analysis
r_h2q_log




## RF-selection
library(randomForest)

location <- names(sel_om) %in% c(var.list) # get location of independent var
name.indep <- names(sel_om[location]) # get names of features
location <- names(sel_om) %in% c("b2") # get location of dependent var
name.dep <- names(sel_om[location]) # get name of dep. var.
indep <- sel_om[name.indep]
dep <- factor(sel_om[name.dep]>0)

# grow trees
set.seed(1)
rf_selection= randomForest(indep, y=dep,
 data=sel_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(sel_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_selection

# convergence diagnostic
palette("default")
plot(rf_selection, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_selection, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- sel_om[,name.dep]

r<-roc(true, pred, ci=T) # ROC analysis

out[row,1]<-model.list[row]
out[row,2] <- round(as.numeric(r$auc),2)
out[row,3]<-round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,4]<-round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,5]<-nrow(sel_om)
out[row,6]<-floor(sum(sel_om$b2)/8)

# compare ROCs
r_h2q<-r
testobj <- roc.test(r_h2q,r_h2q_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,1]<-testobj$p.value[1]




## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b2"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b2"]

r<-roc(true, pred, ci=T) # ROC analysis
out[row,8] <- round(as.numeric(r$auc),2)
out[row,9] <- round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,10] <- round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,11]<-nrow(full_om)
out[row,12]<-floor(sum(full_om$b2)/8)


# compare ROCs
r_h2q_full<-r
testobj <- roc.test(r_h2q_full,r_h2q_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,2]<-testobj$p.value[1]


testobj <- roc.test(r_h2q_full,r_h2q,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_many[row,1]<-testobj$p.value[1]







###############################################################
#									3-YEAR HORIZON post-1970 quarterly dataset				#
###############################################################

#Daten  <- read.table("/Users/felixward/Documents/Studium/Bonn/Research/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class_post70_q.csv", sep=",", dec=".", header=TRUE)


# drop vars not used
hopr <- grep("hopr", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
gdp <- grep("gdp", names(Daten), value=T)
y <- grep("_y", names(Daten), value=T)
stocks <- grep("stocks", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)
fliab <- grep("fliab", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
res <- grep("res", names(Daten), value=T)
tloans <- grep("tloans", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)


drops <- names(Daten) %in% c("quarter", "year", "ccode", hopr, gdp, y, stocks, stir, ltrate,glo, fliab) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo)
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/8

# SELECTION SET:
sel.list <-c("b3","tloans_r_gap",  "tloans_r_gr", "a_fliab_r_gap",  "a_ltrate_r",  "a_gdp_r_gap", "cpi_gr",  "er_gap", "res_r_gap", "a_gdp_r_gr")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/8



#DATA for logit model
## interaction-terms for logit model

ia_prb<-Daten$tloans_y_gap*Daten$ltrate
Daten$ia_prb<-ia_prb

ia_lygr<-Daten$tloans_y*Daten$gdp_r_gr
Daten$ia_lygr<-ia_lygr

ia_lyer<-Daten$tloans_y_gap*Daten$er_gap
Daten$ia_lyer<-ia_lyer

## country factor
Daten$country.factor<-as.factor(Daten$ccode)

#throw out vars not used
drops.logit <- names(Daten) %in% c("year") # true-false indicator: true at the names in vector
full.logit <- Daten[!drops] # drops those variables which have true indication in "drops"

###############################################################
#														ANALYSIS   												#
###############################################################

### CLASSIFICATION-TREE ANALYSIS
##############################################################################################################
# variables
var.list <- c( "tloans_r_gap",  "tloans_r_gr", "a_fliab_r_gap",  "a_ltrate_r",  "a_gdp_r_gap", "cpi_gr",  "er_gap", "res_r_gap", "a_gdp_r_gr")

# variables (logit)
var.logit <- c( "tloans_r_gap",  "tloans_r_gr", "a_fliab_r_gap",  "a_ltrate_r",  "a_gdp_r_gap", "cpi_gr",  "er_gap", "res_r_gap", "a_gdp_r_gr")
# interaction terms (logit)
ia.logit <- c("ia_prb", "ia_lygr", "ia_lyer")

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)

# row of table
row<-12
##############################################################################################################


#LOGIT
aucs <- matrix(nrow=1, ncol=runs)
ci95_lo <- matrix(nrow=1, ncol=runs)
ci95_up <- matrix(nrow=1, ncol=runs)

N <- matrix(nrow=1, ncol=runs)
	

# get formula
location <- names(full.logit) %in% c(var.logit, ia.logit,"country.factor") # get location of vars
name <- names(full.logit[location]) # get names
indep <- paste(name, collapse="+") # indep. variables
dep <- paste("b3~") # dep. variable
fmla <- as.formula(paste(dep, indep)) # get formula


for(j in 1:runs) {
	
	# training, test sample
	set.seed(j)
	indexes = sample(1:nrow(full.logit), size=0.632*nrow(full), replace=F)
	test = full.logit[-indexes,]
	train = full.logit[indexes,]
	
	# Regression
	logit<-glm(fmla, data=train, family="binomial")
	N[1,j] <- logit$df.null

	# OOS-analysis
	pred<-predict(logit, newdata=test, type="response") # predicted outcome

	location <- names(test) %in% c("b3")
	name <- names(test[location]) # get names
	true<-test[,name] # real outcome

	r<-roc(true,pred,ci=T) # ROC analysis
	aucs[1,j] <- as.numeric(r$auc)
		
	ci95_lo[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[1]
	ci95_up[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[3]

}

N <- as.numeric(colMeans(as.matrix(N[1, ]))) # update output table matrix

auc<-as.numeric(colMeans(as.matrix(aucs[1, ])))
ci95_lo<-as.numeric(colMeans(as.matrix(ci95_lo[1, ])))
ci95_up<-as.numeric(colMeans(as.matrix(ci95_up[1, ])))



# Representative logit model whose AUC equals the MCCV average

# training, test sample
set.seed(2)
indexes = sample(1:nrow(full.logit), size=0.632*nrow(full.logit), replace=F)
test = full.logit[-indexes,]
train = full.logit[indexes,]
	
# Regression
logit<-glm(fmla, data=train, family="binomial")

# OOS-analysis
pred<-predict(logit, newdata=test, type="response") # predicted outcome

true<-test[,"b3"] # real outcome

library(pROC)
r_h3q_log<-roc(true,pred,ci=F) # ROC analysis
r_h3q_log




## RF-selection
library(randomForest)

location <- names(sel_om) %in% c(var.list) # get location of independent var
name.indep <- names(sel_om[location]) # get names of features
location <- names(sel_om) %in% c("b3") # get location of dependent var
name.dep <- names(sel_om[location]) # get name of dep. var.
indep <- sel_om[name.indep]
dep <- factor(sel_om[name.dep]>0)

# grow trees
set.seed(1)
rf_selection= randomForest(indep, y=dep,
 data=sel_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(sel_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_selection

# convergence diagnostic
palette("default")
plot(rf_selection, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_selection, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- sel_om[,name.dep]

r<-roc(true, pred, ci=T) # ROC analysis

out[row,1]<-model.list[row]
out[row,2] <- round(as.numeric(r$auc),2)
out[row,3]<-round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,4]<-round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,5]<-nrow(sel_om)
out[row,6]<-floor(sum(sel_om$b3)/12)

# compare ROCs
r_h3q<-r
testobj <- roc.test(r_h3q,r_h3q_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,1]<-testobj$p.value[1]




## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b3"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b3"]

r<-roc(true, pred, ci=T) # ROC analysis
out[row,8] <- round(as.numeric(r$auc),2)
out[row,9] <- round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,10] <- round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,11]<-nrow(full_om)
out[row,12]<-floor(sum(full_om$b3)/12)


# compare ROCs
r_h3q_full<-r
testobj <- roc.test(r_h3q_full,r_h3q_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,2]<-testobj$p.value[1]


testobj <- roc.test(r_h3q_full,r_h3q,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_many[row,1]<-testobj$p.value[1]






###############################################################
#									Q4 only, post-1970 quarterly dataset								#
###############################################################

#Daten  <- read.table("/Users/felixward/Documents/Studium/Bonn/Research/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class_post70_q4.csv", sep=",", dec=".", header=TRUE)


# drop vars not used
hopr <- grep("hopr", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
gdp <- grep("gdp", names(Daten), value=T)
y <- grep("_y", names(Daten), value=T)
stocks <- grep("stocks", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)
fliab <- grep("fliab", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
res <- grep("res", names(Daten), value=T)
tloans <- grep("tloans", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)


drops <- names(Daten) %in% c("quarter", "year", "ccode", hopr, gdp, y, stocks, stir, ltrate,glo, fliab) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo)
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/2

# SELECTION SET:
sel.list <-c("b2","tloans_r_gap",  "tloans_r_gr", "a_fliab_r_gap",  "a_ltrate_r",  "a_gdp_r_gap", "cpi_gr",  "er_gap", "res_r_gap", "a_gdp_r_gr")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/2



#DATA for logit model
## interaction-terms for logit model

ia_prb<-Daten$tloans_y_gap*Daten$ltrate
Daten$ia_prb<-ia_prb

ia_lygr<-Daten$tloans_y*Daten$gdp_r_gr
Daten$ia_lygr<-ia_lygr

ia_lyer<-Daten$tloans_y_gap*Daten$er_gap
Daten$ia_lyer<-ia_lyer

## country factor
Daten$country.factor<-as.factor(Daten$ccode)

#throw out vars not used
drops.logit <- names(Daten) %in% c("year") # true-false indicator: true at the names in vector
full.logit <- Daten[!drops] # drops those variables which have true indication in "drops"

###############################################################
#														ANALYSIS   												#
###############################################################

### CLASSIFICATION-TREE ANALYSIS
##############################################################################################################
# variables
var.list <- c( "tloans_r_gap",  "tloans_r_gr", "a_fliab_r_gap",  "a_ltrate_r",  "a_gdp_r_gap", "cpi_gr",  "er_gap", "res_r_gap", "a_gdp_r_gr")

# variables (logit)
var.logit <- c( "tloans_r_gap",  "tloans_r_gr", "a_fliab_r_gap",  "a_ltrate_r",  "a_gdp_r_gap", "cpi_gr",  "er_gap", "res_r_gap", "a_gdp_r_gr")
# interaction terms (logit)
ia.logit <- c("ia_prb", "ia_lygr", "ia_lyer")

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)

# row of table
row<-13
##############################################################################################################


#LOGIT
aucs <- matrix(nrow=1, ncol=runs)
ci95_lo <- matrix(nrow=1, ncol=runs)
ci95_up <- matrix(nrow=1, ncol=runs)

N <- matrix(nrow=1, ncol=runs)
	

# get formula
location <- names(full.logit) %in% c(var.logit, ia.logit,"country.factor") # get location of vars
name <- names(full.logit[location]) # get names
indep <- paste(name, collapse="+") # indep. variables
dep <- paste("b2~") # dep. variable
fmla <- as.formula(paste(dep, indep)) # get formula


for(j in 1:runs) {
	
	# training, test sample
	set.seed(j)
	indexes = sample(1:nrow(full.logit), size=0.632*nrow(full), replace=F)
	test = full.logit[-indexes,]
	train = full.logit[indexes,]
	
	# Regression
	logit<-glm(fmla, data=train, family="binomial")
	N[1,j] <- logit$df.null

	# OOS-analysis
	pred<-predict(logit, newdata=test, type="response") # predicted outcome

	location <- names(test) %in% c("b2")
	name <- names(test[location]) # get names
	true<-test[,name] # real outcome

	r<-roc(true,pred,ci=T) # ROC analysis
	aucs[1,j] <- as.numeric(r$auc)
		
	ci95_lo[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[1]
	ci95_up[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[3]

}

N <- as.numeric(colMeans(as.matrix(N[1, ]))) # update output table matrix

auc<-as.numeric(colMeans(as.matrix(aucs[1, ])))
ci95_lo<-as.numeric(colMeans(as.matrix(ci95_lo[1, ])))
ci95_up<-as.numeric(colMeans(as.matrix(ci95_up[1, ])))



# Representative logit model whose AUC equals the MCCV average

# training, test sample
set.seed(11)
indexes = sample(1:nrow(full.logit), size=0.632*nrow(full.logit), replace=F)
test = full.logit[-indexes,]
train = full.logit[indexes,]
	
# Regression
logit<-glm(fmla, data=train, family="binomial")

# OOS-analysis
pred<-predict(logit, newdata=test, type="response") # predicted outcome

true<-test[,"b2"] # real outcome

library(pROC)
r_h2q4_log<-roc(true,pred,ci=F) # ROC analysis
r_h2q4_log




## RF-selection
library(randomForest)

location <- names(sel_om) %in% c(var.list) # get location of independent var
name.indep <- names(sel_om[location]) # get names of features
location <- names(sel_om) %in% c("b2") # get location of dependent var
name.dep <- names(sel_om[location]) # get name of dep. var.
indep <- sel_om[name.indep]
dep <- factor(sel_om[name.dep]>0)

# grow trees
set.seed(1)
rf_selection= randomForest(indep, y=dep,
 data=sel_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(sel_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_selection

# convergence diagnostic
palette("default")
plot(rf_selection, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_selection, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- sel_om[,name.dep]

r<-roc(true, pred, ci=T) # ROC analysis

out[row,1]<-model.list[row]
out[row,2] <- round(as.numeric(r$auc),2)
out[row,3]<-round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,4]<-round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,5]<-nrow(sel_om)
out[row,6]<-floor(sum(sel_om$b2)/2)

# compare ROCs
r_h2q<-r
testobj <- roc.test(r_h2q,r_h2q_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,1]<-testobj$p.value[1]




## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b2"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b2"]

r<-roc(true, pred, ci=T) # ROC analysis
out[row,8] <- round(as.numeric(r$auc),2)
out[row,9] <- round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,10] <- round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,11]<-nrow(full_om)
out[row,12]<-floor(sum(full_om$b2)/2)


# compare ROCs
r_h2q_full<-r
testobj <- roc.test(r_h2q_full,r_h2q_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,2]<-testobj$p.value[1]


testobj <- roc.test(r_h2q_full,r_h2q,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_many[row,1]<-testobj$p.value[1]






###############################################################
#									EMERGING MARKETS, quarterly dataset					#
###############################################################

#Daten  <- read.table("/Users/felixward/Documents/Studium/Bonn/Research/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class_post70_q_AEDE.csv", sep=",", dec=".", header=TRUE)


#Drop low quality countries
Daten <- Daten[ which(Daten$DE==1), ]

# drop vars not used
AE <- grep("AE", names(Daten), value=T)
DE <- grep("DE", names(Daten), value=T)
big <- grep("big", names(Daten), value=T)
QU <- grep("QU", names(Daten), value=T)
ME <- grep("ME", names(Daten), value=T)

# drop vars not used
hopr <- grep("hopr", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
gdp <- grep("gdp", names(Daten), value=T)
y <- grep("_y", names(Daten), value=T)
stocks <- grep("stocks", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)
fliab <- grep("fliab", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
res <- grep("res", names(Daten), value=T)
tloans <- grep("tloans", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)

drops <- names(Daten) %in% c("quarter", "year", "ccode", hopr, gdp, y, stocks, stir, ltrate,glo, fliab, AE,DE,big,QU,ME) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo)
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/8

# SELECTION SET:
sel.list <-c("b2","tloans_r_gap",  "tloans_r_gr", "a_fliab_r_gap",  "a_ltrate_r",  "a_gdp_r_gap", "cpi_gr",  "er_gap", "res_r_gap", "a_gdp_r_gr")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/8



#DATA for logit model
## interaction-terms for logit model

ia_prb<-Daten$tloans_y_gap*Daten$ltrate
Daten$ia_prb<-ia_prb

ia_lygr<-Daten$tloans_y*Daten$gdp_r_gr
Daten$ia_lygr<-ia_lygr

ia_lyer<-Daten$tloans_y_gap*Daten$er_gap
Daten$ia_lyer<-ia_lyer

## country factor
Daten$country.factor<-as.factor(Daten$ccode)

#throw out vars not used
drops.logit <- names(Daten) %in% c("year") # true-false indicator: true at the names in vector
full.logit <- Daten[!drops] # drops those variables which have true indication in "drops"

###############################################################
#														ANALYSIS   												#
###############################################################

### CLASSIFICATION-TREE ANALYSIS
##############################################################################################################
# variables
var.list <- c( "tloans_r_gap",  "tloans_r_gr", "a_fliab_r_gap",  "a_ltrate_r",  "a_gdp_r_gap", "cpi_gr",  "er_gap", "res_r_gap", "a_gdp_r_gr")

# variables (logit)
var.logit <- c( "tloans_r_gap",  "tloans_r_gr", "a_fliab_r_gap",  "a_ltrate_r",  "a_gdp_r_gap", "cpi_gr",  "er_gap", "res_r_gap", "a_gdp_r_gr")
# interaction terms (logit)
ia.logit <- c("ia_prb", "ia_lygr", "ia_lyer")

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)

# row of table
row<-14
##############################################################################################################


#LOGIT
aucs <- matrix(nrow=1, ncol=runs)
ci95_lo <- matrix(nrow=1, ncol=runs)
ci95_up <- matrix(nrow=1, ncol=runs)

N <- matrix(nrow=1, ncol=runs)
	

# get formula
location <- names(full.logit) %in% c(var.logit, ia.logit,"country.factor") # get location of vars
name <- names(full.logit[location]) # get names
indep <- paste(name, collapse="+") # indep. variables
dep <- paste("b2~") # dep. variable
fmla <- as.formula(paste(dep, indep)) # get formula


for(j in 1:runs) {
	
	# training, test sample
	set.seed(j)
	indexes = sample(1:nrow(full.logit), size=0.632*nrow(full), replace=F)
	test = full.logit[-indexes,]
	train = full.logit[indexes,]
	
	# Regression
	logit<-glm(fmla, data=train, family="binomial")
	N[1,j] <- logit$df.null

	# OOS-analysis
	pred<-predict(logit, newdata=test, type="response") # predicted outcome

	location <- names(test) %in% c("b2")
	name <- names(test[location]) # get names
	true<-test[,name] # real outcome

	r<-roc(true,pred,ci=T) # ROC analysis
	aucs[1,j] <- as.numeric(r$auc)
		
	ci95_lo[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[1]
	ci95_up[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[3]

}

N <- as.numeric(colMeans(as.matrix(N[1, ]))) # update output table matrix

auc<-as.numeric(colMeans(as.matrix(aucs[1, ])))
ci95_lo<-as.numeric(colMeans(as.matrix(ci95_lo[1, ])))
ci95_up<-as.numeric(colMeans(as.matrix(ci95_up[1, ])))



# Representative logit model whose AUC equals the MCCV average

# training, test sample
set.seed(4)
indexes = sample(1:nrow(full.logit), size=0.632*nrow(full.logit), replace=F)
test = full.logit[-indexes,]
train = full.logit[indexes,]
	
# Regression
logit<-glm(fmla, data=train, family="binomial")

# OOS-analysis
pred<-predict(logit, newdata=test, type="response") # predicted outcome

true<-test[,"b2"] # real outcome

library(pROC)
r_DEq_log<-roc(true,pred,ci=F) # ROC analysis
r_DEq_log




## RF-selection
library(randomForest)

location <- names(sel_om) %in% c(var.list) # get location of independent var
name.indep <- names(sel_om[location]) # get names of features
location <- names(sel_om) %in% c("b2") # get location of dependent var
name.dep <- names(sel_om[location]) # get name of dep. var.
indep <- sel_om[name.indep]
dep <- factor(sel_om[name.dep]>0)

# grow trees
set.seed(1)
rf_selection= randomForest(indep, y=dep,
 data=sel_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(sel_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_selection

# convergence diagnostic
palette("default")
plot(rf_selection, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_selection, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- sel_om[,name.dep]

r<-roc(true, pred, ci=T) # ROC analysis

out[row,1]<-model.list[row]
out[row,2] <- round(as.numeric(r$auc),2)
out[row,3]<-round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,4]<-round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,5]<-nrow(sel_om)
out[row,6]<-floor(sum(sel_om$b2)/8)

# compare ROCs
r_DEq<-r
testobj <- roc.test(r_DEq,r_DEq_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,1]<-testobj$p.value[1]




## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b2"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b2"]

r<-roc(true, pred, ci=T) # ROC analysis
out[row,8] <- round(as.numeric(r$auc),2)
out[row,9] <- round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,10] <- round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,11]<-nrow(full_om)
out[row,12]<-floor(sum(full_om$b2)/8)


# compare ROCs
r_DEq_full<-r
testobj <- roc.test(r_DEq_full,r_DEq_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,2]<-testobj$p.value[1]


testobj <- roc.test(r_DEq_full,r_DEq,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_many[row,1]<-testobj$p.value[1]







###############################################################
#									QUALITY DATA, quarterly dataset					#
###############################################################

#Daten  <- read.table("/Users/felixward/Documents/Studium/Bonn/Research/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class_post70_q_AEDE.csv", sep=",", dec=".", header=TRUE)


#Drop low quality countries
Daten <- Daten[ which(Daten$QU==1), ]

# drop vars not used
AE <- grep("AE", names(Daten), value=T)
DE <- grep("DE", names(Daten), value=T)
big <- grep("big", names(Daten), value=T)
QU <- grep("QU", names(Daten), value=T)
ME <- grep("ME", names(Daten), value=T)

# drop vars not used
hopr <- grep("hopr", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
gdp <- grep("gdp", names(Daten), value=T)
y <- grep("_y", names(Daten), value=T)
stocks <- grep("stocks", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)
fliab <- grep("fliab", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
res <- grep("res", names(Daten), value=T)
tloans <- grep("tloans", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)

drops <- names(Daten) %in% c("quarter", "year", "ccode", hopr, gdp, y, stocks, stir, ltrate,glo, fliab, AE,DE,big,QU,ME) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo)
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/8

# SELECTION SET:
sel.list <-c("b2","tloans_r_gap",  "tloans_r_gr", "a_fliab_r_gap",  "a_ltrate_r",  "a_gdp_r_gap", "cpi_gr",  "er_gap", "res_r_gap", "a_gdp_r_gr")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/8



#DATA for logit model
## interaction-terms for logit model

ia_prb<-Daten$tloans_y_gap*Daten$ltrate
Daten$ia_prb<-ia_prb

ia_lygr<-Daten$tloans_y*Daten$gdp_r_gr
Daten$ia_lygr<-ia_lygr

ia_lyer<-Daten$tloans_y_gap*Daten$er_gap
Daten$ia_lyer<-ia_lyer

## country factor
Daten$country.factor<-as.factor(Daten$ccode)

#throw out vars not used
drops.logit <- names(Daten) %in% c("year") # true-false indicator: true at the names in vector
full.logit <- Daten[!drops] # drops those variables which have true indication in "drops"

###############################################################
#														ANALYSIS   												#
###############################################################

### CLASSIFICATION-TREE ANALYSIS
##############################################################################################################
# variables
var.list <- c( "tloans_r_gap",  "tloans_r_gr", "a_fliab_r_gap",  "a_ltrate_r",  "a_gdp_r_gap", "cpi_gr",  "er_gap", "res_r_gap", "a_gdp_r_gr")

# variables (logit)
var.logit <- c( "tloans_r_gap",  "tloans_r_gr", "a_fliab_r_gap",  "a_ltrate_r",  "a_gdp_r_gap", "cpi_gr",  "er_gap", "res_r_gap", "a_gdp_r_gr")
# interaction terms (logit)
ia.logit <- c("ia_prb", "ia_lygr", "ia_lyer")

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)

# row of table
row<-15
##############################################################################################################


#LOGIT
aucs <- matrix(nrow=1, ncol=runs)
ci95_lo <- matrix(nrow=1, ncol=runs)
ci95_up <- matrix(nrow=1, ncol=runs)

N <- matrix(nrow=1, ncol=runs)
	

# get formula
location <- names(full.logit) %in% c(var.logit, ia.logit,"country.factor") # get location of vars
name <- names(full.logit[location]) # get names
indep <- paste(name, collapse="+") # indep. variables
dep <- paste("b2~") # dep. variable
fmla <- as.formula(paste(dep, indep)) # get formula


for(j in 1:runs) {
	
	# training, test sample
	set.seed(j)
	indexes = sample(1:nrow(full.logit), size=0.632*nrow(full), replace=F)
	test = full.logit[-indexes,]
	train = full.logit[indexes,]
	
	# Regression
	logit<-glm(fmla, data=train, family="binomial")
	N[1,j] <- logit$df.null

	# OOS-analysis
	pred<-predict(logit, newdata=test, type="response") # predicted outcome

	location <- names(test) %in% c("b2")
	name <- names(test[location]) # get names
	true<-test[,name] # real outcome

	r<-roc(true,pred,ci=T) # ROC analysis
	aucs[1,j] <- as.numeric(r$auc)
		
	ci95_lo[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[1]
	ci95_up[1,j] <- as.numeric(ci.auc(r,conf.level=ci[2]))[3]

}

N <- as.numeric(colMeans(as.matrix(N[1, ]))) # update output table matrix

auc<-as.numeric(colMeans(as.matrix(aucs[1, ])))
ci95_lo<-as.numeric(colMeans(as.matrix(ci95_lo[1, ])))
ci95_up<-as.numeric(colMeans(as.matrix(ci95_up[1, ])))



# Representative logit model whose AUC equals the MCCV average

# training, test sample
set.seed(1)
indexes = sample(1:nrow(full.logit), size=0.632*nrow(full.logit), replace=F)
test = full.logit[-indexes,]
train = full.logit[indexes,]
	
# Regression
logit<-glm(fmla, data=train, family="binomial")

# OOS-analysis
pred<-predict(logit, newdata=test, type="response") # predicted outcome

true<-test[,"b2"] # real outcome

library(pROC)
r_Qq_log<-roc(true,pred,ci=F) # ROC analysis
r_Qq_log




## RF-selection
library(randomForest)

location <- names(sel_om) %in% c(var.list) # get location of independent var
name.indep <- names(sel_om[location]) # get names of features
location <- names(sel_om) %in% c("b2") # get location of dependent var
name.dep <- names(sel_om[location]) # get name of dep. var.
indep <- sel_om[name.indep]
dep <- factor(sel_om[name.dep]>0)

# grow trees
set.seed(1)
rf_selection= randomForest(indep, y=dep,
 data=sel_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(sel_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_selection

# convergence diagnostic
palette("default")
plot(rf_selection, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_selection, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- sel_om[,name.dep]

r<-roc(true, pred, ci=T) # ROC analysis

out[row,1]<-model.list[row]
out[row,2] <- round(as.numeric(r$auc),2)
out[row,3]<-round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,4]<-round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,5]<-nrow(sel_om)
out[row,6]<-floor(sum(sel_om$b2)/8)

# compare ROCs
r_Qq<-r
testobj <- roc.test(r_Qq,r_Qq_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,1]<-testobj$p.value[1]




## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b2"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=trees,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b2"]

r<-roc(true, pred, ci=T) # ROC analysis
out[row,8] <- round(as.numeric(r$auc),2)
out[row,9] <- round(as.numeric(ci.auc(r,conf.level=0.95))[1],2)
out[row,10] <- round(as.numeric(ci.auc(r,conf.level=0.95))[3],2)
out[row,11]<-nrow(full_om)
out[row,12]<-floor(sum(full_om$b2)/8)


# compare ROCs
r_Qq_full<-r
testobj <- roc.test(r_Qq_full,r_Qq_log,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_base[row,2]<-testobj$p.value[1]


testobj <- roc.test(r_Qq_full,r_Qq,method="delong",alternative="greater")
options("scipen"=10)
options()$scipen

sig_many[row,1]<-testobj$p.value[1]














logits <- c(r_h1_log$auc, r_h3_log$auc, r_RR_log$auc, r_DE_log$auc, r_Q_log$auc)

out

sig_base
sig_many

save.image("/Users/felixward/Dropbox/CrisisPrediction/DoFiles/CT_robustness") 




###############################################################
#															TABLES													#
###############################################################
load("/Users/felixward/Dropbox/CrisisPrediction//DoFiles/CT_robustness")

library(xtable)
#OUTPUT TABLE (always use double the amount of backslashes needed in latex)
# add symbols for significance
out2 <- out

for (i in 1:nrow(sig_base)){
	if(sig_base[i,1]<=0.05) {
		out2[i,2] <- paste("\\textbf{",out2[i,2],"}",collapse="")
	}
	if(sig_base[i,2]<=0.05) {
		out2[i,8] <- paste("\\textbf{",out2[i,8],"}",collapse="")
	}	
}

# # for (i in 1:nrow(sig_pre)){
	# if(sig_pre[i,1]<=0.05) {
		# out2[i+1,2] <- paste(out2[i+1,2],"$^{\\ddagger}$",collapse="")
	# }
	# if(sig_pre[i,2]<=0.05) {
		# out2[i+1,6] <- paste(out2[i+1,6],"$^{\\ddagger}$",collapse="")
	# }	
# }

for (i in 1:nrow(sig_many)){
	if(sig_many[i,1]<=0.05) {
		out2[i,8] <- paste(out2[i,8],"$^{\\mathsection}$",collapse="")
	}	
}


# confidence intervals
cis<-paste(out2[,3], out2[,4], sep=",")
cis<-paste("[", cis, sep="")
cis<-paste(cis, "]", sep="")

cis2<-paste(out2[,9], out2[,10], sep=",")
cis2<-paste("[", cis2, sep="")
cis2<-paste(cis2, "]", sep="")

# observations (crises)
obs<-paste("[",out2[,6], sep="")
obs<-paste(obs, "]",sep="")
obs<-paste(out2[,5], obs, sep=" ")

obs2<-paste("[",out2[,12], sep="")
obs2<-paste(obs2, "]",sep="")
obs2<-paste(out2[,11], obs2, sep=" ")


out3 <- out2[,c(1,2)] # leave out .9, .99 lower-ci columns
outF <- cbind(out3,cis,obs,out2[,7:8],cis2,obs2)

# get rid of row and columnnames
x <- data.frame(outF)
outF<-as.matrix(x)
rownames(outF) <- rep("", nrow(outF))
colnames(outF) <- rep("", ncol(outF))

mat3<-xtable(outF, align="llcccm{1.5cm}ccc", caption="CT-Robustness", label="tab:CT_robustness") # for whatever reason need one column more than i actually want (added "l" to left)

print(mat3, type="latex", caption.placement="top", hline.after=c(-1,nrow(mat3)), sanitize.text.function = function(x){x}, file="/Users/felixward/Dropbox/CrisisPrediction/Written/CT_Robustness.txt", replace=T, floating=F, booktabs=T, include.colnames=F, include.rownames=F, add.to.row=list(pos=list(0,0,0,0,0,4,9), 
command=c(" \\multicolumn{1}{c}{} & \\multicolumn{3}{c}{\\textbf{Restricted selection}} & & \\multicolumn{3}{c}{\\textbf{Many predictors}} \\\\",
"  \\cmidrule(l r){2-4} \\cmidrule(l r){6-8} \\\\",
" \\multicolumn{1}{l}{\\textbf{Model}} & AUC & 95\\%-CI & N [crises] & & AUC & 95\\%-CI & N [crises]  \\\\",
"  \\cmidrule(l r){1-8}  \\\\",
" & \\multicolumn{7}{c}{\\textit{Long-run 1870-2011 dataset}} \\\\ \\vspace{0.0cm} \\\\",
"  \\vspace{0.00cm} \\\\ & \\multicolumn{7}{c}{\\textit{Post-1970 annual dataset}} \\\\ \\vspace{0.0cm}  \\\\",
"  \\vspace{0.00cm} \\\\ & \\multicolumn{7}{c}{\\textit{Post-1970 quarterly dataset}} \\\\ \\vspace{0.0cm}  \\\\" )))
