#####################################################################
#											RANDOM FORESTS 															#
#####################################################################
rm(list=ls())
library(psych)
library(graphics)

rm(list=ls())

# table matrices
out <- matrix(nrow=7, ncol=8)
sig_two <- matrix(nrow=1,ncol=6)


###############################################################
#												2-year horizon LONG-RUN DATA													#
###############################################################

#Daten  <- read.table("D:/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)


ca <- grep("ca", names(Daten), value=T)
drops <- names(Daten) %in% c(ca)
Daten <- Daten[!drops]

# drop vars not used
assets <- grep("assets", names(Daten), value=T)
stocks <- grep("stocks", names(Daten), value=T)
narrowm <- grep("narrowm", names(Daten), value=T)
money <- grep("money", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
loans <- grep("loans", names(Daten), value=T)
debt <- grep("debt", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)
gdp <- grep("gdp", names(Daten), value=T)
i <- grep("i_", names(Daten), value=T)
c <- grep("c_", names(Daten), value=T)
ri <- grep("ri", names(Daten), value=T)
rc <- grep("rc", names(Daten), value=T)

drops <- names(Daten) %in% c("year", "ccode", stocks, money, stir,assets,i,ri,glo) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo)
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/2

# SELECTION SET:
sel.list <- c("b2", "loans1_y_gap", "pdebt_gap", "narrowm_y_gap",  "rltrate", "gr_rgdp", "gr_cpi",  "er_gap", "loans_y", "pdebt", "ltrate")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/2

###############################################################
#														ANALYSIS   												#
###############################################################

### CLASSIFICATION-TREE ANALYSIS
##############################################################################################################

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)
##############################################################################################################



## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b2"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=5000,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b2"]

roc<-roc(true, pred, ci=T) # ROC analysis
r_1<-roc




###############################################################
#												1-year horizon LONG-RUN DATA													#
###############################################################

#Daten  <- read.table("D:/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)


ca <- grep("ca", names(Daten), value=T)
drops <- names(Daten) %in% c(ca)
Daten <- Daten[!drops]

# drop vars not used
assets <- grep("assets", names(Daten), value=T)
stocks <- grep("stocks", names(Daten), value=T)
narrowm <- grep("narrowm", names(Daten), value=T)
money <- grep("money", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
loans <- grep("loans", names(Daten), value=T)
debt <- grep("debt", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)
gdp <- grep("gdp", names(Daten), value=T)
i <- grep("i_", names(Daten), value=T)
c <- grep("c_", names(Daten), value=T)
ri <- grep("ri", names(Daten), value=T)
rc <- grep("rc", names(Daten), value=T)

drops <- names(Daten) %in% c("year", "ccode", stocks, money, stir,assets,i,ri,glo) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo)
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/2

# SELECTION SET:
sel.list <- c("b2", "loans1_y_gap", "pdebt_gap", "narrowm_y_gap",  "rltrate", "gr_rgdp", "gr_cpi",  "er_gap", "loans_y", "pdebt", "ltrate")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/2

###############################################################
#														ANALYSIS   												#
###############################################################

### CLASSIFICATION-TREE ANALYSIS
##############################################################################################################

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)
##############################################################################################################



## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b1"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=5000,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b1"]

roc<-roc(true, pred, ci=T) # ROC analysis


 	N <- nrow(full_om)
	
	aucs <- as.numeric(roc$auc)
		
	ci90 <- as.numeric(ci.auc(roc,conf.level=ci[3]))[1]
	ci95_lo <- as.numeric(ci.auc(roc,conf.level=ci[2]))[1]
	ci95_up <- as.numeric(ci.auc(roc,conf.level=ci[2]))[3]
	ci99 <- as.numeric(ci.auc(roc,conf.level=ci[1]))[1]



	# use colMeans here, as the "as.matrix()" transformation turns the initial row(non)vector into a columnvector.
	# This is necessary, as the *Means commands only apply to matrices, but x[i, ] is not a matrix
	N <- as.numeric(as.matrix(N[ ]))
	auc <- as.numeric(as.matrix(aucs[]))
	
	ci90<-as.numeric(as.matrix(ci90[]))
	ci95_lo<-as.numeric(as.matrix(ci95_lo[ ]))
	ci95_up<-as.numeric(as.matrix(ci95_up[]))
	ci99<-as.numeric(as.matrix(ci99[ ]))

	out[2,2] <- round(auc,2)

	

# confidence intervals
cis<-paste(round(ci95_lo,2), round(ci95_up,2), sep=",")
cis<-paste("[", cis, sep="")
cis<-paste(cis, "]", sep="")

out[2,3] <- cis
out[2,4]<-round(N,2)

out[2,5]<-rf_full$ntree

out[2,6]<-rf_full$mtry
out[2,7]<-ncol(indep)

out[2,8]<-floor(sum(full_om$b2)/2)


#AUC comparison
testobj <- roc.test(roc,r_1,method="delong",alternative="two.sided")
options("scipen"=10)
options()$scipen

sig_two[1,1]<-testobj$p.value[1]





###############################################################
#												3-year horizon LONG-RUN DATA													#
###############################################################

#Daten  <- read.table("D:/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)


ca <- grep("ca", names(Daten), value=T)
drops <- names(Daten) %in% c(ca)
Daten <- Daten[!drops]

# drop vars not used
assets <- grep("assets", names(Daten), value=T)
stocks <- grep("stocks", names(Daten), value=T)
narrowm <- grep("narrowm", names(Daten), value=T)
money <- grep("money", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
loans <- grep("loans", names(Daten), value=T)
debt <- grep("debt", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)
gdp <- grep("gdp", names(Daten), value=T)
i <- grep("i_", names(Daten), value=T)
c <- grep("c_", names(Daten), value=T)
ri <- grep("ri", names(Daten), value=T)
rc <- grep("rc", names(Daten), value=T)

drops <- names(Daten) %in% c("year", "ccode", stocks, money, stir,assets,i,ri,glo) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo)
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/2

# SELECTION SET:
sel.list <- c("b2", "loans1_y_gap", "pdebt_gap", "narrowm_y_gap",  "rltrate", "gr_rgdp", "gr_cpi",  "er_gap", "loans_y", "pdebt", "ltrate")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/2

###############################################################
#														ANALYSIS   												#
###############################################################

### CLASSIFICATION-TREE ANALYSIS
##############################################################################################################

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)
##############################################################################################################



## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b3"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=5000,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b3"]

roc<-roc(true, pred, ci=T) # ROC analysis

N <- nrow(full_om)
	
	aucs <- as.numeric(roc$auc)
		
	ci90 <- as.numeric(ci.auc(roc,conf.level=ci[3]))[1]
	ci95_lo <- as.numeric(ci.auc(roc,conf.level=ci[2]))[1]
	ci95_up <- as.numeric(ci.auc(roc,conf.level=ci[2]))[3]
	ci99 <- as.numeric(ci.auc(roc,conf.level=ci[1]))[1]



	# use colMeans here, as the "as.matrix()" transformation turns the initial row(non)vector into a columnvector.
	# This is necessary, as the *Means commands only apply to matrices, but x[i, ] is not a matrix
	N <- as.numeric(as.matrix(N[ ]))
	auc <- as.numeric(as.matrix(aucs[]))
	
	ci90<-as.numeric(as.matrix(ci90[]))
	ci95_lo<-as.numeric(as.matrix(ci95_lo[ ]))
	ci95_up<-as.numeric(as.matrix(ci95_up[]))
	ci99<-as.numeric(as.matrix(ci99[ ]))

	out[3,2] <- round(auc,2)

	

# confidence intervals
cis<-paste(round(ci95_lo,2), round(ci95_up,2), sep=",")
cis<-paste("[", cis, sep="")
cis<-paste(cis, "]", sep="")

out[3,3] <- cis
out[3,4]<-round(N,2)

out[3,5]<-rf_full$ntree

out[3,6]<-rf_full$mtry
out[3,7]<-ncol(indep)

out[3,8]<-floor(sum(full_om$b2)/2)

#AUC comparison
testobj <- roc.test(roc,r_1,method="delong",alternative="two.sided")
options("scipen"=10)
options()$scipen

sig_two[1,2]<-testobj$p.value[1]






###############################################################
#												2-year horizon Post-1970 yearly DATA				#
###############################################################

#Daten  <- read.table("D:/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class_post70_y.csv", sep=",", dec=".", header=TRUE)


fliab <- grep("fliab", names(Daten), value=T)
drops <- names(Daten) %in% c(fliab)
Daten <- Daten[!drops]

# drop vars not used
stocks <- grep("stocks", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
loans <- grep("loans", names(Daten), value=T)
pdebt <- grep("pdebt", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)

drops <- names(Daten) %in% c("year", "ccode", stocks, ltrate,stir,glo) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo) # keep global variables as they have few missings
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/2

# SELECTION SET:
sel.list <-c("b2", "loans_y_gap", "loans_y", "rer_gap", "gdp_r_gap", "gr_cpi", "nx_y_gap")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/2

###############################################################
#														ANALYSIS   												#
###############################################################

### CLASSIFICATION-TREE ANALYSIS
##############################################################################################################

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)
##############################################################################################################



## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b2"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=5000,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b2"]

roc<-roc(true, pred, ci=T) # ROC analysis
r_2<-roc





###############################################################
#												1-year horizon Post-1970 yearly DATA				#
###############################################################

#Daten  <- read.table("D:/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class_post70_y.csv", sep=",", dec=".", header=TRUE)


fliab <- grep("fliab", names(Daten), value=T)
drops <- names(Daten) %in% c(fliab)
Daten <- Daten[!drops]

# drop vars not used
stocks <- grep("stocks", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
loans <- grep("loans", names(Daten), value=T)
pdebt <- grep("pdebt", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)

drops <- names(Daten) %in% c("year", "ccode", stocks, ltrate,stir,glo) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo) # keep global variables as they have few missings
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/2

# SELECTION SET:
sel.list <-c("b2", "loans_y_gap", "loans_y", "rer_gap", "gdp_r_gap", "gr_cpi", "nx_y_gap")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/2

###############################################################
#														ANALYSIS   												#
###############################################################

### CLASSIFICATION-TREE ANALYSIS
##############################################################################################################

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)
##############################################################################################################



## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b1"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=5000,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b1"]

roc<-roc(true, pred, ci=T) # ROC analysis


 	N <- nrow(full_om)
	
	aucs <- as.numeric(roc$auc)
		
	ci90 <- as.numeric(ci.auc(roc,conf.level=ci[3]))[1]
	ci95_lo <- as.numeric(ci.auc(roc,conf.level=ci[2]))[1]
	ci95_up <- as.numeric(ci.auc(roc,conf.level=ci[2]))[3]
	ci99 <- as.numeric(ci.auc(roc,conf.level=ci[1]))[1]



	# use colMeans here, as the "as.matrix()" transformation turns the initial row(non)vector into a columnvector.
	# This is necessary, as the *Means commands only apply to matrices, but x[i, ] is not a matrix
	N <- as.numeric(as.matrix(N[ ]))
	auc <- as.numeric(as.matrix(aucs[]))
	
	ci90<-as.numeric(as.matrix(ci90[]))
	ci95_lo<-as.numeric(as.matrix(ci95_lo[ ]))
	ci95_up<-as.numeric(as.matrix(ci95_up[]))
	ci99<-as.numeric(as.matrix(ci99[ ]))

	out[4,2] <- round(auc,2)



# confidence intervals
cis<-paste(round(ci95_lo,2), round(ci95_up,2), sep=",")
cis<-paste("[", cis, sep="")
cis<-paste(cis, "]", sep="")

out[4,3] <- cis
out[4,4]<-round(N,2)

out[4,5]<-rf_full$ntree

out[4,6]<-rf_full$mtry
out[4,7]<-ncol(indep)

out[4,8]<-floor(sum(full_om$b2)/2)

#AUC comparison
testobj <- roc.test(roc,r_2,method="delong",alternative="two.sided")
options("scipen"=10)
options()$scipen

sig_two[1,3]<-testobj$p.value[1]






###############################################################
#												3-year horizon Post-1970 yearly DATA													#
###############################################################

#Daten  <- read.table("D:/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class_post70_y.csv", sep=",", dec=".", header=TRUE)


fliab <- grep("fliab", names(Daten), value=T)
drops <- names(Daten) %in% c(fliab)
Daten <- Daten[!drops]

# drop vars not used
stocks <- grep("stocks", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
loans <- grep("loans", names(Daten), value=T)
pdebt <- grep("pdebt", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)

drops <- names(Daten) %in% c("year", "ccode", stocks, ltrate,stir,glo) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo) # keep global variables as they have few missings
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/2

# SELECTION SET:
sel.list <-c("b2", "loans_y_gap", "loans_y", "rer_gap", "gdp_r_gap", "gr_cpi", "nx_y_gap")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/2

###############################################################
#														ANALYSIS   												#
###############################################################

### CLASSIFICATION-TREE ANALYSIS
##############################################################################################################

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)
##############################################################################################################



## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b3"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=5000,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b3"]

roc<-roc(true, pred, ci=T) # ROC analysis


 	N <- nrow(full_om)
	
	aucs <- as.numeric(roc$auc)
		
	ci90 <- as.numeric(ci.auc(roc,conf.level=ci[3]))[1]
	ci95_lo <- as.numeric(ci.auc(roc,conf.level=ci[2]))[1]
	ci95_up <- as.numeric(ci.auc(roc,conf.level=ci[2]))[3]
	ci99 <- as.numeric(ci.auc(roc,conf.level=ci[1]))[1]



	# use colMeans here, as the "as.matrix()" transformation turns the initial row(non)vector into a columnvector.
	# This is necessary, as the *Means commands only apply to matrices, but x[i, ] is not a matrix
	N <- as.numeric(as.matrix(N[ ]))
	auc <- as.numeric(as.matrix(aucs[]))
	
	ci90<-as.numeric(as.matrix(ci90[]))
	ci95_lo<-as.numeric(as.matrix(ci95_lo[ ]))
	ci95_up<-as.numeric(as.matrix(ci95_up[]))
	ci99<-as.numeric(as.matrix(ci99[ ]))

	out[5,2] <- round(auc,2)

	

# confidence intervals
cis<-paste(round(ci95_lo,2), round(ci95_up,2), sep=",")
cis<-paste("[", cis, sep="")
cis<-paste(cis, "]", sep="")

out[5,3] <- cis
out[5,4]<-round(N,2)

out[5,5]<-rf_full$ntree

out[5,6]<-rf_full$mtry
out[5,7]<-ncol(indep)

out[5,8]<-floor(sum(full_om$b2)/2)

#AUC comparison
testobj <- roc.test(roc,r_2,method="delong",alternative="two.sided")
options("scipen"=10)
options()$scipen

sig_two[1,4]<-testobj$p.value[1]






###############################################################
#												2-year horizon Post-1970 quarterly DATA													#
###############################################################

#Daten  <- read.table("D:/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class_post70_q.csv", sep=",", dec=".", header=TRUE)



# drop vars not used
hopr <- grep("hopr", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
gdp <- grep("gdp", names(Daten), value=T)
y <- grep("_y", names(Daten), value=T)
stocks <- grep("stocks", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)
fliab <- grep("fliab", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
res <- grep("res", names(Daten), value=T)
tloans <- grep("tloans", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)


drops <- names(Daten) %in% c("quarter", "year", "ccode", hopr, gdp, y, stocks, stir, ltrate,glo, fliab) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo)
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/8

# SELECTION SET:
sel.list <-c("b2", "tloans_r_gap", "tloans_r_gr", "res_r_gap",  "er_gap",  "a_ltrate_r_gap",  "a_gdp_r_gap", "a_gdp_r_gr", "cpi_gr")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/8

###############################################################
#														ANALYSIS   												#
###############################################################

### CLASSIFICATION-TREE ANALYSIS
##############################################################################################################

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)
##############################################################################################################



## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b2"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=5000,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b2"]

roc<-roc(true, pred, ci=T) # ROC analysis
r_3<-roc





###############################################################
#												1-year horizon Post-1970 quarterly DATA													#
###############################################################

#Daten  <- read.table("D:/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class_post70_q.csv", sep=",", dec=".", header=TRUE)



# drop vars not used
hopr <- grep("hopr", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
gdp <- grep("gdp", names(Daten), value=T)
y <- grep("_y", names(Daten), value=T)
stocks <- grep("stocks", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)
fliab <- grep("fliab", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
res <- grep("res", names(Daten), value=T)
tloans <- grep("tloans", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)


drops <- names(Daten) %in% c("quarter", "year", "ccode", hopr, gdp, y, stocks, stir, ltrate,glo, fliab) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo)
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/8

# SELECTION SET:
sel.list <-c("b2", "tloans_r_gap", "tloans_r_gr", "res_r_gap",  "er_gap",  "a_ltrate_r_gap",  "a_gdp_r_gap", "a_gdp_r_gr", "cpi_gr")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/8

###############################################################
#														ANALYSIS   												#
###############################################################

### CLASSIFICATION-TREE ANALYSIS
##############################################################################################################

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)
##############################################################################################################



## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b1"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=5000,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b1"]

roc<-roc(true, pred, ci=T) # ROC analysis


 	N <- nrow(full_om)
	
	aucs <- as.numeric(roc$auc)
		
	ci90 <- as.numeric(ci.auc(roc,conf.level=ci[3]))[1]
	ci95_lo <- as.numeric(ci.auc(roc,conf.level=ci[2]))[1]
	ci95_up <- as.numeric(ci.auc(roc,conf.level=ci[2]))[3]
	ci99 <- as.numeric(ci.auc(roc,conf.level=ci[1]))[1]



	# use colMeans here, as the "as.matrix()" transformation turns the initial row(non)vector into a columnvector.
	# This is necessary, as the *Means commands only apply to matrices, but x[i, ] is not a matrix
	N <- as.numeric(as.matrix(N[ ]))
	auc <- as.numeric(as.matrix(aucs[]))
	
	ci90<-as.numeric(as.matrix(ci90[]))
	ci95_lo<-as.numeric(as.matrix(ci95_lo[ ]))
	ci95_up<-as.numeric(as.matrix(ci95_up[]))
	ci99<-as.numeric(as.matrix(ci99[ ]))

	out[6,2] <- round(auc,2)

# confidence intervals
cis<-paste(round(ci95_lo,2), round(ci95_up,2), sep=",")
cis<-paste("[", cis, sep="")
cis<-paste(cis, "]", sep="")

out[6,3] <- cis
out[6,4]<-round(N,2)

out[6,5]<-rf_full$ntree

out[6,6]<-rf_full$mtry
out[6,7]<-ncol(indep)

out[6,8]<-floor(sum(full_om$b2)/8)

#AUC comparison
testobj <- roc.test(roc,r_3,method="delong",alternative="two.sided")
options("scipen"=10)
options()$scipen

sig_two[1,5]<-testobj$p.value[1]






###############################################################
#												3-year horizon Post-1970 quarterly DATA													#
###############################################################

#Daten  <- read.table("D:/Dropbox/CrisisPrediction/Data/R_class.csv", sep=",", dec=".", header=TRUE)
Daten  <- read.table("/Users/felixward/Dropbox/CrisisPrediction/Data/R_class_post70_q.csv", sep=",", dec=".", header=TRUE)



# drop vars not used
hopr <- grep("hopr", names(Daten), value=T)
gap <- grep("gap", names(Daten), value=T)
gdp <- grep("gdp", names(Daten), value=T)
y <- grep("_y", names(Daten), value=T)
stocks <- grep("stocks", names(Daten), value=T)
stir <- grep("stir", names(Daten), value=T)
ltrate <- grep("ltrate", names(Daten), value=T)
glo <- grep("a_", names(Daten), value=T)
fliab <- grep("fliab", names(Daten), value=T)
er <- grep("er", names(Daten), value=T)
res <- grep("res", names(Daten), value=T)
tloans <- grep("tloans", names(Daten), value=T)
cpi <- grep("cpi", names(Daten), value=T)


drops <- names(Daten) %in% c("quarter", "year", "ccode", hopr, gdp, y, stocks, stir, ltrate,glo, fliab) # true-false indicator: true at the names in vector
saves <- names(Daten) %in% c(glo)
full <- Daten[!drops] # drops those variables which have true indication in "drops"
full <- cbind(Daten[glo], full)

# FULL SET: omit observations with missing values
full_om <- na.omit(full)
sum(full_om$b2)/8

# SELECTION SET:
sel.list <-c("b2", "tloans_r_gap", "tloans_r_gr", "res_r_gap",  "er_gap",  "a_ltrate_r_gap",  "a_gdp_r_gap", "a_gdp_r_gr", "cpi_gr")
location <- names(full) %in% c(sel.list) # get location of independent var
name.sel <- names(full[location]) # get names of features
sel <- full[name.sel]
sel_om <- na.omit(sel)
sum(sel_om$b2)/8
###############################################################
#														ANALYSIS   												#
###############################################################

### CLASSIFICATION-TREE ANALYSIS
##############################################################################################################

# miscellaneous non-independent
misc.list <- c("b2","b1","b3","rec1","rec2","rec3")

# confidence intervals
n.ci <- 3
ci <- c(0.99, 0.95, 0.9)
##############################################################################################################



## RANDOM FOREST
library(randomForest)

location <- names(full_om) %in% c(misc.list) # get location of dependent var
name.indep <- names(full_om[!location]) # get names of features
indep <- full_om[name.indep]
dep <- factor(full_om[,"b3"]>0) # dep. var.

# grow trees
set.seed(1)
rf_full= randomForest(indep, y=dep,
 data=full_om,
 ntree=5000,
 replace=T, # bootstrapping (with replacement!)
 mtry=sqrt(ncol(indep)), # all features except dependent variable
 
 cutoff=c(1/2, 1/2), # majority vote: class with maximum ratio of (prop. of votes/cutoff(=1/k)) wins
 sampsize=nrow(full_om), # bootstrapping (comput. more efficient wihtout much loss by using 1/2*train (see Friedman & Hall, 2007))
 nodesize=1 # fully grow trees (experiment to avoid overfitting (see Segal, 2004)); (also see Biau et al., 2012 on consistency)
 ) 
rf_full


# convergence diagnostic
palette("default")
plot(rf_full, type="l", main="")

# OOS-analysis
library(pROC)

pred <- predict(rf_full, type="prob")[,2] # predicted outcome; second column = TRUE probability (votes combined with normvotes=T equals type="prob")

true <- full_om[,"b3"]

roc<-roc(true, pred, ci=T) # ROC analysis


 	N <- nrow(full_om)
	
	aucs <- as.numeric(roc$auc)
		
	ci90 <- as.numeric(ci.auc(roc,conf.level=ci[3]))[1]
	ci95_lo <- as.numeric(ci.auc(roc,conf.level=ci[2]))[1]
	ci95_up <- as.numeric(ci.auc(roc,conf.level=ci[2]))[3]
	ci99 <- as.numeric(ci.auc(roc,conf.level=ci[1]))[1]



	# use colMeans here, as the "as.matrix()" transformation turns the initial row(non)vector into a columnvector.
	# This is necessary, as the *Means commands only apply to matrices, but x[i, ] is not a matrix
	N <- as.numeric(as.matrix(N[ ]))
	auc <- as.numeric(as.matrix(aucs[]))
	
	ci90<-as.numeric(as.matrix(ci90[]))
	ci95_lo<-as.numeric(as.matrix(ci95_lo[ ]))
	ci95_up<-as.numeric(as.matrix(ci95_up[]))
	ci99<-as.numeric(as.matrix(ci99[ ]))

	out[7,2] <- round(auc,2)

	

# confidence intervals
cis<-paste(round(ci95_lo,2), round(ci95_up,2), sep=",")
cis<-paste("[", cis, sep="")
cis<-paste(cis, "]", sep="")

out[7,3] <- cis
out[7,4]<-round(N,2)

out[7,5]<-rf_full$ntree

out[7,6]<-rf_full$mtry
out[7,7]<-ncol(indep)

out[7,8]<-floor(sum(full_om$b2)/8)

#AUC comparison
testobj <- roc.test(roc,r_3,method="delong",alternative="two.sided")
options("scipen"=10)
options()$scipen

sig_two[1,6]<-testobj$p.value[1]


out
sig_two

save.image("/Users/felixward/Dropbox/CrisisPrediction/DoFiles/CT_horizons") 


###############################################################
#															TABLES													#
###############################################################
load("/Users/felixward/Dropbox/CrisisPrediction/DoFiles/CT_horizons")

library(xtable)
model.list <- c("\\vtop{\\hbox{\\strut 1-year horizon}\\hbox{\\strut Long-run sample}\\hbox{\\strut yearly}}", "\\vtop{\\hbox{\\strut 3-year horizon}\\hbox{\\strut Long-run sample}\\hbox{\\strut yearly}}", "\\vtop{\\hbox{\\strut 1-year horizon}\\hbox{\\strut Post-1970 sample}\\hbox{\\strut yearly}}", "\\vtop{\\hbox{\\strut 3-year horizon}\\hbox{\\strut Post-1970 sample}\\hbox{\\strut yearly}}", "\\vtop{\\hbox{\\strut 1-year horizon}\\hbox{\\strut Post-1970 sample}\\hbox{\\strut quarterly}}", "\\vtop{\\hbox{\\strut 3-year horizon}\\hbox{\\strut Post-1970 sample}\\hbox{\\strut quarterly}}")

for (i in 1:ncol(sig_two)){
	if(sig_two[1,i]<=0.05) {
		out[i+1,2] <- paste(out[i+1,2],"$^{\\mathsection}$",collapse="")
	}	
}

out[2:7,1]<-model.list
param.list <- c("", "AUC", "95\\%-CI", "N","B", "$ J_{try} $", "$ J $", "\\# of crises")
out[1,] <- param.list

tout<-t(out)
mat3<-xtable(tout, align="llcccccc", caption="Robustness Checks", label="tab:CT_horizons") # for whatever reason need one column more than i actually want (added "l" to left)

print(mat3, type="latex", caption.placement="top", hline.after=c(), sanitize.text.function = function(x){x}, file="/Users/felixward/Dropbox/CrisisPrediction/Written/CT_horizons.txt", replace=T, floating=F, booktabs=T, include.colnames=F, include.rownames=F, add.to.row=list(pos=list(0,1,4,8), 
command=c(" \\\\ \\cmidrule(){1-7} \\\\",
" \\\\ \\cdashline{1-7} \\\\",
" \\\\ \\cdashline{1-7} \\\\",
" \\\\ \\cmidrule(){1-7} \\\\")))