# Comparison: Prediction
# Dec 18, 2015
# Ref: ~/Projects/DP/baseball_compare

 
source("data_select2012.R")

source("profl_fun.R")
source("uvTweedie_WGLVmix.R") # Bayes rule (Tweedie formula) for u or v (marginal out v or u)
source("uvTweedie_WTLVmix.R") # Bayes rule for u or v (independence prior)
source("uTweedie_Bmix.R") # Bayes rule (Tweedie formula) for u or v (marginal out v or u)

load("joint_np.Rda")

fo = fuv2 #KW WGLVmix mixture with no covariates
fi = WTLVmix(C$HA,C$id,4*C$AB, rtol = 1e-10) # KW WTLVmix mixture (indepedence prior)
fb <- Bmix(bballt$Ht, bballt$ABt, m = 3000, rtol = 1e-10) # Binomial mixture (aggregated individual)


load("joint_with_age_lag.Rda")

whichmax <- arrayInd(which.max(pl2),dim(pl2))
b1l <- bs2[,1][whichmax[1]]
b2l <- bs2[,2][whichmax[2]]
rho <- rhos[whichmax[3]]
lag = C$lag
y_lag = ha - rho * lag - b1l * (age - 30)/100 - b2l*(age-30)^2/10000
f_age_lag <- WTLVmix(y_lag,id,w,rtol = 1e-10)

load("joint_with_age.Rda")

whichmax <- arrayInd(which.max(pl2),dim(pl2))
b1 <- bs2[,1][whichmax[1]]
b2 <- bs2[,2][whichmax[2]]
load("joint_with_age_lag.Rda")  # to get rid of 2002 data so that length of ha agrees with joint_with_age_lag
yage = ha - b1 * (age - 30)/100 - b2*(age-30)^2/10000
fage <- WTLVmix(yage,id,w,rtol = 1e-10)   #with age covariates, independece prior

# another model (see "joint_with_lag_AB.Rda")
y_lag2 = ha - 0.9 * lag - 0.25* C$AB/1000
f_age_lag2 <- WTLVmix(y_lag2, id, w, rtol = 1e-10)

bballt2012$HA <- asin(sqrt((bballt2012$H+0.25)/(bballt2012$AB + 0.5)))   #2012 realized transformed batting average
sub1 <- bballt$id %in% bballt2012$id  # excludes players not in 2012
sub2 <- bballt2012$id %in% bballt$id  # excludes players only appear in 2012



# WGLVmix model without covariates
NLVp <- uvTweedie_WGLVmix(fo, ha, id, w)
NLVpu <- NLVp$u
NLVpv <- NLVp$v


# WTLVmix model without covariates
TLVp <- uvTweedie_WTLVmix(fi,ha,id,w)
TLVpu <- TLVp$u
TLVpv <- TLVp$v

# WTLVmix model with age
TLVpage <- uvTweedie_WTLVmix(fage,yage,id,w)
TLVpageu <- TLVpage$u
TLVpagev <- TLVpage$v

# WTLVmix model with age and lag
TLVplag <- uvTweedie_WTLVmix(f_age_lag,y_lag,id,w)
TLVplagu <- TLVplag$u
TLVplagv <- TLVplag$v

# WTLVmix model with  lag and AB
TLVpAB <- uvTweedie_WTLVmix(f_age_lag2,y_lag2,id,w)
TLVpABu <- TLVpAB$u
TLVpABv <- TLVpAB$v
# Binomial mixture 
Bmixp <- uTweedie_Bmix(fb$x, fb$y,bballt$Htotal, bballt$ABtotal)$u


# DP mixture 
load("DP0.Rda")
load("DP1.Rda")
load("DP2.Rda")
#Prediction from DP (take predicted density as mixing and use Bayes rule for Bmix) 
DP0p <- uTweedie_Bmix(DP0$grid,DP0$densp.m,bballt$Htotal,bballt$ABtotal)$u
DP1p <- uTweedie_Bmix(DP1$grid,DP1$densp.m,bballt$Htotal,bballt$ABtotal)$u
DP2p <- uTweedie_Bmix(DP2$grid,DP2$densp.m,bballt$Htotal,bballt$ABtotal)$u

# GL without theta heterogeneity, try w/o covariates, age , age + lag
source("uTweedie_GLmix.R")
load("GL_with_age.Rda")

# without covariates
 wsum = tapply(w,id,"sum")
       t = tapply(w * ha, id, "sum")/wsum
       fGL <- GLmix(t, sigma = sqrt(1/wsum), rtol = 1e-10)
fGLu = uTweedie_GLmix(fGL,t,sig= sqrt(1/wsum))$u

# with age covariates 
whichmax <- arrayInd(which.max(pl2),dim(pl2))
b1GLa <- bs2[,1][whichmax[1]]
b2GLa <- bs2[,2][whichmax[2]]
load("GL_with_age_lag.Rda")  # to get rid of 2002 data so that length of ha agrees with GL_with_age_lag
yGL = ha - b1GLa * (age - 30)/100 - b2GLa *(age - 30)^2/10000
tGLa = tapply(w * yGL, id, "sum")/wsum
        fGLa <- GLmix(tGLa, sigma = sqrt(1/wsum), rtol = 1e-10)
fGLau = uTweedie_GLmix(fGLa, tGLa, sig = sqrt(1/wsum))$u

# with age + Lag
load("GL_with_age_lag.Rda")
whichmax <- arrayInd(which.max(pl2),dim(pl2))
b1GLal <- bs2[,1][whichmax[1]]
b2GLal <- bs2[,2][whichmax[2]]
rhoGL <- rhos[whichmax[3]]
lag = C$lag
y_GLal = ha - rhoGL * lag - b1GLal * (age - 30)/100 - b2GLal*(age-30)^2/10000
tGLal = tapply(w * y_GLal, id, "sum")/wsum

fGLal <- GLmix(tGLal,sigma = sqrt(1/wsum), rtol = 1e-10)
fGLalu = uTweedie_GLmix(fGLal, tGLal, sig = sqrt(1/wsum))$u

# no mixture 


# Merge 2012 player set with those that are involved in estimation


T <- as.data.frame(cbind(bballt$name, bballt$id, NLVpu, NLVpv, TLVpu, TLVpv, Bmixp,DP0p,DP1p,DP2p, TLVpageu, TLVpagev, TLVplagu, TLVplagv, TLVpABu, TLVpABv, fGLu, fGLau, fGLalu ))
names(T) <- c("name","id", "NLVpu","NLVpv","TLVpu","TLVpv","Bmixp","DP0p","DP1p","DP2p","TLVxpu", "TLVxpv", "TLVlagpu","TLVlagpv", "TLVpABu","TLVpABv", "GL","GLa","GLal")

C11 = C[C$year==2011,]
C2011 <- as.data.frame(cbind(unique(C11$id), tapply(C11$HA, C11$id,"mean"),tapply(C11$AB,C11$id,"sum"),tapply(C11$H,C11$id,"sum")))
names(C2011) <- c("id", "HA2011","AB2011","H2011")
C2011$name = unique(C11$name)
T1 = merge(C2011, T, by.x = "id", by.y = "id")
T1 = subset(T1, select = -c(name.y))
names(T1) <- c("id","HA2011","AB2011","H2011","name","NLVpu","NLVpv","TLVpu","TLVpv","Bmixp","DP0p","DP1p","DP2p","TLVxpu", "TLVxpv", "TLVlagpu","TLVlagpv", "TLVpABu","TLVpABv","GL","GLa","GLal")


final <- merge(bballt2012,T1,by.x = "id", by.y = "id")
final <- subset(final, select = -c(name.y))
names(final) <- c("id","AB2012","H2012","walks2012","pitcher","age","name","HA","HA2011","AB2011","H2011","NLVpu","NLVpv","TLVpu",
"TLVpv","Bmixp","DP0p","DP1p","DP2p","TLVxpu", "TLVxpv", "TLVlagpu","TLVlagpv", "TLVpABu","TLVpABv","GL","GLa","GLal")



# compute the prediction (back to binomial p scale) w or w/o covariates effect accounted for (!! NLVpu is a factor, so be careful when converting it back to numerics)
final$NLVpu <- as.numeric(paste(final$NLVpu))  # WGLVmix w/o x

final$NLVpub <- sin(final$NLVpu)^2  # WGLVmix w/o x in p scale 
final$Bmixp <- as.numeric(paste(final$Bmixp))   # Bmix w/o x
final$p2012 <- final$H2012/final$AB2012  #realization
final$HA2012 <- asin(sqrt((final$H2012+0.25)/(final$AB2012+0.5)))
final$TLVpu <- as.numeric(paste(final$TLVpu))   # WTLVmix w/o x

final$TLVpub <- sin(final$TLVpu)^2   # WTLVmix w/o x in p scale
final$DP0p <- as.numeric(paste(final$DP0p))  # DP alpha = 10
final$DP1p <- as.numeric(paste(final$DP1p))	 # DP alpha = 1
final$DP2p <- as.numeric(paste(final$DP2p))  # DP alpha = 0.01


final$TLVxpu <- as.numeric(paste(final$TLVxpu)) + b1 * (final$age - 30)/100 + b2*(final$age-30)^2/10000 # WTLVmix w x 
final$TLVxpub <- sin(final$TLVxpu)^2   # WTLVmix w x in p scale



final$GL<- as.numeric(paste(final$GL))
final$GLp <- sin(final$GL)^2
final$GLa <- as.numeric(paste(final$GLa))  + b1GLa * (final$age - 30)/100 + b2GLa * (final$age - 30)^2/10000
final$GLap <- sin(final$GLa)^2


lag2011 = mean(C[C$year==2011,]$HA)
final$TLVlagpu <- as.numeric(paste(final$TLVlagpu)) + rho * lag2011 + b1l * (final$age - 30)/100 + b2l*(final$age-30)^2/10000
final$TLVlagpub <- sin(final$TLVlagpu)^2

final$TLVpABu <- as.numeric(paste(final$TLVpABu)) + 0.9 * lag2011 + 0.25 * final$AB2012/1000
final$TLVpABub <- sin(final$TLVpABu)^2

final$GLal <- as.numeric(paste(final$GLal))  + rhoGL * lag2011 + b1GLal * (final$age - 30)/100 + b2GLal * (final$age - 30)^2/10000
final$GLalp <- sin(final$GLal)^2

load("GL0_age_lag.Rda")
agemodel = coefficients(lsmodel1)
agelagmodel = coefficients(lsmodel2)

final$GL0a <- agemodel[1] + agemodel[2] * (final$age - 30)/100 + agemodel[3] * (final$age - 30)^2/10000
final$GL0al <- agelagmodel[1] + agelagmodel[2] * lag2011 + agelagmodel[3] * (final$age - 30)/100 + agelagmodel[4] * (final$age - 30)^2/10000
final$GL0pa <- sin(final$GL0a)^2
final$GL0pal <- sin(final$GL0al)^2

#pre <- merge(bballt, bballt2012, by.x = "id", by.y = "id")
#final$naive0 <- pre$Htotal/pre$ABtotal  #Naive: historical average (not so naive actually!)

SSE = function(a,b,AB){
	sum((a-b)^2*(4*AB))}

TSE <- function(a,b, AB){
	sum((a - b)^2 - 1/(4 * AB))
	}

LSModel_TSE = c(TSE(final$TLVpu, final$HA2012,final$AB2012), TSE(final$NLVpu, final$HA2012,final$AB2012), TSE(final$TLVxpu, final$HA2012,final$AB2012), TSE(final$TLVlagpu,final$HA2012,final$AB2012), TSE(final$TLVpABu,final$HA2012,final$AB2012), TSE(final$HA2011,final$HA2012,final$AB2012), TSE(lag2011,final$HA2012,final$AB2012))
names(LSModel_TSE) = c("WTLV", "WGLV", "WTLVage", "WTLVlag", "WTLVAB", "HA", "Lag")

LModel_TSE = c( TSE(final$GL, final$HA2012,final$AB2012), TSE(final$GLa, final$HA2012,final$AB2012), TSE(final$GLal, final$HA2012, final$AB2012),TSE(final$GL0a, final$HA2012, final$AB2012),TSE(final$GL0al, final$HA2012, final$AB2012))
names(LModel_TSE) = c( "GL", "GLa","GLal", "GL0a", "GL0al")
print(LSModel_TSE)
print(LModel_TSE)

TSEp <- function(a,b,n) sum((a-b)^2 - b*(1-b)/n)

lagp2011 = sum(C[C$year==2011,]$H)/sum(C[C$year==2011,]$AB)
BModel_TSEp = c(TSEp(final$TLVpub,final$p2012,final$AB2012),TSEp(final$NLVpub,final$p2012,final$AB2012),TSEp(final$TLVxpub, final$p2012,final$AB2012), TSEp(final$TLVlagpub,final$p2012,final$AB2012),TSEp((final$H2011/final$AB2011),final$p2012,final$AB2012),TSEp(lagp2011,final$p2012,final$AB2012),TSEp(final$GLp,final$p2012,final$AB2012),TSEp(final$GLap,final$p2012,final$AB2012),TSEp(final$GLalp,final$p2012,final$AB2012),TSEp(final$GL0pa,final$p2012,final$AB2012),TSEp(final$GL0pal,final$p2012,final$AB2012),TSEp(final$Bmixp, final$p2012,final$AB2012), TSEp(final$DP0p,final$p2012,final$AB2012), TSEp(final$DP1p,final$p2012,final$AB2012), TSEp(final$DP2p,final$p2012,final$AB2012))
names(BModel_TSEp) = c("WTLV","WGLV","WTLVage","WTLVlag","Naive","Lag","GL","GLa","GLal","GL0a", "GL0al","Bmix","DP0","DP1","DP2")
print(BModel_TSEp)


LSModel_SSE = c(SSE(final$TLVpu, final$HA2012,final$AB2012), SSE(final$NLVpu, final$HA2012,final$AB2012), SSE(final$TLVxpu, final$HA2012,final$AB2012), SSE(final$TLVlagpu,final$HA2012,final$AB2012), SSE(final$TLVpABu,final$HA2012,final$AB2012), SSE(final$HA2011,final$HA2012,final$AB2012), SSE(lag2011,final$HA2012,final$AB2012))
names(LSModel_SSE) = c("WTLV", "WGLV", "WTLVage", "WTLVlag", "WTLVAB", "HA", "Lag")

LModel_SSE = c( SSE(final$GL, final$HA2012,final$AB2012), SSE(final$GLa, final$HA2012,final$AB2012), SSE(final$GLal, final$HA2012, final$AB2012),SSE(final$GL0a, final$HA2012, final$AB2012),SSE(final$GL0al, final$HA2012, final$AB2012))
names(LModel_SSE) = c( "GL", "GLa","GLal", "GL0a", "GL0al")
print(LSModel_SSE)
print(LModel_SSE)




