#2002-2012 data collection from espn webpage
#No team info
string=c("ari","atl","bal","bos","chc","chw","cin","cle","col","det","hou","kc","laa","lad","mia","mil","min","nym","nyy","oak","phi","pit","sd","sf","sea","stl","tb","tex","tor","wsh")

team=c("arizona-diamondbacks","atlanta-braves","baltimore-orioles","boston-red-sox","chicago-cubs","chicago-white-sox","cincinnati-reds","cleveland-indians","colorado-rockies","detriot-tigers","houston-astros","kansas-city-royals","los-angeles-angels","los-angeles-dodgers","miami-marlins","milwaukee-brewers","minnesota-twins","new-york-mets","new-york-yankees","oakland-atheletics","philadelphia-phillies","pittsburgh-pirates","san-diego-padres","san-francisco-giants","seattle-mariners","st-louis-cardinals","tampa-bay-rays","texas-rangers","toronto-blue-jays","washington-nationals")

year=c(2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012)


#B is a data.frame containing all players from 30 teams over 11 years. Stacked by year. 

B<-NULL
for (k in 1:(length(year))){
	load(paste("~/A_",year[k],".Rda",sep="")) #name of the matrix in Rda is A
	B<-rbind(B,A)
	}
colnames(B)=c("name","AB(4)","H(4)","BB(4)","OBP(4)","AB(5)","H(5)","BB(5)","OBP(5)","AB(6)","H(6)","BB(6)","OBP(6)","AB(7)","H(7)","BB(7)","OBP(7)","AB(8)","H(8)","BB(8)","OBP(8)","AB(9)","H(9)","BB(9)","OBP(9)","pitcher","year","team")

# Remove Daggers from the Baseball data:  Illustrates some awkward grep usage
# We begin with the knowledge that ## has a dagger (rowname==13)
a <- B[grep("13", rownames(B)),1][1] 
b <- as.character(a) # make into a string
d <- unlist(strsplit(b,NULL)) #split into characters
g <- d[13] # d is the dagger
traded <- rep(0,nrow(B))
name <- B$name
traded[grep(g,name)] <- 1 
name <- sub(g, "", name)
B<-cbind(name,B[2:28],traded)

#deal with duplicate name players (rename with YOB)
dB=B
dB$name<-as.character(dB$name)
dB$name[dB$name=='Chris Young'&dB$pitcher==1]<-'Chris Young 1979'
dB$name[dB$name=='Chris Young'&dB$pitcher==0]<-'Chris Young 1983'
dB$name[dB$name=='Alex Gonzalez'&dB$team=='miami-marlins']<-'Alex Gonzalez 1977'
dB$name[dB$name=='Alex Gonzalez'&dB$team=='boston-red-sox']<-'Alex Gonzalez 1977'
dB$name[dB$name=='Alex Gonzalez'&dB$team=='cincinnati-reds']<-'Alex Gonzalez 1977'
dB$name[dB$name=='Alex Gonzalez'&dB$team=='toronto-blue-jays']<-'Alex Gonzalez 1977'
dB$name[dB$name=='Alex Gonzalez'&dB$team=='atlanta-braves']<-'Alex Gonzalez 1977'
dB$name[dB$name=='Alex Gonzalez'&dB$team=='chicago-cubs']<-'Alex Gonzalez 1973'
dB$name[dB$name=='Alex Gonzalez'&dB$team=='san-diego-padres']<-'Alex Gonzalez 1973'
dB$name[dB$name=='Alex Gonzalez'&dB$team=='tampa-bay-rays']<-'Alex Gonzalez 1973'
dB$name[dB$name=='Alex Gonzalez'&dB$team=='philadelphia-phillies']<-'Alex Gonzalez 1973'
dB$name[dB$name=='Alex Gonzalez'&dB$team=='washington-nationals']<-'Alex Gonzalez 1973'

dB$name[dB$name=='Abraham Nunez'&dB$team=='pittsburgh-pirates']<-'Abraham Nunez 1976'
dB$name[dB$name=='Abraham Nunez'&dB$team=='st-louis-cardinals']<-'Abraham Nunez 1976'
dB$name[dB$name=='Abraham Nunez'&dB$team=='philadelphia-phillies']<-'Abraham Nunez 1976'
dB$name[dB$name=='Abraham Nunez'&dB$team=='new-york-mets']<-'Abraham Nunez 1976'
dB$name[dB$name=='Abraham Nunez'&dB$team=='miami-marlins']<-'Abraham Nunez 1977'

dB$name[dB$name=='Luis Gonzalez'&dB$team=='colorado-rockies']<-'Luis Gonzalez 1979'
dB$name[dB$name=='Luis Gonzalez'&dB$team=='arizona-diamondbacks']<-'Luis Gonzalez 1967'
dB$name[dB$name=='Luis Gonzalez'&dB$team=='los-angeles-dodgers']<-'Luis Gonzalez 1967'
dB$name[dB$name=='Luis Gonzalez'&dB$team=='miami-marlins']<-'Luis Gonzalez 1967'

dB$name[dB$name=='Ramon Castro'&dB$team=='oakland-atheletics']<-'Ramon Castro 1979'
dB$name[dB$name=='Ramon Castro'&dB$team=='miami-marlins']<-'Ramon Castro 1976'
dB$name[dB$name=='Ramon Castro'&dB$team=='new-york-mets']<-'Ramon Castro 1976'
dB$name[dB$name=='Ramon Castro'&dB$team=='chicago-white-sox']<-'Ramon Castro 1976'

dB$name[dB$name=='Luis Lopez'&dB$team=='washington-nationals']<-'Luis Lopez 1973'
dB$name[dB$name=='Luis Lopez'&dB$team=='baltimore-orioles']<-'Luis Lopez 1970'
dB$name[dB$name=='Luis Lopez'&dB$team=='cincinnati-reds']<-'Luis Lopez 1970'
dB$name[dB$name=='Luis Lopez'&dB$team=='milwaukee-brewers']<-'Luis Lopez 1970'

dB$name[dB$name=='Brian Anderson'&dB$team=='kansas-city-royals']<-'Brian Anderson 1972'
dB$name[dB$name=='Brian Anderson'&dB$team=='chicago-white-sox']<-'Brian Anderson 1982'
dB$name[dB$name=='Brian Anderson'&dB$team=='boston-red-sox']<-'Brian Anderson 1982'

dB$name[dB$name=='Tony Pena'&dB$team=='atlanta-braves']<-'Tony Pena 1981'
dB$name[dB$name=='Tony Pena'&dB$team=='kansas-city-royals']<-'Tony Pena 1981'
dB$name[dB$name=='Tony Pena'&dB$team=='arizona-diamondbacks']<-'Tony Pena 1982'
dB$name[dB$name=='Tony Pena'&dB$team=='chicago-white-sox']<-'Tony Pena 1982'

dB$name[dB$name=='Jose Reyes'&dB$team=='new-york-mets']<-'Jose Reyes 1983'
dB$name[dB$name=='Jose Reyes'&dB$team=='chicago-cubs']<-'Jose Reyes 1984'

dB$name[dB$name=='Ryan Braun'&dB$team=='milwaukee-brewers']<-'Ryan Braun 1983'
dB$name[dB$name=='Ryan Braun'&dB$team=='kansas-city-royals']<-'Ryan Braun 1980'

dB$name[dB$name=='Edgar Gonzalez'&dB$team=='san-diego-padres']<-'Edgar Gonzalez 1978'
dB$name[dB$name=='Edgar Gonzalez'&dB$team=='arizona-diamondbacks']<-'Edgar Gonzalez 1983'
dB$name[dB$name=='Edgar Gonzalez'&dB$team=='oakland-atheletics']<-'Edgar Gonzalez 1983'
dB$name[dB$name=='Edgar Gonzalez'&dB$team=='colorado-rockies']<-'Edgar Gonzalez 1983'

dB$name[dB$name=='Chris Carter'&dB$team=='oakland-atheletics']<-'Chris Carter 1986'
dB$name[dB$name=='Chris Carter'&dB$team=='boston-red-sox']<-'Chris Carter 1982'
dB$name[dB$name=='Chris Carter'&dB$team=='new-york-mets']<-'Chris Carter 1982'

dB$name[dB$name=='Chris Carpenter'&dB$team=='chicago-cubs']<-'Chris Carpenter 1985'
dB$name[dB$name=='Chris Carpenter'&dB$team=='st-louis-cardinals']<-'Chris Carpenter 1975'
dB$name[dB$name=='Henry Rodriguez'&dB$team=='cincinnati-reds']<-'Henry Rodriguez 1990'
dB$name[dB$name=='Henry Rodriguez'&dB$team=='washington-nationals'&dB$year>2005]<-'Henry Rodriguez 1987'
dB$name[dB$name=='Henry Rodriguez'&dB$team=='oakland-atheletics']<-'Henry Rodriguez 1990'
dB$name[dB$name=='Henry Rodriguez'&dB$team=='oakland-atheletics']<-'Henry Rodriguez 1990'
dB$name[dB$name=='Henry Rodriguez'&dB$team=='washington-nationals'&dB$year==2002]<-'Henry Rodriguez 1967'

dB$name[dB$name=='Melvin Upton Jr.'] <- 'B.J. Upton'  #change name at 2015!!



ulist<-unique(dB$name)

baseball<-NULL #stack all players that have at least appeared twice
for (i in 1:length(ulist)){
	w=(dB$name==ulist[i])
	baseball=rbind(baseball,dB[w,])
	}

baseball$id<-0  #Add id for tracking purpose
for (i in 1:length(ulist)){
	w=(baseball$name==ulist[i])
	baseball$id[w]<-i
	}



#needs to write into txt table here
write.table(baseball,file="~/baseball_panel_by_year.txt")
save(baseball,file="~/baseball_panel_by_year.Rda")
###
#construct panel

baseball<-read.table("~/baseball_panel_by_year.txt",header=T) 

colnames(baseball)=c("name","AB.4","H.4","BB.4","OBP.4","AB.5","H.5","BB.5","OBP.5","AB.6","H.6","BB.6","OBP.6","AB.7","H.7","BB.7","OBP.7","AB.8","H.8","BB.8","OBP.8","AB.9","H.9","BB.9","OBP.9","pitcher","year","team","traded","id")


baseball$AB_s1=baseball$AB.4+baseball$AB.5+baseball$AB.6 #first season at bats
baseball$AB_s2=baseball$AB.7+baseball$AB.8+baseball$AB.9 # second season at bats
baseball$H_s1=baseball$H.4+baseball$H.5+baseball$H.6 #first season hits
baseball$H_s2=baseball$H.7+baseball$H.8+baseball$H.9 #second season hits
baseball$BB_s1 = baseball$BB.4+baseball$BB.5+baseball$BB.6 #first season walks

baseball$BB_s2 = baseball$BB.7+baseball$BB.8+baseball$BB.9 #second season walks

baseball_r=subset(baseball,select=c("name","id","year","AB_s1","H_s1","BB_s1","AB_s2","H_s2","BB_s2","pitcher","traded"))  #reduce data set containing just names, id ,picther indicator and aggregates by seasons


require(doBy) #to be able to collapse data by year (players show up in same year in diff team)

bball<-summaryBy(AB_s1+H_s1+BB_s1+AB_s2+H_s2+BB_s2~name+id+year+pitcher,FUN=sum,data=baseball_r)
colnames(bball)=c("name","id","year","pitcher","AB_s1","H_s1","BB_s1","AB_s2","H_s2","BB_s2")
bball<-bball[order(bball$id),]

#baseball_r[grep("Chris Young",baseball_r$name),]


#transform and trim by criteria

bball$y1=asin(sqrt((bball$H_s1+0.25)/(bball$AB_s1+0.5))) #1st half season transformed hitting average
bball$y2=asin(sqrt((bball$H_s2+0.25)/(bball$AB_s2+0.5))) #2nd half season transformed hitting average
bball$y1[(bball$AB_s1<10)]<-NA #at bats <10 is too small for normality assumption to hold, hence deleted. 
bball$y2[(bball$AB_s2<10)]<-NA

require(reshape) 
ydata1=melt(bball[,c(1,2,3,4,11,12)],id=c("name","id","year","pitcher")) #reshape the data so that HA (y1, y2) is stacked in time order. 
ydata1=ydata1[order(ydata1$id,ydata1$year),][,c(1,2,3,4,6)]
colnames(ydata1)=c("name","id","year","pitcher","HA")

ydata2=melt(bball[,c(1,2,3,5,8)],id=c("name","id","year")) #reshape the data so that AB(AB_s1, AB_s2) is stacked in time order. 
ydata2=ydata2[order(ydata2$id,ydata2$year),][,c(1,2,3,5)] 
colnames(ydata2)=c("name","id","year","AB")

ydata3=melt(bball[,c(1,2,3,6,9)],id=c("name","id","year")) #reshape the data so that hits (H_s1, H_s2) is stacked in time order. 
ydata3=ydata3[order(ydata3$id,ydata3$year),][,c(1,2,3,5)]
colnames(ydata3)=c("name","id","year","H")

ydata4=melt(bball[,c(1,2,3,7,10)],id=c("name","id","year")) #reshape the data so that walks (BB_s1, BB_s2) is stacked in time order. 
ydata4=ydata4[order(ydata4$id,ydata4$year),][,c(1,2,3,5)]
colnames(ydata4)=c("name","id","year","BB")

ydata<-ydata1
ydata$AB=ydata2$AB
ydata$H=ydata3$H
ydata$BB = ydata4$BB

ydata$season<-rep(c(1,2),nrow(ydata)/2)  #all players show up in both season if they show up for the year. 
#ydata$year.season<-ydata$year+ydata$season/10 #2002.2 means 2002 2nd half season.

ydata=subset(ydata,!is.na(ydata$HA))  #remove all HA that is NA ,i.e. AB<=10
bball<-ydata
bball<-bball[order(bball$name,bball$id,bball$year),]
rownames(bball)<-c(1:dim(bball)[1])
colnames(bball)<-c("name","id","year","pitcher","HA","AB","H","BB","season")
write.csv(bball,file="/Users/jiayinggu/Documents/Research/mac/Research_Empirical_Bayes/baseball_data_panelv2/1 R routines/byYear.csv")

save(bball,file="~/byYear_b4Trim.Rda")

#####
#Trim the data (keep players than have more than 3 half season records)

load("~/byYear_b4Trim.Rda")



list=(table(bball$id)<=3) #get a list of id that only have 3 obs. We need to remove them so that we can calculate variance. One could change the threshold value "?".
list=which(list) 
list=as.numeric(names(list)) #get the list of id's that needs to be removed from dataframe
arrow<-NULL
for (i in 1:length(list)){
	arrow=c(arrow,which(bball$id==list[i]))
	}
bball=bball[-arrow,]

namelist=unique(bball$name)

#rename id
bball$id<-0  #Add id for tracking purpose
for (i in 1:length(namelist)){
	w=(bball$name==namelist[i])
	bball$id[w]<-i
	}
colnames(bball)<-c("name","id","year","pitcher","HA","AB","H","BB","season")
bball=bball[,c("name","id","year","season","pitcher","HA","AB","H","BB")]
write.csv(bball,"~/bball>3_ByYear.csv")
save(bball,file="~/bball>3.Rda")
write.table(bball,"~/bball>3.txt")

namelist=unique(bball$name)  #a list of players 

save(bball, file = "~/bball.Rda")
