############################################################################################################
# SCRIPT TO IMPLEMENT THE CHOW-LIN (1971) PROCEDURE TO INTERPOLATE, DISTRIBUTE AND EXTRAPOLATE TIME SERIES #
# by Christian Schoder                                                                                     #
# Department of Economics, The New School for Social Research, New York                                    #
# Version 12/24/2011                                                                                       #
# Please send comments to schoc152@newschool.edu                                                           #
############################################################################################################

############################################################################################################
# STEP 0: Make sure the following preconditions are met                                                    #
############################################################################################################
## Make sure that the R working folder is the folder of this script
## Make sure you have read and understood the article by Chow-Lin (1971)

############################################################################################################
# STEP 1: Important definitions and functions for later reference                                          #
############################################################################################################
## Interpolation ("i") or distribution ("d")? Assign the appropriate value. Default is "i". 
problem <- "i" 
## Do you want to add a time trend to the matrix of explanatory variables? "y" for yes, "n" for no. Default is "y".
timetrend <- "y"
## If you have to extrapolate a series, specify here how many observations the extrapolated part of the estimated series (with the high frequency) will include, otherwise assign 0. Default is 0.
m<-14
## If you have to extrapolate a series, specify here if the extrapolated part is at the beginning ("b") or end ("e") of the series. Default is "b". 
extrapolate<-"e"
## Function to solve for solution of polynomial
findroot<-function(x) {
X<-rep(1,(n*f)^2)
dim(X)<-c(n*f,n*f)
for (j in 1:(n*f)) {for (i in 1:(n*f)) {X[j,i]<-x^abs(i-j)}}
sum(X[1:f,(f+1):(2*f)])/sum(X[1:f,1:f])-q
}
## The following is needed for the loop over multiple series
y.complete.new<-seq(1,n*f+m)

for (k in 2:16) {
############################################################################################################
# STEP 2: Importing the data series                                                                        #
############################################################################################################
## Note that there must not be missing values in the series. 
## We use the notation of Chow and Lin's article. Hence, a dot indicates the series with low frequency. 
## Obviously, the references to the imported files have to be updated.
## The data files should not include constants which will be added later nor a time trend which may be added later.
## It is assumed that the data files include the observation index in the first column which will be excluded.
## If stock data are used, they should be end-of-the-year values.
## Note that the low-frequency explanatory variables must match the number of observations of the low-frequency dependent variable even in the case of extrapolation. 
dat_y. <- read.table("dat_y_dot.csv", header=TRUE, sep=",")[,k] #importing the low-frequency dependent variable
dat_x <- read.table("dat_x.csv", header=TRUE, sep=",")[,k] #importing the high-frequency explanatory variable(s) (for the period to interpolate/distribute)
dat_x. <- read.table("dat_x_dot.csv", header=TRUE, sep=",")[,k] #importing the low-frequency explanatory variable(s) (for the period to interpolate/distribute)
if (m>0) {dat_e.x <- read.table("dat_e_x.csv", header=TRUE, sep=",")[,k]} else {} #importing the high-frequency explanatory variable(s) (for the period to extrapolate)

############################################################################################################
# STEP 3: Some more definitions                                                                            #
############################################################################################################
## Number of observations of the low-frequency series
n<-length(dat_y.)
## Frequency of the high-frequency series assuming that frequency of the low-frequency series is 1
f<-dim(read.table("dat_x.csv", header=TRUE, sep=","))[1]/length(dat_y.) 
## Create the matrix transforming the high-frequency data into low-freqency data. Note that the form of the matrix is different for the interpolation and distribution problem.
c<-rep(0,n*n*f); dim(c)<-c(n,n*f)
if (problem=="d") {for (j in 0:(f-1)) {for (i in 1:n) {c[i,f*i-j]=1/f}}} else {for (i in 1:n) {c[i,f*i-(f-1)]=1}}
## Create an empty y-vector to be filled later with the estimates, an empty matrix which will be the variance-covariance matrix for interpolation/distribution as well as an empty matrix which will be the variance-covariance matrix for extrapolation.
y<-rep(0,(n*f+m)); dim(y)<-c(n*f+m,1)
A<-rep(1,(n*f)^2); dim(A)<-c(n*f,n*f)
if (m>0) {B<-rep(1,m*n*f); dim(B)<-c(m,n*f)} else {}
## Create the known data matrices/vectors for the imported data and add a constant to the matrix/vector of exogenous variables.
x <- cbind(rep(1, times=n*f), dat_x)
x. <- cbind(rep(1, times=n), dat_x.)
y. <- dat_y.; dim(y.)=c(n,1)
if (m>0) {e.x <- cbind(rep(1, times=m), dat_e.x)} else {}

############################################################################################################
# STEP 4: Adds a time trend, but only if you indicated that in Step 1.                                     #
############################################################################################################
if (timetrend=="n") {} else {
  if (m>0) {
    if (extrapolate=="e") {
      x <- cbind(x,seq(1,n*f))
      x. <- cbind(x.,seq(1,n)*f)
      e.x <- cbind(e.x,n*f+seq(1,m))
    } else {
      x <- cbind(x,m+seq(1,n*f))
      x. <- cbind(x.,m+seq(1,n)*f)
      e.x <- cbind(e.x,seq(1,m))
    }
  } else {
    x <- cbind(x,seq(1,n*f))
    x. <- cbind(x.,seq(1,n)*f)
  }
}

############################################################################################################
# STEP 5: Estimation                                                                                       #
############################################################################################################
## It is assumed that the disturbance terms follow a first-order autoregressive process.
## Note that the lower and upper bound of the uniroot function may have to be adjusted such that the polynomial at these bounds has values of opposite sign.
q0<- 0.4 #define start value
q<- 0.5 #define start value
while (abs(q-q0)>0.0001) {
q0<-q
if (problem=="d") {
  a<-uniroot(findroot, lower = -1.2, upper = 1.2)$root # only for distribution
} else {
  a<-abs(q)^(1/f) # only for interpolations
}
for (j in 1:(n*f)) {for (i in 1:(n*f)) {A[j,i]<-a^abs(i-j)}} #we can use A instead of v_q because the constant term of the matrices drop out!!! 
beta<- solve( t(x.) %*% solve( c %*% A %*% t(c) ) %*% t(t(x.)) ) %*% (t(x.) %*% solve( c %*% A %*% t(c)  ) %*% y.)
u. <- y. - t(t(x.)) %*% beta 
q<-acf(u.,plot=FALSE)$acf[2,1,1]
}
## Using the estimated a, we can compute B for extrapolation.
if (m>0) {for (j in 0:m-1) {for (i in 0:(n*f-1)) {B[j+1,i+1]<-a^abs(n*f-i+j)}}} else {}

############################################################################################################
# STEP 6: Computing the interpolated/distributed/extrapolated series and data export                       #
############################################################################################################
## Note that the resulting series has to be devided by 4 if the dependent variable is a flow and the explanatory variables are stocks. 
y <- t(t(x)) %*% beta + A %*% t(c) %*% solve(c %*% A %*% t(c) ) %*% u.
if (m>0) {
  if (extrapolate=="e") {
    e.y <- t(t(e.x)) %*% beta + B %*% t(c) %*% solve(c %*% A %*% t(c) ) %*% u.
    y.complete <- rbind(y,e.y)
  } else {
    e.y <- t(t(e.x)) %*% beta + B %*% t(c) %*% solve(c %*% A %*% t(c) ) %*% u.
    y.complete <- rbind(e.y,y)
  }
} else {
  y.complete <- y
}
y.complete.new<-cbind(y.complete.new,y.complete)
write.csv(y.complete.new, file="dat_y.csv")
}

############################################################################################################
# STEP 7: Plot low-frequency and estimated high-frequency series to check for errors                       #
############################################################################################################
plot(ts(y.complete,start=1,freq=1),col="red",ylab="")
if (extrapolate=="e") {lines(ts(y.,start=f,freq=1/f),col="blue",ylab="")} else {lines(ts(y.,start=m+f,freq=1/f),col="blue",ylab="")}
legend("topleft",bty="n", legend=c("low-frequency series","estimated high-frequency series"), lty=c(1,1), col=c("blue", "red"))


