## Accompanying materials to the Online Appendix to
## "Private returns to R\&D in the presence of spillovers, revisited"
## Forthcoming, Journal of Applied Econometrics
## Giovanni Millo, September 14th 2018

## Exponent of cross-sectional dependence

## this Version 5: done with unbalancedness in the main body,
## further address the unbalancedness problem in spatial errors
## (principal components) by choosing whether to fail (default),
## balance by discarding, or filling in through the DINEOF method

## hyp: missing data only affect T, no individuals are lost

cdexp <- function(x, a.size=0.05, persistent=FALSE, p=NULL,
                  spat.err=FALSE, vf.method=c("ar", "nw"),
                  pcomps=4, balance=c("fail", "reduce",
                                "interpolate")) {

    n <- pdim(x)$nT$n
    t. <- pdim(x)$nT$T

    if(is.null(p)) p <- ceiling(t.^(1/3))

    ## make x in matrix form, time in rows:
    datx <- cbind(as.numeric(x), attr(x, "index"))
    dimnames(datx)[[2]] <- c("x", "id", "time")
    meltx <- melt(datx, id=c("id","time"))
    mx <- cast(meltx, time ~ id)
    mx <- as.matrix(mx)

    xbar.t <- between(x, effect="time", na.rm=TRUE)

    ## calc. (uncorrected) alfa.hat
    a. <- 1 + (1/2)*(log(var(xbar.t, na.rm=TRUE))/log(n))         # this is alfa.hat in (14)


    ## now for the correction leading to alfa.tilde:

    xbar <- xbar.t/sd(xbar.t, na.rm=TRUE)
    xbar.std <- (xbar.t - mean(xbar.t, na.rm=TRUE))/sd(xbar.t, na.rm=TRUE)

    ## est. standardized factor loadings by regressing each x_i
    ## on xbar

    #c.avg <- tapply(x, attr(x, "index")[[1]],
    #                FUN=function(y) coef(lm(y~xbar-1)))

    c.avg <- apply(mx, 2, FUN=function(y) coef(lm(y~xbar-1)))

    ## NB estimates on x.t/sd(x.t), not demeaned, and without intercept

    ## Estimate s.hat in either way, depending on whether there is spatial correlation in u
    ## case 1: spherical remainder errors

    ## est. non-standardized factor loadings by regressing each x_i
    ## on xbar.t

    #c.avg.2 <- tapply(x, attr(x, "index")[[1]],
    #                  FUN=function(y) coef(lm(y~xbar.t-1)))

    c.avg.2 <- apply(mx, 2, FUN=function(y) coef(lm(y~xbar.t-1)))

    ## calc. residuals from xs regression on stdz. "factors"
    ## (u in paper)
    #e <- tapply(x, attr(x, "index")[[1]],
    #            FUN=function(y) resid(lm(y~xbar-1)))

    e <- apply(mx, 2, FUN=function(y) resid(lm(y~xbar-1))) # NB e is a list

    ## ...and variances sigma2_j...

    if(is.matrix(e)) {
      v.e <- apply(e, 2, var, na.rm=TRUE)
    } else {
      v.e <- lapply(e, var, na.rm=TRUE)
    }

    ## ...and mean variance sigma2bar_N
    s.hat <- mean(unlist(v.e), na.rm=TRUE)

    ## now use either s.hat or s.hat.pc as an estimator of the errors variance
    if(spat.err) {

        ## case 2: spatially correlated errors

        ## check balancednmess and in case unbal. choose what to do
        if(pdim(x)$balanced == FALSE) {
            switch(match.arg(balance),
                   fail={stop("Spatial errors correction unavailable for unbalanced panels unless selecting a balancing method ('reduce' or 'interpolate')")},
                   reduce={mx0 <- mx[ , !is.na(apply(mx, 2, sum))]},
                   interpolate={
                       print("Interpolating missing data by DINEOF method")
                       mx0 <- dineof(mx)$Xa})
        } else {
            mx0 <- mx
        }

        ## extract the first pcomps (default: 4) PCs
        ## regress x on them to estimate stdzd. cross-sectional coefs (loadings)
        pc <- prcomp(t(mx0), center=F, scale=F)$rotation[,1:pcomps]
        c.avg.2.pc <- solve(crossprod(pc), crossprod(pc, mx0))

        ## calc. residuals from xs regression (u in paper), here: on PCs
        e.pc <- mx0 - pc %*% c.avg.2.pc

        ## estimator \tilde{c_N} as in Eq. (30-31), BKP JAE paper:
        ## N * the variance of the cross-sectional averages of residuals
        ## from the regression of x on PC1, PC2, ... , PC_pcomps
        e.bar.pc <- apply(e.pc, 1, mean)
        s.hat.pc <- n * mean((e.bar.pc - mean(e.bar.pc))^2)

        s.hat <- s.hat.pc
    }

    a.tilde <- a. - (1/2) * (s.hat / (n * log(n) *
                                      var(xbar.t, na.rm=TRUE)))
                                        # this is alfa.tilde in (15)

    ## now for the calculation of alfa.thrtilde, correcting for significant loadings:
    ## estimate significance of factor loadings

    ## regress each x_i on xs averages xbar.t, with intercept
    ## (TODO: do regressions once, then extract useful terms)

    #coefx <- apply(mx, 2, FUN=function(y) coef(lm(y~xbar.t)))
    #residx <- apply(mx, 2, FUN=function(y) resid(lm(y~xbar.t)))
                                        # unneeded!
    t.test <- apply(mx, 2, FUN=function(y) coef(summary(lm(y~xbar.t)))[2,3])

    ## find (decreasing) order of significance for each loading
                                        #load.order <- order(unlist(abs(t.test)), decreasing=TRUE)
    ## multiple-test for significance in order of t-test
    load.test <- cbind(abs(t.test), 1:n,
                       rep(0,n))[order(abs(t.test), decreasing=TRUE),]
    dimnames(load.test)[[2]] <- c("ttest","orig.i","size")
    for(j in 1:n) {
        p.n <- a.size/(n-j+1)
        theta <- qnorm(1-p.n/2)
        if(load.test[j,"ttest"] > theta) {
            load.test[j,"size"] <- 1
        }
    }

    ## if none is significant:
    if(sum(load.test[,"size"])==0) {
        musqr.thr <- 1
    } else {
        ## in original data, discard rows where size=0:

        ## make dataframe from pseries
        pdx <- cbind(as.numeric(x), attr(x, "index"))
        ## select set of indices to 'keep' (order doesn't count)
        keep.i <- load.test[load.test[,"size"]==1, "orig.i"]
        ## prune
        x.str <- pdx[pdx[,2] %in% unique(pdx[,2])[keep.i],]

        xstr.bar <- tapply(x.str[,1], x.str[,3], mean,
                           na.rm=TRUE)
        ## deviance of factors for dataset restricted to
        ## individuals with significant loadings only:
        musqr.thr <- mean((xstr.bar-mean(xstr.bar, na.rm=TRUE))^2,
                          na.rm=TRUE)
    }

    a.thrtilde <- a.tilde - (1/2)*(log(musqr.thr)/log(n))

    ## now make variance omega (Theorem 3)

    ## TODO in final version, make either ggg.o or ggg.t depending on choice of alfa

    ggg.o <- floor(n^a.tilde) # this is N^alfa
    ## reduce to admissible range
    if(ggg.o > n) ggg.o <- n
    if(ggg.o < 1) ggg.o <- 1
    ## order individual coefs according to abs value
    c.avg.sel <- cbind(matrix(c.avg, ncol=1),
                       matrix(abs(c.avg), ncol=1))[order(abs(c.avg),
                                                         decreasing=TRUE), ]
    ## ...and select the first N^alfa ones
    c.avg.sel <- c.avg.sel[1:ggg.o, 1]
    m.c.avg.sel <- mean(c.avg.sel)

    frasel <- (c.avg.sel - m.c.avg.sel)^2
    s.frasel <- sum(frasel)

    ## same for a.thrtilde
    ggg.t <- floor(n^a.thrtilde) # this is N^alfa
    ## reduce to admissible range
    if(ggg.t > n) ggg.t <- n
    if(ggg.t < 1) ggg.t <- 1
    ## order individual coefs according to abs value
    c.avg.selt <- cbind(matrix(c.avg, ncol=1),  # same as before, reuse!
                        matrix(abs(c.avg), ncol=1))[order(abs(c.avg),
                                                          decreasing=TRUE), ]
    ## ...and select the first N^alfa ones
    c.avg.selt <- c.avg.selt[1:ggg.t, 1]
    m.c.avg.selt <- mean(c.avg.selt)

    ## this is now the deviance of first N^alfa loadings,
    ## the denominator of (21)
    fraselt <- (c.avg.selt - m.c.avg.selt)^2
    s.fraselt <- sum(fraselt)

    ## estimate factor variance Vf2 under either uncorrelated or persistent factors

    if(persistent) {

        ## estimate Vf2.q as in (38), p.18
        x.tilde <- xbar   # xs averages divided by their SD, already calc. above
        z.t <- (x.tilde - mean(x.tilde, na.rm=TRUE))^2 # squares of centered x.tilde
        z.tilde <- z.t - mean(z.t, na.rm=TRUE)

        ## matrix of lagged, squared and standardized XS averages
        z.tilde.lag <- matrix(0, nrow=length(z.tilde), ncol=p) # is TxP
        for(i in 1:p) {
            z.tilde.lag[,i] <- c(rep(0, i), # NB uses zeros for NA
                                 z.tilde[1:(length(z.tilde)-i)])
        }
        v.1 <- cbind(z.tilde, z.tilde.lag)[(p+1):t., ]
        ## prune NAs
        v.1 <- v.1[!is.na(apply(v.1, 1, sum)), ]
        dv <- v.1[,1,drop=FALSE]
        rhs <- v.1[,2:(p+1), drop=FALSE]
        ## regress z.tilde (squared "factors") on lags 1:p
        b <- solve(crossprod(rhs), crossprod(rhs, dv))
        ## calc residuals and use them in variance calc.
        e.nw <- dv - rhs %*% b  # w/o intercept (makes very little diff, as standardized)
        sse.nw <- crossprod(e.nw)
        sig2.nw <- sse.nw/(length(e.nw) - dim(rhs)[[2]])
        Vf2.q <- sig2.nw/(1 - sum(b))^2

        ## alt.: estimate Vf2 as in (22), Newey-West-like method
        ## notice: the s_t and s_(t-j) terms in (22) are equal to z_t etc. in the
        ## eqs. after (38) on page 18

        Vf2.q.alt <- mean(z.tilde^2, na.rm=TRUE)
        for(j in 1:p) {
            Vf2.q.alt <- Vf2.q.alt + mean(v.1[,1] * v.1[,j+1])

        }

        ## lastly, set Vf2
        switch(match.arg(vf.method),
               ar={Vf2 <- Vf2.q},
               nw={Vf2 <- Vf2.q.alt})

    } else {

        ## estimate Vf2 as in (38), p.18
        Vf2 <- mean(xbar.std^4, na.rm=TRUE) -1

    }

    ## confidence intervals for alfa.
    ## here: a.thirtilde

    alfa.star <- a.thrtilde
    minNaT <- min(n^(alfa.star), t.)
    deviance.sign.loadings <- s.fraselt
    omega <- (minNaT/t.) * Vf2 +
        4*(minNaT/(n^alfa.star))*
        (deviance.sign.loadings/(n^(alfa.star) - 1))
    crit.val <- qnorm(a.size/2, lower.tail=FALSE)
    confb <- crit.val * 1/sqrt(t.) * (1/(2*log(n))) * sqrt(omega)

    ## return:
    return(list(
        a.tilde=a.tilde,
        a.thrtilde=a.thrtilde,
        Vf2=Vf2,
        ggg.o=ggg.o,
        s.frasel=s.frasel,
        ggg.t=ggg.t,
        s.fraselt=s.fraselt,
        musqr.thr=musqr.thr,
        cb.a.thrtilde=c(alfa.star - confb, alfa.star + confb)))
}
