################################################################################
# Chou, Cheng and Ruoyao Shi, "What Time Use Surveys Can (And Cannot) Tell Us
# About Labor Supply", Journal of Applied Econometrics, forthcoming.
# Produce Table 3.
# Written by Cheng Chou (https://chengchou.github.io), Jan 29, 2021.
################################################################################

library(ipumsr)
library(dplyr)
library(purrr)
library(xtable)
library(magrittr)
library(tidyr)
library(AER)
library(Matrix)

# Load data clean codes
atus <- readRDS("atus.rds")
source("estimators.R")

################################################################################
# Descriptive statistics #
################################################################################
# DESCRIPTIVE STATISTICS
# Mean of hours and pay
descstat.mean <-
  atus %>%
  group_by(SEX, MARRIED) %>%
  summarize(
    mCPS = mean(HRSATRATE),
    mATUSDay = mean(Hour),
    mPay = mean(HOURWAGE),
    "n of obs" = n()
  ) %>%
  ungroup()
mean.mat <- descstat.mean %>% select(starts_with('m'), - MARRIED) %>% data.matrix()

# Std of hours
descstat.sd <- 
  atus %>% 
  group_by(SEX, MARRIED) %>%
  summarize(
    sdCPS = sd(HRSATRATE),
    sdATUSDay = sd(Hour),
    sdPay = sd(HOURWAGE)
  ) %>% 
  ungroup()

# Lower bounder of std dev of weekly hours from ATUS
lb.sd <- 
  atus %>% 
  group_by(SEX, MARRIED, DAY) %>% 
  summarize(VAR = var(Hour)) %>% 
  ungroup() %>%
  group_by(SEX, MARRIED) %>% 
  summarize('Lower Bound of Std Dev' = sqrt(sum(VAR))) %>%
  ungroup()

# Merge mean, sd and lower bound of sd together
# Names of the useful columns.
name1 <- 
  unlist(
    map(c('CPS', 'ATUSDay', 'Pay'), ~ paste0(c('m', 'sd'), .x))
  ) %>% 
  append(c('Lower Bound of Std Dev', 'n of obs'))
desc1 <- 
  descstat.mean %>% 
  left_join(descstat.sd, by = c('SEX', 'MARRIED')) %>% 
  left_join(lb.sd, by = c('SEX', 'MARRIED')) %>%
  select(name1) %>%
  data.matrix() %>% 
  t() %>%
  .[-6, ] # Exclude the sd of pay
# Add column and row names of desc1 for table.
colnames(desc1) <- 
  c(
    'Unmarried Men',
    'Married Men',
    'Unmarried Women',
    'Married Women'
  )
rownames(desc1) <- 
  c(
    'Mean of CPS Usual Weekly Hours Worked',
    'Std Dev of CPS Usual Weekly Hours Worked',
    'Mean of ATUS Hours Worked on Interview Day',
    'Std Dev of ATUS Hours Worked on Interview Day',
    'Mean of Hourly Pay (2017 USD)',
    'Lower Bound of Std Dev of Weekly Hours Worked from ATUS',
    '$n$ of Obs'
  )

################################################################################
# CREATE IV #
################################################################################
# Decile of hour wage and spouse wage
decileValueWAGE <- 
  atus %$% quantile(HOURWAGE, probs = seq(0, 1, by = 0.1))
decileValueSpWage <- 
  c(-1/2,
    atus %>% 
    filter(SPWAGE > 0) %$% 
    quantile(SPWAGE, probs = seq(0, 1, by = 0.1), type = 9)
  )
# Labels of decile for HOURWAGE
decilesLab <- 
  paste0(
    paste0(seq(0, 90, by  = 10), '%'), 
    '-',
    paste0(seq(10, 100, by = 10), '%')
  )
# Labels of decile for SPWAGE
decilesLab2 <- c('0') %>% append(decilesLab)
# Add deciles of wages to the original data
atus <- 
  atus %>%
  # Create IV for 'HOURWAGE' and 'SPWAGE' using decile
  mutate(
    decileWAGE = cut(HOURWAGE, decileValueWAGE, include.lowest = TRUE, labels = decilesLab),
    decileSPWAGE = cut(SPWAGE, decileValueSpWage, labels = decilesLab2)
  )

################################################################################
# Independent variables and IV in regression analysis #
################################################################################
# Control variables: own age and age squared, educ, asset, number of children
# residing in the household, Census region dummies, a metropolitan area
# dummy, race, year, occ, ind.
# x variables for unmarried workers
y <- "HRSATRATE | Hour ~ "
x.unmry <- "log(HOURWAGE) + EDUC + NCHLT5 + NCHLT18 + AGE + I(AGE ^ 2) + RACE + OCC + IND + REGION + YEAR"
x.mry <- paste0(' | ', x.unmry, ' + SPWAGE')
z.unmry <- paste0(' | ', x.unmry, ' - log(HOURWAGE) + decileWAGE')
z.mry <- paste0(' | ', x.unmry, ' - log(HOURWAGE) + decileWAGE + decileSPWAGE')
# Formula for estimation routine
fm <- Formula::Formula(
  as.formula(paste0(y, x.unmry, z.unmry, x.mry, z.mry))
  )

# Coefficients to be reported for married men and married women
x.report <- list(); x.names <- list(); elsty.type <- list()
x.report[['Married']] <- c("log(HOURWAGE)", "SPWAGE", 'NCHLT5', 'NCHLT18')
x.report[['Unmarried']] <- c("log(HOURWAGE)", 'NCHLT5', 'NCHLT18')
x.names[['Married']] <- c("HOURWAGE", "SPWAGE", 'NCHLT5', 'NCHLT18')
x.names[['Unmarried']] <- c("HOURWAGE", 'NCHLT5', 'NCHLT18')
elsty.type[['Married']] <- c(1, 2, 1, 1)
elsty.type[['Unmarried']] <- c(1, 1, 1)

################################################################################
# Estimation using different methods #
################################################################################
# VARIOUS SUBGROUPS OF SAMPLE #
# Split sample according to sex, marital status
sm <- 
  matrix(
    c('Male', 'Male', 'Female', 'Female', 'Married', 'Unmarried', 'Married', 'Unmarried'),
    nc = 2, 
    dimnames = list(c(), c('Sex', 'Married'))
  )
nsm <- nrow(sm)

elsty <- list() # elasticity estimation results
param <- list() # parameter estimation results
r.squared <- list() # r.squared
hman <- list() # p values of Hausman test

for (i in 1 : nsm) {
  g <- sm[i, ] %>% paste0(collapse = ' ')
  s <- sm[i, 1] # marital status
  m <- sm[i, 2] # marital status
  rhs = if (m == 'Unmarried') 1 : 2 else 3 : 4 # which formula to use
  rhs.ols = if (m == 'Unmarried') 1 else 3 # which formula to use
  dat.i <- atus %>% filter(SEX == s, MARRIED == m)
  cat(g, '\n')
  # IV and OLS using CPS and ATUS data
  cps.i <- cpsreg(formula(fm, lhs = 1, rhs = rhs), dat.i, x.report[[m]], x.names[[m]], elsty.type = elsty.type[[m]])
  cps.ols.i <- cpsreg(formula(fm, lhs = 1, rhs = rhs.ols), dat.i, x.report[[m]], x.names[[m]], elsty.type = elsty.type[[m]])
  tus.i <- tusreg(formula(fm, lhs = 2, rhs = rhs), dat.i, x.report[[m]], x.names[[m]], elsty.type = elsty.type[[m]])
  tus.ols.i <- tusreg(formula(fm, lhs = 2, rhs = rhs.ols), dat.i, x.report[[m]], x.names[[m]], elsty.type = elsty.type[[m]])
  # Save elasticities
  elsty[[g]] <- 
    bind_cols2(
    cps.i$elsty %>% appendname('IV.'),
    cps.ols.i$elsty %>% appendname('OLS.'),
    tus.i$elsty %>% appendname('IV.'),
    tus.ols.i$elsty %>% appendname('OLS.')
    )
  # Save parameters
  param[[g]] <- 
    bind_cols2(
    cps.i$param %>% appendname('IV.'),
    cps.ols.i$param %>% appendname('OLS.'),
    tus.i$param %>% appendname('IV.'),
    tus.ols.i$param %>% appendname('OLS.')
    )
  # Save R squared
  r.squared[[g]] <- bind_cols2(
    cps.i$r.squared %>% appendname('IV.'),
    cps.ols.i$r.squared %>% appendname('OLS.'),
    tus.i$r.squared %>% appendname('IV.'),
    tus.ols.i$r.squared %>% appendname('OLS.')
    )
  # Hausman p values
  hman[[g]] <- bind_cols2(
    hausman2(cps.i, tus.i) %>% appendname('IV.'),
    hausman2(cps.ols.i, tus.ols.i) %>% appendname('OLS.')
    )
}

# Create BIG table for IV results.
picker <- function(colname, varname) {
  # pick column and variable
  col.lst <- elsty %>% map(~ select(., Variables, colname))
  pos <- col.lst %>% map(~ c(0, 1) + which(.$Variables == varname))
  lst <- list()
  for (i in 1 : 4) {
    if (length(pos[[i]]) > 0) {
      lst[[i]] <- slice(col.lst[[i]], pos[[i]])
    } else {
      lst[[i]] <- tibble(
        Variables = c(varname, NA), 
        !!varname := rep(NA, 2)
      )
    }
  }
  tb <- bind_cols2(lst) %>% `colnames<-` (colnames(desc2))
  tb[['Variables']][1] <- paste0(colname, ':', varname)
  tb
}
picker2 <- function(colname) {
  tb <- r.squared %>% map(~ select(., Variables, colname)) %>% bind_cols2() %>% 
    `colnames<-` (colnames(desc2)) %>% 
    mutate_if(is.numeric, ~ as.character(round(., 3)))
  tb[['Variables']][1] <- paste0(colname, ':', "R squared")
  tb
}
picker3 <- function(colname) {
  tb <- hman %>% map(~ select(., Variables, colname)) %>% bind_cols2() %>% 
    `colnames<-` (colnames(desc2)) %>% 
    mutate_if(is.numeric, ~ as.character(round(., 3)))
  tb[['Variables']][1] <- paste0(colname, ':', "p values")
  tb
}
desc2 <- as_tibble(desc1, rownames = 'Variables') %>%
  select(Variables, "Married Men", "Unmarried Men", "Married Women", "Unmarried Women") %>% 
  # change numeric to characters
  mutate_if(is.numeric, ~ as.character(round(., 3)))
temp <- desc2 %>% slice(-7) %>% 
  bind_rows(
    x.report$Married %>% 
      map(~
        bind_rows(
          picker('IV.cps', .),
          picker('IV.im', .)
        )
      ),
    picker2('IV.cps'),
    picker2('IV.im'),
    picker3('IV.pValue'),
    desc2 %>% slice(7)
  )
# Remove rows that are all NA
temp <- temp %>% filter(! apply(temp, 1, function(r) all(is.na(r))))
# The following line is to remove estimates of spouse earnigs, # of kids for
# the unmarried groups.
rnames_to_remove <- c("IV.cps:SPWAGE", "IV.cps:R squared")
a <- which(pull(temp, Variables) == rnames_to_remove[1]) # first row to change
b <- which(pull(temp, Variables) == rnames_to_remove[2]) - 1 # last row to change
temp[a : b, c("Unmarried Men", "Unmarried Women")] <- NA

# Change variable names to be identical to latex table
old.name <- temp %>% pull(Variables)
new.name <- c(
  old.name[1:6],
  "Wage (CPS)",
  NA,
  "Wage (ATUS)",
  NA,
  "Spouse weekly earnings (CPS)",
  NA,
  "Spouse weekly earnings (ATUS)",
  NA,
  "Num. of kids age $<$ 5 (CPS)",
  NA,
  "Num. of kids age $<$ 5 (ATUS)",
  NA,
  "Num. of kids ages 5--18",
  NA,
  "Num. of kids ages 5--18 (ATUS)",
  NA,
  "$R$ squared (CPS)",
  "$R$ squared (ATUS)",
  "$p$ value of Hausman test",
  "$n$ of obs."
)
temp <- temp %>% mutate(Variables = new.name)
# TABLE 3 IN PAPER #
print(
  xtable(
    temp, 
    caption = "Weekly Labor Supply Elasticity Estimates: the CPS and the ATUS"
    ), 
  file = "atusElasticity.tex",
  sanitize.text.function = function(x) {x}
)
