## ======================================
## Figure 2: Expanding the observation period for hospitalization rates to 1983-2009.
## Lead test for pre-treatment hospitalizations
## ======================================

  rm(list=ls())
  
  #Assume base R is running
  my.wd <- getSrcDirectory(function(x){x})
  
  #Check for errors
  if(grepl("error", tolower(class(my.wd)[1])) | my.wd==""){
    #Try to access working directory through R Studio API
    my.wd <- tryCatch(dirname(rstudioapi::getActiveDocumentContext()$path),
                      error = function(e) e)
    
  }
  
  #Set working directory
  setwd(my.wd)
  
## ======================================
## Load packages and functions
## ======================================

  #Read in function to install packages
  source("Functions/installPackageNotFound.R")

  #Libraries for modeling
  installPackageNotFound("plm")
  installPackageNotFound("lmtest")

## ==========================
## Read in data
## ==========================  

  #County population
  county.pop.1964.2015 <- read.csv("../Data/county_population_1964_2015.csv", stringsAsFactors = FALSE)
  
  #Hospitalizations county-year (for plotting)
  hosp.cty.yr.1983.2009 <- read.csv("../Data/hosp_county_year_1983_2009.csv", stringsAsFactors = FALSE)
  
  #Hospitalizations zip3-quarter (for lead test)
  hosp.zip3.qtr.1983.2009 <- read.csv("../Data/hosp_zip3_quarter_1983_2009.csv", stringsAsFactors = FALSE)

## ==========================
## Aggregate population
## ==========================  

  # SoCal county names
  socal <- c("Imperial", "Kern","Orange","San Bernardino","San Diego", "San Luis Obispo","Santa Barbara", "Riverside", "Ventura")
  
  #County population CA (non-LA)
  ca0.pop <- aggregate(pop~year, FUN = sum,
                      data=subset(county.pop.1964.2015, !Name == "Los Angeles"))
  
  #County population CA (non-LA, non-SoCal)
  ca.pop <- aggregate(pop~year, FUN = sum,
                      data=subset(county.pop.1964.2015, !Name %in% c(socal, "Los Angeles")))
  
  #County population LA only
  la.pop <- aggregate(pop~year, 
                      data = subset(county.pop.1964.2015, Name == "Los Angeles"), FUN = sum)
  
  #County population SoCal only
  socal.pop <- aggregate(pop~year, 
                         data = subset(county.pop.1964.2015, Name%in%socal), FUN = sum)
  
## ==========================
## Aggregate hospitalizations
## ==========================  
  
  ## Add binary indicators for county groupings
  hosp.cty.yr.1983.2009$la <- as.numeric(hosp.cty.yr.1983.2009$county.name=="Los Angeles")
  hosp.cty.yr.1983.2009$socal <- as.numeric(hosp.cty.yr.1983.2009$county.name%in%socal)
  
  #Illness name to aggregate
  fbi.name <- c("jl.codes.mdc6")

  #Aggregate counts
  year.ca0.agg <- aggregate(formula(paste0("cbind(", paste0(fbi.name, collapse=","),")~year")), 
                        data = subset(hosp.cty.yr.1983.2009, la == 0), 
                        FUN = sum)
  year.ca.agg <- aggregate(formula(paste0("cbind(", paste0(fbi.name, collapse=","),")~year")), 
                        data = subset(hosp.cty.yr.1983.2009, la == 0 & socal == 0), 
                        FUN = sum)
  year.la.agg <- aggregate(formula(paste0("cbind(", paste0(fbi.name, collapse=","),")~year")), 
                            data = subset(hosp.cty.yr.1983.2009, la == 1), 
                            FUN = sum)
  year.socal.agg <- aggregate(formula(paste0("cbind(", paste0(fbi.name, collapse=","),")~year")), 
                              data = subset(hosp.cty.yr.1983.2009, socal == 1), 
                              FUN = sum)
  
  #Add time varying population
  year.la.agg <- merge(year.la.agg, la.pop, by = c("year"))
  year.socal.agg <- merge(year.socal.agg, socal.pop, by = c("year"))
  year.ca.agg <- merge(year.ca.agg, ca.pop, by = c("year"))
  year.ca0.agg <- merge(year.ca0.agg, ca0.pop, by = c("year"))
  
  #Create rates
  year.la.agg$rate <- 100000*year.la.agg[,c(fbi.name)]/year.la.agg$pop
  year.socal.agg$rate <- 100000*year.socal.agg[,c(fbi.name)]/year.socal.agg$pop
  year.ca.agg$rate <- 100000*year.ca.agg[,c(fbi.name)]/year.ca.agg$pop
  year.ca0.agg$rate <- 100000*year.ca0.agg[,c(fbi.name)]/year.ca0.agg$pop
  
## ==========================
## Plot trends
## ==========================  

  #Colors for plot
  la.color <- rgb(1,0,0,0.8)
  ca.color <- rgb(0,0,1,0.8)
  
  #Code for plot
  layout(matrix(1:3, 1,3), widths=c(0.12,0.4,0.4))
  
  par(mar=c(3,3,2,1), mgp=c(1.5,0.5,0), tcl=-0.3)
  
  ## JL time window
    plot(year.la.agg$year, year.la.agg$rate, type = "n", col = ca.color,
         ylab = "Rate", xlab = "Year", main = "J&L Data", ylim = c(2,6.6), xlim=c(1995,1999),
         axes=F,cex.main=1,xpd=T)
  axis(1)
  axis(2)
  rect(xleft = 1995, xright = 1999, ytop = 8, ybottom = 0, border = NA, col = rgb(0,0,0,0.1))
      abline(v=1997.5,col="white")
  lines(year.ca0.agg$year[year.ca0.agg$year%in%1995:1999],
        year.ca0.agg$rate[year.ca0.agg$year%in%1995:1999], col = ca.color, lty=5)
  lines(year.la.agg$year[year.la.agg$year%in%1995:1999],
        year.la.agg$rate[year.la.agg$year%in%1995:1999], col = la.color, lwd=1.5)
  text(1996, 5.4, "LA", col=la.color)
  text(1996, 2.6, "CA", col=ca.color)
  
  ## Full series
    plot(year.la.agg$year, year.la.agg$rate, type = "n", col = ca.color,
         ylab = "Rate", xlab = "Year", main = "Hospitalizations, 1983-2009", ylim = c(2,6.6))
    rect(xleft = 1995, xright = 1999, ytop = 8, ybottom = 0, border = NA, col = rgb(0,0,0,0.1))
      abline(v=1997.5,col="white")
    lines(year.ca0.agg$year, year.ca0.agg$rate, col = ca.color, lty=5,lwd=1)
    lines(year.la.agg$year, year.la.agg$rate, col = la.color, lwd=1.5)
  text(x = 1986, y = 4.1, label = "CA", col = ca.color)
  text(x = 1983, y = 5.3, label = "LA", col = la.color)
    text(1997,6.3,"J&L\nwindow\n1995-99",col=rgb(0,0,0,0.75),cex=0.85)
  
  
    #Separate SoCal
    plot(year.la.agg$year, year.la.agg$rate, type = "n", col = ca.color,
         ylab = "Rate", xlab = "Year", main = "Southern California", ylim = c(2,6.6))
    rect(xleft = 1995, xright = 1999, ytop = 8, ybottom = 0, border = NA, col = rgb(0,0,0,0.1))
      abline(v=1997.5,col="white")
  lines(year.la.agg$year, year.la.agg$rate, col = la.color, lwd=1.5)
    lines(year.socal.agg$year, year.socal.agg$rate, lty=5, col=rgb(1,0,1,1),lwd=1)
      text(x = 1983, y = 1.7, label = "CA", col = ca.color)
      text(x = 1986, y = 4.1, label = "Southern CA", col = "purple")
      text(x = 1983, y = 5.3, label = "LA", col = la.color)

## ==========================
## Lead test of parallel trends
## ==========================  

  #Create year quarter variable
  hosp.zip3.qtr.1983.2009$year.qtr <- paste0(hosp.zip3.qtr.1983.2009$year, "-", hosp.zip3.qtr.1983.2009$quarter)
  
  #Create log count variable
  hosp.zip3.qtr.1983.2009$logCount <- log(hosp.zip3.qtr.1983.2009$jl.codes.mdc6 + 1)
  
  #Model of LA vs CA with 1994 salmonella outbreak as treatment year
  lead.model <- plm(logCount ~ I(prop.la*as.numeric(year>=1994)) , 
                    index = c("zip", "year.qtr"), 
                    model="within", effect="twoways", 
                    data= subset(hosp.zip3.qtr.1983.2009, year%in%1990:1997))
  
  #Cluster-robust SE
  coeftest(lead.model, vcov=vcovHC(lead.model, type="HC0", cluster="group"))
  