##Setup

#Clear memory
rm(list = ls(all=T))


#Load packages and install if not installed
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
  
  tidyverse,    # Grammar for data + ggplot2
  did,          # Callaway and Sant'Anna
  did2s,        # Gardner
  data.table,   # Easy to make data tables
  knitr,        # Print pretty tables
  ggtext,       # Fancy ggplot text (e.g. italics)
  RColorBrewer, # Colour palettes
  fastDummies   # Easy to make dummy variables
)

#Set some output options
knitr::opts_chunk$set(include = TRUE, warning = FALSE, message = FALSE, 
                      fig.width = 10, fig.height = 7)
## Event study and plot functions

# Event study funciton to calculate just our three estimators
my_event_study <- function(data, xformla = NULL, true_effect) {
  rbind(
    event_study(
      yname = "revenue",  
      tname = "year",       
      idname = "id",        
      gname = "g",
      estimator = "TWFE",
      xformla = xformla,
      data = data
      ),
    event_study(
      yname = "revenue",  
      tname = "year",       
      idname = "id",        
      gname = "g",
      estimator = "did2s",
      xformla = xformla,
      data =data
      ),
    event_study(
      yname = "revenue",  
      tname = "year",       
      idname = "id",        
      gname = "g",
      estimator = "did",
      xformla = xformla,
      data = data
      )
  ) %>%
    left_join(true_effect, by = "term")
}


#Save a color pallette for my plots
myPal <-  brewer.pal(n=4, name = "Set1")

# My own event study plot function
my_event_plot <-  function(data, plot_title = "") {
  data %>%
    mutate(conf_low = estimate - 1.96 * std.error, #Confidence bands
           conf_high = estimate + 1.96 * std.error,
           estimator = case_when( #Pub years are not correct in package
             estimator == "Callaway and Sant'Anna (2020)" ~ "Callaway and Sant'Anna (2021)",
             estimator == "Gardner (2021)" ~ "Gardner (2022)", 
             TRUE ~ estimator
             ),
           pre_post = case_when(term < 0 ~ "pre", 
                                TRUE ~ "post")
           ) %>%
    ggplot(aes(x=term, y=estimate, color = estimator, shape = estimator)) + 
      facet_wrap(~estimator, scales = "free") + #separate panels for each estimator
      geom_line(aes(x = term, y=true_effect, group=pre_post, linetype = "solid"), #Line for true effect
                color = myPal[1], linewidth = 0.75) +    
      geom_point() + #Point estimates
      geom_errorbar(aes(ymin=conf_low, ymax=conf_high)) +  #Confidence bands
      geom_vline(xintercept = -0.5, linetype = "dashed") + 
      geom_hline(yintercept = 0, linetype = "dashed") + 
      scale_y_continuous(labels = scales::comma,
                         name = "Point Estimates With 95% Confidence Intervals") +
      scale_linetype_manual(values = "solid", name = "", labels = "True Effect") +
      scale_color_manual(values=c(myPal[2], myPal[3], myPal[4]), 
                         name = "") +
      scale_shape_manual(values = c(15,16,17),
                         name = "") +
      xlab("Event Time") +
      ggtitle(plot_title, subtitle = "\n") + #make room for legend in top-left 
      theme_classic() + 
      theme(
            axis.line = element_line(color="grey"),
            axis.ticks = element_line(color = "grey"),
            axis.title.y = element_text(margin = margin(0,12,0,0),
                                        size = 12, 
                                        color = "grey30"),
            axis.title.x = element_text(margin = margin(12,0,6,0),
                                        size = 12, 
                                        color = "grey30"),
            axis.text = element_text(color = "grey30"),
            legend.position = c(0.18,1.15),
            legend.direction = "horizontal",
            legend.margin = margin(0),
            legend.text = element_text(size = 12),
            plot.title.position = "plot",
            strip.text = element_text(margin = margin(12,0,12,0),
                                      hjust = 0.01,
                                      size = 12,
                                      color = "grey30"),
            strip.background = element_blank(),
            plot.title = element_textbox_simple(margin = margin(6,12,18,0),
                                      size = 16
              ),
            plot.caption = element_textbox_simple(margin = margin(6,0,6,0),
                                                  size = 9,
                                                  color = "grey30"),
            plot.caption.position = "plot"
            )
}

# my_event_plot(output,
#               plot_title = "Testing Testing Testing 123")

Introduction

In my day job, I do a fair bit of work in the realm of causal inference. Typically, this involves looking at whether a particular policy resulted in improved outcomes for individuals who experienced the policy. There are many different ways of doing this. One of the most common methods, or groups of methods, is difference-in-differences (DiD). DiD methods have been advancing rapidly in recent years following the discovery of a number of issues with two-way fixed effects estimators (see Borusyak and Jaravel (2018), De Chaisemartin and d’Haultfoeuille (2020), Goodman-Bacon (2021), and Sun and Abraham (2021). See Roth et al. (2023) for a synthesis of the literature).

Recently, I have been exploring how different variations of DiD estimators that allow for staggered adoption behave under different circumstances. The {did2s} and {did} packages in R make this relatively easy to do with Monte Carlo simulations. In this piece, I focus on the implementation of two estimators in particular. These two are the estimators proposed in Gardner (2022), and Callaway and Sant’Anna (2021). I will not be providing any analytical proofs or getting into the math behind how these two methods work. Rather, I will be illustrating with Monte Carlo simulations how these estimators behave (or misbehave) given different types of treatment effects and covariates and how the aforementioned R package allow us to implement the methods. I will also be comparing them to the classic two-way fixed effects estimator. I explain the data generating process (DGP) for each scenario in plain English rather than with equations. The DGP can also be deciphered by reading the R code.

Simulations

#Set number of units, periods and simulations.
  #Applies to all scenarios
n_units <- 2000 
n_periods <- 5
n_sim <- 100

I find simulation-based explanations easier to follow when they are explained in terms of hypothetical real world scenarios and the variables are given realistic values and names. For this simulation, let us imagine that a large company with many locations wants to measure whether, and to what degree, the adoption of a particular technology improves sales. Some of this company’s locations have adopted the technology, while others have not. Furthermore, the locations that have adopted the technology have adopted it at different times and have chosen to adopt it of their own volition, which means that the treatment is not randomly assigned. We shall explore how we might measure the effects of the technology on sales using the methods of Callaway and Sant’Anna (2021), Gardner (2022), and Two-Way Fixed Effects, hereafter referred to as TWFE.

Homogeneous Treatment and Covariates with Fixed Effects

To begin, we have a balanced panel consisting of 5 periods (years) and 2000 units (store locations). 1/2 of stores do not adopt the technology (i.e., they are untreated), while the other 1/2 have adopted the technology (i.e., they are treated at some point). Among these stores, 1/4 adopt the technology in each of the last four years of the panel. In the first year, none of the stores have adopted the technology yet. For now, we will say that stores are treated in the first year that they adopt the technology and remain treated in all subsequent years. Additionally, there is one covariate, which is a dummy variable that indicates whether a store is located in an urban area or not. Approximately half of the stores are in urban areas. Stores located in urban areas are twice as likely to adopt the technology as stores in non-urban areas; there is a two in three chance that store in an urban area adopts the technology, while there is a one in three chance that a store in a non-urban area adopts the technology.

Revenue, the dependent variable, is determined as follows. For each store, revenue in each year is equal to 1 million (the intercept), plus the effect of being located in an urban area, the effect of adopting the technology (treatment effect) and an error term drawn from a random normal distribution with a mean of zero and a standard deviation of 75 thousand. The ceteris paribus effect of being located in an urban area is an additional 50 thousand per year of revenue, while the ceteris paribus effect of the treatment is an additional 100 thousand per year of revenue. This effect is completely homogeneous.

#Create data for simulation 1
create_data_1 <- function(n.units= n_units, n.periods=n_periods) { 
  
  #First column
  myData <- as.data.frame(rep(1:n.units, n.periods))
  colnames(myData) <- "id"

  myData %>%
  arrange(by=id) %>% 
  cbind(year = rep(2019:(2019+n.periods-1),n.units)) %>% #Add periods
  group_by(id) %>%
  mutate(g = case_when(        #Assign to treatment groups
    id %in% (1:n.units/2) ~ 0, #50% untreated
    id %in% ((n.units/2+1):(n.units/2+n.units/(n.periods-1)/2)) ~ 2020,
    id %in% ((n.units/2+n.units/(n.periods-1)/2+1):(n.units/2+n.units*2/(n.periods-1)/2)) ~ 2021,
    id %in% ((n.units/2+n.units*2/(n.periods-1)/2+1):(n.units/2+n.units*3/(n.periods-1)/2)) ~ 2022,
    TRUE ~ 2023
  ), 
         urban = case_when(g==0 ~ sample(c(0,0,1),1),
                           TRUE ~ sample(c(0,1,1),1)) # Urban locations twice as likely to be treated
  ) %>%
  rowwise() %>%
  mutate(treated = case_when(g<=year & g!=0 ~ 1,
                             TRUE ~ 0),
         revenue = 1000000 + # 1M intercept plus:
           50000*urban + # additional 50k if in urban area
           100000*treated + # treatment effect: 75k 
           rnorm(n=1, mean =0, sd = 75000) # mean 0 error w/ sd= 75k
         ) 
}


#For reproducibility
set.seed(97)
#Create example of one run of simulated data
myData <- create_data_1()

Now, to ensure that the reader has a good comprehension of the data being simulated, I provide a few tables that illustrate data created by one run of the data simulator. The first table shows what the data set looks like for one untreated store in a non-urban area and one treated store in an urban area.

## A glimpse of the data ##
#Filter down to the five observations for one treated individual
myData %>% 
  ungroup() %>%
  filter(g==0, urban == 0)%>%
  slice_head(n=5) %>%
  #Stack on top of five obs for one untreated individual
  rbind(
    #Filter down to the five observations for one individual treated in 2017
    myData %>%
      ungroup() %>%
      filter(g==2021, urban==1)%>%
      slice_head(n=5)
  ) %>%
  mutate(revenue = scales::dollar(revenue)) %>%
  #Print as a table
  kable(
    align = "l",
    caption = "Data for One Store That Adopts the Technology and One that Does Not",
    col.names = c("ID","Year","Treatment Group","Urban Dummy","Treated Yet", "Revenue")
  )
Data for One Store That Adopts the Technology and One that Does Not
ID Year Treatment Group Urban Dummy Treated Yet Revenue
1 2019 0 0 0 $1,054,478
1 2020 0 0 0 $929,467
1 2021 0 0 0 $877,219
1 2022 0 0 0 $893,955
1 2023 0 0 0 $1,023,172
1253 2019 2021 1 0 $1,141,808
1253 2020 2021 1 0 $1,051,653
1253 2021 2021 1 1 $1,109,497
1253 2022 2021 1 1 $1,098,604
1253 2023 2021 1 1 $1,153,314

The table below shows mean revenue by treatment status and whether a store is located in an urban area.

myData %>%
  group_by(treated, urban) %>%
  summarise(Revenue = mean(revenue) %>% scales::dollar()) %>%
  kable(
    align = "l",
    caption = "Mean Revenue by Treatment Status and Whether Stores are Urban",
    col.names = c("Treated","Urban Dummy", "Mean Revenue")
  )
Mean Revenue by Treatment Status and Whether Stores are Urban
Treated Urban Dummy Mean Revenue
0 0 $1,001,324
0 1 $1,049,908
1 0 $1,096,311
1 1 $1,150,939

Finally, this table shows the number of units with each treatment group / urban dummy combination.

myData %>%
  group_by(g, urban) %>%
  summarise(Count = n()) %>%
  kable(
    align = "l",
    caption = "Unit-Period Observations in Each Treatment Group by Urban Dummy ",
    col.names = c("Treatment Group", "Urban Dummy", "Count")
  )
Unit-Period Observations in Each Treatment Group by Urban Dummy
Treatment Group Urban Dummy Count
0 0 3375
0 1 1625
2020 0 400
2020 1 850
2021 0 380
2021 1 870
2022 0 430
2022 1 820
2023 0 435
2023 1 815

Now that the housekeeping about the data is out of the way, let’s look at how the different estimators perform in this scenario. The {did2s} package makes it very easy for us to compare the estimators. I will be presenting the point estimates produced by the different estimators, along with 95% confidence intervals, in plot form. Along with the estimates, the plots show the true effect, which is represented by a red line. If an estimator produces unbiased estimates, the points will fall on the red line. If an estimator produces biased estimates, the points will not fall on the red line.

#Save true effect
true_effect <- data.table(
  term = -4:3,
  true_effect = c(rep(0,4),rep(100000,4))
)

#A function to run a simulation 1 time and run estimators
get_est_1 <- function(...) { # '...' must be included for map_dfr to work
  myData <- create_data_1()
  #Run our three estimators
  output <- my_event_study(myData, true_effect = true_effect)
  output
}


#For reproducibility
set.seed(97)
#Run simulation multiple times & save mean output.
output <- map_dfr(1:n_sim, get_est_1) %>% #Run sim n_sim times & rbind() outputs
  group_by(term, estimator) %>%
  summarise(across(everything(), mean))

#Plot the output with my plotting function
my_event_plot(output,
              plot_title = "Homogeneous Treatment and Covariates with Fixed Effects")

As can be seen above, the three methods all provide similar, unbiased estimates of the effect of the treatment. Note that we have not explicitly controlled for, or conditioned on, the urban covaraite within any of the three methods. The estimates remain unbiased because although the covariate effects the level of the dependent variable, it does not effect the dynamics of the dependent variable - it has the same value and same effect in all periods.

Heterogeneous Treatment Effects

#Effect that treatment has in each year
e_20 <- 50000
e_21 <- 100000
e_22 <- 150000
e_23 <- 200000

Now, let us consider a case in which the effects of the treatment are heterogeneous. That is, the effects are not the same for all groups and periods. We will make it so that the treatment effect is +$50,000 in revenue in 2020, +$100,000 in 2021, +$150,000 in 2022 and +$200,000 in 2023. Perhaps this steady increase in the benefit from the technology arises from some sort of network effect - as more people/firms/locations adopt the technology, it becomes more effective. Note that it does not become more effective because units have been in the ‘treated’ state for more years. It becomes more effective as more calendar years pass, regardless of how long a given unit has been treated.

#Create data for simulation 2
create_data_2 <- function(n.units= n_units, n.periods=n_periods) { 
  
  #First column
  myData <- as.data.frame(rep(1:n.units, n.periods))
  colnames(myData) <- "id"

  myData %>%
  arrange(by=id) %>% 
  cbind(year = rep(2019:(2019+n.periods-1),n.units)) %>% #Add periods
  group_by(id) %>%
  mutate(g = case_when(        #Assign to treatment groups
    id %in% (1:n.units/2) ~ 0, #50% untreated
    id %in% ((n.units/2+1):(n.units/2+n.units/(n.periods-1)/2)) ~ 2020,
    id %in% ((n.units/2+n.units/(n.periods-1)/2+1):(n.units/2+n.units*2/(n.periods-1)/2)) ~ 2021,
    id %in% ((n.units/2+n.units*2/(n.periods-1)/2+1):(n.units/2+n.units*3/(n.periods-1)/2)) ~ 2022,
    TRUE ~ 2023
  ), 
         urban = case_when(g==0 ~ sample(c(0,0,1),1),
                           TRUE ~ sample(c(0,1,1),1)) # Urban locations twice as likely to be treated
  ) %>%
  rowwise() %>%
  mutate(treated = case_when(g<=year & g!=0 ~ 1,
                             TRUE ~ 0),
         revenue = 1000000 + #1M intercept
           50000*urban +
           #treatment effect dependent on year
           case_when(
             year == 2019 ~ 0,
             year == 2020 ~ treated*e_20,
             year == 2021 ~ treated*e_21,
             year == 2022 ~ treated*e_22,
             year == 2023 ~ treated*e_23
           ) +
           rnorm(n=1, mean =0, sd = 75000) # error w/ mean 0 & sd= 75k
         ) 
}


#Calculate true effect and save
true_effect <- data.table(
  term = -4:3,
  true_effect = c(rep(0,4),
               (e_20+e_21+e_22+e_23)/4,
               (e_21+e_22+e_23)/3,
               (e_22+e_23)/2,
               e_23)
)

#A function to run a simulation 1 time and run estimators
get_est_2 <- function(...) { # '...' must be included for map_dfr to work
  myData <- create_data_2()
  #Run our three estimators
  output <- my_event_study(myData, true_effect = true_effect)
  output
}

#For reproducibility
set.seed(97)
#Run simulation multiple times & save mean output.
output <- map_dfr(1:n_sim, get_est_2) %>% #Run sim n_sim times & rbind() outputs
  group_by(term, estimator) %>%
  summarise(across(everything(), mean))

#Plot the results
my_event_plot(output,
              plot_title = "Estimating the Effects When Treatment Effects are Heterogeneous")

Above, we see the well-established result that TWFE yields biased estimates when treatment effects are not homogeneous. This issue is solved by the methods of Callaway and Sant’Anna (2021) and Gardner (2022). It is also interesting to note that the estimates in the periods before treatment should be centered on zero in this case because we know that parallel trends does hold. However, TWFE yields non-zero esitmates, incorrectly suggesting that parallel trends does not hold. This is established in Sun and Abraham (2021). TWFE failing in the presence of heterogeneous treatment effects is a significant issue because in real applications treatment effects are very unlikely to be homogeneous. For that reason, one should use methods that allow for heterogeneous effects, such as those of Callaway and Sant’Anna (2021) and Gardner (2022) whenever possible.

Covariates with Dynamic Effects

Let us return to a situation in which the effects of treatment are not heterogeneous. We will use homogeneous treatment effects for the remainder of this demonstration so that we are only considering one complication at a time.

As in the first simulation, we will say that adopting the technology has the effect of increasing revenues by $100,000 per year in all years. However, we will introduce a covariate that does not have the same effect in all years. Let us imagine that this is a Canadian company with approximately half of its locations in Canada and half in the USA. Initially, in 2019, being located in the USA has a negative effect, perhaps due to inferior brand awareness. In 2019, being located in the USA has the effect of reducing revenues by $50,000, all else being equal. However, as time goes on, the brand grows in popularity in the USA. Perhaps competitors in the USA start closing down, or the brand just becomes more of a fad in the USA than in Canada. As a result, in 2020, being located in the USA has neither a negative effect nor positive effect, while in 2021 onwards, being located in the United States has the effect of boosting revenues by $50,000, all else being equal. Furthermore, Canadian locations are less likely to adopt the technology than American locations, meaning that treatment is correlated with this variable. We will say that American locations are three times as likely to adopt the treatment as Canadian locations.

Therefore, although the value of this covariate is constant over time, its effect is not constant. Rather than merely affecting the level of the outcome variable, this covariate affects the dynamics of the outcome variable. First, let us compute the three different estimators that we have been discussing, without explicitly controlling for or conditioning on this new covariate in any of them.

#Create data for simulation 3
create_data_3 <- function(n.units= n_units, n.periods=n_periods) { 
  
  #First column
  myData <- as.data.frame(rep(1:n.units, n.periods))
  colnames(myData) <- "id"

  myData %>%
  arrange(by=id) %>% 
  cbind(year = rep(2019:(2019+n.periods-1),n.units)) %>% #Add periods
  group_by(id) %>%
  mutate(g = case_when(        #Assign to treatment groups
    id %in% (1:n.units/2) ~ 0, #50% untreated
    id %in% ((n.units/2+1):(n.units/2+n.units/(n.periods-1)/2)) ~ 2020,
    id %in% ((n.units/2+n.units/(n.periods-1)/2+1):(n.units/2+n.units*2/(n.periods-1)/2)) ~ 2021,
    id %in% ((n.units/2+n.units*2/(n.periods-1)/2+1):(n.units/2+n.units*3/(n.periods-1)/2)) ~ 2022,
    TRUE ~ 2023
    ), 
         urban = case_when(g==0 ~ sample(c(0,0,1),1),
                           TRUE ~ sample(c(0,1,1),1)
                           ), # Urban locations twice as likely to be treated
         usa = case_when(g == 0 ~ sample(c(0,0,0,1),1),
                         TRUE ~ sample(c(0,1,1,1),1)
                         ) #American locations thrice as likely to be treated
  ) %>%
  rowwise() %>%
  mutate(treated = case_when(g<=year & g!=0 ~ 1,
                             TRUE ~ 0),
         revenue = 1000000 +
           50000*urban +
           #Effect of canada covariate dependent on year
           case_when(
             year == 2019 ~ usa*-50000,
             year == 2020 ~ usa*0,
             TRUE ~  usa*50000
           ) +
           100000*treated + 
           rnorm(n=1, mean =0, sd = 75000), # error w/ mean 0 & sd= 75k
         usa_year = factor(usa*year)
         )
}

#Save true effect
true_effect <- data.table(
  term = -4:3,
  true_effect = c(rep(0,4),rep(100000,4))
)

#A function to run a simulation 1 time and run estimators
get_est_3 <- function(...) { # '...' must be included for map_dfr to work
  myData <- create_data_3()
  #Run our three estimators
  output <- my_event_study(myData, true_effect = true_effect)
  output
}

#For reproducibility
set.seed(97)
#Run simulation multiple times & save mean output.
output <- map_dfr(1:n_sim, get_est_3) %>% #Run sim n_sim times & rbind() outputs
  group_by(term, estimator) %>%
  summarise(across(everything(), mean))

my_event_plot(output,
              plot_title = "Estimating the Effects <i>Without</i> Conditioning on Covariates")

Alas, all three estimators produce biased estimates for the effect of the technology. In the language of difference-in-differences, this is because the parallel trends assumption does not hold. The ‘gap’ in revenues between the treatment group and the control group would not have stayed constant in the ‘post’ periods if the treated group had remained untreated. Because the stores that adopt the technology are disproportionately located in the USA, the treated group would have had lower revenues in 2019, which would then have caught up in 2020 before surpassing those of the untreated group in 2021 and remaining higher in 2022 and 2023. Knowing with certainty that this is the case is the beauty of the Monte Carlo simulation.

If this were a practical application, the estimates in the ‘pre’ periods not being centered on zero would give a hint that something is amiss, because this indicates that the pre-trends are not parallel. Of course, there are more formal tests of pre-trends, such as that proposed in Callaway and Sant’Anna (2021), but I do not want to get too far into the weeds. It is important to note, however, the distinction between parallel pre-trends and the parallel trends assumption. Parallel pre-trends are neither sufficient nor necessary for difference-in-differences or TWFE estimates to be valid. However, the parallel trends assumption is necessary for these estimators to be valid. Parallel trends refers to the assumption that the paths of the dependent variable for the treated and untreated group would have remained parallel in the post-treatment period had the treated group remained untreated. Parallel pre-trends give us evidence that the trends between the two groups likely would have remained parallel in the post-treatment periods, had the treated group remained untreated. However, this is not necessarily the case. One could imagine a scenario in which the trends between the two groups were parallel and then something changed post-treatment to effect the dynamics of the outcome variable differently for the treated group versus the untreated group. The reverse is also possible; even if pre-trends are not parallel, it is possible that the outcome paths would have been parallel in the ‘post’ periods, absent treatment.

Callaway and Santa’Anna (2021) provides a method for conditioning on covariates that affect outcome dynamics. Specifically, the method is the doubly robust method proposed in Sant’Anna and Zhao (2020). One aspect of this method is to use inverse probability weighting to re-weight the observations in the treatment group according their covariates. Essentially, if a unit in the control group has covariate values that are more prevalent in the treatment group, they will receive a high weight, while an observation with covaraite values similar to most other control group units, it will receive a low weight.

In the cases of the both TWFE and Gardner (2022), time-constant covariates that effect the dynamics of the outcome cannot be controlled for simply by introducing those covariates into the regression(s). The reason that unit or group fixed effects do not result in unbiased estimates when such covariates are present is that they ‘assume’ that the unit level effects are, well, fixed (over time). Therefore, we will run the Callaway and Sant’Anna estimator with conditioning on the country covariate, but we will need to do something a little more creative for the other two methods.

For Garnder and TWFE, we can control for time-constant covariates that effect the dynamics of the outcome by interacting the time-constant covariates with period dummies. The results below will show that this works, but I am not convinced that it is terribly practical. In the case of Callaway and Sant’Anna, we condition time-constant covariates that we think may be associated with the dynamics of the outcome in order to improve the viability of the parallel trends assumption. However, introducing more covariates weakens the overlap condition, so it is a double edged sword. Introducing period*covariate interaction terms in Gardner (2022) and TWFE may have even more negative reprocussions because doing so can use up a lot of degrees of freedom, especially when a panel has many periods and few units.

#A function to simulate data 1 time and run CS estimator
get_est_cs_3 <- function(...) { # '...' must be included for map_dfr to work
  myData <- create_data_3()
  #Run our Callaway and Sant'Anna estimator
  output <- event_study(
      yname = "revenue",  
      tname = "year",       
      idname = "id",        
      gname = "g",
      estimator = "did",
      xformla = ~ usa,
      data = myData
      ) %>%
    rbind(
        event_study(
        yname = "revenue",  
        tname = "year",       
        idname = "id",        
        gname = "g",
        estimator = "did2s",
        xformla = ~ factor(usa_year),
        data = myData
      ),
      event_study(
        yname = "revenue",  
        tname = "year",       
        idname = "id",        
        gname = "g",
        estimator = "TWFE",
        xformla = ~ factor(usa_year),
        data = myData
      )
      
    )
  output
}

#For reproducibility
set.seed(97)
#Run simulation multiple times & save mean output.
output <- map_dfr(1:n_sim, get_est_cs_3) %>% #Run sim n_sim times & rbind() outputs
  group_by(term, estimator) %>%
  summarise(across(everything(), mean))

#Add column for true effect to DF of estimates
output <- left_join(output, true_effect, by = "term")

my_event_plot(output,
              plot_title = "Estimating the Effects <i>With</i> Conditioning on Covariates*") + 
  labs(caption = "*For Callaway and Sant'Anna, we condition on the time-constant covariate itself. For the other two, we condition on interactions between the time-constant covariate and period dummies.")

In this case, the covariate conditioning has resulted in unbiased estimates.

Time-Varying Covariates

Something that people are often concerned about with difference-in-differences methods, such as that of Callaway and Santa’Anna, is that the methods cannot control for time-varying covariates. In some cases there are ways around this. For example, a person’s age changes over time, but we can still condition on it by conditioning on age at a specific date. In other cases, taking the value of a variable at a specific point in time does not work.

It is important to point out that this is somewhat of a contentious topic. To paraphrase something Pedro Sant’Anna said in response to a question about this issue, if a variable changes values in a period after the unit is treated, that variable is not a covariate, it is an outcome. In many, if not most cases, I think that this is true. If the variable truly is an outcome, or some intermediate step in the causal path between the treatment and the dependent variable, we certainly do not want to control for it. This would constitute a ‘bad control.’

Speaking within our example, say we wanted to control for the aggregate education level of the employees at the different stores because we think employee education might affect revenues and be correlated with technology adoption. If adopting the new technology attracts more educated employees, which in turn increases revenues, then that increase in revenues is a result of the technology adoption. Controlling for education would ‘wipe out’ this effect from our estimates. For reasons such as this, the {did} package that Callaway and Sant’Anna created for the implementation of their estimator in R, will only use the last value before treatment of a time varying covariate if one tries to condition on such a covariate, rather than allowing it to change after treatment. The authors acknowledge that one could ‘hack’ the software to make it allow covariates’ values to change after treatment, but strongly advise against it. For this reason, I will allow the software to run using the default method of only using the last value before treatment of time-varying covariates in Callaway and Sant’Anna (2021). Simulating results for bad controls is beyond the scope of this document, but perhaps I will do a write up about them in the future.

As a counter-example, imagine that there is some sort of new policy that is being rolled out by some municipalities but not others. The policy hurts revenues at stores in these municipalities (maybe it is some sort of a ‘sin’ tax on our products, or a ban on a particularly effective form of marketing). Moreover, stores that adopt the treatment are twice as likely to be located in municipalities that implement the policy. However, the policy is not implemented because stores there have adopted the technology. Rather, we will imagine that there is some unobserved factor that causes an increase in the likelihoods of both technology adoption on the part of the stores and implementation of the policy on the part of municipalities. Last, let’s assume that the stores are all (or nearly all) located in different municipalities. If municipalities were sufficiently large and locations were sufficiently plentiful that there were treated and untreated units in every municipality, one could actually arrive at an unbiased estimate by conditioning on municipality using Callaway and Sant’Anna (2021).

In this scenario, revenue is determined as follows. For each store, revenue in each year is equal to one million, minus $250,000 if a store is located in a municipality with the policy discussed above, plus the $100,000 treatment effect, plus an error term which is drawn from random normal distribution with mean zero and a standard deviation of $75,000. The policy is introduced randomly in any of the five years or not at all. The odds that the municipality of a store that adopts the technology introduced the policy in any of the five years is 1/10, while the odds for any given year for stores that do not adopt the policy is 1/20 .

#Create data for simulation 3
create_data_4 <- function(n.units= n_units, n.periods=n_periods) { 
  
  #First column
  myData <- as.data.frame(rep(1:n.units, n.periods))
  colnames(myData) <- "id"

  myData %>%
  arrange(by=id) %>% 
  cbind(year = rep(2019:(2019+n.periods-1),n.units)) %>% #Add periods
  group_by(id) %>%
  mutate(g = case_when(        #Assign to treatment groups
    id %in% (1:n.units/2) ~ 0, #50% untreated
    id %in% ((n.units/2+1):(n.units/2+n.units/(n.periods-1)/2)) ~ 2020,
    id %in% ((n.units/2+n.units/(n.periods-1)/2+1):(n.units/2+n.units*2/(n.periods-1)/2)) ~ 2021,
    id %in% ((n.units/2+n.units*2/(n.periods-1)/2+1):(n.units/2+n.units*3/(n.periods-1)/2)) ~ 2022,
    TRUE ~ 2023
    ), 
         policy_year = case_when(
                                 g == 0 ~ sample(c(2019,2020,2021,2022,2023, rep(0,15)),1), #each year 1/20
                                 TRUE ~ sample(c(2019,2020,2021,2022,2023, rep(0,5)),1) #each year 1/10
                                 )
  ) %>%
  rowwise() %>%
  mutate(treated = case_when(g<=year & g!=0 ~ 1,
                             TRUE ~ 0),
         policy = case_when(policy_year<=year & policy_year !=0 ~ 1,
                            TRUE ~ 0),
         revenue = 1000000
                   + policy * -250000
                   + treated * 100000
                   + rnorm(n=1, mean =0, sd = 75000) # error w/ mean 0 & sd= 75k
         ) 
}

#A function to run a simulation 1 time and run estimators
get_est_4 <- function(...) { # '...' must be included for map_dfr to work
  myData <- create_data_4()
  #Run our three estimators
  output_conditional <- my_event_study(myData, xformla = ~policy, true_effect = true_effect) %>%
    mutate(conditioning = 1)
  output_unconditional <- my_event_study(myData, true_effect = true_effect) %>%
    mutate(conditioning = 0)
  rbind(output_conditional, output_unconditional)
}

output <- get_est_4()

#For reproducibility
set.seed(97)
#Run simulation multiple times & save mean output.
output <- map_dfr(1:n_sim, get_est_4) %>% #Run sim n_sim times & rbind() outputs
  group_by(term, estimator, conditioning) %>%
  summarise(across(everything(), mean))

#Plot results w/o conditioning in policy variable
my_event_plot(filter(output, conditioning == 0),
              plot_title = "Estimating Effects <i>Without</i> Conidtioning on Time-Varying Covariates")

#Plot results w/ conditioning in policy variable
my_event_plot(filter(output, conditioning == 1),
              plot_title = "Estimating Effects <i>With</i> Conidtioning Time-Varying Covariates*")+ 
  labs(caption = "*The software used to implement Callaway and Sant'Anna does not allow covariates' values to change after treatment. See discussion for more.") 

Of course, when we do not condition on / control for the policy covariate we get biased estimates from all three methods. TWFE and Gardner (2022) both allow us to control for this sort of covaraite, while the software does not allow for such controls within the methods of Callaway and Sant’Anna (2021). For the same reasons that Callaway and Sant’Anna (2021) advise against using time varying covariates, one must use a great degree of caution when controlling for such covariates in Gardner (2022) or TWFE as well.

Conclusion

In this document, we have explored how the TWFE, Gardner (2022), and Callaway and Sant’Anna (2021) estimators behave (or misbehave) in a variety of circumstances. First, we showed that when treatment effects are heterogeneous, TWFE will produce biased estimates, while the other two estimators can produce unbiased estimates. Next, we illustrated that if covariates exist that have constant values but still effect the dynamics of the dependent variable (ie, their effects change over time) the Callaway and Sant’Anna (2021) estimator can produce unbiased estimates by conditioning on those covariates, while we need to introduce period*covariate interaction terms to control for such covariates in Gardner (2022) and TWFE. Doing so may often be impractical. Last, we saw that Gardner (2022) and TWFE can control for covariates whose values change over time, while the software created by the authors of Callaway and Sant’Anna (2021) prevents one from doing so within their method. Of course, TWFE still requires homogeneous effects in this scenario, as well as all others described here, and truly homogeneous effects are unlikely to exist in the real world.

Clearly, I have brushed over multiple assumptions and potential pitfalls with each of these estimators. The purpose of this was not to give a comprehensive overview of these methods, but rather to give a glimpse of some of the practical differences between them and some basic examples of scenarios in which one or two of the methods might be more appropriate than the other(s). For a more comprehensive understanding of these methods I suggest not only reading the Gardner (2022) and Callaway and Sant’Anna (2021) papers, but also looking at some of the documentation for the {did} and {did2s} R packages. The vignette https://cran.r-project.org/web/packages/did/vignettes/TWFE.html, for example, gives a very approachable overview of some issues with TWFE estimators and how Callaway and Sant’Anna remedy such problems with their estimator. Butts and Gardner (2021), provides additional insight into the {did2s} R package and the estimator proposed in Gardner (2022). Professor Taylor Wright provides recordings of some great DiD-related webinars on his youtube channel (https://www.youtube.com/playlist?list=PLVObvb_htcuBt8mV9yNagt7hK9FL5KXeE), including webinars by Dr. Pedro Sant’Anna and Dr. John Gardner. Finally, I recommend Professor Scott Cunningham’s Substack (https://causalinf.substack.com/) for discussions about all things causal inference.

I created this document independently out of my own interest. If you think it is useful, feel free to share it with others. If you think that there are flaws, feel free to reach out to me at g.vandervinne1@gmail.com.

References

Borusyak, Kirill, and Xavier Jaravel. 2018. Revisiting Event Study Designs. SSRN.
Butts, Kyle, and John Gardner. 2021. \(\{\)Did2s\(\}\): Two-Stage Difference-in-Differences.” arXiv Preprint arXiv:2109.05913.
Callaway, Brantly, and Pedro H. C. Sant’Anna. 2021. “Difference-in-Differences with Multiple Time Periods.” Journal of Econometrics 225 (2): 200–230. https://doi.org/https://doi.org/10.1016/j.jeconom.2020.12.001.
De Chaisemartin, Clément, and Xavier d’Haultfoeuille. 2020. “Two-Way Fixed Effects Estimators with Heterogeneous Treatment Effects.” American Economic Review 110 (9): 2964–96.
Gardner, John. 2022. “Two-Stage Differences in Differences.” arXiv Preprint arXiv:2207.05943.
Goodman-Bacon, Andrew. 2021. “Difference-in-Differences with Variation in Treatment Timing.” Journal of Econometrics 225 (2): 254–77.
Roth, Jonathan, Pedro HC Sant’Anna, Alyssa Bilinski, and John Poe. 2023. “What’s Trending in Difference-in-Differences? A Synthesis of the Recent Econometrics Literature.” Journal of Econometrics.
Sun, Liyang, and Sarah Abraham. 2021. “Estimating Dynamic Treatment Effects in Event Studies with Heterogeneous Treatment Effects.” Journal of Econometrics 225 (2): 175–99.