library(data.table)
library(fixest)
library(kableExtra)

setDTthreads(100)
setFixest_nthreads(4)

nsim <- 1000
n <- c(10, 50, 500) # individuals per cluster
J <- c(5, 50, 500) # number of clusters

Individual RCT inference

\(y_{ij}=\beta W_{ij}+v_{j}+e_{ij}\)

cl_rct <- function(n, J, alpha = 0.05) {
  while (TRUE) {
    W <- rbinom(n*J, 1, 0.5)
    if (!all(W == 1) & !all(W == 0)) break
  }
  e <- rnorm(n*J, sd = 0.2)
  cl <- rep(seq_len(J), each = n)
  dt <- data.table(W, e, cl = factor(cl))
  dt[, by = cl, `:=`(v = rnorm(1))]
  dt[, `:=`(y = 0.1*W + v + e,
            y0 = v + e)]
  #'
  reg <- feols(y~W, data = dt)
  reg0 <- feols(y0~W, data = dt)
  pvals <- c(y_sd = pvalue(reg)["W"],
             y_cl = pvalue(reg, cluster = "cl")["W"],
             y0_sd = pvalue(reg0)["W"],
             y0_cl = pvalue(reg0, cluster = "cl")["W"])
  lapply(pvals, function(x) x < alpha)
}
sim_dt <- CJ(n, J, sim = seq_len(nsim))
set.seed(123)
sim_dt[, rej := purrr::map2(n, J, cl_rct)]
sim_dt[, by = .(n, J, sim), 
       c("Pwr Std", "Pwr CRVE", "Size Std", "Size CRVE") := rej[[1]]]
sim_dt[, rej := NULL]

size <- sim_dt[, by = .(n, J), lapply(.SD, mean),
                .SDcols = c("Size Std", "Size CRVE")] |> 
  dcast(n~J, value.var = c("Size Std", "Size CRVE"))
power <- sim_dt[, by = .(n, J), lapply(.SD, mean),
                .SDcols = c("Pwr Std", "Pwr CRVE")] |> 
  dcast(n~J, value.var = c("Pwr Std", "Pwr CRVE"))
kbl(size, col.names = c("n", rep(c("5", "50", "500"), 2))) |> 
  kable_classic(full_width = FALSE) |> 
  add_header_above(c(" " = 1, "Standard" = 3, "CRVE" = 3))
Standard
CRVE
n 5 50 500 5 50 500
10 0.044 0.056 0.041 0.039 0.042 0.039
50 0.066 0.045 0.049 0.037 0.044 0.049
500 0.046 0.055 0.060 0.041 0.052 0.062

The test size is adequate using regular inference even in the presence of unobserved cluster-specific characteristics.

kbl(power, col.names = c("n", rep(c("5", "50", "500"), 2))) |> 
  kable_classic(full_width = FALSE) |> 
  add_header_above(c(" " = 1, "Standard" = 3, "CRVE" = 3))
Standard
CRVE
n 5 50 500 5 50 500
10 0.063 0.194 0.933 0.067 0.204 0.937
50 0.161 0.694 1.000 0.179 0.691 1.000
500 0.790 1.000 1.000 0.733 1.000 1.000

We have more power using the regular inference method.

Treatment designation by cluster

\(y_{ij}=\beta W_{j}+v_{j}+e_{ij}\)

cl_grp <- function(n, J, alpha = 0.05) {
  e <- rnorm(n*J, sd = 0.2)
  cl <- rep(seq_len(J), each = n)
  while (TRUE) {
    W <- rep(rbinom(J, 1, 0.5), each = n)
    if (!all(W == 1) & !all(W == 0)) break
  }
  
  dt <- data.table(W, e, cl = factor(cl))
  dt[, by = cl, `:=`(v = rnorm(1)) ]
  dt[, `:=`(y = 0.4*W + v + e,
            y0 = v + e)]
  #'
  reg <- feols(y~W, data = dt)
  reg0 <- feols(y0~W, data = dt)
  pvals <- c(y_sd = pvalue(reg)["W"],
             y_cl = pvalue(reg, cluster = "cl")["W"],
             y0_sd = pvalue(reg0)["W"],
             y0_cl = pvalue(reg0, cluster = "cl")["W"])
  lapply(pvals, function(x) x < alpha)
}
sim1_dt <- CJ(n, J, sim = seq_len(nsim))
set.seed(123)
sim1_dt[, rej := purrr::map2(n, J, cl_grp)]
sim1_dt[, by = .(n, J, sim), 
       c("Pwr Std", "Pwr CRVE", "Size Std", "Size CRVE") := rej[[1]]]
sim1_dt[, rej := NULL]

size1 <- sim1_dt[, by = .(n, J), lapply(.SD, mean),
               .SDcols = c("Size Std", "Size CRVE")] |> 
  dcast(n~J, value.var = c("Size Std", "Size CRVE"))
power1 <- sim1_dt[, by = .(n, J), lapply(.SD, mean),
                .SDcols = c("Pwr Std", "Pwr CRVE")] |> 
  dcast(n~J, value.var = c("Pwr Std", "Pwr CRVE"))
kbl(size1, col.names = c("n", rep(c("5", "50", "500"), 2))) |> 
  kable_classic(full_width = FALSE) |> 
  add_header_above(c(" " = 1, "Standard" = 3, "CRVE" = 3))
Standard
CRVE
n 5 50 500 5 50 500
10 0.614 0.498 0.518 0.162 0.058 0.043
50 0.825 0.789 0.774 0.183 0.061 0.040
500 0.944 0.928 0.941 0.163 0.046 0.036

The test size is completely inadequate (oversized) when using regular inference with cluster randomization. The researcher is better off using CRVE inference in this case.

kbl(power1, col.names = c("n", rep(c("5", "50", "500"), 2))) |> 
  kable_classic(full_width = FALSE) |> 
  add_header_above(c(" " = 1, "Standard" = 3, "CRVE" = 3))
Standard
CRVE
n 5 50 500 5 50 500
10 0.652 0.830 1 0.191 0.281 0.995
50 0.844 0.935 1 0.196 0.310 0.996
500 0.944 0.975 1 0.194 0.279 0.996