library(data.table)
library(fixest)
library(kableExtra)

setDTthreads(100)
setFixest_nthreads(4)

nsim <- 1000
n <- c(10, 50, 500) # individuals per cluster
J <- c(5, 50, 500) # number of clusters

Individual RCT inference

\(y_{ij}=\beta W_{ij}+v_{j}+e_{ij}\)

cl_rct <- function(n, J, alpha = 0.05) {
  while (TRUE) {
    W <- rbinom(n*J, 1, 0.5)
    if (!all(W == 1) & !all(W == 0)) break
  }
  e <- rnorm(n*J, sd = 0.2)
  cl <- rep(seq_len(J), each = n)
  dt <- data.table(W, e, cl = factor(cl))
  dt[, by = cl, `:=`(v = rnorm(1))]
  dt[, `:=`(y = 0.1*W + v + e,
            y0 = v + e)]
  #'
  reg <- feols(y~W, data = dt)
  reg0 <- feols(y0~W, data = dt)
  pvals <- c(y_sd = pvalue(reg)["W"],
             y_cl = pvalue(reg, cluster = "cl")["W"],
             y0_sd = pvalue(reg0)["W"],
             y0_cl = pvalue(reg0, cluster = "cl")["W"])
  lapply(pvals, function(x) x < alpha)
}

sim_dt <- CJ(n, J, sim = seq_len(nsim))
set.seed(123)
sim_dt[, rej := purrr::map2(n, J, cl_rct)]
sim_dt[, by = .(n, J, sim), 
       c("Pwr Std", "Pwr CRVE", "Size Std", "Size CRVE") := rej[[1]]]
sim_dt[, rej := NULL]

size <- sim_dt[, by = .(n, J), lapply(.SD, mean),
                .SDcols = c("Size Std", "Size CRVE")] |> 
  dcast(n~J, value.var = c("Size Std", "Size CRVE"))
power <- sim_dt[, by = .(n, J), lapply(.SD, mean),
                .SDcols = c("Pwr Std", "Pwr CRVE")] |> 
  dcast(n~J, value.var = c("Pwr Std", "Pwr CRVE"))

kbl(size, col.names = c("n", rep(c("5", "50", "500"), 2))) |> 
  kable_classic(full_width = FALSE) |> 
  add_header_above(c(" " = 1, "Standard" = 3, "CRVE" = 3))

	Standard			CRVE
n	5	50	500	5	50	500
10	0.044	0.056	0.041	0.039	0.042	0.039
50	0.066	0.045	0.049	0.037	0.044	0.049
500	0.046	0.055	0.060	0.041	0.052	0.062

The test size is adequate using regular inference even in the presence of unobserved cluster-specific characteristics.

kbl(power, col.names = c("n", rep(c("5", "50", "500"), 2))) |> 
  kable_classic(full_width = FALSE) |> 
  add_header_above(c(" " = 1, "Standard" = 3, "CRVE" = 3))

	Standard			CRVE
n	5	50	500	5	50	500
10	0.063	0.194	0.933	0.067	0.204	0.937
50	0.161	0.694	1.000	0.179	0.691	1.000
500	0.790	1.000	1.000	0.733	1.000	1.000

We have more power using the regular inference method.

Treatment designation by cluster

\(y_{ij}=\beta W_{j}+v_{j}+e_{ij}\)

cl_grp <- function(n, J, alpha = 0.05) {
  e <- rnorm(n*J, sd = 0.2)
  cl <- rep(seq_len(J), each = n)
  while (TRUE) {
    W <- rep(rbinom(J, 1, 0.5), each = n)
    if (!all(W == 1) & !all(W == 0)) break
  }
  
  dt <- data.table(W, e, cl = factor(cl))
  dt[, by = cl, `:=`(v = rnorm(1)) ]
  dt[, `:=`(y = 0.4*W + v + e,
            y0 = v + e)]
  #'
  reg <- feols(y~W, data = dt)
  reg0 <- feols(y0~W, data = dt)
  pvals <- c(y_sd = pvalue(reg)["W"],
             y_cl = pvalue(reg, cluster = "cl")["W"],
             y0_sd = pvalue(reg0)["W"],
             y0_cl = pvalue(reg0, cluster = "cl")["W"])
  lapply(pvals, function(x) x < alpha)
}

sim1_dt <- CJ(n, J, sim = seq_len(nsim))
set.seed(123)
sim1_dt[, rej := purrr::map2(n, J, cl_grp)]
sim1_dt[, by = .(n, J, sim), 
       c("Pwr Std", "Pwr CRVE", "Size Std", "Size CRVE") := rej[[1]]]
sim1_dt[, rej := NULL]

size1 <- sim1_dt[, by = .(n, J), lapply(.SD, mean),
               .SDcols = c("Size Std", "Size CRVE")] |> 
  dcast(n~J, value.var = c("Size Std", "Size CRVE"))
power1 <- sim1_dt[, by = .(n, J), lapply(.SD, mean),
                .SDcols = c("Pwr Std", "Pwr CRVE")] |> 
  dcast(n~J, value.var = c("Pwr Std", "Pwr CRVE"))

kbl(size1, col.names = c("n", rep(c("5", "50", "500"), 2))) |> 
  kable_classic(full_width = FALSE) |> 
  add_header_above(c(" " = 1, "Standard" = 3, "CRVE" = 3))

	Standard			CRVE
n	5	50	500	5	50	500
10	0.614	0.498	0.518	0.162	0.058	0.043
50	0.825	0.789	0.774	0.183	0.061	0.040
500	0.944	0.928	0.941	0.163	0.046	0.036

The test size is completely inadequate (oversized) when using regular inference with cluster randomization. The researcher is better off using CRVE inference in this case.

kbl(power1, col.names = c("n", rep(c("5", "50", "500"), 2))) |> 
  kable_classic(full_width = FALSE) |> 
  add_header_above(c(" " = 1, "Standard" = 3, "CRVE" = 3))

	Standard			CRVE
n	5	50	500	5	50	500
10	0.652	0.830	1	0.191	0.281	0.995
50	0.844	0.935	1	0.196	0.310	0.996
500	0.944	0.975	1	0.194	0.279	0.996

Clusters, Correlations and Inference

Rafael F. Bressan

2021-08-26

Individual RCT inference

Treatment designation by cluster