# Packages
::p_load(ggplot2, scales, fastverse, fixest, here)
pacman# Load data
= here('data-002.csv') %>% fread() ps_dt
Problem Set 2
1 General regression analysis and the CIA
01 Loading packages…
02 Time for some figures.
# Histogram: income gap, 2010
ggplot(
data = ps_dt,
aes(x = income_white_2010 - income_black_2010, fill = as.factor(had_rosenwald_school))
+
) geom_histogram(alpha = 0.8, color = NA, position = 'identity', bins = 50) +
scale_fill_viridis_d('Had Rosenwald School', labels = c('N', 'Y')) +
scale_x_continuous('Income Gap in 2010', labels = dollar) +
scale_y_continuous('Count', labels = comma) +
theme_minimal() +
theme(legend.position = 'bottom')
# Histogram: pct pop enslaved, 2010
ggplot(
data = ps_dt,
aes(x = pct_pop_enslaved_1860/100, fill = as.factor(had_rosenwald_school))
+
) geom_histogram(alpha = 0.8, color = NA, position = 'identity', bins = 50) +
scale_fill_viridis_d('Had Rosenwald School', labels = c('N', 'Y')) +
scale_x_continuous('Percent of Population Enslaved in 1860', labels = percent) +
scale_y_continuous('Count', labels = comma) +
theme_minimal() +
theme(legend.position = 'bottom')
# Scatter plot: income and pct pop enslaved
ggplot(
data = ps_dt,
aes(x = pct_pop_enslaved_1860/100, y = income_white_2010 - income_black_2010, color = as.factor(had_rosenwald_school))
+
) geom_point(alpha = 0.8) +
scale_color_viridis_d('Had Rosenwald School', labels = c('N', 'Y')) +
scale_x_continuous('Percent of Population Enslaved in 1860', labels = percent) +
scale_y_continuous('Income Gap in 2010', labels = dollar) +
theme_minimal() +
theme(legend.position = 'bottom')
03 Regressing the income gap on the indicator for Rosenwald school, we are assuming that schools’ locations are as good as random (in their distribution across counties). In other words, we need anything that affects the gap—other than the schools—to be independent of whether counties had Rosenwald schools.
# Cross-sectional regression
= feols(
reg03 - income_black_2010 ~
income_white_2010
had_rosenwald_school,data = ps_dt
)etable(reg03)
reg03
Dependent Var.: income_white_2010-income_black_2010
Constant 14,537.5*** (802.8)
had_rosenwald_school 4,323.7*** (909.7)
____________________ ______________________________
S.E. type IID
Observations 710
R2 0.03092
Adj. R2 0.02955
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
04 Estimates below. Our CIA updates to within state placement of schools needs to be as good as random (conditional on state, all non-Rosenwald school determinants of the income gap are independent of whether or not a county had a Rosenwald school).
# Add state fixed effects
= feols(
reg04 - income_black_2010 ~
income_white_2010 |
had_rosenwald_school
state,data = ps_dt
)etable(reg04)
reg04
Dependent Var.: income_white_2010-income_black_2010
had_rosenwald_school 4,285.8** (1,087.0)
Fixed-Effects: ------------------------------
state Yes
____________________ ______________________________
S.E.: Clustered by: state
Observations 710
R2 0.09778
Within R2 0.02938
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
05 Estimates below…
# Add 1860 controls
= feols(
reg05 - income_black_2010 ~
income_white_2010 + pct_pop_enslaved_1860 + pop_total_1860 |
had_rosenwald_school
state,data = ps_dt
)etable(reg05)
reg05
Dependent Var.: income_white_2010-income_black_2010
had_rosenwald_school 2,252.9** (669.1)
pct_pop_enslaved_1860 129.1*** (20.06)
pop_total_1860 0.1062* (0.0409)
Fixed-Effects: ------------------------------
state Yes
_____________________ ______________________________
S.E.: Clustered by: state
Observations 710
R2 0.16186
Within R2 0.09831
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
06 Comparison below. Yes, the movement in the point estimate for the effect of Rosenwald schools seems to match what we would expect. One would like expect that places with more histories of more intense slavery would have larger income gaps today and would also have been more likely to receive a Rosenwald school. When we control for the county’s history of slavery (and total population in 1860), we see the coefficient on the Rosenwald school indicator decrease.
etable(reg04, reg05)
reg04
Dependent Var.: income_white_2010-income_black_2010
had_rosenwald_school 4,285.8** (1,087.0)
pct_pop_enslaved_1860
pop_total_1860
Fixed-Effects: ------------------------------
state Yes
_____________________ ______________________________
S.E.: Clustered by: state
Observations 710
R2 0.09778
Within R2 0.02938
reg05
Dependent Var.: income_white_2010-income_black_2010
had_rosenwald_school 2,252.9** (669.1)
pct_pop_enslaved_1860 129.1*** (20.06)
pop_total_1860 0.1062* (0.0409)
Fixed-Effects: ------------------------------
state Yes
_____________________ ______________________________
S.E.: Clustered by: state
Observations 710
R2 0.16186
Within R2 0.09831
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
07 Open answer: Just want to see a DAG that makes the point that we have likely not controlled for everything.
08 I’d be concerned that there could be a bad-controls issue here: if the schools affected current (or 2010) population levels, then we should not control for it. If we don’t think schools affected current population, then we are probably fine (depending on the DAG you have in mind).
2 Matching
09 The histogram of treated units’ distances to their nearest control unit (note: I’m using log10 for the x axis).
# Load MatchIt
::p_load(MatchIt)
pacman# Find distances between treated and control observations
= mahalanobis_dist(
dist_mat formula = had_rosenwald_school ~ pct_pop_enslaved_1860 + pop_total_1860,
data = ps_dt,
)# Find the minimum distance from each treated individual (rows) to the controls (columns)
= apply(X = dist_mat, FUN = min, MARGIN = 1)
trt_dist # Make the histogram
ggplot(
data = data.table(dist = trt_dist),
aes(x = dist)
+
) geom_histogram(alpha = 0.8, fill = viridis::viridis(1), color = NA, position = 'identity', binwidth = 0.1) +
scale_x_log10(bquote(Mahalanobis~distance~(log[10]~scale))) +
scale_y_continuous('Count', labels = comma) +
theme_minimal()
10
# Match each treated unit to its control unit
# Step 1: Find the index of the control units that match to each treated unit
= apply(X = dist_mat, FUN = which.min, MARGIN = 1)
ctrl_j # Step 2: Find the control units (rows) implied by the indices
= colnames(dist_mat)[ctrl_j] |> as.numeric()
ctrl_r # Step 3: Create a dataset of treated individuals and their nearest neighbors
= data.table(
match_dt income_gap_trt = ps_dt[had_rosenwald_school == 1, income_white_2010 - income_black_2010],
income_gap_ctrl = ps_dt[ctrl_r, income_white_2010 - income_black_2010]
)# Step 4: Calculate the differences
:= income_gap_trt - income_gap_ctrl]
match_dt[, trt_effect # Plot a histogram of the individual treatment effects
ggplot(
data = match_dt,
aes(x = trt_effect)
+
) geom_histogram(alpha = 0.8, fill = viridis::viridis(1), color = NA, position = 'identity', bins = 50) +
geom_vline(xintercept = match_dt[,mean(trt_effect)], color = 'orange') +
scale_x_continuous('Estimated effect of Rosenwald school on the income gap', labels = dollar) +
scale_y_continuous('Count', labels = comma) +
theme_minimal()
The estimate for the average treatment effect is approximately -$129.42.
11 The estimate is much smaller and no longer signficantly different from zero.
12 The estimator in 10 is estimating the average treatment effect on the treated (ATET or ATT), because we’ve effectively conditioned on treated counties—we’re only matching control counties to treated counties.
Note The average effect here is at the count level. If we wanted an average effect at the individual level, we would want to weight by population (above, I’ve treated all counties as equal when taking the mean).
3 Propensity-score methods
13 Estimate the propensity scores…
# Estimate propensity scores as function of 1860 attributes
= feglm(
pscore_reg ~
had_rosenwald_school #pct_pop_enslaved_1860 + pop_total_1860,
I(pct_pop_enslaved_1860^2) + I(pop_total_1860^2) + pct_pop_enslaved_1860 * pop_total_1860,
data = ps_dt,
family = 'logit'
)# Add propensity scores to the dataset
:= predict(pscore_reg, newdata = ps_dt)] ps_dt[, p_score
14 Estimates below. The estimated effect of Rosenwald schools is smaller in magnitude and no longer significantly different from zero (still positive).
# Estimate the 1860-controls regression, controlling for the propensity score
= feols(
reg18 - income_black_2010 ~
income_white_2010 + pct_pop_enslaved_1860 + pop_total_1860 + p_score |
had_rosenwald_school
state,data = ps_dt
)etable("OLS" = reg05, "Prop. Scores" = reg18)
OLS
Dependent Var.: income_white_2010-income_black_2010
had_rosenwald_school 2,252.9** (669.1)
pct_pop_enslaved_1860 129.1*** (20.06)
pop_total_1860 0.1062* (0.0409)
p_score
Fixed-Effects: ------------------------------
state Yes
_____________________ ______________________________
S.E.: Clustered by: state
Observations 710
R2 0.16186
Within R2 0.09831
Prop. Scores
Dependent Var.: income_white_2010-income_black_2010
had_rosenwald_school 1,626.8 (928.2)
pct_pop_enslaved_1860 95.69** (23.67)
pop_total_1860 0.0619. (0.0317)
p_score 7,464.5 (4,561.5)
Fixed-Effects: ------------------------------
state Yes
_____________________ ______________________________
S.E.: Clustered by: state
Observations 710
R2 0.16806
Within R2 0.10498
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
15 Checking overlap… doesn’t look perfect (yet).
# Check overlap
ggplot(
data = ps_dt,
aes(x = p_score, fill = as.factor(had_rosenwald_school))
+
) geom_histogram(alpha = 0.8, color = NA, position = 'identity', binwidth = 0.01) +
scale_fill_viridis_d('Had Rosenwald School', labels = c('N', 'Y')) +
scale_x_continuous('Estimated Propensity Score', labels = percent) +
scale_y_continuous('Count', labels = comma) +
theme_minimal() +
theme(legend.position = 'bottom')
16 Finding the violators.
# Enforce overlap
= ps_dt[had_rosenwald_school == 0, max(p_score)]
c_max = ps_dt[had_rosenwald_school == 1, min(p_score)]
t_min := 1]
ps_dt[, overlap == 0 & p_score < t_min, overlap := 0]
ps_dt[had_rosenwald_school == 1 & p_score > c_max, overlap := 0] ps_dt[had_rosenwald_school
Approximately 93.9% of the observations comply with our enforced overlap.
17 We need to be able to compare individuals with equal likelihoods of treatment. Without overlap, there is no counterfactual for some observations—and we cannot enforce the CIA.
18 Enforcing overlap here doesn’t actually change much…
# Repeat regression but now enforcing overlap
= feols(
reg18 - income_black_2010 ~
income_white_2010 + pct_pop_enslaved_1860 + pop_total_1860 + p_score |
had_rosenwald_school
state,data = ps_dt[overlap == TRUE]
)etable('No overlap' = reg18, 'Overlap' = reg18)
No overlap
Dependent Var.: income_white_2010-income_black_2010
had_rosenwald_school 1,622.6 (920.9)
pct_pop_enslaved_1860 95.13** (23.88)
pop_total_1860 0.0024 (0.1019)
p_score 8,365.2 (5,651.1)
Fixed-Effects: ------------------------------
state Yes
_____________________ ______________________________
S.E.: Clustered by: state
Observations 667
R2 0.16321
Within R2 0.09459
Overlap
Dependent Var.: income_white_2010-income_black_2010
had_rosenwald_school 1,622.6 (920.9)
pct_pop_enslaved_1860 95.13** (23.88)
pop_total_1860 0.0024 (0.1019)
p_score 8,365.2 (5,651.1)
Fixed-Effects: ------------------------------
state Yes
_____________________ ______________________________
S.E.: Clustered by: state
Observations 667
R2 0.16321
Within R2 0.09459
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
19 The coefficient increases a bit when we inversely weight with the estimated propensity scores; it is marginally significant (at the 10% level).
# Add propensity score weights to the dataset
:= (
ps_dt[, p_weight / p_score + (1 - had_rosenwald_school) / (1 - p_score)
had_rosenwald_school
)]# Weighted regression (with controls; no propensity score)
= feols(
reg15 - income_black_2010 ~
income_white_2010 + pct_pop_enslaved_1860 + pop_total_1860 |
had_rosenwald_school
state,weights = ~ p_weight,
data = ps_dt[overlap == TRUE]
)etable(reg15)
reg15
Dependent Var.: income_white_2010-income_black_2010
had_rosenwald_school 1,966.4. (941.6)
pct_pop_enslaved_1860 147.5** (34.88)
pop_total_1860 0.1026 (0.1497)
Fixed-Effects: ------------------------------
state Yes
_____________________ ______________________________
S.E.: Clustered by: state
Observations 667
R2 0.17514
Within R2 0.09152
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
20
# Estimate block-level treatement effects
= lapply(
block_dt X = seq(0.4, 1, 0.1),
FUN = function(b) {
# The block's results
= feols(
b_est - income_black_2010 ~
income_white_2010 + pct_pop_enslaved_1860 + pop_total_1860 | state,
had_rosenwald_school data = ps_dt[overlap == TRUE & p_score > b - 0.1 & p_score <= b]
)# Return
data.table(
block = b,
est = as.matrix(b_est$coeftable)["had_rosenwald_school","Estimate"],
n = b_est$nobs
)
}%>% rbindlist()
) # Show block-level treatment effects
block_dt
block est n
1: 0.4 -6248.40100 25
2: 0.5 4740.73301 29
3: 0.6 -40.67093 36
4: 0.7 -5330.11593 66
5: 0.8 4501.95411 114
6: 0.9 179.68589 206
7: 1.0 2780.19632 182
# Estimate the average treatment effect
weighted.mean(x = est, w = n)] block_dt[,
[1] 1039.899