--- title: "Inference and Simulation" subtitle: "EC 425/525, Set 4" author: "Edward Rubin" date: "`r format(Sys.time(), '%d %B %Y')`" output: xaringan::moon_reader: css: ['default', 'metropolis', 'metropolis-fonts', 'my-css.css'] # self_contained: true nature: highlightStyle: github highlightLines: true countIncrementalSlides: false --- class: inverse, middle ```{R, setup, include = F} # devtools::install_github("dill/emoGG") library(pacman) p_load( broom, tidyverse, ggplot2, ggthemes, ggforce, ggridges, latex2exp, viridis, extrafont, gridExtra, kableExtra, snakecase, janitor, data.table, dplyr, lubridate, knitr, estimatr, here, magrittr ) # Define pink color red_pink <- "#e64173" turquoise <- "#20B2AA" orange <- "#FFA500" red <- "#fb6107" blue <- "#3b3b9a" green <- "#8bb174" grey_light <- "grey70" grey_mid <- "grey50" grey_dark <- "grey20" purple <- "#6A5ACD" slate <- "#314f4f" # Dark slate grey: #314f4f # Knitr options opts_chunk$set( comment = "#>", fig.align = "center", fig.height = 7, fig.width = 10.5, warning = F, message = F ) opts_chunk$set(dev = "svg") options(device = function(file, width, height) { svg(tempfile(), width = width, height = height) }) options(crayon.enabled = F) options(knitr.table.format = "html") # A blank theme for ggplot theme_empty <- theme_bw() + theme( line = element_blank(), rect = element_blank(), strip.text = element_blank(), axis.text = element_blank(), plot.title = element_blank(), axis.title = element_blank(), plot.margin = structure(c(0, 0, -0.5, -1), unit = "lines", valid.unit = 3L, class = "unit"), legend.position = "none" ) theme_simple <- theme_bw() + theme( line = element_blank(), panel.grid = element_blank(), rect = element_blank(), strip.text = element_blank(), axis.text.x = element_text(size = 18, family = "STIXGeneral"), axis.text.y = element_blank(), axis.ticks = element_blank(), plot.title = element_blank(), axis.title = element_blank(), # plot.margin = structure(c(0, 0, -1, -1), unit = "lines", valid.unit = 3L, class = "unit"), legend.position = "none" ) theme_axes_math <- theme_void() + theme( text = element_text(family = "MathJax_Math"), axis.title = element_text(size = 22), axis.title.x = element_text(hjust = .95, margin = margin(0.15, 0, 0, 0, unit = "lines")), axis.title.y = element_text(vjust = .95, margin = margin(0, 0.15, 0, 0, unit = "lines")), axis.line = element_line( color = "grey70", size = 0.25, arrow = arrow(angle = 30, length = unit(0.15, "inches") )), plot.margin = structure(c(1, 0, 1, 0), unit = "lines", valid.unit = 3L, class = "unit"), legend.position = "none" ) theme_axes_serif <- theme_void() + theme( text = element_text(family = "MathJax_Main"), axis.title = element_text(size = 22), axis.title.x = element_text(hjust = .95, margin = margin(0.15, 0, 0, 0, unit = "lines")), axis.title.y = element_text(vjust = .95, margin = margin(0, 0.15, 0, 0, unit = "lines")), axis.line = element_line( color = "grey70", size = 0.25, arrow = arrow(angle = 30, length = unit(0.15, "inches") )), plot.margin = structure(c(1, 0, 1, 0), unit = "lines", valid.unit = 3L, class = "unit"), legend.position = "none" ) theme_axes <- theme_void() + theme( text = element_text(family = "Fira Sans Book"), axis.title = element_text(size = 18), axis.title.x = element_text(hjust = .95, margin = margin(0.15, 0, 0, 0, unit = "lines")), axis.title.y = element_text(vjust = .95, margin = margin(0, 0.15, 0, 0, unit = "lines")), axis.line = element_line( color = grey_light, size = 0.25, arrow = arrow(angle = 30, length = unit(0.15, "inches") )), plot.margin = structure(c(1, 0, 1, 0), unit = "lines", valid.unit = 3L, class = "unit"), legend.position = "none" ) theme_set(theme_gray(base_size = 20)) # Column names for regression results reg_columns <- c("Term", "Est.", "S.E.", "t stat.", "p-Value") # Function for formatting p values format_pvi <- function(pv) { return(ifelse( pv < 0.0001, "<0.0001", round(pv, 4) %>% format(scientific = F) )) } format_pv <- function(pvs) lapply(X = pvs, FUN = format_pvi) %>% unlist() # Tidy regression results table tidy_table <- function(x, terms, highlight_row = 1, highlight_color = "black", highlight_bold = T, digits = c(NA, 3, 3, 2, 5), title = NULL) { x %>% tidy() %>% select(1:5) %>% mutate( term = terms, p.value = p.value %>% format_pv() ) %>% kable( col.names = reg_columns, escape = F, digits = digits, caption = title ) %>% kable_styling(font_size = 20) %>% row_spec(1:nrow(tidy(x)), background = "white") %>% row_spec(highlight_row, bold = highlight_bold, color = highlight_color) } ``` # Prologue --- name: schedule # Schedule ### Last time The *CEF* and least-squares regression ### Today Inference
.note[Read] *MHE* 3.1 ### Upcoming Lab (as usual) on Friday. (Meet Jenni!) No class on Monday. --- name: advice layout: false class: clear, middle .attn[Advice] Don't ride elevators (especially in PLC). --- layout: true # Inference --- class: inverse, middle --- name: why ## Why? .qa[Q] What's the big deal with inference? -- .qa[A] We rarely know the CEF or the population (and its regression vector). We *can* draw statistical inferences about the population using samples. -- .attn[Important] The issue/topic of *statistical inference* is separate from *causality*. Separate questions 1. How do we interpret the estimated coefficient $\hat{\beta}$? 2. What is the sampling distribution of $\hat{\beta}$? --- name: ols ## Moving from population to sample .attn[Recall] The population-regression function gives us the best linear approximation to the CEF. -- We're interested in the (unknown) population-regression vector $$ \begin{align} \beta = \mathop{E}\nolimits\left[ \text{X}_{i} \text{X}_{i}' \right]^{-1} \mathop{E}\left[ \text{X}_{i} \text{Y}_{i} \right] \end{align} $$ -- which we estimate via the ordinary least squares (OLS) estimator.super[.pink[†]] $$ \begin{align} \hat{\beta} = \left( \sum_i \text{X}_{i}\text{X}_{i}' \right)^{-1} \left( \sum_i \text{X}_{i} \text{Y}_{i} \right) \end{align} $$ .footnote[.pink[†] *MHE* presents a method-of-moments motivation for this derivation, where $\dfrac{1}{n}\sum_i \text{X}_{i} \text{X}_{i}'$ is our sample-based estimated for $\mathop{E}\left[ \text{X}_{i}\text{X}_{i}' \right]$. You've also seen others, _e.g._, minimizing MSE of $\text{Y}_{i}$ given $\text{X}_{i}$. ] --- ## A classic However you write it, this OLS estimator $$ \begin{align} \hat{\beta} &= \left( \text{X}^\prime \text{X} \right)^{-1} \text{X}^\prime \text{y} \\ &= \left( \sum_i \text{X}_{i}\text{X}_{i}' \right)^{-1} \left( \sum_i \text{X}_{i} \text{Y}_{i} \right) \\ &= \beta + \left[ \sum_i \text{X}_{i}\text{X}_{i}' \right]^{-1} \sum_i \text{X}_{i}e_i \end{align} $$ is the same estimator you've been using since undergrad. -- .note[Note] I'm following *MHE* in defining $e_i = \text{Y}_{i} - \text{X}_{i}'\beta$. --- ## A classic As you've learned, the OLS estimator $$ \begin{align} \hat{\beta} = \left( \sum_i \text{X}_{i}\text{X}_{i}' \right)^{-1} \left( \sum_i \text{X}_{i} \text{Y}_{i} \right) = \beta + \left[ \sum_i \text{X}_{i}\text{X}_{i}' \right]^{-1} \sum_i \text{X}_{i}e_i \end{align} $$ has asymptotic covariance $$ \begin{align} \mathop{E}\left[ \text{X}_{i}\text{X}_{i}' \right]^{-1} \mathop{E}\left[ \text{X}_{i}\text{X}_{i}' e_i^2 \right] \mathop{E}\left[ \text{X}_{i}\text{X}_{i}' \right]^{-1} \end{align} $$ -- which we estimate by (**1**) replacing $e_i$ with $\hat{e}_i = \text{Y}_{i} - \text{X}_{i}'\hat{\beta}$ and (**2**) replacing expectations with sample means, _e.g._, $\mathop{E}\left[ \text{X}_{i}\text{X}_{i}'e_i^2 \right]$ becomes $\dfrac{1}{n}\sum \left[ \text{X}_{i}\text{X}_{i}'\hat{e}_i^2 \right]$. -- Standard errors of this flavor are known as heteroskedasticity-consistent (or -robust) standard errors (or Eicker-Huber-White). --- name: het ## Defaults Statistical packages default to assuming homoskedasticity, _i.e._, $\mathop{E}\left[ e_i^2 \mid \text{X}_{i} \right] = \sigma^2$ for all $i$. -- With homoskedasticity, $$ \begin{align} \mathop{E}\left[ \text{X}_{i}\text{X}_{i}'e_i^2 \right] = \mathop{E}\left[ \mathop{E}\left[ \text{X}_{i}\text{X}_{i}'e_i^2 \mid \text{X}_{i} \right] \right] = \mathop{E}\left[ \text{X}_{i}\text{X}_{i}' \mathop{E}\left[ e_i^2 \mid \text{X}_{i} \right] \right] = \sigma^2 \mathop{E}\left[ \text{X}_{i}\text{X}_{i}' \right] \end{align} $$ -- Now, returning to to the asym. covariance matrix of $\hat{\beta}$, $$ \begin{align} \mathop{E}\left[ \text{X}_{i}\text{X}_{i}' \right]^{-1} \mathop{E}\left[ \text{X}_{i} \text{X}_{i}' e_i^2 \right] \mathop{E}\left[ \text{X}_{i}\text{X}_{i}' \right]^{-1} &= \mathop{E}\left[ \text{X}_{i}\text{X}_{i}' \right]^{-1} \sigma^2 \mathop{E}\left[ \text{X}_{i} \text{X}_{i}' \right] \mathop{E}\left[ \text{X}_{i}\text{X}_{i}' \right]^{-1} \\ &= \sigma^2 \mathop{E}\left[ \text{X}_{i}\text{X}_{i}' \right]^{-1} \end{align} $$ --- ## Defaults Angrist and Pischke argue we should probably change our default to heteroskedasticity. If the CEF is nonlinear, then our linear approximation (linear regression) generates heteroskedasticity. -- $\mathop{E}\left[ \left( \text{Y}_{i} - \text{X}_{i}'\beta \right)^2 \mid \text{X}_{i} \right]$ --
  $= \mathop{E} \left[ \bigg( \big\{ \text{Y}_{i} - \mathop{E}\left[ \text{Y}_{i} \mid \text{X}_{i} \right] \big\} + \big\{ \mathop{E}\left[ \text{Y}_{i} \mid \text{X}_{i} \right] - \text{X}_{i}'\beta \big\} \bigg)^2 \Bigg| \text{X}_{i} \right]$ --
  $= \mathop{\text{Var}} \left( \text{Y}_{i} \mid \text{X}_{i} \right) + \left( \mathop{E}\left[ \text{Y}_{i} \mid \text{X}_{i} \right] - \text{X}_{i}'\beta \right)^2$ -- Thus, even if $\text{Y}_{i}\mid \text{X}_{i}$ has contant variance, $e_i \mid \text{X}_{i}$ is heteroskedastic. --- ## Two notes 1. Heteroskedasticity is .hi[not our biggest concern] in inference. > ...as an empirical matter, heteroskedasticity may matter very little... If heteroskedasticity matters a lot, say, more than a 30 percent increase or any marked decrease in standard errors, you should worry about possible programming errors or other problems. (*MHE*, p.47) 2. Notice that we've .hi[avoided "standard" stronger assumptions], _e.g._, normality, fixed regressors, linear CEF, homoskedasticity. -- Following (2): We only have large-sample, asymptotic results (consistency) rather than finite-sample results (unbiasedness). --- name: warning ## Warning Because many of properties we care about for the inference are .hi-pink[large-sample] properties, they may not always apply to .hi-purple[small samples]. -- One practical way we can study the behavior of an estimator: .b[simulation]. -- .note[Note] You need to make sure your simulation can actually test/respond to the question you are asking (_e.g._, bias *vs.* consistency). --- name: sim ## Simulation Let's compare false- and true-positive rates.super[.pink[†]] for .footnote[.pink[†] The false-positive rate goes by many names; another common name: *type-I error rate*.] 1. .hi-purple[Homoskedasticity-assuming standard errors] $\color{#6A5ACD}{\left( \mathop{\text{Var}} \left[ e_i | \text{X}_{i} \right] = \sigma^2\right)}$ 1. .hi-pink[Heteroskedasticity-robust standard errors] -- .b[Simulation outline] .pseudocode-small[ 1. Define data-generating process (DGP). 1. Choose sample size n. 1. Set seed. 1. Run 10,000 iterations of
  a. Draw sample of size n from DGP.
  b. Conduction inference.
  c. Record inferences' outcomes. ] --- layout: true # Simulation --- class: inverse, middle --- name: dgp ## Data-generating process First, we'll define our DGP. -- We've been talking a lot about nonlinear CEFs, so let's use one. Let's keep the disturbances well behaved. -- $$ \begin{align} \text{Y}_{i} = 1 + e^{0.5 \text{X}_{i}} + \varepsilon_i \end{align} $$ where $\text{X}_{i}\sim\mathop{\text{Uniform}}(0, 10)$ and $\varepsilon_i\sim\mathop{N}(0,1)$. --- ## Data-generating process $$ \begin{align} \text{Y}_{i} = 1 + e^{0.5 \text{X}_{i}} + \varepsilon_i \end{align} $$ where $\text{X}_{i}\sim\mathop{\text{Uniform}}(0, 10)$ and $\varepsilon_i\sim\mathop{N}(0,15^2)$. -- ```{R, sim_seed, include = F} set.seed(12345) ``` .pull-left[ ```{R, sim_dgp} library(pacman) p_load(dplyr) # Choose a size n <- 1000 # Generate data dgp_df <- tibble( ε = rnorm(n, sd = 15), x = runif(n, min = 0, max = 10), y = 1 + exp(0.5 * x) + ε ) ``` ] -- .pull-right[ ```{R, sim_dply, printed, echo = F} dgp_df ``` ] --- layout: false class: clear, middle .orange[Our CEF] ```{R, sim_pop_plot1, echo = F} ggplot(data = dgp_df, aes(x = x, y = y)) + stat_function(fun = function(x) 1 + exp(0.5 * x), color = orange, size = 1.5) + geom_point(alpha = 0) + theme_pander(base_size = 18, base_family = "MathSansBook") + theme(axis.title.y = element_text(angle = 0, vjust = 0.5)) ``` --- count: false class: clear, middle Our population ```{R, sim_pop_plot2, echo = F} ggplot(data = dgp_df, aes(x = x, y = y)) + stat_function(fun = function(x) 1 + exp(0.5 * x), alpha = 0.9, color = orange, size = 1.5) + geom_point(alpha = 0.7, size = 1.8) + theme_pander(base_size = 18, base_family = "MathSansBook") + theme(axis.title.y = element_text(angle = 0, vjust = 0.5)) ``` --- count: false class: clear, middle .purple[The population least-squares regression line] ```{R, sim_pop_plot3, echo = F} ggplot(data = dgp_df, aes(x = x, y = y)) + stat_function(fun = function(x) 1 + exp(0.5 * x), alpha = 0.9, color = orange, size = 1.5) + geom_point(alpha = 0.2, size = 2) + stat_smooth(method = lm, se = F, color = purple, size = 2) + theme_pander(base_size = 18, base_family = "MathSansBook") + theme(axis.title.y = element_text(angle = 0, vjust = 0.5)) ``` --- layout: true # Simulation --- name: iter ## Iterating To make iterating easier, let's wrap our DGP in a function. ```{R, sim_fun} fun_iter <- function(iter, n = 30) { # Generate data iter_df <- tibble( ε = rnorm(n, sd = 15), x = runif(n, min = 0, max = 10), y = 1 + exp(0.5 * x) + ε ) } ``` We still need to run a regression and draw some inferences. .note[Note] We're defaulting to size-30 samples. --- We will use `lm_robust()` from the `estimatr` package for OLS and inference..super[.pink[†]] .footnote[.pink[†] `lm()` works for "spherical" standard errors but cannot calculate het.-robust standard errors.] - `se_type = "classical"` provides homoskedasticity-assuming SEs - `se_type = "HC2"` provides heteroskedasticity-robust SEs ```{R, ex_lm_robust} lm_robust(y ~ x, data = dgp_df, se_type = "classical") %>% tidy() %>% select(1:5) lm_robust(y ~ x, data = dgp_df, se_type = "HC2") %>% tidy() %>% select(1:5) ``` --- ## Inference Now add these estimators to our iteration function... ```{R, sim_fun2} fun_iter <- function(iter, n = 30) { # Generate data iter_df <- tibble( ε = rnorm(n, sd = 15), x = runif(n, min = 0, max = 10), y = 1 + exp(0.5 * x) + ε ) # Estimate models lm1 <- lm_robust(y ~ x, data = iter_df, se_type = "classical") lm2 <- lm_robust(y ~ x, data = iter_df, se_type = "HC2") # Stack and return results bind_rows(tidy(lm1), tidy(lm2)) %>% select(1:5) %>% filter(term == "x") %>% mutate(se_type = c("classical", "HC2"), i = iter) } ``` --- ## Run it Now we need to actually run our `iter_df()` function 10,000 times. -- There are a lot of ways to run a single function over a list/vector of values. - `lapply()`, _e.g._, `lapply(X = 1:3, FUN = sqrt)` - `for()`, _e.g._, `for (x in 1:3) sqrt(x)` - `map()` from `purrr`, _e.g._, `map(1:3, sqrt)` -- We're going to go with `map()` from the `purrr` package because it easily parallelizes across platforms using the `furrr` package. --- name: parallel ## Run it! .pull-left[ Run our function 10,000 times ```{R, ex_sim, eval = F} # Packages p_load(purrr) # Set seed set.seed(12345) # Run 10,000 iterations sim_list <- map(1:1e4, fun_iter) ``` ] -- .pull-right[ Parallelized 10,000 iterations ```{R, ex_sim2, eval = T, cache = T} # Packages p_load(purrr, furrr) # Set options set.seed(123) # Tell R to parallelize plan(multiprocess) # Run 10,000 iterations sim_list <- future_map( 1:1e4, fun_iter, .options = future_options(seed = T) ) ``` ] -- The `furrr` package (`future` + `purrr`) makes parallelization .hi-pink[easy] .hi-orange[and] .hi-turquoise[fun].hi-purlple[!]😸 --- ## Run it!! Our `fun_iter()` function returns a `data.frame`, and `future_map()` returns a `list` (of the returned objects). So `sim_list` is going to be a `list` of `data.frame` objects. We can bind them into one `data.frame` with `bind_rows()`. ```{R, sim_bind} # Bind list together sim_df <- bind_rows(sim_list) ``` -- So what are the results? --- name: results layout: false class: clear, middle Comparing the distributions of standard errors for the coefficient on $x$ ```{R, sim_plot1, echo = F} ggplot(data = sim_df, aes(x = std.error, fill = se_type)) + geom_density(color = NA) + geom_hline(yintercept = 0) + xlab("Standard error") + ylab("Density") + scale_fill_viridis( "", labels = c("Classical", "Het. Robust"), discrete = T, option = "B", begin = 0.25, end = 0.85, alpha = 0.9 ) + theme_pander(base_size = 20, base_family = "Fira Sans Book") ``` --- class: clear, middle Comparing the distributions of $t$ statistics for the coefficient on $x$ ```{R, sim_plot2, echo = F} ggplot(data = sim_df, aes(x = statistic, fill = se_type)) + geom_density(color = NA) + geom_hline(yintercept = 0) + geom_vline(xintercept = qt(0.975, df = 28), linetype = "longdash", color = red_pink) + xlab("t statistic") + ylab("Density") + scale_fill_viridis( "", labels = c("Classical", "Het. Robust"), discrete = T, option = "B", begin = 0.25, end = 0.85, alpha = 0.9 ) + theme_pander(base_size = 20, base_family = "Fira Sans Book") ``` --- class: clear, middle .qa[Q] All of these test are for a false H.sub[0]. How would the simulation change to enforce a *true* null hypothesis? --- layout: true # Simulation --- name: null ## Updating to enforce the null Let's update our simulation function to take arguments `γ` and `δ` such that $$ \begin{align} \text{Y}_{i} = 1 + e^{\gamma \text{X}_{i}} + \varepsilon_i \end{align} $$ where $\varepsilon_i\sim\mathop{\text{N}}(0,\sigma^2\text{X}_{i}^\delta)$. -- In other words, - $\gamma = 0$ implies no relationship between $\text{Y}_{i}$ and $\text{X}_{i}$. - $\delta = 0$ implies homoskedasticity. --- ## Updating to enforce the null Updating the function... ```{R, sim_null} flex_iter <- function(iter, γ = 0, δ = 1, n = 30) { # Generate data iter_df <- tibble( x = runif(n, min = 0, max = 10), ε = rnorm(n, sd = 15 * x^δ), y = 1 + exp(γ * x) + ε ) # Estimate models lm1 <- lm_robust(y ~ x, data = iter_df, se_type = "classical") lm2 <- lm_robust(y ~ x, data = iter_df, se_type = "HC2") # Stack and return results bind_rows(tidy(lm1), tidy(lm2)) %>% select(1:5) %>% filter(term == "x") %>% mutate(se_type = c("classical", "HC2"), i = iter) } ``` --- ## Run again! Now we run our new function `flex_iter()` 10,000 times ```{R, ex_sim3, eval = T, cache = T} # Packages p_load(purrr, furrr) # Set options set.seed(123) # Tell R to parallelize plan(multiprocess) # Run 10,000 iterations null_df <- future_map( 1:1e4, flex_iter, # Enforce the null hypothesis γ = 0, # Specify heteroskedasticity δ = 1, .options = future_options(seed = T) ) %>% bind_rows() ``` --- layout: false class: clear, middle Comparing the distributions of standard errors for the coefficient on $x$ ```{R, sim_plot3, echo = F} ggplot(data = null_df, aes(x = std.error, fill = se_type)) + geom_density(color = NA) + geom_hline(yintercept = 0) + xlab("Standard error") + ylab("Density") + scale_fill_viridis( "", labels = c("Classical", "Het. Robust"), discrete = T, option = "B", begin = 0.25, end = 0.85, alpha = 0.9 ) + theme_pander(base_size = 20, base_family = "Fira Sans Book") ``` --- class: clear, middle Comparing the distributions of $t$ statistics for the coefficient on $x$ ```{R, null_plot4, echo = F} ggplot(data = null_df, aes(x = statistic, fill = se_type)) + geom_density(color = NA) + geom_hline(yintercept = 0) + geom_vline(xintercept = qt(0.975, df = 28), linetype = "longdash", color = red_pink) + xlab("t statistic") + ylab("Density") + scale_fill_viridis( "", labels = c("Classical", "Het. Robust"), discrete = T, option = "B", begin = 0.25, end = 0.85, alpha = 0.9 ) + theme_pander(base_size = 20, base_family = "Fira Sans Book") ``` --- class: clear, middle Distributions of *p*-values: both methods slightly over-reject the (true) null ```{R, null_plot5, echo = F} ggplot(data = null_df, aes(x = p.value, fill = se_type)) + geom_density(color = NA) + geom_hline(yintercept = 0) + geom_vline(xintercept = 0.05, linetype = "longdash", color = red_pink) + xlab("t statistic") + ylab("Density") + scale_fill_viridis( "", labels = c("Classical", "Het. Robust"), discrete = T, option = "B", begin = 0.25, end = 0.85, alpha = 0.9 ) + theme_pander(base_size = 20, base_family = "Fira Sans Book") ``` --- layout: false # Table of contents .pull-left[ ### Admin .smaller[ 1. [Schedule](#schedule) 1. [Advice](#advice) ] ] .pull-right[ ### Inference .smaller[ 1. [Why?](#why) 1. [OLS](#ols) 1. [Heteroskedasticity](#het) 1. [Small-sample warning](#warning) 1. [Simulation](#sim) - [Outline](#sim) - [DGP](#dgp) - [Iterating](#iter) - [Parallelization](#parallel) - [Results](#results) - [Under the null](#null) ] ] --- exclude: true ```{R, generate pdfs, include = F, eval = T} source("../../ScriptsR/unpause.R") unpause("04Inference.Rmd", ".", T, T) ```