class: center, middle, inverse, title-slide # ECON 3818 ## Chapter 5 ### Kyle Butts ### 29 November 2021 --- class: clear, middle <!-- Custom css --> <style type="text/css"> /* ------------------------------------------------------- * * !! This file was generated by xaringanthemer !! * * Changes made to this file directly will be overwritten * if you used xaringanthemer in your xaringan slides Rmd * ------------------------------------------------------- */ @import url(https://fonts.googleapis.com/css?family=Roboto&display=swap); @import url(https://fonts.googleapis.com/css?family=Roboto&display=swap); @import url(https://fonts.googleapis.com/css?family=Source+Code+Pro:400,700&display=swap); @import url(https://fonts.googleapis.com/css2?family=Atkinson+Hyperlegible&display=swap); :root { /* Fonts */ --text-font-family: 'Atkinson Hyperelegible'; --text-font-is-google: 1; --text-font-family-fallback: Roboto, -apple-system, BlinkMacSystemFont, avenir next, avenir, helvetica neue, helvetica, Ubuntu, roboto, noto, segoe ui, arial; --text-font-base: sans-serif; --header-font-family: 'Atkinson Hyperelegible' --header-font-is-google: 1; --header-font-family-fallback: Georgia, serif; --code-font-family: 'Source Code Pro'; --code-font-is-google: 1; --base-font-size: 20px; --text-font-size: 1rem; --code-font-size: 0.9rem; --code-inline-font-size: 1em; --header-h1-font-size: 1.75rem; --header-h2-font-size: 1.6rem; --header-h3-font-size: 1.5rem; /* Colors */ --text-color: #131516; --text-color-light: #555F61; --header-color: #FFF; --background-color: #FFF; --link-color: #107895; --code-highlight-color: rgba(255,255,0,0.5); --inverse-text-color: #d6d6d6; --inverse-background-color: #272822; --inverse-header-color: #f3f3f3; --inverse-link-color: #107895; --title-slide-background-color: #272822; --title-slide-text-color: #d6d6d6; --header-background-color: #FFF; --header-background-text-color: #FFF; } html { font-size: var(--base-font-size); } body { font-family: var(--text-font-family), var(--text-font-family-fallback), var(--text-font-base); font-weight: normal; color: var(--text-color); } h1, h2, h3 { font-family: var(--header-font-family), var(--header-font-family-fallback); color: var(--text-color-light); } .remark-slide-content { background-color: var(--background-color); font-size: 1rem; padding: 24px 32px 16px 32px; width: 100%; height: 100%; } .remark-slide-content h1 { font-size: var(--header-h1-font-size); } .remark-slide-content h2 { font-size: var(--header-h2-font-size); } .remark-slide-content h3 { font-size: var(--header-h3-font-size); } .remark-code, .remark-inline-code { font-family: var(--code-font-family), Menlo, Consolas, Monaco, Liberation Mono, Lucida Console, monospace; } .remark-code { font-size: var(--code-font-size); } .remark-inline-code { font-size: var(--code-inline-font-size); color: #000; } .remark-slide-number { color: #107895; opacity: 1; font-size: 0.9em; } a, a > code { color: var(--link-color); text-decoration: none; } .footnote { position: absolute; bottom: 60px; padding-right: 6em; font-size: 0.9em; } .remark-code-line-highlighted { background-color: var(--code-highlight-color); } .inverse { background-color: var(--inverse-background-color); color: var(--inverse-text-color); } .inverse h1, .inverse h2, .inverse h3 { color: var(--inverse-header-color); } .inverse a, .inverse a > code { color: var(--inverse-link-color); } img, video, iframe { max-width: 100%; } blockquote { border-left: solid 5px lightgray; padding-left: 1em; } @page { margin: 0; } @media print { .remark-slide-scaler { width: 100% !important; height: 100% !important; transform: scale(1) !important; top: 0 !important; left: 0 !important; } } /* Modified metropolis */ .clear{ border-top: 0px solid #FAFAFA; } h1 { margin-top: -5px; margin-left: -00px; margin-bottom: 30px; color: var(--text-color-light); font-weight: 200; } h2, h3, h4 { padding-top: -15px; padding-bottom: 00px; color: #1A292C; text-shadow: none; font-weight: 400; text-align: left; margin-left: 00px; margin-bottom: -10px; } .title-slide .inverse .remark-slide-content { background-color: #FAFAFA; } .title-slide { background-color: #FAFAFA; border-top: 80px solid #FAFAFA; } .title-slide h1 { color: var(--text-color); font-size: 40px; text-shadow: none; font-weight: 400; text-align: left; margin-left: 15px; } .title-slide h2 { margin-top: -15px; color: var(--link-color); text-shadow: none; font-weight: 300; font-size: 35px; text-align: left; margin-left: 15px; } .title-slide h3 { color: var(--text-color-light); text-shadow: none; font-weight: 300; font-size: 25px; text-align: left; margin-left: 15px; margin-bottom: 0px; } .title-slide h3:last-of-type { font-style: italic; font-size: 1rem; } /* Remove orange line */ hr, .title-slide h2::after, .mline h1::after { content: ''; display: block; border: none; background-color: #e5e5e5; color: #e5e5e5; height: 1px; } hr, .mline h1::after { margin: 1em 15px 0 15px; } .title-slide h2::after { margin: 10px 15px 35px 0; } .mline h1::after { margin: 10px 15px 0 15px; } /* turns off slide numbers for title page: https://github.com/gnab/remark/issues/298 */ .title-slide .remark-slide-number { display: none; } /* Custom CSS */ /* More line spacing */ body { line-height: 1.5; } /* Font styling */ .hi { font-weight: 600; } .mono { font-family: monospace; } .ul { text-decoration: underline; } .ol { text-decoration: overline; } .st { text-decoration: line-through; } .bf { font-weight: bold; } .it { font-style: italic; } /* Font Sizes */ .bigger { font-size: 125%; } .huge{ font-size: 150%; } .small { font-size: 95%; } .smaller { font-size: 85%; } .smallest { font-size: 75%; } .tiny { font-size: 50%; } /* Remark customization */ .clear .remark-slide-number { display: none; } .inverse .remark-slide-number { display: none; } .remark-code-line-highlighted { background-color: rgba(249, 39, 114, 0.5); } /* Xaringan tweeks */ .inverse { background-color: #23373B; text-shadow: 0 0 20px #333; /* text-shadow: none; */ } .title-slide { background-color: #ffffff; border-top: 80px solid #ffffff; } .footnote { bottom: 1em; font-size: 80%; color: #7f7f7f; } /* Lists */ li { margin-top: 4px; } /* Mono-spaced font, smaller */ .mono-small { font-family: monospace; font-size: 16px; } .mono-small .mjx-chtml { font-size: 103% !important; } .pseudocode, .pseudocode-small { font-family: monospace; background: #f8f8f8; border-radius: 3px; padding: 10px; padding-top: 0px; padding-bottom: 0px; } .pseudocode-small { font-size: 16px; } .remark-code { font-size: 68%; } .remark-inline-code { background: #F5F5F5; /* lighter */ /* background: #e7e8e2; /* darker */ border-radius: 3px; padding: 4px; } /* Super and Subscripts */ .super{ vertical-align: super; font-size: 70%; line-height: 1%; } .sub{ vertical-align: sub; font-size: 70%; line-height: 1%; } /* Subheader */ .subheader{ font-weight: 100; font-style: italic; display: block; margin-top: -25px; margin-bottom: 25px; } /* 2/3 left; 1/3 right */ .more-left { float: left; width: 63%; } .less-right { float: right; width: 31%; } .more-right ~ * { clear: both; } /* 9/10 left; 1/10 right */ .left90 { padding-top: 0.7em; float: left; width: 85%; } .right10 { padding-top: 0.7em; float: right; width: 9%; } /* 95% left; 5% right */ .left95 { padding-top: 0.7em; float: left; width: 91%; } .right05 { padding-top: 0.7em; float: right; width: 5%; } .left5 { padding-top: 0.7em; margin-left: 0em; margin-right: -0.4em; float: left; width: 7%; } .left10 { padding-top: 0.7em; margin-left: -0.2em; margin-right: -0.5em; float: left; width: 10%; } .left30 { padding-top: 0.7em; float: left; width: 30%; } .right30 { padding-top: 0.7em; float: right; width: 30%; } .thin-left { padding-top: 0.7em; margin-left: -1em; margin-right: -0.5em; float: left; width: 27.5%; } /* Example */ .ex { font-weight: 300; color: #555F61 !important; font-style: italic; } .col-left { float: left; width: 47%; margin-top: -1em; } .col-right { float: right; width: 47%; margin-top: -1em; } .clear-up { clear: both; margin-top: -1em; } /* Format tables */ table { color: #000000; font-size: 14pt; line-height: 100%; border-top: 1px solid #ffffff !important; border-bottom: 1px solid #ffffff !important; } th, td { background-color: #ffffff; } table th { font-weight: 400; } /* Attention */ .attn { font-weight: 500; color: #e64173 !important; font-family: 'Zilla Slab' !important; } /* Note */ .note { font-weight: 300; font-style: italic; color: #314f4f !important; /* color: #cccccc !important; */ font-family: 'Zilla Slab' !important; } /* Question and answer */ .qa { font-weight: 500; /* color: #314f4f !important; */ color: #e64173 !important; font-family: 'Zilla Slab' !important; } /* Figure Caption */ .caption { font-size: 0.8888889em; line-height: 1.5; margin-top: 1em; color: #6b7280; } </style> <!-- From xaringancolor --> <div style = "position:fixed; visibility: hidden"> $$ \require{color} \definecolor{purple}{rgb}{0.337254901960784, 0.00392156862745098, 0.643137254901961} \definecolor{navy}{rgb}{0.0509803921568627, 0.23921568627451, 0.337254901960784} \definecolor{ruby}{rgb}{0.603921568627451, 0.145098039215686, 0.0823529411764706} \definecolor{alice}{rgb}{0.0627450980392157, 0.470588235294118, 0.584313725490196} \definecolor{daisy}{rgb}{0.92156862745098, 0.788235294117647, 0.266666666666667} \definecolor{coral}{rgb}{0.949019607843137, 0.427450980392157, 0.129411764705882} \definecolor{kelly}{rgb}{0.509803921568627, 0.576470588235294, 0.337254901960784} \definecolor{jet}{rgb}{0.0745098039215686, 0.0823529411764706, 0.0862745098039216} \definecolor{asher}{rgb}{0.333333333333333, 0.372549019607843, 0.380392156862745} \definecolor{slate}{rgb}{0.192156862745098, 0.309803921568627, 0.309803921568627} \definecolor{cranberry}{rgb}{0.901960784313726, 0.254901960784314, 0.450980392156863} $$ </div> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ TeX: { Macros: { purple: ["{\\color{purple}{#1}}", 1], navy: ["{\\color{navy}{#1}}", 1], ruby: ["{\\color{ruby}{#1}}", 1], alice: ["{\\color{alice}{#1}}", 1], daisy: ["{\\color{daisy}{#1}}", 1], coral: ["{\\color{coral}{#1}}", 1], kelly: ["{\\color{kelly}{#1}}", 1], jet: ["{\\color{jet}{#1}}", 1], asher: ["{\\color{asher}{#1}}", 1], slate: ["{\\color{slate}{#1}}", 1], cranberry: ["{\\color{cranberry}{#1}}", 1] }, loader: {load: ['[tex]/color']}, tex: {packages: {'[+]': ['color']}} } }); </script> <style> .purple {color: #5601A4;} .navy {color: #0D3D56;} .ruby {color: #9A2515;} .alice {color: #107895;} .daisy {color: #EBC944;} .coral {color: #F26D21;} .kelly {color: #829356;} .jet {color: #131516;} .asher {color: #555F61;} .slate {color: #314F4F;} .cranberry {color: #E64173;} </style> ## Chapter 5: Regression --- # Review from Last Class Recall the ways we discussed relationships between two random variables `\(X\)` and `\(Y\)`: Covariance, `\(\sigma_{XY}\)` (sample analogue: \\(s_{XY}\\) ) - Direction matters, but magnitude is hard to interpret Correlation, `\(\rho_{XY}\)` (sample analogue: \\(r_{XY}\\) ) - Direction and magnitude matter - Correlation is always value between `\([-1,1]\)` --- # Review from Last Class Recall: `$$r=\frac{Cov(X,Y)}{\sqrt{Var(X)}\cdot \sqrt{Var(Y)}}$$` - Correlation is a function of covariance, just normalizes the magnitudes so we can interpret. --- # Clicker Question Suppose you calculate the sample covariance, `\(s_{XY}=1.2\)`, and the sample standard deviations `\(s_X=2\)` and `\(s_Y=2.5\)`. What is the sample correlation, `\(r_{XY}\)`? <ol type = "a"> <li>0.0576</li> <li>0.24</li> <li>0.048</li> <li>4.17</li> </ol> --- # Relationship between X and Y We often summarize the relationship between `\(X\)` and `\(Y\)` using a straight line: <img src="data:image/png;base64,#ch5_files/figure-html/unnamed-chunk-1-1.svg" width="70%" style="display: block; margin: auto;" /> This is called the line of best fit, or the .hi.coral[regression line]. --- # Regression Line Example In the previous example, the regression line is given by `$$\widehat{\text{Average SAT Reading}} = 78.87 + 0.7983 * \text{Average SAT Math}$$` The `\(\widehat{ }\)` symbol means that we are *predicting* average SAT reading score. -- <br/> .ex[Example:] If a school has an average SAT math score of 600, we would predict their SAT reading score would be `$$\widehat{\text{Average SAT Reading}} = 78.87 + 0.7983 * 600 = 557.85.$$` --- # Regression Line Let `\(X\)` and `\(Y\)` be two random variables. A .hi.coral[regression line] is a straight line that describes how the response variable, `\(Y\)`, changes as the explanatory variable `\(X\)` changes. We often use a regression line to predict the value of `\(Y\)` for a given value of `\(X\)`, *when we believe the relationship between `\(X\)` and `\(Y\)` is linear*. -- Since we are predicting `\(Y\)` with a line, our regression line will be given by: `$$\underbrace{\coral{\hat{Y}_i}}_{\text{Prediction for } Y_i} = \underbrace{\hat{a}}_{\text{Intercept}} + \underbrace{\hat{b}}_{\text{Slope}} * X_i$$` `\(\coral{\hat{Y}_i}\)` is called the .hi.coral[predicted value] --- # Residual Errors However, `\(\coral{\hat{Y}}\)` will not be equal to `\(Y\)`. Therefore our .hi.kelly[prediction error] for unit `\(i\)` is given by $$ \kelly{\varepsilon_i} = Y_i - \coral{\hat{Y}_i} $$ <img src="data:image/png;base64,#ch5_files/figure-html/unnamed-chunk-2-1.svg" width="70%" style="display: block; margin: auto;" /> --- # Being the "best" **Question:** How do we estimate the intercept `\(\hat{a}\)` and the slope `\(\hat{b}\)` line of "best fit"? -- **Answer:** 1. What about minimizing of `\(\sum_i \kelly{\varepsilon_i}\)`, *i.e. lower total prediction error*? -- - `\(\kelly{\varepsilon_i}\)` could be positive or negative if we over/under-estimate `\(\coral{\hat{Y}_i}\)`. If we add up all the residuals, the positive and negatives will sometimes cancel out (same reasoning as variance). This isn't good because we want to be 'penalized' for all prediction errors. -- 2. Therefore we minimize `\(\kelly{\varepsilon_i}^2\)` because each `\(\kelly{\varepsilon_i}^2\)` will be positive. - In general,*the line of best-fit* means the line that minimizes the .kelly[sum of squared errors] (SSE): --- # Sum of Squared Errors $$ SSE = \sum_{i=1}^n (y_i - \hat{a} - \hat{b} X_i)^2 $$ --- # OLS *vs.* other lines/estimators Let's consider a random dataset <img src="data:image/png;base64,#ch5_files/figure-html/ols-vs-lines-1-1.svg" style="display: block; margin: auto;" /> --- count: false # OLS *vs.* other lines/estimators For any line (_i.e._, `\(\hat{y} = \hat{a} + \hat{b} x\)`) that we draw <img src="data:image/png;base64,#ch5_files/figure-html/ols-vs-lines-2-1.svg" style="display: block; margin: auto;" /> --- count: false # OLS *vs.* other lines/estimators For any line (*i.e.*, `\(\hat{y} = \hat{a} + \hat{b} x\)`) that we draw, we can calculate the errors: `\(e_i = y_i - \coral{\hat{Y}_i}\)` <img src="data:image/png;base64,#ch5_files/figure-html/ols-vs-lines-3-1.svg" style="display: block; margin: auto;" /> --- count: false # OLS *vs.* other lines/estimators For any line (_i.e._, `\(\hat{y} = \hat{a} + \hat{b} x\)`) that we draw, we can calculate the errors: `\(e_i = y_i - \coral{\hat{Y}_i}\)` <img src="data:image/png;base64,#ch5_files/figure-html/ols-vs-lines-4-1.svg" style="display: block; margin: auto;" /> --- count: false # OLS *vs.* other lines/estimators For any line (_i.e._, `\(\hat{y} = \hat{a} + \hat{b} x\)`) that we draw, we can calculate the errors: `\(e_i = y_i - \coral{\hat{Y}_i}\)` <img src="data:image/png;base64,#ch5_files/figure-html/ols-vs-lines-5-1.svg" style="display: block; margin: auto;" /> --- count: false # OLS *vs.* other lines/estimators Because SSE squares the errors (_i.e._, `\(\sum e_i^2\)`), big errors are penalized more than small ones. <img src="data:image/png;base64,#ch5_files/figure-html/ols-vs-lines-6-1.svg" style="display: block; margin: auto;" /> --- count: false # OLS *vs.* other lines/estimators The OLS estimate is the combination of `\(\hat{a}\)` and `\(\hat{b}\)` that minimize SSE. <img src="data:image/png;base64,#ch5_files/figure-html/ols-vs-lines-7-1.svg" style="display: block; margin: auto;" /> --- # Least Squares Regression Line Let `\(\{X_i,Y_i\}_{i=1}^n\)` be a *simple random sample* of `\(X\)` and `\(Y\)`. The .hi.coral[least squares regression line] is the equation `\(\widehat{Y}_i = \hat{a} + \hat{b} X_i\)`, where `\(\hat{a}\)` and `\(\hat{b}\)` minimize $$ \sum_{i=1}^n (Y_i - a - b * X_i)^2 $$ -- The solution is: `$$\hat{b} = r_{XY}\frac{s_Y}{s_X}$$` `$$\hat{a} = \bar{Y} - \hat{b} \bar{X}$$` --- # Clicker Question Consider the NHIS dataset. Let `\(Y\)` be a person's weight in pounds, and `\(X\)` be the number of drinks per day they consume (on average). You calculate the following: `\(\bar{Y} = 176.5889\)`, `\(\bar{X} = 2.2489\)`, `\(s_Y = 39.86577\)`, `\(s_X = 1.804856\)`, `\(r_{XY} = 0.1187268\)` What is the regression line you fit to the data? <ol type = "a"> <li>\( \widehat{Y}_i = 176.5889 + 2.6224 X_i \)</li> <li>\( \widehat{Y}_i = 176.5768 + 0.0054 X_i \)</li> <li>\( \widehat{Y}_i = 126.9151 + 22.08814 X_i \)</li> <li>\( \widehat{Y}_i = 170.6913 + 2.6224 X_i \)</li> </ol> --- # Interpreting a Regression A regression line is a straight line that describes how a response variable `\(y\)` changes as an explanatory variable `\(x\)` changes We often use a regression line to predict the value of `\(y\)` for a given value of `\(x\)`, when we believe the relationship is linear A linear relationship is of the form: `$$y=a + b x$$` - `\(a\)` is the value of `\(y\)` whenever `\(x=0\)` - `\(b\)` is the amount `\(y\)` changes when `\(x\)` increases by one --- # Interpreting a Regression Lets go back to our clicker question, we calculated the line of best fit to be: $$ \coral{\hat{Y}_i}=170.69+2.62X_i$$ where `\(Y\)` is a person's weight in pounds, and `\(X\)` is the number of drinks per day they consume (on average) - Our intercept, 170.69 is the predicted weight for someone who doesn't consume any alcohol - Our best guess at `\(Y\)` when `\(X = 0\)`, - Our slope, 2.62 is the amount predicted weight increases when number of drinks per day increases by 1 - The amount `\(\coral{\hat{Y}_i}\)` changes when `\(X_i\)` increases by 1 --- # Interpreting a Regression Say we calculate the following regression line from hours studied and final exam grades: $$ \coral{\widehat{\text{Final Exam}}} = 38 + 5.7 * \text{Hours of Studying} $$ -- - 38 is predicted score with no studying -- - Each hour of studying increases predicted final exam score by 5.7 points --- # Clicker Question Given that same regression line, `\(\text{Final Exam}=38 + 5.7*\text{Hours of Studying}\)`, what is the predicted final exam score if you study 8 hours? <ol type = "a"> <li> 83.6 </li> <li> 45.6 </li> <li> 96.3 </li> </ol> --- # Clicker Question A store in Boulder calculates a least squares line that describes how price (in dollars) of juuls affects the quantity sold; `$$\widehat{\text{Juuls sold}} = 117 - 12.4 * \text{price}$$` If price *decreases* by 1 dollar, what happens to number of juuls sold? <ol type = "a"> <li>Quantity decreases by 12.4</li> <li>Quantity increases 12.4</li> <li>Quantity decreases by 117</li> <li>Quantity increases by 117</li> </ol> --- # Properties of Regression Residuals The slope, `\(b\)`, and intercept, `\(a\)`, of the regression line are found by minimizing `\(\sum_{i=1}^n \varepsilon_i^2\)`. This forces `\(\bar{\varepsilon}=0\)`. Hence, an .hi.purple[assumption] of regression is that `\(E(\varepsilon)=0\)`. Intuitively, this assumption means that the error in your prediction is due entirely to randomness. --- # Overview of Regression Analysis A researcher is studying the relationship between high school students' SAT scores and their GPA during their freshman year of college. The data has a linear correlation coefficient of 0.503. Additional sample statistics are summarized in the table below:
Variable
Description
Sample Mean
Sample Std. Dev
X
SAT score
\( \bar{X}=1501.72 \)
\( s_X=104.14 \)
Y
GPA
\( \bar{Y}=3.3 \)
\( s_Y=0.45 \)
- What is the slope and intercept of this regression line? Write the linear regression using the `\(Y=a+bX\)` format. - Interpret the slope and intercept coefficients - What is the predicted GPA if the student got a 1600 on the SAT? --- # `\(R^2\)` Next we define a measure to evaluate how well the regression line fits: $$ R^2 = \frac{\sum_{i=1}^n (\hat{Y}_i - \bar{Y})^2}{\sum_{i=1}^n (Y_i - \bar{Y})^2} $$ --- # Intuition of `\(R^2\)` Intuitively, `\(R^2\)` measures the percent of variation in `\(Y\)` explained by the model. `$$\cranberry{R^2} = \frac{\text{variation in } \hat{y} \text{ along the regression line as x varies}}{\text{total variation in observed values of y}}$$` --- class: clear,middle <img src="data:image/png;base64,#Rsquaredcomparisons.png" width="90%" style="display: block; margin: auto;" /> --- # `\(r\)` and `\(R^2\)` Correlation, `\(r\)`, describes the strength of a straight-line relationship between two variables `\(R^2\)`, is the fraction of the variation in the values of y that is explained by the least-squares regression of y on x `$$\cranberry{R^2} = r^2$$` --- # `\(r\)` and `\(R^2\)` Lets say we have `\(r = -0.7786\)` and `\(\cranberry{R^2} = (-0.7786)^2 = 0.6062\)` between exercise and fat gain. - `\(r = -0.7786\)`, there is a strong negative linear relationship between time exercised and amount of weight gained - `\(\cranberry{R^2} = 0.6062\)`, about 61% of the variation in fat gained is accounted for by the linear relationship between fat gain and exercise. This means about 39% of the change in fat gained is not explained by this relationship --- # Clicker Question Say we run a regression on the temperature and the amount of gas used to heat a home. We find that the `\(r=-0.99\)` and `\(R^2=0.98\)`. This suggests that: <ol type = "a"> <li>although temperature and gas used are very correlated, the temperature does not make very good predictions of the amount of gas used</li> <li>gas used increases by 0.99 cubic feet for each additional degree colder it is outside </li> <li>prediction of gas used based off temperature will be quite accurate</li> </ol> --- # `\(R^2\)` Sidebar A small `\(R^2\)` does not mean the result is uninteresting. All it means is that the x variable alone does not explain a large portion of the variation in y. <br/> .ex[Example:] You find significant relationship between exercise and income, but it has a small `\(R^2\)`. We know income is determined by a variety of variables -- parent's income, education, innate ability, experience, etc. - Your result isn't uninteresting, it just means there is a lot of variation in income .it[not due] to exercise, which is exactly what we'd expect. --- # `\(R^2\)` Example Recall from our previous example that a researcher calculated a correlation coefficient 0.503 between SAT scores and college freshman GPA. This implies an `\(R^2\)` of 0.253. What does this `\(R^2\)` mean? -- Does this make sense? What other things could explain the variation in freshman year GPA? --- # Influential Observations Our regression line is sensitive to outliers, either in the x or y dimension - We say an outlier is .hi.daisy[influential] if deleting it changes our regression line substantially - The amount by which the line changes is called the .hi.daisy[leverage] an influential observation has --- # Example of Influential Observation Let's revisit the NYC Math and Reading SAT Scores example. <img src="data:image/png;base64,#ch5_files/figure-html/nyc-sat-outlier-1-1.svg" width="80%" style="display: block; margin: auto;" /> --- # Example of Influential Observation Suppose we had an outlier of a Math score of 700 and a Reading score of 340. <img src="data:image/png;base64,#ch5_files/figure-html/nyc-sat-outlier-2-1.svg" width="80%" style="display: block; margin: auto;" /> --- # Example of Influential Observation Suppose we had an outlier of a Math score of 700 and a Reading score of 340. That data point has quite a bit of leverage because it is an extreme outlier. <img src="data:image/png;base64,#ch5_files/figure-html/nyc-sat-outlier-3-1.svg" width="80%" style="display: block; margin: auto;" /> --- # Cautions about Correlation and Regression Correlation and regression are powerful tools for describing the relationship between two variables, but they have their limitations - These tools only describe linear relationships - They are not resistant to outliers -- There is one more major issue with discussing correlations: -- .center.huge.hi.ruby[Correlation does not imply causation] --- # Spurious Correlations .hi[Spurious Correlation] is things move in the same direction, even if they are completely unrelated. <img src="data:image/png;base64,#spcorr1.png" width="100%" style="display: block; margin: auto;" /> .footnote[http://www.tylervigen.com/spurious-correlations] --- # Spurious Correlations .hi[Spurious Correlation] is things move in the same direction, even if they are completely unrelated. <img src="data:image/png;base64,#spcorr2.png" width="100%" style="display: block; margin: auto;" /> .footnote[http://www.tylervigen.com/spurious-correlations] --- # Lurking Variable A .hi[lurking variable] is a variable that is not among the explanatory or response in a study and yet may influence the interpretation of relationships among those variables. Also known as .hi[omitted variable bias]. - Experience in music and test scores -- family background is a lurking variable - Ice cream sales and number of violent police reports -- both of these things increase when the weather is warm --- # Ecological Correlation A correlation based on averages rather than on individuals is called an .hi[ecological correlation]. The idea being that the correlation between averages may be stronger than the correlation at the individual level Examples: - Number of years of education and average income level - Hours of weekly exercise and body mass index These relationships are very strong when we look at everyone, but may not be as strong when we analyze at the individual level --- # Extrapolation .hi[Extrapolation] is the use of a regression line for a prediction far outside the range of values of the explanatory variable `\(x\)` that you used to obtain the line The idea here is that not many relationships are linear for .it[all values of x] Example: - Age and height, eventually you stop growing - Experience and pay, eventually your salary levels off (usually) --- # Clicker Question A study of elementary school children, ages 6 to 11, finds a high positive correlation between shoe size and score on a test of reading comprehension. The observed correlation is most likely due to: <ol type = "a"> <li>cause and effect (larger shoe size causes higher reading comprehension) </li> <li>a mistake, because the correlation must be negative </li> <li>the effect of a lurking variable, such as age or years of reading experience </li> <li>reverse cause and effect (higher reading comprehension causes larger shoe size) </li> </ol>