Homework 1

This homework is meant to get you thinking about the tools we’ve learned so far. In particular, I want you to think about what feature selection and regularization might tell you about the relationships you’re trying to understand. I want you to take the following steps - each time showing me the code that you used and explaining what the output means.

Import the data set that you want to use.

## Importing data----

anes16<-import("./ML/ANES/raw/anes_timeseries_2016_dta/anes_timeseries_2016.dta")

## Wrangling the data

d1 <- anes16 %>%
  select(V161007, V161267x,V161342,V161310x,V161270,V161010d,
         V161268, V161115, V161217:V161223, V161326:V161329,
         V161522, ends_with('x')) %>% 
  rename(internet= V161007,
         age     = V161267x,
         gender  = V161342,
         race    = V161310x,
         education = V161270,
         region = V161010d,
         income = V161361x,
         trust_peple = V161219,
         gov_waste = V161217,
         unemp = V161142x,
         marital = V161268,
         Pres_for_rel = V161084x,
         occup = V161276x) %>% 
  mutate( internet = case_when(
    internet == 2 ~ 0,
    internet == 1 ~ 1)) %>% 
  filter(!is.na(internet)) %>%
  # drop_na(internet) %>% #I could have also used
  factorize() #%>%

Immediately split your data into a training set (we’ll call R) and testing set we’ll call S.

##Splitting data----
set.seed(66636)
s <- initial_split(d1, prop=.6)
R <- training(s)
S <- testing(s)

blueprint <- recipe(internet~.,data=R) %>% 
  step_nzv(all_predictors())  %>% 
  step_knnimpute(all_numeric(), neighbors = 6) %>%
  step_other(all_nominal(), threshold = 0.01,
           other = "other") %>%
  step_unknown(all_nominal()) %>%
  step_dummy(all_nominal(), one_hot = FALSE)

train_dat <- prep(blueprint, training = R) %>% 
  bake(., new_data=R)

names(train_dat)<-str_replace_all(names(train_dat),"[.]","_")

test_dat <- prep(blueprint, training = S) %>% 
  bake(., new_data=S)

names(test_dat)<-str_replace_all(names(test_dat),"[.]","_")

In the steps below, use the training set R.
- Estimate a theoretically derived model.

## Running theoretical model with training data ####
teo_d <-train_dat %>%
    select(starts_with(c('internet','age','gender',
                         'race','education','region'))) %>% 
  rowwise() %>%
  mutate( age_g40_64 =
            sum(c_across(age_X06__Age_group_40_44:age_X10__Age_group_60_64)),
          age_g65 = sum(c_across(age_X11__Age_group_65_69:age_X13__Age_group_75_or_older)),
          south = sum(c_across(contains(c('Alabama', 'Arkansas', 'Delaware',
          'Washington', 'Florida', 'Georgia', 'Kentucky', 'Louisiana', 'Maryland',
          'Mississippi', 'North Carolina', 'Oklahoma', 'South_Carolina', 
          'Tennessee','Texas', 'Virginia', 'West_Virginia')))))


tmod <- glm(internet~ age_g40_64+ age_g65+gender_X1__Male 
            +race_X2__Black__non_Hispanic 
            + race_X5__Hispanic
            + race_X6__Other_non_Hispanic_incl_multiple_races__WEB__blank__Other__counted_as_a_race_
            + education_X9__High_school_graduate__high_school_diploma_or_equivalent__for_example__GED_
            + education_X10__Some_college_but_no_degree +south, 
            data=teo_d, family = binomial())

stargazer(tmod, title="Results", align=TRUE, type = "html", 
          covariate.labels = c("Age group:40-64","Age:65+","Male",
                               "Race:Black","Race:Hispanic","Race:Other",
                               "Ed:HS","Ed:Some college (ND)",
                               "South region"))

**Results**

	Dependent variable:

	internet

Age group:40-64	-1.045^**
	(0.421)

Age:65+	-2.620^***
	(0.422)

Male	-0.092
	(0.263)

Race:Black	-1.241^***
	(0.372)

Race:Hispanic	-0.883^**
	(0.392)

Race:Other	-1.000^*
	(0.511)

Ed:HS	-0.722^**
	(0.296)

Ed:Some college (ND)	0.249
	(0.393)

South region	-0.192
	(0.266)

Constant	4.142^***
	(0.454)


Observations	708
Log Likelihood	-204.980
Akaike Inf. Crit.	429.960

Note:	p<0.1; p<0.05; p<0.01

*The objective of the paper I selected is to argue that internet is not driving political polarization in the US. As a first step, authors estimate a model that try to predict internet use in terms of characteristics of the consumers.*

Use subset selection methods to find the best model that could be made from the covariates in the model.

## FIND BEST MODEL ORIGINAL THEORY MODEL----
X1 <- teo_d %>%
  select(-internet) %>%
  as.matrix()

y1 <- teo_d %>%
  select(internet) %>%
  pull

rmodst <- regsubsets(x=X1, y=y1, method="forward",
                     all.best=FALSE, nbest=3, nvmax=10, 
                     really.big = TRUE) #I used "forward" because "exhaustive"

Reordering variables and trying again:

                                        #was taking forever

get_best<-function(regsubs=rmodst, depv="internet", data=teo_d){
    m <- as_tibble(summary(regsubs)$which, rownames = "size")
    m <- m %>% add_column(adjr2=summary(regsubs)$adjr2)
    m <- m %>% group_by(size) %>% filter(adjr2 == max(adjr2))
    m <- m %>% select(-`(Intercept)`)
    print(m$adjr2)
    f<-list()
    for (i in 1:nrow(m)){
      v<-names(m[,as.vector(m[i,]==TRUE)])
      f[[i]]<-paste(depv," ~ ", paste(v, collapse="+"))
    }
    b_m <- lapply(f, function(x)glm(x, data=data, family = binomial()))
    b_m
}

best_t<-get_best(regsubs = rmodst, depv="internet", data=teo_d)

[1] 0.0770812 0.1102643 0.1371978 0.1594509 0.1765303 0.1878448 0.1969313 [8] 0.2022197 0.2066311 0.2109234 0.2132535

stargazer(best_t[[8]],best_t[[9]],best_t[[10]],best_t[[11]], 
          title="Results", align=TRUE, type = "html",
          covariate.labels = c("Age:18-20","Age:60-64","Age:70-74","Age:75+",
                               "Race:Black","Ed:12th (ND)","Ed:HS","Ed:other",
                               "MS state", "NC state","TX state"))

**Results**

	Dependent variable:

	internet
	(1)	(2)	(3)	(4)

Age:18-20				15.779
				(936.864)

Age:60-64	-1.199^***	-1.184^***	-1.119^**	-1.048^**
	(0.447)	(0.452)	(0.455)	(0.458)

Age:70-74	-2.494^***	-2.547^***	-2.613^***	-2.568^***
	(0.439)	(0.444)	(0.451)	(0.454)

Age:75+	-2.587^***	-2.641^***	-2.746^***	-2.716^***
	(0.365)	(0.369)	(0.376)	(0.377)

Race:Black	-1.171^***	-0.952^**	-0.949^**	-0.926^**
	(0.385)	(0.414)	(0.417)	(0.419)

Ed:12th (ND)	-2.186^***	-2.067^***	-2.010^***	-2.212^***
	(0.483)	(0.494)	(0.504)	(0.522)

Ed:HS	-1.116^***	-1.058^***	-1.074^***	-1.084^***
	(0.321)	(0.324)	(0.325)	(0.325)

Ed:other	-2.534^***	-2.561^***	-2.511^***	-2.496^***
	(0.512)	(0.517)	(0.523)	(0.522)

MS state	-1.432^***	-1.602^***	-1.764^***	-1.875^***
	(0.544)	(0.553)	(0.561)	(0.571)

NC state		-1.014^*	-1.185^**	-1.221^**
		(0.560)	(0.568)	(0.570)

TX state			-0.997^**	-1.087^**
			(0.427)	(0.433)

Constant	3.822^***	3.855^***	4.012^***	3.996^***
	(0.285)	(0.288)	(0.305)	(0.305)


Observations	708	708	708	708
Log Likelihood	-179.541	-178.067	-175.600	-173.454
Akaike Inf. Crit.	377.081	376.135	373.200	370.908

Note:	p<0.1; p<0.05; p<0.01

Find at least a few more variables that you hadn’t added to the original model, add them in and re-estimate the subset selection models. What changes?

#### FINDING THE BEST MODEL ADD VARS----
X <- train_dat %>%
  select(starts_with(c('age','gender',
                       'race','education','region',
                       'income',
                       'trust_peple',
                       'gov_waste',
                       'unemp',
                       'marital',
                       'Pres_for_re',
                       'occup')) ) %>%
  as.matrix()

y <- train_dat %>%
  select(internet) %>%
  pull

rmods <- regsubsets(x=X, y=y, method="forward",
                    all.best=FALSE, nbest=3, 
                    nvmax=10, really.big = TRUE)

Reordering variables and trying again:

best_m<-get_best(regsubs = rmods, depv="internet", data=train_dat)

[1] 0.08615856 0.11524528 0.15143676 0.17839207 0.19530867 0.20842379 [7] 0.21721684 0.22644225 0.24096999 0.25162179 0.26028759

stargazer(best_m[[8]],best_m[[9]],best_m[[10]],best_m[[11]], 
          title="Results", align=TRUE, type = "html",
          covariate.labels = c("Age:70-74","Age:75+",
                               "Education: 12th (ND)",
                               "Education:other",
                               "MS state","NC state",
                               "Trust people: MoT","Widowed",
                               "Pres. for. rel.:DS ","Occupation:Retired",
                               "Occupation:Permanently disabled"))

**Results**

	Dependent variable:

	internet
	(1)	(2)	(3)	(4)

Age:70-74		-1.732^***	-1.902^***	-1.997^***
		(0.538)	(0.548)	(0.552)

Age:75+	-1.021^**	-1.693^***	-1.958^***	-2.062^***
	(0.440)	(0.512)	(0.532)	(0.538)

Education: 12th (ND)	-1.620^***	-1.606^***	-1.744^***	-1.558^***
	(0.472)	(0.492)	(0.494)	(0.511)

Education:other	-1.401^***	-1.574^***	-1.501^***	-1.579^***
	(0.482)	(0.492)	(0.502)	(0.514)

MS state	-1.525^***	-1.755^***	-1.882^***	-2.018^***
	(0.559)	(0.567)	(0.592)	(0.596)

NC state				-1.453^***
				(0.561)

Trust people: MoT	1.630^***	1.642^***	1.667^***	1.669^***
	(0.361)	(0.363)	(0.370)	(0.374)

Widowed	-1.153^***	-0.961^**	-0.918^**	-0.962^**
	(0.410)	(0.417)	(0.425)	(0.429)

Pres. for. rel.:DS			1.079^***	1.112^***
			(0.337)	(0.340)

Occupation:Retired	-1.576^***	-0.989^**	-0.908^**	-0.848^**
	(0.343)	(0.403)	(0.410)	(0.411)

Occupation:Permanently disabled	-1.975^***	-2.043^***	-2.118^***	-1.936^***
	(0.461)	(0.470)	(0.471)	(0.486)

Constant	2.962^***	3.057^***	2.749^***	2.835^***
	(0.240)	(0.250)	(0.260)	(0.268)


Observations	708	708	708	708
Log Likelihood	-171.395	-166.357	-160.669	-157.637
Akaike Inf. Crit.	360.790	352.713	343.338	339.274

Note:	p<0.1; p<0.05; p<0.01

We can see that, with respect the orginal model, there were variables such as occupation, marital status and even other surprising characteristics like trust in people or disapproval of the president handling foreing relations that improved prediction.

On this full set of covariates in the training set R, estimate a ridge regression, lasso, elastic-net and adaptive lasso. Present the coefficients from these different models in a way that makes clear what happens.

## Selecting vars
X2<- train_dat %>% 
    select(age_X12__Age_group_70_74     ,
    age_X13__Age_group_75_or_older      ,
    education_X8__12th_grade_no_diploma ,
    education_other                     ,
    region_X28__Mississippi             ,
    region_X37__North_Carolina          ,
    trust_peple_X2__Most_of_the_time    ,
    marital_X3__Widowed                 ,
    Pres_for_rel_X4__Disapprove_strongly,
    occup_X5__R_retired__if_also_working__working__20_hrs_wk_,
    occup_X6__R_permanently_disabled__if_also_working__working__20_hrs_wk_) %>% 
  as.matrix()

y2 <- train_dat %>% 
  select(internet) %>% 
  pull

## Ridge
rcv <- cv.glmnet(X2, y2, alpha=0, family = 'binomial')
ridge <- glmnet(X2, y2, alpha=0, family = 'binomial', lambda = rcv$lambda.min)

## LASSO

lcv <- cv.glmnet(X2, y2, alpha=1, family = 'binomial')
lasso <- glmnet(X2, y2, alpha=1, family = 'binomial',lambda = lcv$lambda.min)


## Elastic Net

# grid search across 
e_netcv <- train(
  x = X2,
  y = y2,
  method = "glmnet",
  family = "binomial",
  nested = TRUE,
  # preProc = c("zv", "center", "scale"),
  trControl = trainControl(method = "cv", number = 10),
  tuneLength = 10
)

e_net <- glmnet(X2, y2, family = 'binomial',
                alpha = e_netcv$bestTune$alpha, 
                lambda = e_netcv$bestTune$lambda)

## Adaptive lasso
g<-1
b.e_net<-coef(e_net)
b.e_net <- ifelse(b.e_net == 0, .01, b.e_net)
w<-1/(abs(b.e_net)^g)
a_lasso<-glmnet(X2, y2, alpha=1, family = 'binomial',
                lambda = lcv$lambda.min,
                penalty.factor = w)

##Plotting coefficients of different methods

varnames <- c("Age:70-74","Age:75+",
             "Education: 12th (ND)",
             "Education:other",
             "MS state","NC state",
             "Trust people: MoT","Widowed",
             "Pres. for. rel.:DS ","Occupation:Retired",
             "Occupation:Permanently disabled")

coefs <- tibble(
  b = c(as.vector(coef(ridge)[-1]), 
        as.vector(coef(lasso)[-1]), 
        as.vector(coef(e_net)[-1]),
        as.vector(coef(a_lasso)[-1])), 
  model = factor(rep(1:4, each=length(coef(ridge)[-1])), 
                 labels=c("Ridge", "LASSO","ENET","A-LASSO")), 
  variable = rep(varnames, 4))

p1 <- ggplot(coefs, aes(x=b, y=variable, 
                        colour=model, shape=model)) + 
  geom_point() + 
  theme_bw() + 
  geom_vline(xintercept=0, lty=3)+
  ggtitle("Coefficients")

p1

Using the best subset model, ridge regression, lasso, elastic net and adaptive lasso - use cross-validation to find the best model in the set. For this step, you can fix the tuning parameters in the regularization methods at their optimal values you found above. Here, we’re not trying to tune the models, but to see how the tuned models behave in sample. This should leave you with a single model that is “best”.

### CV for the very best ----
## Using the caret package
glmnet_grid<-expand.grid(alpha = c(0, e_netcv$bestTune$alpha,1),
                         lambda = c(rcv$lambda.min, lcv$lambda.min,
                                    e_netcv$bestTune$lambda))

vbest <- train(
  x = X2,
  y = y2,
  method = "glmnet",
  family = "binomial",
  nested = TRUE,
  # preProc = c("zv", "center", "scale"),
  trControl = trainControl(method = "cv", number = 10),
  tuneLength = 10,
  tuneGrid = glmnet_grid,
  # penalty.factor = c(0,w.enet,w.ridge,w.lasso)
)
vbest

## glmnet 
## 
## 708 samples
##  11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 637, 637, 638, 637, 637, 637, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda       RMSE      Rsquared   MAE     
##   0      0.001526993  2.427456  0.2519522  2.170900
##   0      0.009154082  2.423548  0.2520885  2.167601
##   0      0.079251666  1.772691  0.2615088  1.627331
##   1      0.001526993  2.602312  0.2461144  2.313875
##   1      0.009154082  2.162329  0.2446710  1.943335
##   1      0.079251666  1.274217  0.0776811  1.234760
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.07925167.

Estimate that single best model on the testing set S. What do you find about the relationships of interest? Do all of the theoretical variables you started with withstand the scrutiny that these tools provide?

X3<- test_dat %>% 
    select(age_X12__Age_group_70_74     ,
    age_X13__Age_group_75_or_older      ,
    education_X8__12th_grade_no_diploma ,
    education_other                     ,
    region_X28__Mississippi             ,
    region_X37__North_Carolina          ,
    trust_peple_X2__Most_of_the_time    ,
    marital_X3__Widowed                 ,
    Pres_for_rel_X4__Disapprove_strongly,
    occup_X5__R_retired__if_also_working__working__20_hrs_wk_,
    occup_X6__R_permanently_disabled__if_also_working__working__20_hrs_wk_) %>% 
  as.matrix()

y3 <- test_dat %>% 
  select(internet) %>% 
  pull

vbestm<- glmnet(X3, y3, family = 'binomial',
                alpha = vbest$bestTune$alpha, 
                lambda = vbest$bestTune$lambda)
coef(vbestm)

## 12 x 1 sparse Matrix of class "dgCMatrix"
##                                                                                s0
## (Intercept)                                                             2.5110143
## age_X12__Age_group_70_74                                                .        
## age_X13__Age_group_75_or_older                                         -0.5673007
## education_X8__12th_grade_no_diploma                                     .        
## education_other                                                         .        
## region_X28__Mississippi                                                 .        
## region_X37__North_Carolina                                              .        
## trust_peple_X2__Most_of_the_time                                        .        
## marital_X3__Widowed                                                     .        
## Pres_for_rel_X4__Disapprove_strongly                                    .        
## occup_X5__R_retired__if_also_working__working__20_hrs_wk_               .        
## occup_X6__R_permanently_disabled__if_also_working__working__20_hrs_wk_  .

Only one vaiable out of the set f original model survived. Apparently, only the intercept and age group 75 and older provides better predictions, than the whole set.

Estimate the theoretically derived on your testing sample S. How does that compare to the model you estimated in step 4?

teo_dt <-test_dat %>%
    select(starts_with(c('internet','age','gender',
                         'race','education','region'))) %>% 
  rowwise() %>%
  mutate( age_g40_64 =
            sum(c_across(age_X06__Age_group_40_44:age_X10__Age_group_60_64)),
          age_g65 = sum(c_across(age_X11__Age_group_65_69:age_X13__Age_group_75_or_older)),
          south = sum(c_across(contains(c('Alabama', 'Arkansas', 'Delaware',
          'Washington', 'Florida', 'Georgia', 'Kentucky', 'Louisiana', 'Maryland',
          'Mississippi', 'North Carolina', 'Oklahoma', 'South_Carolina', 
          'Tennessee','Texas', 'Virginia', 'West_Virginia')))))


tmod2 <- glm(internet~ age_g40_64+ age_g65+gender_X1__Male 
            +race_X2__Black__non_Hispanic 
            + race_X5__Hispanic
            + race_X6__Other_non_Hispanic_incl_multiple_races__WEB__blank__Other__counted_as_a_race_
            + education_X9__High_school_graduate__high_school_diploma_or_equivalent__for_example__GED_
            + education_X10__Some_college_but_no_degree +south, 
            data=teo_dt, family = binomial())

stargazer(tmod2, title="Results", align=TRUE, type = "html", 
          covariate.labels = c("Age group:40-64","Age:65+","Male",
                               "Race:Black","Race:Hispanic","Race:Other",
                               "Ed:HS","Ed:Some college (ND)",
                               "South region"))

**Results**

	Dependent variable:

	internet

Age group:40-64	-0.629
	(0.581)

Age:65+	-2.327^***
	(0.536)

Male	0.430
	(0.378)

Race:Black	-0.373
	(0.621)

Race:Hispanic	-1.089^**
	(0.461)

Race:Other	-1.195
	(0.860)

Ed:HS	-0.861^**
	(0.393)

Ed:Some college (ND)	0.731
	(0.658)

South region	-0.686^*
	(0.370)

Constant	4.212^***
	(0.604)


Observations	472
Log Likelihood	-109.283
Akaike Inf. Crit.	238.566

Note:	p<0.1; p<0.05; p<0.01

While we can’t compare the magnitude of the coefficients of both modesl because the lasso model is biased, in terms of prediction, the Cross-Validated regularized regression performs better.

mse_o = mse(teo_dt$internet, predict(tmod2))
mse_cvl = mse(teo_dt$internet, as.vector(predict(vbest, newx = teo_dt)))

tibble ( Model = c("CV Regularized Regression","Original Model"),
         MSE = c(mse_cvl,mse_o))

## # A tibble: 2 x 2
##   Model                       MSE
##   <chr>                     <dbl>
## 1 CV Regularized Regression  1.54
## 2 Original Model             5.61

Homework 1

Hans Martinez

10/8/2020 (due 10/22/2020)