This homework is meant to get you thinking about the tools we’ve learned so far. In particular, I want you to think about what feature selection and regularization might tell you about the relationships you’re trying to understand. I want you to take the following steps - each time showing me the code that you used and explaining what the output means.

  1. Import the data set that you want to use.
## Importing data----

anes16<-import("./ML/ANES/raw/anes_timeseries_2016_dta/anes_timeseries_2016.dta")

## Wrangling the data

d1 <- anes16 %>%
  select(V161007, V161267x,V161342,V161310x,V161270,V161010d,
         V161268, V161115, V161217:V161223, V161326:V161329,
         V161522, ends_with('x')) %>% 
  rename(internet= V161007,
         age     = V161267x,
         gender  = V161342,
         race    = V161310x,
         education = V161270,
         region = V161010d,
         income = V161361x,
         trust_peple = V161219,
         gov_waste = V161217,
         unemp = V161142x,
         marital = V161268,
         Pres_for_rel = V161084x,
         occup = V161276x) %>% 
  mutate( internet = case_when(
    internet == 2 ~ 0,
    internet == 1 ~ 1)) %>% 
  filter(!is.na(internet)) %>%
  # drop_na(internet) %>% #I could have also used
  factorize() #%>%
  1. Immediately split your data into a training set (we’ll call R) and testing set we’ll call S.
##Splitting data----
set.seed(66636)
s <- initial_split(d1, prop=.6)
R <- training(s)
S <- testing(s)

blueprint <- recipe(internet~.,data=R) %>% 
  step_nzv(all_predictors())  %>% 
  step_knnimpute(all_numeric(), neighbors = 6) %>%
  step_other(all_nominal(), threshold = 0.01,
           other = "other") %>%
  step_unknown(all_nominal()) %>%
  step_dummy(all_nominal(), one_hot = FALSE)

train_dat <- prep(blueprint, training = R) %>% 
  bake(., new_data=R)

names(train_dat)<-str_replace_all(names(train_dat),"[.]","_")

test_dat <- prep(blueprint, training = S) %>% 
  bake(., new_data=S)

names(test_dat)<-str_replace_all(names(test_dat),"[.]","_")
  1. In the steps below, use the training set R.
    • Estimate a theoretically derived model.
## Running theoretical model with training data ####
teo_d <-train_dat %>%
    select(starts_with(c('internet','age','gender',
                         'race','education','region'))) %>% 
  rowwise() %>%
  mutate( age_g40_64 =
            sum(c_across(age_X06__Age_group_40_44:age_X10__Age_group_60_64)),
          age_g65 = sum(c_across(age_X11__Age_group_65_69:age_X13__Age_group_75_or_older)),
          south = sum(c_across(contains(c('Alabama', 'Arkansas', 'Delaware',
          'Washington', 'Florida', 'Georgia', 'Kentucky', 'Louisiana', 'Maryland',
          'Mississippi', 'North Carolina', 'Oklahoma', 'South_Carolina', 
          'Tennessee','Texas', 'Virginia', 'West_Virginia')))))


tmod <- glm(internet~ age_g40_64+ age_g65+gender_X1__Male 
            +race_X2__Black__non_Hispanic 
            + race_X5__Hispanic
            + race_X6__Other_non_Hispanic_incl_multiple_races__WEB__blank__Other__counted_as_a_race_
            + education_X9__High_school_graduate__high_school_diploma_or_equivalent__for_example__GED_
            + education_X10__Some_college_but_no_degree +south, 
            data=teo_d, family = binomial())

stargazer(tmod, title="Results", align=TRUE, type = "html", 
          covariate.labels = c("Age group:40-64","Age:65+","Male",
                               "Race:Black","Race:Hispanic","Race:Other",
                               "Ed:HS","Ed:Some college (ND)",
                               "South region"))
Results
Dependent variable:
internet
Age group:40-64 -1.045**
(0.421)
Age:65+ -2.620***
(0.422)
Male -0.092
(0.263)
Race:Black -1.241***
(0.372)
Race:Hispanic -0.883**
(0.392)
Race:Other -1.000*
(0.511)
Ed:HS -0.722**
(0.296)
Ed:Some college (ND) 0.249
(0.393)
South region -0.192
(0.266)
Constant 4.142***
(0.454)
Observations 708
Log Likelihood -204.980
Akaike Inf. Crit. 429.960
Note: p<0.1; p<0.05; p<0.01
*The objective of the paper I selected is to argue that internet is not driving political polarization in the US. As a first step, authors estimate a model that try to predict internet use in terms of characteristics of the consumers.*
## FIND BEST MODEL ORIGINAL THEORY MODEL----
X1 <- teo_d %>%
  select(-internet) %>%
  as.matrix()

y1 <- teo_d %>%
  select(internet) %>%
  pull

rmodst <- regsubsets(x=X1, y=y1, method="forward",
                     all.best=FALSE, nbest=3, nvmax=10, 
                     really.big = TRUE) #I used "forward" because "exhaustive" 

Reordering variables and trying again:

                                        #was taking forever

get_best<-function(regsubs=rmodst, depv="internet", data=teo_d){
    m <- as_tibble(summary(regsubs)$which, rownames = "size")
    m <- m %>% add_column(adjr2=summary(regsubs)$adjr2)
    m <- m %>% group_by(size) %>% filter(adjr2 == max(adjr2))
    m <- m %>% select(-`(Intercept)`)
    print(m$adjr2)
    f<-list()
    for (i in 1:nrow(m)){
      v<-names(m[,as.vector(m[i,]==TRUE)])
      f[[i]]<-paste(depv," ~ ", paste(v, collapse="+"))
    }
    b_m <- lapply(f, function(x)glm(x, data=data, family = binomial()))
    b_m
}

best_t<-get_best(regsubs = rmodst, depv="internet", data=teo_d)

[1] 0.0770812 0.1102643 0.1371978 0.1594509 0.1765303 0.1878448 0.1969313 [8] 0.2022197 0.2066311 0.2109234 0.2132535

stargazer(best_t[[8]],best_t[[9]],best_t[[10]],best_t[[11]], 
          title="Results", align=TRUE, type = "html",
          covariate.labels = c("Age:18-20","Age:60-64","Age:70-74","Age:75+",
                               "Race:Black","Ed:12th (ND)","Ed:HS","Ed:other",
                               "MS state", "NC state","TX state"))
Results
Dependent variable:
internet
(1) (2) (3) (4)
Age:18-20 15.779
(936.864)
Age:60-64 -1.199*** -1.184*** -1.119** -1.048**
(0.447) (0.452) (0.455) (0.458)
Age:70-74 -2.494*** -2.547*** -2.613*** -2.568***
(0.439) (0.444) (0.451) (0.454)
Age:75+ -2.587*** -2.641*** -2.746*** -2.716***
(0.365) (0.369) (0.376) (0.377)
Race:Black -1.171*** -0.952** -0.949** -0.926**
(0.385) (0.414) (0.417) (0.419)
Ed:12th (ND) -2.186*** -2.067*** -2.010*** -2.212***
(0.483) (0.494) (0.504) (0.522)
Ed:HS -1.116*** -1.058*** -1.074*** -1.084***
(0.321) (0.324) (0.325) (0.325)
Ed:other -2.534*** -2.561*** -2.511*** -2.496***
(0.512) (0.517) (0.523) (0.522)
MS state -1.432*** -1.602*** -1.764*** -1.875***
(0.544) (0.553) (0.561) (0.571)
NC state -1.014* -1.185** -1.221**
(0.560) (0.568) (0.570)
TX state -0.997** -1.087**
(0.427) (0.433)
Constant 3.822*** 3.855*** 4.012*** 3.996***
(0.285) (0.288) (0.305) (0.305)
Observations 708 708 708 708
Log Likelihood -179.541 -178.067 -175.600 -173.454
Akaike Inf. Crit. 377.081 376.135 373.200 370.908
Note: p<0.1; p<0.05; p<0.01
#### FINDING THE BEST MODEL ADD VARS----
X <- train_dat %>%
  select(starts_with(c('age','gender',
                       'race','education','region',
                       'income',
                       'trust_peple',
                       'gov_waste',
                       'unemp',
                       'marital',
                       'Pres_for_re',
                       'occup')) ) %>%
  as.matrix()

y <- train_dat %>%
  select(internet) %>%
  pull

rmods <- regsubsets(x=X, y=y, method="forward",
                    all.best=FALSE, nbest=3, 
                    nvmax=10, really.big = TRUE)

Reordering variables and trying again:

best_m<-get_best(regsubs = rmods, depv="internet", data=train_dat)

[1] 0.08615856 0.11524528 0.15143676 0.17839207 0.19530867 0.20842379 [7] 0.21721684 0.22644225 0.24096999 0.25162179 0.26028759

stargazer(best_m[[8]],best_m[[9]],best_m[[10]],best_m[[11]], 
          title="Results", align=TRUE, type = "html",
          covariate.labels = c("Age:70-74","Age:75+",
                               "Education: 12th (ND)",
                               "Education:other",
                               "MS state","NC state",
                               "Trust people: MoT","Widowed",
                               "Pres. for. rel.:DS ","Occupation:Retired",
                               "Occupation:Permanently disabled"))
Results
Dependent variable:
internet
(1) (2) (3) (4)
Age:70-74 -1.732*** -1.902*** -1.997***
(0.538) (0.548) (0.552)
Age:75+ -1.021** -1.693*** -1.958*** -2.062***
(0.440) (0.512) (0.532) (0.538)
Education: 12th (ND) -1.620*** -1.606*** -1.744*** -1.558***
(0.472) (0.492) (0.494) (0.511)
Education:other -1.401*** -1.574*** -1.501*** -1.579***
(0.482) (0.492) (0.502) (0.514)
MS state -1.525*** -1.755*** -1.882*** -2.018***
(0.559) (0.567) (0.592) (0.596)
NC state -1.453***
(0.561)
Trust people: MoT 1.630*** 1.642*** 1.667*** 1.669***
(0.361) (0.363) (0.370) (0.374)
Widowed -1.153*** -0.961** -0.918** -0.962**
(0.410) (0.417) (0.425) (0.429)
Pres. for. rel.:DS 1.079*** 1.112***
(0.337) (0.340)
Occupation:Retired -1.576*** -0.989** -0.908** -0.848**
(0.343) (0.403) (0.410) (0.411)
Occupation:Permanently disabled -1.975*** -2.043*** -2.118*** -1.936***
(0.461) (0.470) (0.471) (0.486)
Constant 2.962*** 3.057*** 2.749*** 2.835***
(0.240) (0.250) (0.260) (0.268)
Observations 708 708 708 708
Log Likelihood -171.395 -166.357 -160.669 -157.637
Akaike Inf. Crit. 360.790 352.713 343.338 339.274
Note: p<0.1; p<0.05; p<0.01

We can see that, with respect the orginal model, there were variables such as occupation, marital status and even other surprising characteristics like trust in people or disapproval of the president handling foreing relations that improved prediction.

## Selecting vars
X2<- train_dat %>% 
    select(age_X12__Age_group_70_74     ,
    age_X13__Age_group_75_or_older      ,
    education_X8__12th_grade_no_diploma ,
    education_other                     ,
    region_X28__Mississippi             ,
    region_X37__North_Carolina          ,
    trust_peple_X2__Most_of_the_time    ,
    marital_X3__Widowed                 ,
    Pres_for_rel_X4__Disapprove_strongly,
    occup_X5__R_retired__if_also_working__working__20_hrs_wk_,
    occup_X6__R_permanently_disabled__if_also_working__working__20_hrs_wk_) %>% 
  as.matrix()

y2 <- train_dat %>% 
  select(internet) %>% 
  pull

## Ridge
rcv <- cv.glmnet(X2, y2, alpha=0, family = 'binomial')
ridge <- glmnet(X2, y2, alpha=0, family = 'binomial', lambda = rcv$lambda.min)

## LASSO

lcv <- cv.glmnet(X2, y2, alpha=1, family = 'binomial')
lasso <- glmnet(X2, y2, alpha=1, family = 'binomial',lambda = lcv$lambda.min)


## Elastic Net

# grid search across 
e_netcv <- train(
  x = X2,
  y = y2,
  method = "glmnet",
  family = "binomial",
  nested = TRUE,
  # preProc = c("zv", "center", "scale"),
  trControl = trainControl(method = "cv", number = 10),
  tuneLength = 10
)

e_net <- glmnet(X2, y2, family = 'binomial',
                alpha = e_netcv$bestTune$alpha, 
                lambda = e_netcv$bestTune$lambda)

## Adaptive lasso
g<-1
b.e_net<-coef(e_net)
b.e_net <- ifelse(b.e_net == 0, .01, b.e_net)
w<-1/(abs(b.e_net)^g)
a_lasso<-glmnet(X2, y2, alpha=1, family = 'binomial',
                lambda = lcv$lambda.min,
                penalty.factor = w)

##Plotting coefficients of different methods

varnames <- c("Age:70-74","Age:75+",
             "Education: 12th (ND)",
             "Education:other",
             "MS state","NC state",
             "Trust people: MoT","Widowed",
             "Pres. for. rel.:DS ","Occupation:Retired",
             "Occupation:Permanently disabled")

coefs <- tibble(
  b = c(as.vector(coef(ridge)[-1]), 
        as.vector(coef(lasso)[-1]), 
        as.vector(coef(e_net)[-1]),
        as.vector(coef(a_lasso)[-1])), 
  model = factor(rep(1:4, each=length(coef(ridge)[-1])), 
                 labels=c("Ridge", "LASSO","ENET","A-LASSO")), 
  variable = rep(varnames, 4))

p1 <- ggplot(coefs, aes(x=b, y=variable, 
                        colour=model, shape=model)) + 
  geom_point() + 
  theme_bw() + 
  geom_vline(xintercept=0, lty=3)+
  ggtitle("Coefficients")

p1

### CV for the very best ----
## Using the caret package
glmnet_grid<-expand.grid(alpha = c(0, e_netcv$bestTune$alpha,1),
                         lambda = c(rcv$lambda.min, lcv$lambda.min,
                                    e_netcv$bestTune$lambda))

vbest <- train(
  x = X2,
  y = y2,
  method = "glmnet",
  family = "binomial",
  nested = TRUE,
  # preProc = c("zv", "center", "scale"),
  trControl = trainControl(method = "cv", number = 10),
  tuneLength = 10,
  tuneGrid = glmnet_grid,
  # penalty.factor = c(0,w.enet,w.ridge,w.lasso)
)
vbest
## glmnet 
## 
## 708 samples
##  11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 637, 637, 638, 637, 637, 637, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda       RMSE      Rsquared   MAE     
##   0      0.001526993  2.427456  0.2519522  2.170900
##   0      0.009154082  2.423548  0.2520885  2.167601
##   0      0.079251666  1.772691  0.2615088  1.627331
##   1      0.001526993  2.602312  0.2461144  2.313875
##   1      0.009154082  2.162329  0.2446710  1.943335
##   1      0.079251666  1.274217  0.0776811  1.234760
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.07925167.
  1. Estimate that single best model on the testing set S. What do you find about the relationships of interest? Do all of the theoretical variables you started with withstand the scrutiny that these tools provide?
X3<- test_dat %>% 
    select(age_X12__Age_group_70_74     ,
    age_X13__Age_group_75_or_older      ,
    education_X8__12th_grade_no_diploma ,
    education_other                     ,
    region_X28__Mississippi             ,
    region_X37__North_Carolina          ,
    trust_peple_X2__Most_of_the_time    ,
    marital_X3__Widowed                 ,
    Pres_for_rel_X4__Disapprove_strongly,
    occup_X5__R_retired__if_also_working__working__20_hrs_wk_,
    occup_X6__R_permanently_disabled__if_also_working__working__20_hrs_wk_) %>% 
  as.matrix()

y3 <- test_dat %>% 
  select(internet) %>% 
  pull

vbestm<- glmnet(X3, y3, family = 'binomial',
                alpha = vbest$bestTune$alpha, 
                lambda = vbest$bestTune$lambda)
coef(vbestm)
## 12 x 1 sparse Matrix of class "dgCMatrix"
##                                                                                s0
## (Intercept)                                                             2.5110143
## age_X12__Age_group_70_74                                                .        
## age_X13__Age_group_75_or_older                                         -0.5673007
## education_X8__12th_grade_no_diploma                                     .        
## education_other                                                         .        
## region_X28__Mississippi                                                 .        
## region_X37__North_Carolina                                              .        
## trust_peple_X2__Most_of_the_time                                        .        
## marital_X3__Widowed                                                     .        
## Pres_for_rel_X4__Disapprove_strongly                                    .        
## occup_X5__R_retired__if_also_working__working__20_hrs_wk_               .        
## occup_X6__R_permanently_disabled__if_also_working__working__20_hrs_wk_  .

Only one vaiable out of the set f original model survived. Apparently, only the intercept and age group 75 and older provides better predictions, than the whole set.

  1. Estimate the theoretically derived on your testing sample S. How does that compare to the model you estimated in step 4?
teo_dt <-test_dat %>%
    select(starts_with(c('internet','age','gender',
                         'race','education','region'))) %>% 
  rowwise() %>%
  mutate( age_g40_64 =
            sum(c_across(age_X06__Age_group_40_44:age_X10__Age_group_60_64)),
          age_g65 = sum(c_across(age_X11__Age_group_65_69:age_X13__Age_group_75_or_older)),
          south = sum(c_across(contains(c('Alabama', 'Arkansas', 'Delaware',
          'Washington', 'Florida', 'Georgia', 'Kentucky', 'Louisiana', 'Maryland',
          'Mississippi', 'North Carolina', 'Oklahoma', 'South_Carolina', 
          'Tennessee','Texas', 'Virginia', 'West_Virginia')))))


tmod2 <- glm(internet~ age_g40_64+ age_g65+gender_X1__Male 
            +race_X2__Black__non_Hispanic 
            + race_X5__Hispanic
            + race_X6__Other_non_Hispanic_incl_multiple_races__WEB__blank__Other__counted_as_a_race_
            + education_X9__High_school_graduate__high_school_diploma_or_equivalent__for_example__GED_
            + education_X10__Some_college_but_no_degree +south, 
            data=teo_dt, family = binomial())

stargazer(tmod2, title="Results", align=TRUE, type = "html", 
          covariate.labels = c("Age group:40-64","Age:65+","Male",
                               "Race:Black","Race:Hispanic","Race:Other",
                               "Ed:HS","Ed:Some college (ND)",
                               "South region"))
Results
Dependent variable:
internet
Age group:40-64 -0.629
(0.581)
Age:65+ -2.327***
(0.536)
Male 0.430
(0.378)
Race:Black -0.373
(0.621)
Race:Hispanic -1.089**
(0.461)
Race:Other -1.195
(0.860)
Ed:HS -0.861**
(0.393)
Ed:Some college (ND) 0.731
(0.658)
South region -0.686*
(0.370)
Constant 4.212***
(0.604)
Observations 472
Log Likelihood -109.283
Akaike Inf. Crit. 238.566
Note: p<0.1; p<0.05; p<0.01

While we can’t compare the magnitude of the coefficients of both modesl because the lasso model is biased, in terms of prediction, the Cross-Validated regularized regression performs better.

mse_o = mse(teo_dt$internet, predict(tmod2))
mse_cvl = mse(teo_dt$internet, as.vector(predict(vbest, newx = teo_dt)))

tibble ( Model = c("CV Regularized Regression","Original Model"),
         MSE = c(mse_cvl,mse_o))
## # A tibble: 2 x 2
##   Model                       MSE
##   <chr>                     <dbl>
## 1 CV Regularized Regression  1.54
## 2 Original Model             5.61