This homework is meant to get you thinking about the tools we’ve learned so far. In particular, I want you to think about what feature selection and regularization might tell you about the relationships you’re trying to understand. I want you to take the following steps - each time showing me the code that you used and explaining what the output means.
## Importing data----
anes16<-import("./ML/ANES/raw/anes_timeseries_2016_dta/anes_timeseries_2016.dta")
## Wrangling the data
d1 <- anes16 %>%
select(V161007, V161267x,V161342,V161310x,V161270,V161010d,
V161268, V161115, V161217:V161223, V161326:V161329,
V161522, ends_with('x')) %>%
rename(internet= V161007,
age = V161267x,
gender = V161342,
race = V161310x,
education = V161270,
region = V161010d,
income = V161361x,
trust_peple = V161219,
gov_waste = V161217,
unemp = V161142x,
marital = V161268,
Pres_for_rel = V161084x,
occup = V161276x) %>%
mutate( internet = case_when(
internet == 2 ~ 0,
internet == 1 ~ 1)) %>%
filter(!is.na(internet)) %>%
# drop_na(internet) %>% #I could have also used
factorize() #%>%
R
) and testing set we’ll call S
.##Splitting data----
set.seed(66636)
s <- initial_split(d1, prop=.6)
R <- training(s)
S <- testing(s)
blueprint <- recipe(internet~.,data=R) %>%
step_nzv(all_predictors()) %>%
step_knnimpute(all_numeric(), neighbors = 6) %>%
step_other(all_nominal(), threshold = 0.01,
other = "other") %>%
step_unknown(all_nominal()) %>%
step_dummy(all_nominal(), one_hot = FALSE)
train_dat <- prep(blueprint, training = R) %>%
bake(., new_data=R)
names(train_dat)<-str_replace_all(names(train_dat),"[.]","_")
test_dat <- prep(blueprint, training = S) %>%
bake(., new_data=S)
names(test_dat)<-str_replace_all(names(test_dat),"[.]","_")
R
.
## Running theoretical model with training data ####
teo_d <-train_dat %>%
select(starts_with(c('internet','age','gender',
'race','education','region'))) %>%
rowwise() %>%
mutate( age_g40_64 =
sum(c_across(age_X06__Age_group_40_44:age_X10__Age_group_60_64)),
age_g65 = sum(c_across(age_X11__Age_group_65_69:age_X13__Age_group_75_or_older)),
south = sum(c_across(contains(c('Alabama', 'Arkansas', 'Delaware',
'Washington', 'Florida', 'Georgia', 'Kentucky', 'Louisiana', 'Maryland',
'Mississippi', 'North Carolina', 'Oklahoma', 'South_Carolina',
'Tennessee','Texas', 'Virginia', 'West_Virginia')))))
tmod <- glm(internet~ age_g40_64+ age_g65+gender_X1__Male
+race_X2__Black__non_Hispanic
+ race_X5__Hispanic
+ race_X6__Other_non_Hispanic_incl_multiple_races__WEB__blank__Other__counted_as_a_race_
+ education_X9__High_school_graduate__high_school_diploma_or_equivalent__for_example__GED_
+ education_X10__Some_college_but_no_degree +south,
data=teo_d, family = binomial())
stargazer(tmod, title="Results", align=TRUE, type = "html",
covariate.labels = c("Age group:40-64","Age:65+","Male",
"Race:Black","Race:Hispanic","Race:Other",
"Ed:HS","Ed:Some college (ND)",
"South region"))
Dependent variable: | |
internet | |
Age group:40-64 | -1.045** |
(0.421) | |
Age:65+ | -2.620*** |
(0.422) | |
Male | -0.092 |
(0.263) | |
Race:Black | -1.241*** |
(0.372) | |
Race:Hispanic | -0.883** |
(0.392) | |
Race:Other | -1.000* |
(0.511) | |
Ed:HS | -0.722** |
(0.296) | |
Ed:Some college (ND) | 0.249 |
(0.393) | |
South region | -0.192 |
(0.266) | |
Constant | 4.142*** |
(0.454) | |
Observations | 708 |
Log Likelihood | -204.980 |
Akaike Inf. Crit. | 429.960 |
Note: | p<0.1; p<0.05; p<0.01 |
*The objective of the paper I selected is to argue that internet is not driving political polarization in the US. As a first step, authors estimate a model that try to predict internet use in terms of characteristics of the consumers.*
## FIND BEST MODEL ORIGINAL THEORY MODEL----
X1 <- teo_d %>%
select(-internet) %>%
as.matrix()
y1 <- teo_d %>%
select(internet) %>%
pull
rmodst <- regsubsets(x=X1, y=y1, method="forward",
all.best=FALSE, nbest=3, nvmax=10,
really.big = TRUE) #I used "forward" because "exhaustive"
Reordering variables and trying again:
#was taking forever
get_best<-function(regsubs=rmodst, depv="internet", data=teo_d){
m <- as_tibble(summary(regsubs)$which, rownames = "size")
m <- m %>% add_column(adjr2=summary(regsubs)$adjr2)
m <- m %>% group_by(size) %>% filter(adjr2 == max(adjr2))
m <- m %>% select(-`(Intercept)`)
print(m$adjr2)
f<-list()
for (i in 1:nrow(m)){
v<-names(m[,as.vector(m[i,]==TRUE)])
f[[i]]<-paste(depv," ~ ", paste(v, collapse="+"))
}
b_m <- lapply(f, function(x)glm(x, data=data, family = binomial()))
b_m
}
best_t<-get_best(regsubs = rmodst, depv="internet", data=teo_d)
[1] 0.0770812 0.1102643 0.1371978 0.1594509 0.1765303 0.1878448 0.1969313 [8] 0.2022197 0.2066311 0.2109234 0.2132535
stargazer(best_t[[8]],best_t[[9]],best_t[[10]],best_t[[11]],
title="Results", align=TRUE, type = "html",
covariate.labels = c("Age:18-20","Age:60-64","Age:70-74","Age:75+",
"Race:Black","Ed:12th (ND)","Ed:HS","Ed:other",
"MS state", "NC state","TX state"))
Dependent variable: | ||||
internet | ||||
(1) | (2) | (3) | (4) | |
Age:18-20 | 15.779 | |||
(936.864) | ||||
Age:60-64 | -1.199*** | -1.184*** | -1.119** | -1.048** |
(0.447) | (0.452) | (0.455) | (0.458) | |
Age:70-74 | -2.494*** | -2.547*** | -2.613*** | -2.568*** |
(0.439) | (0.444) | (0.451) | (0.454) | |
Age:75+ | -2.587*** | -2.641*** | -2.746*** | -2.716*** |
(0.365) | (0.369) | (0.376) | (0.377) | |
Race:Black | -1.171*** | -0.952** | -0.949** | -0.926** |
(0.385) | (0.414) | (0.417) | (0.419) | |
Ed:12th (ND) | -2.186*** | -2.067*** | -2.010*** | -2.212*** |
(0.483) | (0.494) | (0.504) | (0.522) | |
Ed:HS | -1.116*** | -1.058*** | -1.074*** | -1.084*** |
(0.321) | (0.324) | (0.325) | (0.325) | |
Ed:other | -2.534*** | -2.561*** | -2.511*** | -2.496*** |
(0.512) | (0.517) | (0.523) | (0.522) | |
MS state | -1.432*** | -1.602*** | -1.764*** | -1.875*** |
(0.544) | (0.553) | (0.561) | (0.571) | |
NC state | -1.014* | -1.185** | -1.221** | |
(0.560) | (0.568) | (0.570) | ||
TX state | -0.997** | -1.087** | ||
(0.427) | (0.433) | |||
Constant | 3.822*** | 3.855*** | 4.012*** | 3.996*** |
(0.285) | (0.288) | (0.305) | (0.305) | |
Observations | 708 | 708 | 708 | 708 |
Log Likelihood | -179.541 | -178.067 | -175.600 | -173.454 |
Akaike Inf. Crit. | 377.081 | 376.135 | 373.200 | 370.908 |
Note: | p<0.1; p<0.05; p<0.01 |
#### FINDING THE BEST MODEL ADD VARS----
X <- train_dat %>%
select(starts_with(c('age','gender',
'race','education','region',
'income',
'trust_peple',
'gov_waste',
'unemp',
'marital',
'Pres_for_re',
'occup')) ) %>%
as.matrix()
y <- train_dat %>%
select(internet) %>%
pull
rmods <- regsubsets(x=X, y=y, method="forward",
all.best=FALSE, nbest=3,
nvmax=10, really.big = TRUE)
Reordering variables and trying again:
best_m<-get_best(regsubs = rmods, depv="internet", data=train_dat)
[1] 0.08615856 0.11524528 0.15143676 0.17839207 0.19530867 0.20842379 [7] 0.21721684 0.22644225 0.24096999 0.25162179 0.26028759
stargazer(best_m[[8]],best_m[[9]],best_m[[10]],best_m[[11]],
title="Results", align=TRUE, type = "html",
covariate.labels = c("Age:70-74","Age:75+",
"Education: 12th (ND)",
"Education:other",
"MS state","NC state",
"Trust people: MoT","Widowed",
"Pres. for. rel.:DS ","Occupation:Retired",
"Occupation:Permanently disabled"))
Dependent variable: | ||||
internet | ||||
(1) | (2) | (3) | (4) | |
Age:70-74 | -1.732*** | -1.902*** | -1.997*** | |
(0.538) | (0.548) | (0.552) | ||
Age:75+ | -1.021** | -1.693*** | -1.958*** | -2.062*** |
(0.440) | (0.512) | (0.532) | (0.538) | |
Education: 12th (ND) | -1.620*** | -1.606*** | -1.744*** | -1.558*** |
(0.472) | (0.492) | (0.494) | (0.511) | |
Education:other | -1.401*** | -1.574*** | -1.501*** | -1.579*** |
(0.482) | (0.492) | (0.502) | (0.514) | |
MS state | -1.525*** | -1.755*** | -1.882*** | -2.018*** |
(0.559) | (0.567) | (0.592) | (0.596) | |
NC state | -1.453*** | |||
(0.561) | ||||
Trust people: MoT | 1.630*** | 1.642*** | 1.667*** | 1.669*** |
(0.361) | (0.363) | (0.370) | (0.374) | |
Widowed | -1.153*** | -0.961** | -0.918** | -0.962** |
(0.410) | (0.417) | (0.425) | (0.429) | |
Pres. for. rel.:DS | 1.079*** | 1.112*** | ||
(0.337) | (0.340) | |||
Occupation:Retired | -1.576*** | -0.989** | -0.908** | -0.848** |
(0.343) | (0.403) | (0.410) | (0.411) | |
Occupation:Permanently disabled | -1.975*** | -2.043*** | -2.118*** | -1.936*** |
(0.461) | (0.470) | (0.471) | (0.486) | |
Constant | 2.962*** | 3.057*** | 2.749*** | 2.835*** |
(0.240) | (0.250) | (0.260) | (0.268) | |
Observations | 708 | 708 | 708 | 708 |
Log Likelihood | -171.395 | -166.357 | -160.669 | -157.637 |
Akaike Inf. Crit. | 360.790 | 352.713 | 343.338 | 339.274 |
Note: | p<0.1; p<0.05; p<0.01 |
We can see that, with respect the orginal model, there were variables such as occupation, marital status and even other surprising characteristics like trust in people or disapproval of the president handling foreing relations that improved prediction.
R
, estimate a ridge regression, lasso, elastic-net and adaptive lasso. Present the coefficients from these different models in a way that makes clear what happens.## Selecting vars
X2<- train_dat %>%
select(age_X12__Age_group_70_74 ,
age_X13__Age_group_75_or_older ,
education_X8__12th_grade_no_diploma ,
education_other ,
region_X28__Mississippi ,
region_X37__North_Carolina ,
trust_peple_X2__Most_of_the_time ,
marital_X3__Widowed ,
Pres_for_rel_X4__Disapprove_strongly,
occup_X5__R_retired__if_also_working__working__20_hrs_wk_,
occup_X6__R_permanently_disabled__if_also_working__working__20_hrs_wk_) %>%
as.matrix()
y2 <- train_dat %>%
select(internet) %>%
pull
## Ridge
rcv <- cv.glmnet(X2, y2, alpha=0, family = 'binomial')
ridge <- glmnet(X2, y2, alpha=0, family = 'binomial', lambda = rcv$lambda.min)
## LASSO
lcv <- cv.glmnet(X2, y2, alpha=1, family = 'binomial')
lasso <- glmnet(X2, y2, alpha=1, family = 'binomial',lambda = lcv$lambda.min)
## Elastic Net
# grid search across
e_netcv <- train(
x = X2,
y = y2,
method = "glmnet",
family = "binomial",
nested = TRUE,
# preProc = c("zv", "center", "scale"),
trControl = trainControl(method = "cv", number = 10),
tuneLength = 10
)
e_net <- glmnet(X2, y2, family = 'binomial',
alpha = e_netcv$bestTune$alpha,
lambda = e_netcv$bestTune$lambda)
## Adaptive lasso
g<-1
b.e_net<-coef(e_net)
b.e_net <- ifelse(b.e_net == 0, .01, b.e_net)
w<-1/(abs(b.e_net)^g)
a_lasso<-glmnet(X2, y2, alpha=1, family = 'binomial',
lambda = lcv$lambda.min,
penalty.factor = w)
##Plotting coefficients of different methods
varnames <- c("Age:70-74","Age:75+",
"Education: 12th (ND)",
"Education:other",
"MS state","NC state",
"Trust people: MoT","Widowed",
"Pres. for. rel.:DS ","Occupation:Retired",
"Occupation:Permanently disabled")
coefs <- tibble(
b = c(as.vector(coef(ridge)[-1]),
as.vector(coef(lasso)[-1]),
as.vector(coef(e_net)[-1]),
as.vector(coef(a_lasso)[-1])),
model = factor(rep(1:4, each=length(coef(ridge)[-1])),
labels=c("Ridge", "LASSO","ENET","A-LASSO")),
variable = rep(varnames, 4))
p1 <- ggplot(coefs, aes(x=b, y=variable,
colour=model, shape=model)) +
geom_point() +
theme_bw() +
geom_vline(xintercept=0, lty=3)+
ggtitle("Coefficients")
p1
### CV for the very best ----
## Using the caret package
glmnet_grid<-expand.grid(alpha = c(0, e_netcv$bestTune$alpha,1),
lambda = c(rcv$lambda.min, lcv$lambda.min,
e_netcv$bestTune$lambda))
vbest <- train(
x = X2,
y = y2,
method = "glmnet",
family = "binomial",
nested = TRUE,
# preProc = c("zv", "center", "scale"),
trControl = trainControl(method = "cv", number = 10),
tuneLength = 10,
tuneGrid = glmnet_grid,
# penalty.factor = c(0,w.enet,w.ridge,w.lasso)
)
vbest
## glmnet
##
## 708 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 637, 637, 638, 637, 637, 637, ...
## Resampling results across tuning parameters:
##
## alpha lambda RMSE Rsquared MAE
## 0 0.001526993 2.427456 0.2519522 2.170900
## 0 0.009154082 2.423548 0.2520885 2.167601
## 0 0.079251666 1.772691 0.2615088 1.627331
## 1 0.001526993 2.602312 0.2461144 2.313875
## 1 0.009154082 2.162329 0.2446710 1.943335
## 1 0.079251666 1.274217 0.0776811 1.234760
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.07925167.
S
. What do you find about the relationships of interest? Do all of the theoretical variables you started with withstand the scrutiny that these tools provide?X3<- test_dat %>%
select(age_X12__Age_group_70_74 ,
age_X13__Age_group_75_or_older ,
education_X8__12th_grade_no_diploma ,
education_other ,
region_X28__Mississippi ,
region_X37__North_Carolina ,
trust_peple_X2__Most_of_the_time ,
marital_X3__Widowed ,
Pres_for_rel_X4__Disapprove_strongly,
occup_X5__R_retired__if_also_working__working__20_hrs_wk_,
occup_X6__R_permanently_disabled__if_also_working__working__20_hrs_wk_) %>%
as.matrix()
y3 <- test_dat %>%
select(internet) %>%
pull
vbestm<- glmnet(X3, y3, family = 'binomial',
alpha = vbest$bestTune$alpha,
lambda = vbest$bestTune$lambda)
coef(vbestm)
## 12 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 2.5110143
## age_X12__Age_group_70_74 .
## age_X13__Age_group_75_or_older -0.5673007
## education_X8__12th_grade_no_diploma .
## education_other .
## region_X28__Mississippi .
## region_X37__North_Carolina .
## trust_peple_X2__Most_of_the_time .
## marital_X3__Widowed .
## Pres_for_rel_X4__Disapprove_strongly .
## occup_X5__R_retired__if_also_working__working__20_hrs_wk_ .
## occup_X6__R_permanently_disabled__if_also_working__working__20_hrs_wk_ .
Only one vaiable out of the set f original model survived. Apparently, only the intercept and age group 75 and older provides better predictions, than the whole set.
S
. How does that compare to the model you estimated in step 4?teo_dt <-test_dat %>%
select(starts_with(c('internet','age','gender',
'race','education','region'))) %>%
rowwise() %>%
mutate( age_g40_64 =
sum(c_across(age_X06__Age_group_40_44:age_X10__Age_group_60_64)),
age_g65 = sum(c_across(age_X11__Age_group_65_69:age_X13__Age_group_75_or_older)),
south = sum(c_across(contains(c('Alabama', 'Arkansas', 'Delaware',
'Washington', 'Florida', 'Georgia', 'Kentucky', 'Louisiana', 'Maryland',
'Mississippi', 'North Carolina', 'Oklahoma', 'South_Carolina',
'Tennessee','Texas', 'Virginia', 'West_Virginia')))))
tmod2 <- glm(internet~ age_g40_64+ age_g65+gender_X1__Male
+race_X2__Black__non_Hispanic
+ race_X5__Hispanic
+ race_X6__Other_non_Hispanic_incl_multiple_races__WEB__blank__Other__counted_as_a_race_
+ education_X9__High_school_graduate__high_school_diploma_or_equivalent__for_example__GED_
+ education_X10__Some_college_but_no_degree +south,
data=teo_dt, family = binomial())
stargazer(tmod2, title="Results", align=TRUE, type = "html",
covariate.labels = c("Age group:40-64","Age:65+","Male",
"Race:Black","Race:Hispanic","Race:Other",
"Ed:HS","Ed:Some college (ND)",
"South region"))
Dependent variable: | |
internet | |
Age group:40-64 | -0.629 |
(0.581) | |
Age:65+ | -2.327*** |
(0.536) | |
Male | 0.430 |
(0.378) | |
Race:Black | -0.373 |
(0.621) | |
Race:Hispanic | -1.089** |
(0.461) | |
Race:Other | -1.195 |
(0.860) | |
Ed:HS | -0.861** |
(0.393) | |
Ed:Some college (ND) | 0.731 |
(0.658) | |
South region | -0.686* |
(0.370) | |
Constant | 4.212*** |
(0.604) | |
Observations | 472 |
Log Likelihood | -109.283 |
Akaike Inf. Crit. | 238.566 |
Note: | p<0.1; p<0.05; p<0.01 |
While we can’t compare the magnitude of the coefficients of both modesl because the lasso model is biased, in terms of prediction, the Cross-Validated regularized regression performs better.
mse_o = mse(teo_dt$internet, predict(tmod2))
mse_cvl = mse(teo_dt$internet, as.vector(predict(vbest, newx = teo_dt)))
tibble ( Model = c("CV Regularized Regression","Original Model"),
MSE = c(mse_cvl,mse_o))
## # A tibble: 2 x 2
## Model MSE
## <chr> <dbl>
## 1 CV Regularized Regression 1.54
## 2 Original Model 5.61