Here is assignment 2:
Using the variables that you used in your previous assignment,
## Importing data----
# getwd()
anes16<-import("./ML/ANES/raw/anes_timeseries_2016_dta/anes_timeseries_2016.dta")
## Wrangling the data
d1 <- anes16 %>%
select(V161007, V161267x,V161342,V161310x,V161270,V161010d,
V161268, V161115, V161217:V161223, V161326:V161329,
V161522, ends_with('x')) %>%
rename(internet= V161007,
int_home = V161326,
age = V161267x,
gender = V161342,
race = V161310x,
education = V161270,
region = V161010d,
income = V161361x,
trust_peple = V161219,
gov_waste = V161217,
unemp = V161142x,
marital = V161268,
Pres_for_rel = V161084x,
occup = V161276x) %>%
# remove_noresponse() %>%
mutate( internet = case_when(
internet == 2 ~ 0,
internet == 1 ~ 1),
int_home = case_when(
int_home == 2 ~ 0,
int_home == 1 ~ 1)
) %>%
# filter(!is.na(int_home)) %>%
filter(!is.na(internet)) %>%
# drop_na(internet) %>% #I could have also used
factorize() #%>%
# reduce_fct_levels()
##Splitting data----
library(rsample)
set.seed(333363)
s <- initial_split(d1, prop=.6, strata = "internet")
R <- training(s)
S <- testing(s)
blueprint <- #recipe(int_home~.,data=R) %>%
recipe(internet~.,data=R) %>%
step_nzv(all_predictors()) %>%
step_knnimpute(all_numeric(), neighbors = 6) %>%
# step_other(all_nominal(), threshold = 0.01,
# other = "other") %>%
step_unknown(all_nominal()) %>%
step_dummy(all_nominal(), one_hot = FALSE)
train_dat <- prep(blueprint, training = R) %>%
bake(., new_data=R)
names(train_dat)<-str_replace_all(names(train_dat),"[.]","_")
test_dat <- prep(blueprint, training = S) %>%
bake(., new_data=S)
names(test_dat)<-str_replace_all(names(test_dat),"[.]","_")
X <- train_dat %>%
select(starts_with(c('age','gender',
'race','education','region',
'income',
'trust_peple',
'gov_waste',
'unemp',
'marital',
'Pres_for_re',
'occup')) ) %>%
as.matrix()
y <- train_dat %>%
select(internet) %>%
pull
rmods <- regsubsets(x=X, y=y, method="forward",
all.best=FALSE, nbest=3,
nvmax=10, really.big = TRUE)
Reordering variables and trying again:
get_best<-function(regsubs=rmodst, depv="internet", data=teo_d){
m <- as_tibble(summary(regsubs)$which, rownames = "size")
m <- m %>% add_column(rss=summary(regsubs)$rss)
m <- m %>% group_by(size) %>% filter(rss == max(rss))
m <- m %>% select(-`(Intercept)`)
print(m$rss)
f<-list()
for (i in 1:nrow(m)){
v<-names(m[,as.vector(m[i,]==TRUE)])
f[[i]]<-paste(depv," ~ ", paste(v, collapse="+"))
}
b_m <- lapply(f, function(x)glm(x, data=data, family = binomial()))
b_m
}
best_m<-get_best(regsubs = rmods, depv="internet", data=train_dat)
[1] 60.07696 57.51776 54.60051 52.88004 51.64883 50.64447 50.01740 49.11263 [9] 48.49350 47.98257 47.59355
stargazer(best_m[[8]],best_m[[9]],best_m[[10]],best_m[[11]],
title="Results", align=TRUE, type = "html",
covariate.labels = c("Age:70-74","Age:75+", "White",
"Ed:4th or less",
"Ed:5th or 6th",
"Ed:10th",
"Ed:12th (ND)","Ed:HS",
"OK state", "Trust people:MT",
"Married", "Occ:Permanently disabled")
)
Dependent variable: | ||||
internet | ||||
(1) | (2) | (3) | (4) | |
Age:70-74 | -2.109*** | -2.127*** | -2.250*** | -2.066*** |
(0.490) | (0.514) | (0.513) | (0.500) | |
Age:75+ | -2.932*** | -2.853*** | -2.778*** | -2.371*** |
(0.406) | (0.413) | (0.422) | (0.382) | |
White | 1.058*** | 1.019*** | 1.000*** | |
(0.341) | (0.346) | (0.350) | ||
Ed:4th or less | -16.810 | -17.555 | -17.883 | -18.644 |
(943.634) | (900.702) | (907.455) | (918.995) | |
Ed:5th or 6th | -2.328** | -2.566** | -2.927*** | -3.542*** |
(0.981) | (1.036) | (1.049) | (1.067) | |
Ed:10th | -1.897** | |||
(0.838) | ||||
Ed:12th (ND) | -1.931*** | -2.044*** | -2.378*** | -2.453*** |
(0.486) | (0.501) | (0.522) | (0.532) | |
Ed:HS | -0.918*** | -1.020*** | ||
(0.331) | (0.336) | |||
OK state | -1.216* | |||
(0.697) | ||||
Trust people:MT | 1.605*** | 1.487*** | 1.368*** | 1.393*** |
(0.388) | (0.396) | (0.400) | (0.399) | |
Married | 1.119*** | 1.145*** | 1.193*** | |
(0.369) | (0.373) | (0.377) | ||
Occ:Permanently disabled | -1.763*** | -1.600*** | -1.541*** | -1.514*** |
(0.496) | (0.496) | (0.506) | (0.496) | |
Constant | 2.208*** | 1.922*** | 2.267*** | 2.895*** |
(0.256) | (0.265) | (0.304) | (0.293) | |
Observations | 708 | 708 | 708 | 708 |
Log Likelihood | -162.926 | -157.641 | -153.892 | -154.642 |
Akaike Inf. Crit. | 343.851 | 335.283 | 329.783 | 333.283 |
Note: | p<0.1; p<0.05; p<0.01 |
#### MARS ####
hyper_grid <- expand.grid(
degree = 1:3,
nprune = 2:15
)
set.seed(3333) # for reproducibility
cv_mars <- train(
x = X,
y = y,
method = "earth",
metric = "RMSE",
trControl = trainControl(method = "cv", number = 10),
tuneGrid = hyper_grid
)
# cv_mars$bestTune
# cv_mars$results %>%
# filter(nprune == cv_mars$bestTune$nprune, degree == cv_mars$bestTune$degree)
# ggplot(cv_mars)
#
# cv_mars$finalModel %>%
# summary()
#
# cv_mars$finalModel %>%
# fitted()
# variable importance plots
# dotPlot(varImp(cv_mars))
#### Polywog ####
p <- train_dat %>%
select(starts_with(c('age','gender',
'race','education','region',
'income',
'trust_peple',
'gov_waste',
'unemp',
'marital',
'Pres_for_re',
'occup')), 'internet' )
poly <-polywog(internet ~ ., data=p, degree=2, family = "binomial")
# summary(poly)$coefficients %>%
# as_tibble(rownames = "variable") %>%
# filter(Estimate != 0)
#### CART ####
library(rpart)
library(dplyr)
library(car)
bp <- #recipe(int_home~.,data=R) %>%
recipe(internet~.,data=R) %>%
step_nzv(all_predictors()) %>%
step_knnimpute(all_numeric(), neighbors = 6) %>%
# step_other(all_nominal(), threshold = 0.01,
# other = "other") %>%
step_unknown(all_nominal()) #%>%
# step_dummy(all_nominal(), one_hot = FALSE)
train_dat1 <- prep(bp, training = R) %>%
bake(., new_data=R)
names(train_dat1)<-str_replace_all(names(train_dat1),"[.]","_")
test_dat1 <- prep(bp, training = S) %>%
bake(., new_data=S)
names(test_dat1)<-str_replace_all(names(test_dat1),"[.]","_")
crt <- train_dat1 %>%
select(starts_with(c('age','gender',
'race','education','region',
'income',
'trust_peple',
'gov_waste',
'unemp',
'marital',
'Pres_for_re',
'occup')), 'internet' )
# summary(cart)
# caret cross validation results
cart_cv <- train(
internet ~ .,
data = crt,
method = "rpart",
trControl = trainControl(method = "cv", number = 10),
tuneLength = 20
)
# cart_cv$bestTune$cp
# library(vip)
# vip(cart_cv)
#
# dotPlot(varImp(cart_cv))
#
# ggplot(cart_cv)
cart <- rpart(internet ~ ., data=crt, cp =cart_cv$bestTune$cp)
# cart
#
# plot(cart)
# text(cart)
#Initialize
h2o.no_progress()
h2o.init(max_mem_size = "5g")
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 hours 41 minutes
## H2O cluster timezone: America/Toronto
## H2O data parsing timezone: UTC
## H2O cluster version: 3.32.0.1
## H2O cluster version age: 1 month and 11 days
## H2O cluster name: H2O_started_from_R_Hans_jtn719
## H2O cluster total nodes: 1
## H2O cluster total memory: 4.51 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 4.0.3 (2020-10-10)
#Convert data to h2o
h2o_train<-as.h2o(crt)
# set the response column to Sale_Price
response <- "internet"
# set the predictor names
predictors <- setdiff(colnames(crt), response)
# number of features
n_features <- length(setdiff(colnames(crt), response))
#### Random forest ####
# hyperparameter grid
hyper_grid <- list(
mtries = floor(n_features * c(.05, .15, .25, .333, .4)),
min_rows = c(1, 3, 5, 10),
max_depth = c(10, 20, 30),
sample_rate = c(.55, .632, .70, .80)
)
# random grid search strategy
search_criteria <- list(
strategy = "RandomDiscrete",
stopping_metric = "mse",
stopping_tolerance = 0.001, # stop if improvement is < 0.1%
stopping_rounds = 10, # over the last 10 models
max_runtime_secs = 60*3 # or stop search after 5 min.
)
# perform grid search
random_grid <- h2o.grid(
algorithm = "randomForest",
grid_id = "rf_random_grid_1",
x = predictors,
y = response,
training_frame = h2o_train,
hyper_params = hyper_grid,
ntrees = n_features * 5,
seed = 1333,
stopping_metric = "RMSE",
stopping_rounds = 10, # stop if last 10 trees added
stopping_tolerance = 0.005, # don't improve RMSE by 0.5%
search_criteria = search_criteria
)
# random_grid
# collect the results and sort by our model performance metric
# of choice
random_grid_perf <- h2o.getGrid(
grid_id = "rf_random_grid_1",
sort_by = "mse",
decreasing = FALSE
)
# random_grid_perf
# h2o_rf2
#
# vip(h2o_rf2)
#
# as.vector(predict(h2o_rf2,h2o_train)$predict)
#### GBM ####
# h2o.no_progress()
# h2o.init(max_mem_size = "5g")
hyper_grid <- list(
max_depth=1:7,
learn_rate = c(.001, .01, .1, .25),
sample_rate = c(.5, .7, .9, 1),
col_sample_rate = c(.5, .7, .9, 1),
col_sample_rate_per_tree = c(.5, .7, .9, 1)
)
search_criteria <- list(
strategy = "RandomDiscrete",
stopping_metric = "mse",
stopping_tolerance = 0.001, # stop if improvement is < 0.1%
stopping_rounds = 10, # over the last 10 models
max_runtime_secs = 60*3 # or stop search after 5 min.
)
# perform grid search
grid <- h2o.grid(
algorithm = "gbm",
grid_id = "gbm_random_grid_1",
x = predictors,
y = response,
training_frame = h2o_train,
hyper_params = hyper_grid,
ntrees = 600,
# learn_rate = 0.01,
# max_depth = 7,
# min_rows = 5,
nfolds = 10,
stopping_metric = "RMSE",
stopping_rounds = 10,
stopping_tolerance = 0.005,
search_criteria = search_criteria,
seed = 3333
)
# grid
# collect the results and sort by our model performance metric of choice
grid_perf <- h2o.getGrid(
grid_id = "gbm_random_grid_1",
sort_by = "mse",
decreasing = FALSE
)
# grid_perf
# Grab the model_id for the top model, chosen by cross validation error
# best_model_id <- grid_perf@model_ids[[1]]
# best_model <- h2o.getModel(best_model_id)
#
# # Now let’s get performance metrics on the best model
# h2o.performance(model = best_model, xval = TRUE)
# g
# vip(g)
h2o_rf2 <- h2o.randomForest(
x = predictors,
y = response,
training_frame = h2o_train,
nfolds=10,
max_depth=30,
min_rows=5.0,
mtries=3,
sample_rate=.8,
ntrees = 2500,
seed = 3333
)
g <- h2o.gbm(predictors,
response,
training_frame = h2o_train,
learn_rate = .1,
max_depth=5,
ntrees=100,
sample_rate=0.9,
col_sample_rate = 0.5,
col_sample_rate_per_tree = 0.7,
nfolds=10)
Answer: In the training data, the GBM does best, as it has the highest R-squared and the lowest RMSE.
#### Comparison on training data ####
# load('h2o_mods.Rdata')
#Initialize
# h2o.no_progress()
# h2o.init(max_mem_size = "5g")
fit_training<- data.frame(
lm = predict(best_m[[11]]),
mars = as.vector(predict(cv_mars$finalModel)),
poly = predict(poly),
cart = as.vector(predict(cart_cv$finalModel)),
rf = as.vector(predict(h2o_rf2,h2o_train)$predict),
gbm = as.vector(predict(g,h2o_train)$predict)
)
# fit_training
#
# cor(fit_training,y)^2
#
# apply(fit_training,2,function(x)rmse(y,x))
# apply(fit_training,2,function(x)mse(y,x))
knitr::kable(data.frame(
cor.y_hat.y= as.vector(cor(fit_training, y)^2),
rmse=apply(fit_training, 2,function(x)rmse(y,x)),
mse=apply(fit_training, 2,function(x)mse(y,x))
),
caption = "Comparision table using training data."
)
cor.y_hat.y | rmse | mse | |
---|---|---|---|
lm | 0.2134159 | 2.8877887 | 8.3393238 |
mars | 0.2241291 | 0.2629183 | 0.0691260 |
poly | 0.2135695 | 2.6345286 | 6.9407408 |
cart | 0.1082754 | 0.2818652 | 0.0794480 |
rf | 0.8413258 | 0.1562684 | 0.0244198 |
gbm | 0.9652475 | 0.0637714 | 0.0040668 |
Answer: In the testing data, the GBM does best has the highest R-squared, however, the Random forest model has lowest RMSE.
#### Comparison on testing data ####
fit_test<- data.frame(
lm = predict(best_m[[11]],test_dat),
mars = as.vector(predict(cv_mars$finalModel, test_dat)),
poly = predict(poly, test_dat),
cart = as.vector(predict(cart, newdata = test_dat1)),
rf = as.vector(predict(h2o_rf2,as.h2o(test_dat1))$predict),
gbm = as.vector(predict(g,as.h2o(test_dat1))$predict)
)
yt <- test_dat %>%
select(internet) %>%
pull
# fit_test
knitr::kable(data.frame(
cor.y_hat.y= as.vector(cor(fit_test,yt)^2),
rmse=apply(fit_test,2,function(x)rmse(yt,x)),
mse=apply(fit_test,2,function(x)mse(yt,x))
),
caption = "Comparision table using testing data."
)
cor.y_hat.y | rmse | mse | |
---|---|---|---|
lm | 0.1847967 | 2.6975930 | 7.2770079 |
mars | 0.1294121 | 0.2745340 | 0.0753689 |
poly | 0.0957977 | 2.6083647 | 6.8035663 |
cart | 0.0146177 | 0.3211841 | 0.1031593 |
rf | 0.2079776 | 0.2603597 | 0.0677872 |
gbm | 0.1584355 | 0.2710248 | 0.0734544 |