Homework 2

Here is assignment 2:

Using the variables that you used in your previous assignment,

Split your data into a training and testing set (or use the ones you defined previously, if you had done so).

## Importing data----
# getwd()
anes16<-import("./ML/ANES/raw/anes_timeseries_2016_dta/anes_timeseries_2016.dta")

## Wrangling the data

d1 <- anes16 %>%
  select(V161007, V161267x,V161342,V161310x,V161270,V161010d,
         V161268, V161115, V161217:V161223, V161326:V161329,
         V161522, ends_with('x')) %>% 
  rename(internet= V161007,
         int_home = V161326,
         age     = V161267x,
         gender  = V161342,
         race    = V161310x,
         education = V161270,
         region = V161010d,
         income = V161361x,
         trust_peple = V161219,
         gov_waste = V161217,
         unemp = V161142x,
         marital = V161268,
         Pres_for_rel = V161084x,
         occup = V161276x) %>% 
  # remove_noresponse() %>%
  mutate( internet = case_when(
    internet == 2 ~ 0,
    internet == 1 ~ 1),
    int_home = case_when(
      int_home == 2 ~ 0,
      int_home == 1 ~ 1)
  ) %>% 
  # filter(!is.na(int_home)) %>%
  filter(!is.na(internet)) %>%
  # drop_na(internet) %>% #I could have also used
  factorize() #%>%
# reduce_fct_levels()

##Splitting data----
library(rsample)
set.seed(333363)
s <- initial_split(d1, prop=.6, strata = "internet")
R <- training(s)
S <- testing(s)

blueprint <- #recipe(int_home~.,data=R) %>% 
  recipe(internet~.,data=R) %>% 
  step_nzv(all_predictors())  %>% 
  step_knnimpute(all_numeric(), neighbors = 6) %>%
  # step_other(all_nominal(), threshold = 0.01,
  #            other = "other") %>%
  step_unknown(all_nominal()) %>%
  step_dummy(all_nominal(), one_hot = FALSE)

train_dat <- prep(blueprint, training = R) %>% 
  bake(., new_data=R)

names(train_dat)<-str_replace_all(names(train_dat),"[.]","_")

test_dat <- prep(blueprint, training = S) %>% 
  bake(., new_data=S)

names(test_dat)<-str_replace_all(names(test_dat),"[.]","_")

Estimate the best linear, additive model that you can - you can do this with one of the regularization tools if you like.

X <- train_dat %>%
  select(starts_with(c('age','gender',
                       'race','education','region',
                       'income',
                       'trust_peple',
                       'gov_waste',
                       'unemp',
                       'marital',
                       'Pres_for_re',
                       'occup')) ) %>%
  as.matrix()

y <- train_dat %>%
  select(internet) %>%
  pull


rmods <- regsubsets(x=X, y=y, method="forward",
                    all.best=FALSE, nbest=3, 
                    nvmax=10, really.big = TRUE)

Reordering variables and trying again:

get_best<-function(regsubs=rmodst, depv="internet", data=teo_d){
    m <- as_tibble(summary(regsubs)$which, rownames = "size")
    m <- m %>% add_column(rss=summary(regsubs)$rss)
    m <- m %>% group_by(size) %>% filter(rss == max(rss))
    m <- m %>% select(-`(Intercept)`)
    print(m$rss)
    f<-list()
    for (i in 1:nrow(m)){
      v<-names(m[,as.vector(m[i,]==TRUE)])
      f[[i]]<-paste(depv," ~ ", paste(v, collapse="+"))
    }
    b_m <- lapply(f, function(x)glm(x, data=data, family = binomial()))
    b_m
}

best_m<-get_best(regsubs = rmods, depv="internet", data=train_dat)

[1] 60.07696 57.51776 54.60051 52.88004 51.64883 50.64447 50.01740 49.11263 [9] 48.49350 47.98257 47.59355

stargazer(best_m[[8]],best_m[[9]],best_m[[10]],best_m[[11]], 
          title="Results", align=TRUE, type = "html",
          covariate.labels = c("Age:70-74","Age:75+", "White",
          "Ed:4th or less",
          "Ed:5th or 6th",
          "Ed:10th",
          "Ed:12th (ND)","Ed:HS",
          "OK state", "Trust people:MT",
          "Married", "Occ:Permanently disabled")
)

**Results**

	Dependent variable:

	internet
	(1)	(2)	(3)	(4)

Age:70-74	-2.109^***	-2.127^***	-2.250^***	-2.066^***
	(0.490)	(0.514)	(0.513)	(0.500)

Age:75+	-2.932^***	-2.853^***	-2.778^***	-2.371^***
	(0.406)	(0.413)	(0.422)	(0.382)

White	1.058^***	1.019^***	1.000^***
	(0.341)	(0.346)	(0.350)

Ed:4th or less	-16.810	-17.555	-17.883	-18.644
	(943.634)	(900.702)	(907.455)	(918.995)

Ed:5th or 6th	-2.328^**	-2.566^**	-2.927^***	-3.542^***
	(0.981)	(1.036)	(1.049)	(1.067)

Ed:10th				-1.897^**
				(0.838)

Ed:12th (ND)	-1.931^***	-2.044^***	-2.378^***	-2.453^***
	(0.486)	(0.501)	(0.522)	(0.532)

Ed:HS			-0.918^***	-1.020^***
			(0.331)	(0.336)

OK state				-1.216^*
				(0.697)

Trust people:MT	1.605^***	1.487^***	1.368^***	1.393^***
	(0.388)	(0.396)	(0.400)	(0.399)

Married		1.119^***	1.145^***	1.193^***
		(0.369)	(0.373)	(0.377)

Occ:Permanently disabled	-1.763^***	-1.600^***	-1.541^***	-1.514^***
	(0.496)	(0.496)	(0.506)	(0.496)

Constant	2.208^***	1.922^***	2.267^***	2.895^***
	(0.256)	(0.265)	(0.304)	(0.293)


Observations	708	708	708	708
Log Likelihood	-162.926	-157.641	-153.892	-154.642
Akaike Inf. Crit.	343.851	335.283	329.783	333.283

Note:	p<0.1; p<0.05; p<0.01

Using the tools from class - MARS, Polywog (if it doesn’t take too long), CART, Random Forest and Gradient Boosted Machines - generate predictions for the training data. Make sure to tune your models appropriately.

#### MARS ####

hyper_grid <- expand.grid(
  degree = 1:3, 
  nprune = 2:15
)

set.seed(3333)  # for reproducibility
cv_mars <- train(
  x = X,
  y = y,
  method = "earth",
  metric = "RMSE",
  trControl = trainControl(method = "cv", number = 10),
  tuneGrid = hyper_grid
)

# cv_mars$bestTune
# cv_mars$results %>%
#   filter(nprune == cv_mars$bestTune$nprune, degree == cv_mars$bestTune$degree)

# ggplot(cv_mars)
# 
# cv_mars$finalModel %>%
#   summary()
# 
# cv_mars$finalModel %>%
#   fitted()


# variable importance plots
# dotPlot(varImp(cv_mars))


#### Polywog ####

p <- train_dat %>%
  select(starts_with(c('age','gender',
                       'race','education','region',
                       'income',
                       'trust_peple',
                       'gov_waste',
                       'unemp',
                       'marital',
                       'Pres_for_re',
                       'occup')), 'internet' )

poly <-polywog(internet ~ ., data=p, degree=2, family = "binomial")

# summary(poly)$coefficients %>% 
#   as_tibble(rownames = "variable") %>% 
#   filter(Estimate != 0)


#### CART ####
library(rpart)
library(dplyr)
library(car)


bp <- #recipe(int_home~.,data=R) %>% 
  recipe(internet~.,data=R) %>% 
  step_nzv(all_predictors())  %>% 
  step_knnimpute(all_numeric(), neighbors = 6) %>%
  # step_other(all_nominal(), threshold = 0.01,
  #            other = "other") %>%
  step_unknown(all_nominal()) #%>%
  # step_dummy(all_nominal(), one_hot = FALSE)

train_dat1 <- prep(bp, training = R) %>% 
  bake(., new_data=R)

names(train_dat1)<-str_replace_all(names(train_dat1),"[.]","_")

test_dat1 <- prep(bp, training = S) %>% 
  bake(., new_data=S)

names(test_dat1)<-str_replace_all(names(test_dat1),"[.]","_")

crt <- train_dat1 %>%
  select(starts_with(c('age','gender',
                       'race','education','region',
                       'income',
                       'trust_peple',
                       'gov_waste',
                       'unemp',
                       'marital',
                       'Pres_for_re',
                       'occup')), 'internet' )

# summary(cart)

# caret cross validation results
cart_cv <- train(
  internet ~ .,
  data = crt,
  method = "rpart",
  trControl = trainControl(method = "cv", number = 10),
  tuneLength = 20
)

# cart_cv$bestTune$cp

# library(vip)
# vip(cart_cv)
# 
# dotPlot(varImp(cart_cv))
# 
# ggplot(cart_cv)

cart <- rpart(internet ~ ., data=crt, cp =cart_cv$bestTune$cp)
# cart
# 
# plot(cart)
# text(cart)

#Initialize
h2o.no_progress()
h2o.init(max_mem_size = "5g")

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 hours 41 minutes 
##     H2O cluster timezone:       America/Toronto 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.32.0.1 
##     H2O cluster version age:    1 month and 11 days  
##     H2O cluster name:           H2O_started_from_R_Hans_jtn719 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   4.51 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 4.0.3 (2020-10-10)

#Convert data to h2o
h2o_train<-as.h2o(crt)

# set the response column to Sale_Price
response <- "internet"

# set the predictor names
predictors <- setdiff(colnames(crt), response)

# number of features
n_features <- length(setdiff(colnames(crt), response))

#### Random forest ####

# hyperparameter grid
hyper_grid <- list(
  mtries = floor(n_features * c(.05, .15, .25, .333, .4)),
  min_rows = c(1, 3, 5, 10),
  max_depth = c(10, 20, 30),
  sample_rate = c(.55, .632, .70, .80)
)

# random grid search strategy
search_criteria <- list(
  strategy = "RandomDiscrete",
  stopping_metric = "mse",
  stopping_tolerance = 0.001,   # stop if improvement is < 0.1%
  stopping_rounds = 10,         # over the last 10 models
  max_runtime_secs = 60*3     # or stop search after 5 min.
)

# perform grid search 
random_grid <- h2o.grid(
  algorithm = "randomForest",
  grid_id = "rf_random_grid_1",
  x = predictors, 
  y = response, 
  training_frame = h2o_train,
  hyper_params = hyper_grid,
  ntrees = n_features * 5,
  seed = 1333,
  stopping_metric = "RMSE",   
  stopping_rounds = 10,           # stop if last 10 trees added 
  stopping_tolerance = 0.005,     # don't improve RMSE by 0.5%
  search_criteria = search_criteria
)

# random_grid

# collect the results and sort by our model performance metric 
# of choice
random_grid_perf <- h2o.getGrid(
  grid_id = "rf_random_grid_1", 
  sort_by = "mse", 
  decreasing = FALSE
)
# random_grid_perf



# h2o_rf2
# 
# vip(h2o_rf2)
# 
# as.vector(predict(h2o_rf2,h2o_train)$predict)

#### GBM ####
# h2o.no_progress()
# h2o.init(max_mem_size = "5g")


hyper_grid <- list(
  max_depth=1:7,
  learn_rate = c(.001, .01, .1, .25),
  sample_rate = c(.5, .7, .9, 1), 
  col_sample_rate = c(.5, .7, .9, 1),
  col_sample_rate_per_tree = c(.5, .7, .9, 1)
  )

search_criteria <- list(
  strategy = "RandomDiscrete",
  stopping_metric = "mse",
  stopping_tolerance = 0.001,   # stop if improvement is < 0.1%
  stopping_rounds = 10,         # over the last 10 models
  max_runtime_secs = 60*3      # or stop search after 5 min.
)

# perform grid search 
grid <- h2o.grid(
  algorithm = "gbm",
  grid_id = "gbm_random_grid_1",
  x = predictors, 
  y = response,
  training_frame = h2o_train,
  hyper_params = hyper_grid,
  ntrees = 600,
  # learn_rate = 0.01,
  # max_depth = 7,
  # min_rows = 5,
  nfolds = 10,
  stopping_metric = "RMSE",
  stopping_rounds = 10,
  stopping_tolerance = 0.005,
  search_criteria = search_criteria,
  seed = 3333
)

# grid
# collect the results and sort by our model performance metric of choice
grid_perf <- h2o.getGrid(
  grid_id = "gbm_random_grid_1", 
  sort_by = "mse", 
  decreasing = FALSE
)

# grid_perf

# Grab the model_id for the top model, chosen by cross validation error
# best_model_id <- grid_perf@model_ids[[1]]
# best_model <- h2o.getModel(best_model_id)
# 
# # Now let’s get performance metrics on the best model
# h2o.performance(model = best_model, xval = TRUE)


# g
# vip(g)

h2o_rf2 <- h2o.randomForest(
  x = predictors, 
  y = response,
  training_frame = h2o_train, 
  nfolds=10, 
  max_depth=30, 
  min_rows=5.0, 
  mtries=3, 
  sample_rate=.8, 
  ntrees = 2500,
  seed = 3333
)

g <- h2o.gbm(predictors, 
             response, 
             training_frame = h2o_train, 
             learn_rate = .1,
             max_depth=5,
             ntrees=100, 
             sample_rate=0.9,
             col_sample_rate = 0.5,
             col_sample_rate_per_tree = 0.7,
             nfolds=10)

Which of the models does best on the training data (i.e., has the smallest residual sum of squares, or deviance)?

Answer: In the training data, the GBM does best, as it has the highest R-squared and the lowest RMSE.

#### Comparison on training data ####

# load('h2o_mods.Rdata')

#Initialize
# h2o.no_progress()
# h2o.init(max_mem_size = "5g")

fit_training<- data.frame(
  lm = predict(best_m[[11]]),
  mars = as.vector(predict(cv_mars$finalModel)),
  poly = predict(poly),
  cart = as.vector(predict(cart_cv$finalModel)),
  rf = as.vector(predict(h2o_rf2,h2o_train)$predict),
  gbm = as.vector(predict(g,h2o_train)$predict)
)
  
# fit_training
# 
# cor(fit_training,y)^2
# 
# apply(fit_training,2,function(x)rmse(y,x))
# apply(fit_training,2,function(x)mse(y,x))


knitr::kable(data.frame(
  cor.y_hat.y= as.vector(cor(fit_training, y)^2),
  rmse=apply(fit_training, 2,function(x)rmse(y,x)),
  mse=apply(fit_training, 2,function(x)mse(y,x))
  ),
  caption = "Comparision table using training data."
)

Comparision table using training data.
	cor.y_hat.y	rmse	mse
lm	0.2134159	2.8877887	8.3393238
mars	0.2241291	0.2629183	0.0691260
poly	0.2135695	2.6345286	6.9407408
cart	0.1082754	0.2818652	0.0794480
rf	0.8413258	0.1562684	0.0244198
gbm	0.9652475	0.0637714	0.0040668

Use the trained models to estimate test set predictions
- which model does best.
- how do the effects differ between the model you estimated in 2) above?

Answer: In the testing data, the GBM does best has the highest R-squared, however, the Random forest model has lowest RMSE.

#### Comparison on testing data ####

fit_test<- data.frame(
  lm = predict(best_m[[11]],test_dat),
  mars = as.vector(predict(cv_mars$finalModel, test_dat)),
  poly = predict(poly, test_dat),
  cart = as.vector(predict(cart, newdata =  test_dat1)),
  rf = as.vector(predict(h2o_rf2,as.h2o(test_dat1))$predict),
  gbm = as.vector(predict(g,as.h2o(test_dat1))$predict)
)

yt <- test_dat %>%
  select(internet) %>%
  pull

# fit_test

knitr::kable(data.frame(
  cor.y_hat.y= as.vector(cor(fit_test,yt)^2),
  rmse=apply(fit_test,2,function(x)rmse(yt,x)),
  mse=apply(fit_test,2,function(x)mse(yt,x))
  ),
  caption = "Comparision table using testing data."
)

Comparision table using testing data.
	cor.y_hat.y	rmse	mse
lm	0.1847967	2.6975930	7.2770079
mars	0.1294121	0.2745340	0.0753689
poly	0.0957977	2.6083647	6.8035663
cart	0.0146177	0.3211841	0.1031593
rf	0.2079776	0.2603597	0.0677872
gbm	0.1584355	0.2710248	0.0734544

Homework 2

Hans Martinez

November 20, 2020