Loading the relevant packages and the training data:
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, tidymodels, kableExtra, glmnet, ranger, vip, xgboost, stacks)
PSID <- read.csv("train.csv") %>%
as_tibble() %>%
select(-farmer) # removed because there are no occurrences in the data
Most of the data exploration is not documented here. Some objects are created here for my own convenience as references. The following block creates a tibble with summary statistics of the variables, and a correlations matrix:
summary_stats <- tibble(mean = sapply(PSID, mean),
sd = sapply(PSID, sd))
correlations <- as.matrix(sapply(PSID, function(x) cor(x, PSID[1])))
for (i in 2:length(PSID)) {
correlations <- cbind(
correlations,
as.matrix(sapply(PSID, function(x) cor(x, PSID[i])))
)
}
colnames(correlations) <- colnames(PSID)
The next block creates a matrix of average lnwage and variance of lnwage by occupation, alongside each group’s size, their average schooling, and proportion with higher education:
# First creating variance column separately
temp <- PSID[,18:38] * PSID$lnwage
temp[temp == 0] = NA
temp <- as.matrix(sapply(temp, function(x) sd(x, na.rm = TRUE)))
# Now creating the rest and joining them
by_occupation <- cbind(
as.matrix(PSID[,18:38] %>% sapply(function(x) sum(PSID$lnwage * x) / sum(x))),
temp,
as.matrix(PSID[,18:38] %>% sapply(function(x) sum(x))),
as.matrix(PSID[,18:38] %>% sapply(function(x) sum(PSID$edyrs * x) / sum(x))),
as.matrix(PSID[,18:38] %>% sapply(function(x) sum((PSID$colldeg+PSID$advdeg) * x) / sum(x)))
)
rm(temp)
colnames(by_occupation) <- c("m_wage", "v_wage", "n", "m_ed", "p_coll")
For example, we can see that there are only three lawyer-physicians, and that there are also very few artists and scientists with large variance in wages, and so they might not be very indicative of their out-of-sample counterparts’ wages. We can also recognize some profession-type groupings we could make – for example, low-skill, medium-skill, and high-skill professions. Additionally, around a tenth of the sample are classified as managers, but they vary significantly in wages and education.
Next, I look for heterogeneity of wage differences w.r.t. a third variable (for example, we see in the following block that the gender wage gap is larger for lower educated people).
h_female <- PSID %>% group_by(edyrs, female) %>%
summarize(lnwage = mean(lnwage), n = n()) %>%
filter(edyrs > 5) %>%
mutate(wagediff = lag(lnwage, 1) - lnwage) %>%
filter(female == 1)
ggplot(h_female, aes(edyrs, wagediff)) + geom_path()
Here I split the sample 80/20 to train and test, as a first step for my own reference (I use the full training set when submitting my estimations), and split into five folds.
I also experiment with some features which the data exploration suggests may improve predictive ability.
seed <- 2011
set.seed(seed)
# Adding various features constructed from the data
PSID1 <- PSID %>% mutate(edfemale = as.numeric(edyrs < 14) * female,
lowskill = healthsupport + building + foodcare + transport + production + constructextractinstall + protective + officeadmin + sales,
medskill = healthcare + business + architect + socialworker,
highskill = financialop + computer + legaleduc + postseceduc + lawyerphysician,
white = 1 - (black + hisp + otherrace),
managerskill = manager * edyrs,
productionf = production * female,
officef = officeadmin * female)
# Split
split <- PSID1 %>%
initial_split(prop = 0.8)
train <- training(split)
test <- testing(split)
folds <- train %>% vfold_cv(v = 5)
Specifying the models that will be used (lasso, ridge, and elastic net with \(\alpha=0.5\)):
lasso_mod <- linear_reg() %>%
set_args(penalty = tune(), mixture = 1) %>%
set_engine("glmnet") %>%
set_mode("regression")
ridge_mod <- linear_reg() %>%
set_args(penalty = tune(), mixture = 0) %>%
set_engine("glmnet") %>%
set_mode("regression")
en0.5_mod <- linear_reg() %>%
set_args(penalty = tune(), mixture = 0.5) %>%
set_engine("glmnet") %>%
set_mode("regression")
Specifying a recipe to be used for all three models (using the training subset; should be changed before submission):
recipe <- recipe(lnwage ~ ., data = train) %>%
step_center(all_predictors()) %>%
step_scale(all_predictors()) %>%
step_zv(all_predictors()) %>%
step_rm(ID) # since ID is with high confidence uninformative
Specifying workflows for each model:
lasso_wf <- workflow() %>%
add_recipe(recipe) %>%
add_model(lasso_mod)
ridge_wf <- workflow() %>%
add_recipe(recipe) %>%
add_model(ridge_mod)
en0.5_wf <- workflow() %>%
add_recipe(recipe) %>%
add_model(en0.5_mod)
Tuning lambda in each model and taking the 1se lambda:
lasso_1se <- lasso_wf %>%
tune_grid(resamples = folds) %>%
select_best(metric = "rmse") %>%
select(penalty)
ridge_1se <- ridge_wf %>%
tune_grid(resamples = folds) %>%
select_best(metric = "rmse") %>%
select(penalty)
en0.5_1se <- en0.5_wf %>%
tune_grid(resamples = folds) %>%
select_best(metric = "rmse") %>%
select(penalty)
Finalizing the models, recipes and workflows:
# Models
lasso_mod_final <- lasso_mod %>%
finalize_model(lasso_1se)
ridge_mod_final <- ridge_mod %>%
finalize_model(ridge_1se)
en0.5_mod_final <- en0.5_mod %>%
finalize_model(en0.5_1se)
# Recipes
lasso_rec_final <- recipe %>%
finalize_recipe(lasso_1se)
ridge_rec_final <- recipe %>%
finalize_recipe(ridge_1se)
en0.5_rec_final <- recipe %>%
finalize_recipe(en0.5_1se)
# Workflows
lasso_wf_final <- lasso_wf %>%
finalize_workflow(lasso_1se)
ridge_wf_final <- ridge_wf %>%
finalize_workflow(ridge_1se)
en0.5_wf_final <- en0.5_wf %>%
finalize_workflow(en0.5_1se)
Applying the recipes to the train and test set:
lasso_juice <- lasso_rec_final %>%
prep() %>%
juice()
ridge_juice <- ridge_rec_final %>%
prep() %>%
juice()
en0.5_juice <- en0.5_rec_final %>%
prep() %>%
juice()
lasso_bake <- lasso_rec_final %>%
prep() %>%
bake(new_data = test)
ridge_bake <- ridge_rec_final %>%
prep() %>%
bake(new_data = test)
en0.5_bake <- en0.5_rec_final %>%
prep() %>%
bake(new_data = test)
Fitting the models to the training sets and creating predictions for the train and test sets:
# In-sample predictions
lasso_train_pred <- lasso_mod_final %>%
fit(lnwage ~ ., data = lasso_juice) %>%
predict(new_data = lasso_juice) %>%
rename(lasso_train_pred = .pred)
ridge_train_pred <- ridge_mod_final %>%
fit(lnwage ~ ., data = ridge_juice) %>%
predict(new_data = ridge_juice) %>%
rename(ridge_train_pred = .pred)
en0.5_train_pred <- en0.5_mod_final %>%
fit(lnwage ~ ., data = en0.5_juice) %>%
predict(new_data = en0.5_juice) %>%
rename(en0.5_train_pred = .pred)
reg_train_preds <- bind_cols(train, lasso_train_pred, ridge_train_pred, en0.5_train_pred) %>%
select(ID, lnwage, lasso_train_pred, ridge_train_pred, en0.5_train_pred) %>%
mutate(
avg_regularized_pred = 0.85 * lasso_train_pred + 0.05 * ridge_train_pred + 0.1 * en0.5_train_pred,
lasso_sqerror = (lnwage - lasso_train_pred)^2,
ridge_sqerror = (lnwage - ridge_train_pred)^2,
en0.5_sqerror = (lnwage - en0.5_train_pred)^2,
avg_reg_sqerror = (lnwage - avg_regularized_pred)^2
)
# MSE
sapply(reg_train_preds, function(rmse) mean(rmse))[7:10]
## lasso_sqerror ridge_sqerror en0.5_sqerror avg_reg_sqerror
## 0.1393844 0.1405281 0.1407558 0.1395127
# Out-of-sample predictions
lasso_test_pred <- lasso_mod_final %>%
fit(lnwage ~ ., data = lasso_juice) %>%
predict(new_data = lasso_bake) %>%
rename(lasso_test_pred = .pred)
ridge_test_pred <- ridge_mod_final %>%
fit(lnwage ~ ., data = ridge_juice) %>%
predict(new_data = ridge_bake) %>%
rename(ridge_test_pred = .pred)
en0.5_test_pred <- en0.5_mod_final %>%
fit(lnwage ~ ., data = en0.5_juice) %>%
predict(new_data = en0.5_bake) %>%
rename(en0.5_test_pred = .pred)
reg_test_preds <- bind_cols(test, lasso_test_pred, ridge_test_pred, en0.5_test_pred) %>%
select(ID, lnwage, lasso_test_pred, ridge_test_pred, en0.5_test_pred) %>%
mutate(
avg_regularized_pred = 0.85 * lasso_test_pred + 0.05 * ridge_test_pred + 0.1 * en0.5_test_pred,
lasso_sqerror = (lnwage - lasso_test_pred)^2,
ridge_sqerror = (lnwage - ridge_test_pred)^2,
en0.5_sqerror = (lnwage - en0.5_test_pred)^2,
avg_reg_sqerror = (lnwage - avg_regularized_pred)^2
)
# MSE
sapply(reg_test_preds, function(rmse) mean(rmse))[7:10]
## lasso_sqerror ridge_sqerror en0.5_sqerror avg_reg_sqerror
## 0.1593286 0.1599706 0.1605919 0.1594132
Note that since lasso does especially well at prediction – particularly in this case where many of the variables are of low significance to the regression – I give it the majority of the weight in the average prediction.
Creating the RF object, for now using only the smaller training set (note, I leave out the interaction variables constructed earlier so that the trees can generate their own interactions):
set.seed(seed)
rf_fit <- ranger(
formula = lnwage ~ . -ID - edfemale -managerskill -productionf -officef,
data = train,
#mtry = default (square root of number of variables rounded down)
num.trees = 1000,
importance = "impurity"
)
Predicting training and test set values:
# In sample predictions
rf_train_pred <- as_tibble(predict(rf_fit, data = train)$predictions) %>%
rename(rf_train_pred = value) %>%
bind_cols(train) %>%
select(ID, lnwage, rf_train_pred) %>%
mutate(rf_sqerror = (lnwage - rf_train_pred)^2)
mean(rf_train_pred$rf_sqerror)
## [1] 0.06667919
# Out of sample predictions
rf_test_pred <- as_tibble(predict(rf_fit, data = test)$predictions) %>%
rename(rf_test_pred = value) %>%
bind_cols(test) %>%
select(ID, lnwage, rf_test_pred) %>%
mutate(rf_sqerror = (lnwage - rf_test_pred)^2)
mean(rf_test_pred$rf_sqerror)
## [1] 0.1649844
The out of sample MSE is quite similar to that obtained using regularized regressions (as verified over several seeds).
The implementation of gradient boosting here closely follows the tutorial on http://uc-r.github.io/gbm_regression.
We begin by tuning the hyperparameters:
# Creating a hyperparameter grid
hyper_grid <- expand.grid(
eta = c(.01, .05, .1, .3),
max_depth = c(1, 3, 5, 7),
min_child_weight = c(1, 3, 5, 7),
subsample = c(.65, .8, 1),
colsample_bytree = c(.8, .9, 1),
optimal_trees = 0, # will store results here
min_rmse = 0 # will store results here
)
# Grid search
for (i in 1:nrow(hyper_grid)){
set.seed(seed)
# Training the models
xgb_tune <- xgb.cv(
data = as.matrix(train %>% select(-c("ID", "lnwage"))),
label = as.matrix(train$lnwage),
nrounds = 1000,
nfold = 5,
metrics = "rmse",
verbose = 0,
early_stopping_rounds = 10, # stop if no improvement for 10 consecutive trees
params = list(
eta = hyper_grid$eta[i],
max_depth = hyper_grid$max_depth[i],
min_child_weight = hyper_grid$min_child_weight[i],
subsample = hyper_grid$subsample[i],
colsample_bytree = hyper_grid$colsample_bytree[i]
)
)
# Adding the error and tree number to the grid
hyper_grid$optimal_trees[i] <- which.min(xgb_tune$evaluation_log$test_rmse_mean)
hyper_grid$min_rmse[i] <- min(xgb_tune$evaluation_log$test_rmse_mean)
}
# Taking the parameters with the lowest rmse, and fitting the model with them
min_rmse <- which.min(hyper_grid$min_rmse)
set.seed(seed)
xgb_fit_tuned <- xgboost(
data = as.matrix(train %>% select(-c("ID", "lnwage"))),
label = as.matrix(train$lnwage),
nrounds = hyper_grid$optimal_trees[min_rmse],
verbose = 0,
early_stopping_rounds = 10,
params = list(
eta = hyper_grid$eta[min_rmse],
max_depth = hyper_grid$max_depth[min_rmse],
min_child_weight = hyper_grid$min_child_weight[min_rmse],
subsample = hyper_grid$subsample[min_rmse],
colsample_bytree = hyper_grid$colsample_bytree[min_rmse]
)
)
We can now use our tuned model to generate predictions:
# Predicting in sample
xgb_train_pred <- as_tibble(predict(xgb_fit_tuned, as.matrix(train %>% select(-c("ID", "lnwage"))))) %>%
rename(xgb_pred = value) %>%
bind_cols(train) %>%
select(ID, lnwage, xgb_pred) %>%
mutate(xgb_sqerror = (lnwage - xgb_pred)^2)
mean(xgb_train_pred$xgb_sqerror)
## [1] 0.1415992
# Predicting out of sample
xgb_test_pred <- as_tibble(predict(xgb_fit_tuned, as.matrix(test %>% select(-c("ID", "lnwage"))))) %>%
rename(xgb_pred = value) %>%
bind_cols(test) %>%
select(ID, lnwage, xgb_pred) %>%
mutate(xgb_sqerror = (lnwage - xgb_pred)^2)
mean(xgb_test_pred$xgb_sqerror)
## [1] 0.1622138
Following several iterations, it appears that boosting does very slightly better than the previous methods on average, but there doesn’t appear to be a significant difference.
# In sample predictions
train_preds <- left_join(
reg_train_preds,
rf_train_pred
) %>%
left_join(
xgb_train_pred
) %>%
mutate(
avg_pred = (avg_regularized_pred + rf_train_pred + xgb_pred)/3,
avg_sqerror = (lnwage - avg_pred)^2
)
sapply(train_preds, function(rmse) mean(rmse))[c(7:10, 12, 14, 16)]
## lasso_sqerror ridge_sqerror en0.5_sqerror avg_reg_sqerror rf_sqerror
## 0.13938438 0.14052813 0.14075584 0.13951265 0.06667919
## xgb_sqerror avg_sqerror
## 0.14159916 0.10993625
# Out of sample predictions
test_preds <- left_join(
reg_test_preds,
rf_test_pred
) %>%
left_join(
xgb_test_pred
) %>%
mutate(
avg_pred = (avg_regularized_pred + rf_test_pred + xgb_pred)/3,
avg_sqerror = (lnwage - avg_pred)^2
)
sapply(test_preds, function(rmse) mean(rmse))[c(7:10, 12, 14, 16)]
## lasso_sqerror ridge_sqerror en0.5_sqerror avg_reg_sqerror rf_sqerror
## 0.1593286 0.1599706 0.1605919 0.1594132 0.1649844
## xgb_sqerror avg_sqerror
## 0.1622138 0.1578668
This requires creating specifications of the models above so that they can be tuned by the stacks functions.
The application here loosely follows tutorials available on https://stacks.tidymodels.org/articles/basics.html, https://www.hfshr.xyz/posts/2020-11-30-model-stacking/, and https://juliasilge.com/blog/xgboost-tune-volleyball/.
# Specifying the general recipe
general_rec <- recipe(lnwage ~ ., data = train) %>%
step_rm(ID)
### Lasso
# Regression model definition
lasso_spec <- linear_reg() %>%
set_args(penalty = tune(), mixture = 1) %>%
set_engine("glmnet") %>%
set_mode("regression")
# Regression recipe extension
reg_rec <- general_rec %>%
step_normalize(all_numeric(), skip = TRUE)
# Lasso workflow (adding model and recipe)
reg_wf <- workflow() %>%
add_model(lasso_spec) %>%
add_recipe(reg_rec)
# Tuning penalty and mixture, and fitting to the 5-fold cv
set.seed(seed)
lasso_res <- lasso_wf %>%
tune_grid(
resamples = folds,
metrics = metric_set(rmse),
grid = grid_latin_hypercube(penalty(), size = 20),
control = control_stack_grid()
)
### Ridge
# Regression model definition
ridge_spec <- linear_reg() %>%
set_args(penalty = tune(), mixture = 0) %>%
set_engine("glmnet") %>%
set_mode("regression")
# Lasso workflow (adding model and recipe)
ridge_wf <- workflow() %>%
add_model(ridge_spec) %>%
add_recipe(reg_rec)
# Tuning penalty and mixture, and fitting to the 5-fold cv
set.seed(seed)
ridge_res <- ridge_wf %>%
tune_grid(
resamples = folds,
metrics = metric_set(rmse),
grid = grid_latin_hypercube(penalty(), size = 20),
control = control_stack_grid()
)
### Random Forest
# Random forest model definition
rf_spec <- rand_forest(
mtry = tune(),
min_n = tune(),
trees = tune()
) %>%
set_mode("regression") %>%
set_engine("ranger")
# Random forest recipe extension
rf_rec <- general_rec %>%
step_rm(edfemale, managerskill, productionf, officef)
# Random forest workflow
rf_wf <- workflow() %>%
add_model(rf_spec) %>%
add_recipe(rf_rec)
# Tuning parameters, and fitting to the 5-fold cv
set.seed(seed)
rf_res <- rf_wf %>%
tune_grid(
resamples = folds,
metrics = metric_set(rmse),
grid = grid_latin_hypercube(
mtry(range = c(1L, 10L)),
min_n(),
trees(range = c(500L, 1500L)),
size = 50
),
control = control_stack_grid()
)
### Gradient Boosting
# Boosting model definition
xgb_spec <- boost_tree(
mtry = tune(),
min_n = tune(),
trees = tune(),
tree_depth = tune(),
learn_rate = tune(),
loss_reduction = tune(),
sample_size = tune()
) %>%
set_mode("regression") %>%
set_engine("xgboost")
# Boosting workflow
xgb_wf <- workflow() %>%
add_model(xgb_spec) %>%
add_recipe(general_rec)
# Tuning parameters, and fitting to the 5-fold cv
set.seed(seed)
xgb_res <- xgb_wf %>%
tune_grid(
resamples = folds,
metrics = metric_set(rmse),
grid = grid_latin_hypercube(
tree_depth(),
min_n(),
loss_reduction(),
sample_size = sample_prop(range = c(0.65, 1)),
mtry(range = c(35L, 44L)),
trees(range = c(500L, 1500L)),
learn_rate(),
size = 100
),
control = control_stack_grid()
)
### Neural Network
# NN model definition
nnet_spec <- mlp(
hidden_units = tune(),
penalty = tune(),
epochs = tune()
) %>%
set_engine("nnet") %>%
set_mode("regression")
# NN recipe extension
nnet_rec <- general_rec %>%
step_normalize(all_predictors()) %>%
step_corr(all_predictors())
# NN workflow (adding model and recipe)
nnet_wf <- workflow() %>%
add_model(nnet_spec) %>%
add_recipe(nnet_rec)
# Tuning penalty and mixture, and fitting to the 5-fold cv
set.seed(seed)
nnet_res <- nnet_wf %>%
tune_grid(
resamples = folds,
metrics = metric_set(rmse),
grid = grid_latin_hypercube(
penalty(),
hidden_units(),
epochs(),
size = 100
),
control = control_stack_grid()
)
Now we can stack the workflows and blend their predictions:
stack <- stacks() %>%
add_candidates(lasso_res) %>%
add_candidates(ridge_res) %>%
add_candidates(rf_res) %>%
add_candidates(xgb_res) %>%
add_candidates(nnet_res) %>%
blend_predictions() %>%
fit_members()
And then generate predictions:
# In sample
stack_train_pred <- train %>% bind_cols(predict(stack, new_data = train)) %>%
rename(stack_pred = .pred) %>%
select(ID, lnwage, stack_pred) %>%
mutate(stack_sqerror = (lnwage - stack_pred)^2)
mean(stack_train_pred$stack_sqerror)
## [1] 0.1052997
# Out of sample
stack_test_pred <- test %>% bind_cols(predict(stack, new_data = test)) %>%
rename(stack_pred = .pred) %>%
select(ID, lnwage, stack_pred) %>%
mutate(stack_sqerror = (lnwage - stack_pred)^2)
mean(stack_test_pred$stack_sqerror)
## [1] 0.159528
The results tend to be very slightly better on average than my own averaging.
We now apply the methods to the full training set in order to generate predictions for the test set, which we load (and fit with the relevant features) below:
PSID_test <- read.csv("test.csv") %>%
as_tibble() %>%
select(-farmer) %>%
mutate(edfemale = as.numeric(edyrs < 14) * female,
lowskill = healthsupport + building + foodcare + transport + production + constructextractinstall + protective + officeadmin + sales,
medskill = healthcare + business + architect + socialworker,
highskill = financialop + computer + legaleduc + postseceduc + lawyerphysician,
white = 1 - (black + hisp + otherrace),
managerskill = manager * edyrs,
productionf = production * female,
officef = officeadmin * female)
Specifying a recipe to be used for all three models, this time with the full training set:
recipe <- recipe(lnwage ~ ., data = PSID1) %>%
step_center(all_predictors()) %>%
step_scale(all_predictors()) %>%
step_zv(all_predictors()) %>%
step_rm(ID) # since ID is with high confidence uninformative
Specifying workflows for each model:
lasso_wf <- workflow() %>%
add_recipe(recipe) %>%
add_model(lasso_mod)
ridge_wf <- workflow() %>%
add_recipe(recipe) %>%
add_model(ridge_mod)
en0.5_wf <- workflow() %>%
add_recipe(recipe) %>%
add_model(en0.5_mod)
Tuning lambda in each model and taking the 1se lambda:
folds <- PSID1 %>% vfold_cv(v = 5)
lasso_1se <- lasso_wf %>%
tune_grid(resamples = folds) %>%
select_best(metric = "rmse") %>%
select(penalty)
ridge_1se <- ridge_wf %>%
tune_grid(resamples = folds) %>%
select_best(metric = "rmse") %>%
select(penalty)
en0.5_1se <- en0.5_wf %>%
tune_grid(resamples = folds) %>%
select_best(metric = "rmse") %>%
select(penalty)
Finalizing the models, recipes and workflows:
# Models
lasso_mod_final <- lasso_mod %>%
finalize_model(lasso_1se)
ridge_mod_final <- ridge_mod %>%
finalize_model(ridge_1se)
en0.5_mod_final <- en0.5_mod %>%
finalize_model(en0.5_1se)
# Recipes
lasso_rec_final <- recipe %>%
finalize_recipe(lasso_1se)
ridge_rec_final <- recipe %>%
finalize_recipe(ridge_1se)
en0.5_rec_final <- recipe %>%
finalize_recipe(en0.5_1se)
# Workflows
lasso_wf_final <- lasso_wf %>%
finalize_workflow(lasso_1se)
ridge_wf_final <- ridge_wf %>%
finalize_workflow(ridge_1se)
en0.5_wf_final <- en0.5_wf %>%
finalize_workflow(en0.5_1se)
Applying the recipes to the train and test set:
lasso_juice <- lasso_rec_final %>%
prep() %>%
juice()
ridge_juice <- ridge_rec_final %>%
prep() %>%
juice()
en0.5_juice <- en0.5_rec_final %>%
prep() %>%
juice()
lasso_bake <- lasso_rec_final %>%
prep() %>%
bake(new_data = PSID_test)
ridge_bake <- ridge_rec_final %>%
prep() %>%
bake(new_data = PSID_test)
en0.5_bake <- en0.5_rec_final %>%
prep() %>%
bake(new_data = PSID_test)
Fitting the models to the training sets and creating predictions for the train and test sets:
# In-sample predictions
lasso_train_pred <- lasso_mod_final %>%
fit(lnwage ~ ., data = lasso_juice) %>%
predict(new_data = lasso_juice) %>%
rename(lasso_train_pred = .pred)
ridge_train_pred <- ridge_mod_final %>%
fit(lnwage ~ ., data = ridge_juice) %>%
predict(new_data = ridge_juice) %>%
rename(ridge_train_pred = .pred)
en0.5_train_pred <- en0.5_mod_final %>%
fit(lnwage ~ ., data = en0.5_juice) %>%
predict(new_data = en0.5_juice) %>%
rename(en0.5_train_pred = .pred)
reg_train_preds <- bind_cols(PSID1, lasso_train_pred, ridge_train_pred, en0.5_train_pred) %>%
select(ID, lnwage, lasso_train_pred, ridge_train_pred, en0.5_train_pred) %>%
mutate(
avg_regularized_pred = 0.85 * lasso_train_pred + 0.05 * ridge_train_pred + 0.1 * en0.5_train_pred,
lasso_sqerror = (lnwage - lasso_train_pred)^2,
ridge_sqerror = (lnwage - ridge_train_pred)^2,
en0.5_sqerror = (lnwage - en0.5_train_pred)^2,
avg_reg_sqerror = (lnwage - avg_regularized_pred)^2
)
# MSE
sapply(reg_train_preds, function(rmse) mean(rmse))[7:10]
## lasso_sqerror ridge_sqerror en0.5_sqerror avg_reg_sqerror
## 0.1421496 0.1430389 0.1420682 0.1421560
# Out-of-sample predictions
lasso_test_pred <- lasso_mod_final %>%
fit(lnwage ~ ., data = lasso_juice) %>%
predict(new_data = lasso_bake) %>%
rename(lasso_test_pred = .pred)
ridge_test_pred <- ridge_mod_final %>%
fit(lnwage ~ ., data = ridge_juice) %>%
predict(new_data = ridge_bake) %>%
rename(ridge_test_pred = .pred)
en0.5_test_pred <- en0.5_mod_final %>%
fit(lnwage ~ ., data = en0.5_juice) %>%
predict(new_data = en0.5_bake) %>%
rename(en0.5_test_pred = .pred)
reg_test_preds <- bind_cols(PSID_test, lasso_test_pred, ridge_test_pred, en0.5_test_pred) %>%
select(ID, lasso_test_pred, ridge_test_pred, en0.5_test_pred) %>%
mutate(avg_regularized_pred = 0.85 * lasso_test_pred + 0.05 * ridge_test_pred + 0.1 * en0.5_test_pred)
Creating the model:
set.seed(seed)
rf_fit <- ranger(
formula = lnwage ~ . -ID - edfemale -managerskill -productionf -officef,
data = PSID1,
#mtry = default (square root of number of variables rounded down)
num.trees = 1000,
importance = "impurity"
)
Predicting training and test set values:
# In sample predictions
rf_train_pred <- as_tibble(predict(rf_fit, data = PSID1)$predictions) %>%
rename(rf_train_pred = value) %>%
bind_cols(PSID1) %>%
select(ID, lnwage, rf_train_pred) %>%
mutate(rf_sqerror = (lnwage - rf_train_pred)^2)
mean(rf_train_pred$rf_sqerror)
## [1] 0.06854607
# Out of sample predictions
rf_test_pred <- as_tibble(predict(rf_fit, data = PSID_test)$predictions) %>%
rename(rf_test_pred = value) %>%
bind_cols(PSID_test) %>%
select(ID, rf_test_pred)
We repeat the calculations, this time for the full training set:
# Creating a hyperparameter grid
hyper_grid_final <- expand.grid(
eta = c(.01, .05, .1, .3),
max_depth = c(1, 3, 5, 7),
min_child_weight = c(1, 3, 5, 7),
subsample = c(.65, .8, 1),
colsample_bytree = c(.8, .9, 1),
optimal_trees = 0, # will store results here
min_rmse = 0 # will store results here
)
# Grid search
for (i in 1:nrow(hyper_grid_final)){
set.seed(seed)
# Training the models
xgb_tune <- xgb.cv(
data = as.matrix(PSID1 %>% select(-c("ID", "lnwage"))),
label = as.matrix(PSID1$lnwage),
nrounds = 1000,
nfold = 5,
metrics = "rmse",
verbose = 0,
early_stopping_rounds = 10, # stop if no improvement for 10 consecutive trees
params = list(
eta = hyper_grid_final$eta[i],
max_depth = hyper_grid_final$max_depth[i],
min_child_weight = hyper_grid_final$min_child_weight[i],
subsample = hyper_grid_final$subsample[i],
colsample_bytree = hyper_grid_final$colsample_bytree[i]
)
)
# Adding the error and tree number to the grid
hyper_grid_final$optimal_trees[i] <- which.min(xgb_tune$evaluation_log$test_rmse_mean)
hyper_grid_final$min_rmse[i] <- min(xgb_tune$evaluation_log$test_rmse_mean)
}
# Taking the parameters with the lowest rmse, and fitting the model with them
min_rmse <- which.min(hyper_grid_final$min_rmse)
set.seed(seed)
xgb_fit_final <- xgboost(
data = as.matrix(PSID1 %>% select(-c("ID", "lnwage"))),
label = as.matrix(PSID1$lnwage),
nrounds = hyper_grid_final$optimal_trees[min_rmse],
verbose = 0,
early_stopping_rounds = 10,
params = list(
eta = hyper_grid_final$eta[min_rmse],
max_depth = hyper_grid_final$max_depth[min_rmse],
min_child_weight = hyper_grid_final$min_child_weight[min_rmse],
subsample = hyper_grid_final$subsample[min_rmse],
colsample_bytree = hyper_grid_final$colsample_bytree[min_rmse]
)
)
We can now use our tuned model to generate predictions:
# Predicting in sample
xgb_train_pred <- as_tibble(predict(xgb_fit_tuned, as.matrix(PSID1 %>% select(-c("ID", "lnwage"))))) %>%
rename(xgb_pred = value) %>%
bind_cols(PSID1) %>%
select(ID, lnwage, xgb_pred) %>%
mutate(xgb_sqerror = (lnwage - xgb_pred)^2)
mean(xgb_train_pred$xgb_sqerror)
## [1] 0.1457015
# Predicting out of sample
xgb_test_pred <- as_tibble(predict(xgb_fit_tuned, as.matrix(PSID_test %>% select(-ID)))) %>%
rename(xgb_pred = value) %>%
bind_cols(PSID_test) %>%
select(ID, xgb_pred)
# In sample predictions
train_preds <- left_join(
reg_train_preds,
rf_train_pred
) %>%
left_join(
xgb_train_pred
) %>%
mutate(
avg_pred = (avg_regularized_pred + rf_train_pred + xgb_pred)/3,
avg_sqerror = (lnwage - avg_pred)^2
)
sapply(train_preds, function(rmse) mean(rmse))[c(7:10, 12, 14, 16)]
## lasso_sqerror ridge_sqerror en0.5_sqerror avg_reg_sqerror rf_sqerror
## 0.14214959 0.14303891 0.14206815 0.14215597 0.06854607
## xgb_sqerror avg_sqerror
## 0.14570148 0.11253873
# Out of sample predictions
test_preds <- left_join(
reg_test_preds,
rf_test_pred
) %>%
left_join(
xgb_test_pred
) %>%
mutate(
avg_pred = (avg_regularized_pred + rf_test_pred + xgb_pred)/3)
Exporting the predictions to csv:
#test_preds %>% select(ID, avg_pred) %>% rename(lnwage = avg_pred) %>%
# write.csv("Submission1.csv")
# Specifying the general recipe
general_rec <- recipe(lnwage ~ ., data = PSID1) %>%
step_rm(ID)
### Lasso
# Regression model definition
lasso_spec <- linear_reg() %>%
set_args(penalty = tune(), mixture = 1) %>%
set_engine("glmnet") %>%
set_mode("regression")
# Regression recipe extension
reg_rec <- general_rec %>%
step_normalize(all_numeric(), skip = TRUE)
# Lasso workflow (adding model and recipe)
reg_wf <- workflow() %>%
add_model(lasso_spec) %>%
add_recipe(reg_rec)
# Tuning penalty and mixture, and fitting to the 5-fold cv
set.seed(seed)
lasso_res <- lasso_wf %>%
tune_grid(
resamples = folds,
metrics = metric_set(rmse),
grid = grid_latin_hypercube(penalty(), size = 20),
control = control_stack_grid()
)
### Ridge
# Regression model definition
ridge_spec <- linear_reg() %>%
set_args(penalty = tune(), mixture = 0) %>%
set_engine("glmnet") %>%
set_mode("regression")
# Lasso workflow (adding model and recipe)
ridge_wf <- workflow() %>%
add_model(ridge_spec) %>%
add_recipe(reg_rec)
# Tuning penalty and mixture, and fitting to the 5-fold cv
set.seed(seed)
ridge_res <- ridge_wf %>%
tune_grid(
resamples = folds,
metrics = metric_set(rmse),
grid = grid_latin_hypercube(penalty(), size = 20),
control = control_stack_grid()
)
### Random Forest
# Random forest model definition
rf_spec <- rand_forest(
mtry = tune(),
min_n = tune(),
trees = tune()
) %>%
set_mode("regression") %>%
set_engine("ranger")
# Random forest recipe extension
rf_rec <- general_rec %>%
step_rm(edfemale, managerskill, productionf, officef)
# Random forest workflow
rf_wf <- workflow() %>%
add_model(rf_spec) %>%
add_recipe(rf_rec)
# Tuning parameters, and fitting to the 5-fold cv
set.seed(seed)
rf_res <- rf_wf %>%
tune_grid(
resamples = folds,
metrics = metric_set(rmse),
grid = grid_latin_hypercube(
mtry(range = c(1L, 10L)),
min_n(),
trees(range = c(500L, 1500L)),
size = 50
),
control = control_stack_grid()
)
### Gradient Boosting
# Boosting model definition
xgb_spec <- boost_tree(
mtry = tune(),
min_n = tune(),
trees = tune(),
tree_depth = tune(),
learn_rate = tune(),
loss_reduction = tune(),
sample_size = tune()
) %>%
set_mode("regression") %>%
set_engine("xgboost")
# Boosting workflow
xgb_wf <- workflow() %>%
add_model(xgb_spec) %>%
add_recipe(general_rec)
# Tuning parameters, and fitting to the 5-fold cv
set.seed(seed)
xgb_res <- xgb_wf %>%
tune_grid(
resamples = folds,
metrics = metric_set(rmse),
grid = grid_latin_hypercube(
tree_depth(),
min_n(),
loss_reduction(),
sample_size = sample_prop(range = c(0.65, 1)),
mtry(range = c(35L, 44L)),
trees(range = c(500L, 1500L)),
learn_rate(),
size = 100
),
control = control_stack_grid()
)
### Neural Network
# NN model definition
nnet_spec <- mlp(
hidden_units = tune(),
penalty = tune(),
epochs = tune()
) %>%
set_engine("nnet") %>%
set_mode("regression")
# NN recipe extension
nnet_rec <- general_rec %>%
step_normalize(all_predictors()) %>%
step_corr(all_predictors())
# NN workflow (adding model and recipe)
nnet_wf <- workflow() %>%
add_model(nnet_spec) %>%
add_recipe(nnet_rec)
# Tuning penalty and mixture, and fitting to the 5-fold cv
set.seed(seed)
nnet_res <- nnet_wf %>%
tune_grid(
resamples = folds,
metrics = metric_set(rmse),
grid = grid_latin_hypercube(
penalty(),
hidden_units(),
epochs(),
size = 100
),
control = control_stack_grid()
)
Stacking the workflows and blending their predictions:
stack <- stacks() %>%
add_candidates(lasso_res) %>%
add_candidates(ridge_res) %>%
add_candidates(rf_res) %>%
add_candidates(xgb_res) %>%
add_candidates(nnet_res) %>%
blend_predictions() %>%
fit_members()
Generating predictions and exporting to csv:
# In sample
stack_train_pred <- PSID1 %>% bind_cols(predict(stack, new_data = PSID1)) %>%
rename(stack_pred = .pred) %>%
select(ID, lnwage, stack_pred) %>%
mutate(stack_sqerror = (lnwage - stack_pred)^2)
mean(stack_train_pred$stack_sqerror)
## [1] 0.1103744
# Out of sample
stack_test_pred <- PSID_test %>% bind_cols(predict(stack, new_data = PSID_test)) %>%
rename(lnwage = .pred) %>%
select(ID, lnwage) %>%
write.csv("Submission5.csv")