Load packages

Load packages

if (!require("pacman")) install.packages("pacman")

pacman::p_load(
  tidyverse,   # for data wrangling and visualization
  tidymodels,  # for data modeling
  plsmod,      # for estimating pls regression
  here         # for referencing files and folders
)

and set seed for replication

set.seed(1203)

Read data

browser <- here("05-regression-regularization/data","browser-sample.csv") %>% 
  read_csv()
## Parsed with column specification:
## cols(
##   .default = col_double()
## )
## See spec(...) for full column specifications.

Sample spliting

Split the data to training and test sets, and the training set to 10 folds for cross-validation.

browser_split <- browser %>% 
  initial_split(prop = 0.75)

browser_train <- training(browser_split)
browser_test  <- testing(browser_split)

browser_folds <- browser_train %>% 
  vfold_cv(v = 5)

Lasso or ridge

Specify model recipe and workflow

Set mixture = 0 for ridge and mixture = 1 for lasso. penalty is \(\lambda\). We’ll go for ridge:

lm_spec <- linear_reg() %>% 
  set_args(penalty = tune(), mixture = 0, nlambda = 10) %>% 
  set_engine("glmnet") %>% 
  set_mode("regression")
browser_rec <- recipe(log_spend ~ .,data = browser_train) %>% 
  step_center(all_predictors()) %>% 
  step_scale(all_predictors()) %>% 
  step_zv(all_predictors())
lm_wfl <- workflow() %>% 
  add_recipe(browser_rec) %>% 
  add_model(lm_spec)

Tune hyperparameter

lm_results <- lm_wfl %>% 
  tune_grid(
    resamples = browser_folds
  )

show best results

lm_results %>% 
  show_best(metric = "rmse")
## # A tibble: 5 x 6
##    penalty .metric .estimator  mean     n std_err
##      <dbl> <chr>   <chr>      <dbl> <int>   <dbl>
## 1 7.33e- 1 rmse    standard    2.45     5   0.486
## 2 2.88e- 2 rmse    standard    4.68     5   2.23 
## 3 2.07e-10 rmse    standard    4.78     5   2.29 
## 4 3.36e- 9 rmse    standard    4.78     5   2.29 
## 5 4.63e- 8 rmse    standard    4.78     5   2.29

Set best lambda

Two options: lambda that minimizes RMSE, and the 1 standard error rule of thumb:

lambda_min <- lm_results %>% 
  select_best(metric = "rmse")

lambda_1se <- lm_results %>% 
  select_by_one_std_err(
    metric = "rmse",
    desc(penalty)
  ) %>% 
  select(penalty)

Evaluate model on the test set

lm_wfl_final <- lm_wfl %>%
  finalize_workflow(lambda_1se)
browser_test_results <- lm_wfl_final %>% 
  last_fit(split = browser_split)
ridge_results <- browser_test_results %>% 
  collect_metrics() %>% 
  filter(.metric == "rmse") %>% 
  mutate(method = "ridge")

PCR

Specify model recipe and workflow

Specify a linear model. Note that in this case, there are no tuning parameters.

lm_spec <- linear_reg() %>% 
  set_engine("lm") %>% 
  set_mode("regression")

Note how the number of PCs is determined inside the recipe object.

browser_rec <- recipe(log_spend ~ ., data = browser_train) %>% 
  step_center(all_predictors()) %>% 
  step_scale(all_predictors()) %>% 
  step_zv(all_predictors()) %>% 
  step_pca(all_predictors(), num_comp = tune())
lm_wfl <- workflow() %>% 
  add_recipe(browser_rec) %>% 
  add_model(lm_spec)

Tune hyperparameter

num_comp_grid <- expand_grid(num_comp = 1:10)
lm_results <- lm_wfl %>% 
  tune_grid(
    resamples = browser_folds,
    grid = num_comp_grid
  )

show best results

lm_results %>% 
  show_best(metric = "rmse")
## # A tibble: 5 x 6
##   num_comp .metric .estimator  mean     n std_err
##      <int> <chr>   <chr>      <dbl> <int>   <dbl>
## 1        1 rmse    standard    1.69     5  0.0396
## 2        3 rmse    standard    1.70     5  0.0330
## 3        6 rmse    standard    1.70     5  0.0356
## 4        5 rmse    standard    1.71     5  0.0348
## 5        4 rmse    standard    1.71     5  0.0340

Set best number of components

Two options: num_comp that minimizes RMSE, and the 1 standard error rule of thumb:

num_comp_min <- lm_results %>% 
  select_best(metric = "rmse")

num_comp_1se <- lm_results %>% 
  select_by_one_std_err(
    metric = "rmse",
    desc(num_comp)
  ) %>% 
  select(num_comp)

Evaluate model on the test set

lm_wfl_final <- lm_wfl %>%
  finalize_workflow(num_comp_1se)
browser_test_results <- lm_wfl_final %>% 
  last_fit(split = browser_split)
pcr_results <- browser_test_results %>% 
  collect_metrics() %>% 
  filter(.metric == "rmse") %>% 
  mutate(method = "pcr")

PLS

To run tidymodels’ plsmod we’ll need to make sure we have installed the {mixOmics} package.

install.packages("BiocManager")
BiocManager::install("mixOmics")

Specify model recipe and workflow

Specify plsmod

pls_spec <- pls() %>% 
  set_args(num_comp = tune()) %>% 
  set_engine("mixOmics") %>% 
  set_mode("regression")
browser_rec <- recipe(log_spend ~ ., data = browser_train) %>% 
  step_center(all_predictors()) %>% 
  step_scale(all_predictors()) %>% 
  step_zv(all_predictors())
pls_wfl <- workflow() %>% 
  add_recipe(browser_rec) %>% 
  add_model(pls_spec)

Tune hyperparameter

num_comp_grid <- expand_grid(num_comp = 1:10)
pls_results <- pls_wfl %>% 
  tune_grid(
    resamples = browser_folds,
    grid = num_comp_grid
  )

show best results

pls_results %>% 
  show_best(metric = "rmse")
## # A tibble: 5 x 6
##   num_comp .metric .estimator  mean     n std_err
##      <int> <chr>   <chr>      <dbl> <int>   <dbl>
## 1        1 rmse    standard    2.28     5   0.350
## 2        2 rmse    standard    2.46     5   0.309
## 3        3 rmse    standard    3.23     5   0.883
## 4        4 rmse    standard    4.06     5   1.66 
## 5        5 rmse    standard    4.39     5   2.02

Set best number of components

Two options: num_comp that minimizes RMSE, and the 1 standard error rule of thumb:

num_comp_min <- pls_results %>% 
  select_best(metric = "rmse")

num_comp_1se <- pls_results %>% 
  select_by_one_std_err(
    metric = "rmse",
    desc(num_comp)
  ) %>% 
  select(num_comp)

Evaluate model on the test set

pls_wfl_final <- pls_wfl %>%
  finalize_workflow(num_comp_1se)
browser_test_results <- pls_wfl_final %>% 
  last_fit(split = browser_split)
pls_results <- browser_test_results %>% 
  collect_metrics() %>% 
  filter(.metric == "rmse") %>% 
  mutate(method = "pls")

Summary

bind_rows(
  ridge_results,
  pcr_results,
  pls_results
) %>% 
  arrange(.estimate)
## # A tibble: 3 x 4
##   .metric .estimator .estimate method
##   <chr>   <chr>          <dbl> <chr> 
## 1 rmse    standard        1.74 pcr   
## 2 rmse    standard        2.56 ridge 
## 3 rmse    standard        2.95 pls