Load packages

if (!require("pacman")) install.packages("pacman")

pacman::p_load(
  tidyverse,   # for data wrangling and visualization
  tidymodels,  # for data modeling
  plsmod,      # for estimating pls regression
  here         # for referencing files and folders
)

and set seed for replication

set.seed(1203)

Read data

browser <- here("05-regression-regularization/data","browser-sample.csv") %>% 
  read_csv()

## Parsed with column specification:
## cols(
##   .default = col_double()
## )

## See spec(...) for full column specifications.

Sample spliting

Split the data to training and test sets, and the training set to 10 folds for cross-validation.

browser_split <- browser %>% 
  initial_split(prop = 0.75)

browser_train <- training(browser_split)
browser_test  <- testing(browser_split)

browser_folds <- browser_train %>% 
  vfold_cv(v = 5)

Lasso or ridge

Specify model recipe and workflow

Set mixture = 0 for ridge and mixture = 1 for lasso. penalty is \(\lambda\). We’ll go for ridge:

lm_spec <- linear_reg() %>% 
  set_args(penalty = tune(), mixture = 0, nlambda = 10) %>% 
  set_engine("glmnet") %>% 
  set_mode("regression")

browser_rec <- recipe(log_spend ~ .,data = browser_train) %>% 
  step_center(all_predictors()) %>% 
  step_scale(all_predictors()) %>% 
  step_zv(all_predictors())

lm_wfl <- workflow() %>% 
  add_recipe(browser_rec) %>% 
  add_model(lm_spec)

Tune hyperparameter

lm_results <- lm_wfl %>% 
  tune_grid(
    resamples = browser_folds
  )

show best results

lm_results %>% 
  show_best(metric = "rmse")

## # A tibble: 5 x 6
##    penalty .metric .estimator  mean     n std_err
##      <dbl> <chr>   <chr>      <dbl> <int>   <dbl>
## 1 7.33e- 1 rmse    standard    2.45     5   0.486
## 2 2.88e- 2 rmse    standard    4.68     5   2.23 
## 3 2.07e-10 rmse    standard    4.78     5   2.29 
## 4 3.36e- 9 rmse    standard    4.78     5   2.29 
## 5 4.63e- 8 rmse    standard    4.78     5   2.29

Set best lambda

Two options: lambda that minimizes RMSE, and the 1 standard error rule of thumb:

lambda_min <- lm_results %>% 
  select_best(metric = "rmse")

lambda_1se <- lm_results %>% 
  select_by_one_std_err(
    metric = "rmse",
    desc(penalty)
  ) %>% 
  select(penalty)

Evaluate model on the test set

lm_wfl_final <- lm_wfl %>%
  finalize_workflow(lambda_1se)

browser_test_results <- lm_wfl_final %>% 
  last_fit(split = browser_split)

ridge_results <- browser_test_results %>% 
  collect_metrics() %>% 
  filter(.metric == "rmse") %>% 
  mutate(method = "ridge")

PCR

Specify model recipe and workflow

Specify a linear model. Note that in this case, there are no tuning parameters.

lm_spec <- linear_reg() %>% 
  set_engine("lm") %>% 
  set_mode("regression")

Note how the number of PCs is determined inside the recipe object.

browser_rec <- recipe(log_spend ~ ., data = browser_train) %>% 
  step_center(all_predictors()) %>% 
  step_scale(all_predictors()) %>% 
  step_zv(all_predictors()) %>% 
  step_pca(all_predictors(), num_comp = tune())

lm_wfl <- workflow() %>% 
  add_recipe(browser_rec) %>% 
  add_model(lm_spec)

Tune hyperparameter

num_comp_grid <- expand_grid(num_comp = 1:10)

lm_results <- lm_wfl %>% 
  tune_grid(
    resamples = browser_folds,
    grid = num_comp_grid
  )

show best results

lm_results %>% 
  show_best(metric = "rmse")

## # A tibble: 5 x 6
##   num_comp .metric .estimator  mean     n std_err
##      <int> <chr>   <chr>      <dbl> <int>   <dbl>
## 1        1 rmse    standard    1.69     5  0.0396
## 2        3 rmse    standard    1.70     5  0.0330
## 3        6 rmse    standard    1.70     5  0.0356
## 4        5 rmse    standard    1.71     5  0.0348
## 5        4 rmse    standard    1.71     5  0.0340

Set best number of components

Two options: num_comp that minimizes RMSE, and the 1 standard error rule of thumb:

num_comp_min <- lm_results %>% 
  select_best(metric = "rmse")

num_comp_1se <- lm_results %>% 
  select_by_one_std_err(
    metric = "rmse",
    desc(num_comp)
  ) %>% 
  select(num_comp)

Evaluate model on the test set

lm_wfl_final <- lm_wfl %>%
  finalize_workflow(num_comp_1se)

browser_test_results <- lm_wfl_final %>% 
  last_fit(split = browser_split)

pcr_results <- browser_test_results %>% 
  collect_metrics() %>% 
  filter(.metric == "rmse") %>% 
  mutate(method = "pcr")

PLS

To run tidymodels’ plsmod we’ll need to make sure we have installed the {mixOmics} package.

install.packages("BiocManager")
BiocManager::install("mixOmics")

Specify model recipe and workflow

Specify plsmod

pls_spec <- pls() %>% 
  set_args(num_comp = tune()) %>% 
  set_engine("mixOmics") %>% 
  set_mode("regression")

browser_rec <- recipe(log_spend ~ ., data = browser_train) %>% 
  step_center(all_predictors()) %>% 
  step_scale(all_predictors()) %>% 
  step_zv(all_predictors())

pls_wfl <- workflow() %>% 
  add_recipe(browser_rec) %>% 
  add_model(pls_spec)

Tune hyperparameter

num_comp_grid <- expand_grid(num_comp = 1:10)

pls_results <- pls_wfl %>% 
  tune_grid(
    resamples = browser_folds,
    grid = num_comp_grid
  )

show best results

pls_results %>% 
  show_best(metric = "rmse")

## # A tibble: 5 x 6
##   num_comp .metric .estimator  mean     n std_err
##      <int> <chr>   <chr>      <dbl> <int>   <dbl>
## 1        1 rmse    standard    2.28     5   0.350
## 2        2 rmse    standard    2.46     5   0.309
## 3        3 rmse    standard    3.23     5   0.883
## 4        4 rmse    standard    4.06     5   1.66 
## 5        5 rmse    standard    4.39     5   2.02

Set best number of components

Two options: num_comp that minimizes RMSE, and the 1 standard error rule of thumb:

num_comp_min <- pls_results %>% 
  select_best(metric = "rmse")

num_comp_1se <- pls_results %>% 
  select_by_one_std_err(
    metric = "rmse",
    desc(num_comp)
  ) %>% 
  select(num_comp)

Evaluate model on the test set

pls_wfl_final <- pls_wfl %>%
  finalize_workflow(num_comp_1se)

browser_test_results <- pls_wfl_final %>% 
  last_fit(split = browser_split)

pls_results <- browser_test_results %>% 
  collect_metrics() %>% 
  filter(.metric == "rmse") %>% 
  mutate(method = "pls")

Summary

bind_rows(
  ridge_results,
  pcr_results,
  pls_results
) %>% 
  arrange(.estimate)

## # A tibble: 3 x 4
##   .metric .estimator .estimate method
##   <chr>   <chr>          <dbl> <chr> 
## 1 rmse    standard        1.74 pcr   
## 2 rmse    standard        2.56 ridge 
## 3 rmse    standard        2.95 pls

Ridge/Lasso, PCR, and PLS

A Tidymodels Workflow

Itamar Caspi

(updated: 2021-04-11)

Load packages

Read data

Sample spliting

Lasso or ridge

Specify model recipe and workflow

Tune hyperparameter

Set best lambda

Evaluate model on the test set

PCR

Specify model recipe and workflow

Tune hyperparameter

Set best number of components

Evaluate model on the test set

PLS

Specify model recipe and workflow

Tune hyperparameter

Set best number of components

Evaluate model on the test set

Summary