Load packages

Load packages

library(tidyverse)   # for data wrangling and visualization
library(tidymodels)  # for data modeling
library(plsmod)      # for estimating pls regression
library(here)         # for referencing files and folders
if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("mixOmics")

and set seed for replication

set.seed(1203)

Read data

Read the browser dataset from the data folder.

browser <- here("05-regression-regularization/data","browser-sample.csv") %>% 
  read_csv()
## Rows: 1200 Columns: 201
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (201): log_spend, americansingles.com, opm.gov, netzerovoice.com, mobile...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Sample spliting

Split the data to training and test sets, and the training set to 10 folds for cross-validation.

browser_split <- browser %>% 
  initial_split(prop = 0.75)

browser_train <- training(browser_split)
browser_test  <- testing(browser_split)

browser_folds <- browser_train %>% 
  vfold_cv(v = 5)

Lasso or ridge

Specify model recipe and workflow

Set mixture = 0 for ridge and mixture = 1 for lasso. penalty is \(\lambda\). We’ll go for ridge:

lm_spec <- linear_reg() %>% 
  set_args(penalty = tune(), mixture = 0, nlambda = 10) %>% 
  set_engine("glmnet") %>% 
  set_mode("regression")
browser_rec <- recipe(log_spend ~ .,data = browser_train) %>% 
  step_center(all_predictors()) %>% 
  step_scale(all_predictors()) %>% 
  step_zv(all_predictors())
lm_wfl <- workflow() %>% 
  add_recipe(browser_rec) %>% 
  add_model(lm_spec)

Tune hyperparameter

lm_results <- lm_wfl %>% 
  tune_grid(
    resamples = browser_folds
  )
## Warning: package 'glmnet' was built under R version 4.3.3

show best results

lm_results %>% 
  show_best(metric = "rmse")
## # A tibble: 5 x 7
##    penalty .metric .estimator  mean     n std_err .config              
##      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1 2.97e- 1 rmse    standard    2.06     5  0.0868 Preprocessor1_Model10
## 2 2.08e- 2 rmse    standard    2.31     5  0.109  Preprocessor1_Model09
## 3 4.57e-10 rmse    standard    2.31     5  0.109  Preprocessor1_Model01
## 4 1.34e- 9 rmse    standard    2.31     5  0.109  Preprocessor1_Model02
## 5 4.02e- 8 rmse    standard    2.31     5  0.109  Preprocessor1_Model03

Set best lambda

Two options: lambda that minimizes RMSE, and the 1 standard error rule of thumb:

lambda_min <- lm_results %>% 
  select_best(metric = "rmse")

lambda_1se <- lm_results %>% 
  select_by_one_std_err(
    metric = "rmse",
    desc(penalty)
  ) %>% 
  select(penalty)

Evaluate model on the test set

lm_wfl_final <- lm_wfl %>%
  finalize_workflow(lambda_1se)
browser_test_results <- lm_wfl_final %>% 
  last_fit(split = browser_split)
ridge_results <- browser_test_results %>% 
  collect_metrics() %>% 
  filter(.metric == "rmse") %>% 
  mutate(method = "ridge")

PCR

Specify model recipe and workflow

Specify a linear model. Note that in this case, there are no tuning parameters.

lm_spec <- linear_reg() %>% 
  set_engine("lm") %>% 
  set_mode("regression")

Note how the number of PCs is determined inside the recipe object.

browser_rec <- recipe(log_spend ~ ., data = browser_train) %>% 
  step_center(all_predictors()) %>% 
  step_scale(all_predictors()) %>% 
  step_zv(all_predictors()) %>% 
  step_pca(all_predictors(), num_comp = tune())
lm_wfl <- workflow() %>% 
  add_recipe(browser_rec) %>% 
  add_model(lm_spec)

Tune hyperparameter

num_comp_grid <- expand_grid(num_comp = 1:10)
lm_results <- lm_wfl %>% 
  tune_grid(
    resamples = browser_folds,
    grid = num_comp_grid
  )

show best results

lm_results %>% 
  show_best(metric = "rmse")
## # A tibble: 5 x 7
##   num_comp .metric .estimator  mean     n std_err .config              
##      <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1        3 rmse    standard    1.65     5  0.0233 Preprocessor03_Model1
## 2        1 rmse    standard    1.65     5  0.0221 Preprocessor01_Model1
## 3        2 rmse    standard    1.65     5  0.0224 Preprocessor02_Model1
## 4        4 rmse    standard    1.65     5  0.0223 Preprocessor04_Model1
## 5        5 rmse    standard    1.65     5  0.0220 Preprocessor05_Model1

Set best number of components

Two options: num_comp that minimizes RMSE, and the 1 standard error rule of thumb:

num_comp_min <- lm_results %>% 
  select_best(metric = "rmse")

num_comp_1se <- lm_results %>% 
  select_by_one_std_err(
    metric = "rmse",
    desc(num_comp)
  ) %>% 
  select(num_comp)

Evaluate model on the test set

lm_wfl_final <- lm_wfl %>%
  finalize_workflow(num_comp_1se)
browser_test_results <- lm_wfl_final %>% 
  last_fit(split = browser_split)
pcr_results <- browser_test_results %>% 
  collect_metrics() %>% 
  filter(.metric == "rmse") %>% 
  mutate(method = "pcr")

PLS

To run tidymodels’ plsmod we’ll need to make sure we have installed the {mixOmics} package.

install.packages("BiocManager")
BiocManager::install("mixOmics")

Specify model recipe and workflow

Specify plsmod

pls_spec <- pls() %>% 
  set_args(num_comp = tune()) %>% 
  set_engine("mixOmics") %>% 
  set_mode("regression")
browser_rec <- recipe(log_spend ~ ., data = browser_train) %>% 
  step_center(all_predictors()) %>% 
  step_scale(all_predictors()) %>% 
  step_zv(all_predictors())
pls_wfl <- workflow() %>% 
  add_recipe(browser_rec) %>% 
  add_model(pls_spec)

Tune hyperparameter

num_comp_grid <- expand_grid(num_comp = 1:10)
pls_results <- pls_wfl %>% 
  tune_grid(
    resamples = browser_folds,
    grid = num_comp_grid
  )

show best results

pls_results %>% 
  show_best(metric = "rmse")
## # A tibble: 5 x 7
##   num_comp .metric .estimator  mean     n std_err .config              
##      <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1        1 rmse    standard    2.06     5   0.170 Preprocessor1_Model01
## 2        2 rmse    standard    2.08     5   0.114 Preprocessor1_Model02
## 3        3 rmse    standard    2.15     5   0.121 Preprocessor1_Model03
## 4        4 rmse    standard    2.23     5   0.106 Preprocessor1_Model04
## 5        5 rmse    standard    2.29     5   0.111 Preprocessor1_Model05

Set best number of components

Two options: num_comp that minimizes RMSE, and the 1 standard error rule of thumb:

num_comp_min <- pls_results %>% 
  select_best(metric = "rmse")

num_comp_1se <- pls_results %>% 
  select_by_one_std_err(
    metric = "rmse",
    desc(num_comp)
  )

Evaluate model on the test set

pls_wfl_final <- pls_wfl %>%
  finalize_workflow(num_comp_1se)
browser_test_results <- pls_wfl_final %>% 
  last_fit(split = browser_split)
pls_results <- browser_test_results %>% 
  collect_metrics() %>% 
  filter(.metric == "rmse") %>% 
  mutate(method = "pls")

Summary

bind_rows(
  ridge_results,
  pcr_results,
  pls_results
) %>% 
  arrange(.estimate)
## # A tibble: 3 x 5
##   .metric .estimator .estimate .config              method
##   <chr>   <chr>          <dbl> <chr>                <chr> 
## 1 rmse    standard        2.15 Preprocessor1_Model1 pcr   
## 2 rmse    standard        4.96 Preprocessor1_Model1 ridge 
## 3 rmse    standard        5.36 Preprocessor1_Model1 pls