Load packages

Load packages

if (!require("pacman")) install.packages("pacman")

pacman::p_load(
  tidyverse,   # for data wrangling and visualization
  tidymodels,  # for data modeling
  plsmod,      # for estimating pls regression
  here         # for referencing files and folders
)

and set seed for replication

set.seed(1203)

Read data

browser <- here("05-regression-regularization/data","browser-all.csv") %>% 
  read_csv()
## Parsed with column specification:
## cols(
##   .default = col_double()
## )
## See spec(...) for full column specifications.

Sample spliting

Split the data to training and test sets, and the training set to 10 folds for cross-validation.

browser_split <- browser %>% 
  initial_split(prop = 0.75)

browser_train <- training(browser_split)
browser_test  <- testing(browser_split)

browser_folds <- browser_train %>% 
  vfold_cv(v = 5)

Lasso or ridge

Specify model recipe and workflow

Set mixture = 0 for ridge and mixture = 1 for lasso. penalty is \(\lambda\). We’ll go for ridge:

lm_spec <- linear_reg() %>% 
  set_args(penalty = tune(), mixture = 0, nlambda = 10) %>% 
  set_engine("glmnet") %>% 
  set_mode("regression")
browser_rec <- recipe(log_spend ~ .,data = browser_train) %>% 
  step_center(all_predictors()) %>% 
  step_scale(all_predictors()) %>% 
  step_zv(all_predictors())
lm_wfl <- workflow() %>% 
  add_recipe(browser_rec) %>% 
  add_model(lm_spec)

Tune hyperparameter

lm_results <- lm_wfl %>% 
  tune_grid(
    resamples = browser_folds
  )

show best results

lm_results %>% 
  show_best(metric = "rmse", maximize = FALSE)
## # A tibble: 5 x 6
##    penalty .metric .estimator  mean     n std_err
##      <dbl> <chr>   <chr>      <dbl> <int>   <dbl>
## 1 1.71e- 1 rmse    standard    1.72     5  0.0147
## 2 6.75e- 2 rmse    standard    1.76     5  0.0173
## 3 4.90e-10 rmse    standard    1.78     5  0.0195
## 4 1.04e- 9 rmse    standard    1.78     5  0.0195
## 5 4.12e- 8 rmse    standard    1.78     5  0.0195

Set best lambda

Two options: lambda that minimizes RMSE, and the 1 standard error rule of thumb:

lambda_min <- lm_results %>% 
  select_best(
    metric = "rmse",
    maximize = FALSE
  )

lambda_1se <- lm_results %>% 
  select_by_one_std_err(
    metric = "rmse",
    maximize = FALSE,
    desc(penalty)
  ) %>% 
  select(penalty)

Evaluate model on the test set

lm_wfl_final <- lm_wfl %>%
  finalize_workflow(lambda_1se)
browser_test_results <- lm_wfl_final %>% 
  last_fit(split = browser_split)
ridge_results <- browser_test_results %>% 
  collect_metrics() %>% 
  filter(.metric == "rmse") %>% 
  mutate(method = "ridge")

PCR

Specify model recipe and workflow

Specify a linear model. Note that in this case, there are no tuning parameters.

lm_spec <- linear_reg() %>% 
  set_engine("lm") %>% 
  set_mode("regression")

Note how the number of PCs is determined inside the recipe object.

browser_rec <- recipe(log_spend ~ ., data = browser_train) %>% 
  step_center(all_predictors()) %>% 
  step_scale(all_predictors()) %>% 
  step_zv(all_predictors()) %>% 
  step_pca(all_predictors(), num_comp = tune())
lm_wfl <- workflow() %>% 
  add_recipe(browser_rec) %>% 
  add_model(lm_spec)

Tune hyperparameter

num_comp_grid <- expand_grid(num_comp = 1:10)
lm_results <- lm_wfl %>% 
  tune_grid(
    resamples = browser_folds,
    grid = num_comp_grid
  )

show best results

lm_results %>% 
  show_best(metric = "rmse", maximize = FALSE)
## # A tibble: 5 x 6
##   num_comp .metric .estimator  mean     n std_err
##      <int> <chr>   <chr>      <dbl> <int>   <dbl>
## 1       10 rmse    standard    1.64     5 0.00779
## 2        9 rmse    standard    1.64     5 0.00788
## 3        8 rmse    standard    1.64     5 0.00848
## 4        7 rmse    standard    1.64     5 0.00931
## 5        6 rmse    standard    1.64     5 0.00954

Set best number of components

Two options: num_comp that minimizes RMSE, and the 1 standard error rule of thumb:

num_comp_min <- lm_results %>% 
  select_best(
    metric = "rmse",
    maximize = FALSE
  )

num_comp_1se <- lm_results %>% 
  select_by_one_std_err(
    metric = "rmse",
    maximize = FALSE,
    desc(num_comp)
  ) %>% 
  select(num_comp)

Evaluate model on the test set

lm_wfl_final <- lm_wfl %>%
  finalize_workflow(num_comp_1se)
browser_test_results <- lm_wfl_final %>% 
  last_fit(split = browser_split)
pcr_results <- browser_test_results %>% 
  collect_metrics() %>% 
  filter(.metric == "rmse") %>% 
  mutate(method = "pcr")

PLS

To run tidymodels’ plsmod we’ll need to make sure we have installed the {mixOmics} package.

install.packages("BiocManager")
BiocManager::install("mixOmics")

Specify model recipe and workflow

Specify plsmod

pls_spec <- pls() %>% 
  set_args(num_comp = tune()) %>% 
  set_engine("mixOmics") %>% 
  set_mode("regression")
browser_rec <- recipe(log_spend ~ ., data = browser_train) %>% 
  step_center(all_predictors()) %>% 
  step_scale(all_predictors()) %>% 
  step_zv(all_predictors())
pls_wfl <- workflow() %>% 
  add_recipe(browser_rec) %>% 
  add_model(pls_spec)

Tune hyperparameter

num_comp_grid <- expand_grid(num_comp = 1:10)
pls_results <- pls_wfl %>% 
  tune_grid(
    resamples = browser_folds,
    grid = num_comp_grid
  )

show best results

pls_results %>% 
  show_best(metric = "rmse", maximize = FALSE)
## # A tibble: 5 x 6
##   num_comp .metric .estimator  mean     n std_err
##      <int> <chr>   <chr>      <dbl> <int>   <dbl>
## 1        2 rmse    standard    1.61     5  0.0115
## 2        1 rmse    standard    1.61     5  0.0115
## 3        3 rmse    standard    1.64     5  0.0125
## 4        4 rmse    standard    1.69     5  0.0169
## 5        5 rmse    standard    1.73     5  0.0206

Set best number of components

Two options: num_comp that minimizes RMSE, and the 1 standard error rule of thumb:

num_comp_min <- pls_results %>% 
  select_best(
    metric = "rmse",
    maximize = FALSE
  )

num_comp_1se <- pls_results %>% 
  select_by_one_std_err(
    metric = "rmse",
    maximize = FALSE,
    desc(num_comp)
  ) %>% 
  select(num_comp)

Evaluate model on the test set

pls_wfl_final <- pls_wfl %>%
  finalize_workflow(num_comp_1se)
browser_test_results <- pls_wfl_final %>% 
  last_fit(split = browser_split)
pls_results <- browser_test_results %>% 
  collect_metrics() %>% 
  filter(.metric == "rmse") %>% 
  mutate(method = "pls")

Summary

bind_rows(ridge_results,
          pcr_results,
          pls_results)
## # A tibble: 3 x 4
##   .metric .estimator .estimate method
##   <chr>   <chr>          <dbl> <chr> 
## 1 rmse    standard        1.65 ridge 
## 2 rmse    standard        1.64 pcr   
## 3 rmse    standard        1.59 pls