Load packages
Load packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
tidyverse, # for data wrangling and visualization
tidymodels, # for data modeling
plsmod, # for estimating pls regression
here # for referencing files and folders
)
and set seed for replication
Read data
browser <- here("05-regression-regularization/data","browser-sample.csv") %>%
read_csv()
## Parsed with column specification:
## cols(
## .default = col_double()
## )
## See spec(...) for full column specifications.
Sample spliting
Split the data to training and test sets, and the training set to 10 folds for cross-validation.
browser_split <- browser %>%
initial_split(prop = 0.75)
browser_train <- training(browser_split)
browser_test <- testing(browser_split)
browser_folds <- browser_train %>%
vfold_cv(v = 5)
Lasso or ridge
Specify model recipe and workflow
Set mixture = 0
for ridge and mixture = 1
for lasso. penalty
is \(\lambda\). We’ll go for ridge:
lm_spec <- linear_reg() %>%
set_args(penalty = tune(), mixture = 0, nlambda = 10) %>%
set_engine("glmnet") %>%
set_mode("regression")
browser_rec <- recipe(log_spend ~ .,data = browser_train) %>%
step_center(all_predictors()) %>%
step_scale(all_predictors()) %>%
step_zv(all_predictors())
lm_wfl <- workflow() %>%
add_recipe(browser_rec) %>%
add_model(lm_spec)
Tune hyperparameter
lm_results <- lm_wfl %>%
tune_grid(
resamples = browser_folds
)
show best results
lm_results %>%
show_best(metric = "rmse")
## # A tibble: 5 x 6
## penalty .metric .estimator mean n std_err
## <dbl> <chr> <chr> <dbl> <int> <dbl>
## 1 7.33e- 1 rmse standard 2.45 5 0.486
## 2 2.88e- 2 rmse standard 4.68 5 2.23
## 3 2.07e-10 rmse standard 4.78 5 2.29
## 4 3.36e- 9 rmse standard 4.78 5 2.29
## 5 4.63e- 8 rmse standard 4.78 5 2.29
Set best lambda
Two options: lambda that minimizes RMSE, and the 1 standard error rule of thumb:
lambda_min <- lm_results %>%
select_best(metric = "rmse")
lambda_1se <- lm_results %>%
select_by_one_std_err(
metric = "rmse",
desc(penalty)
) %>%
select(penalty)
Evaluate model on the test set
lm_wfl_final <- lm_wfl %>%
finalize_workflow(lambda_1se)
browser_test_results <- lm_wfl_final %>%
last_fit(split = browser_split)
ridge_results <- browser_test_results %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
mutate(method = "ridge")
PCR
Specify model recipe and workflow
Specify a linear model. Note that in this case, there are no tuning parameters.
lm_spec <- linear_reg() %>%
set_engine("lm") %>%
set_mode("regression")
Note how the number of PCs is determined inside the recipe object.
browser_rec <- recipe(log_spend ~ ., data = browser_train) %>%
step_center(all_predictors()) %>%
step_scale(all_predictors()) %>%
step_zv(all_predictors()) %>%
step_pca(all_predictors(), num_comp = tune())
lm_wfl <- workflow() %>%
add_recipe(browser_rec) %>%
add_model(lm_spec)
Tune hyperparameter
num_comp_grid <- expand_grid(num_comp = 1:10)
lm_results <- lm_wfl %>%
tune_grid(
resamples = browser_folds,
grid = num_comp_grid
)
show best results
lm_results %>%
show_best(metric = "rmse")
## # A tibble: 5 x 6
## num_comp .metric .estimator mean n std_err
## <int> <chr> <chr> <dbl> <int> <dbl>
## 1 1 rmse standard 1.69 5 0.0396
## 2 3 rmse standard 1.70 5 0.0330
## 3 6 rmse standard 1.70 5 0.0356
## 4 5 rmse standard 1.71 5 0.0348
## 5 4 rmse standard 1.71 5 0.0340
Set best number of components
Two options: num_comp
that minimizes RMSE, and the 1 standard error rule of thumb:
num_comp_min <- lm_results %>%
select_best(metric = "rmse")
num_comp_1se <- lm_results %>%
select_by_one_std_err(
metric = "rmse",
desc(num_comp)
) %>%
select(num_comp)
Evaluate model on the test set
lm_wfl_final <- lm_wfl %>%
finalize_workflow(num_comp_1se)
browser_test_results <- lm_wfl_final %>%
last_fit(split = browser_split)
pcr_results <- browser_test_results %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
mutate(method = "pcr")
PLS
To run tidymodels’ plsmod
we’ll need to make sure we have installed the {mixOmics}
package.
install.packages("BiocManager")
BiocManager::install("mixOmics")
Specify model recipe and workflow
Specify plsmod
pls_spec <- pls() %>%
set_args(num_comp = tune()) %>%
set_engine("mixOmics") %>%
set_mode("regression")
browser_rec <- recipe(log_spend ~ ., data = browser_train) %>%
step_center(all_predictors()) %>%
step_scale(all_predictors()) %>%
step_zv(all_predictors())
pls_wfl <- workflow() %>%
add_recipe(browser_rec) %>%
add_model(pls_spec)
Tune hyperparameter
num_comp_grid <- expand_grid(num_comp = 1:10)
pls_results <- pls_wfl %>%
tune_grid(
resamples = browser_folds,
grid = num_comp_grid
)
show best results
pls_results %>%
show_best(metric = "rmse")
## # A tibble: 5 x 6
## num_comp .metric .estimator mean n std_err
## <int> <chr> <chr> <dbl> <int> <dbl>
## 1 1 rmse standard 2.28 5 0.350
## 2 2 rmse standard 2.46 5 0.309
## 3 3 rmse standard 3.23 5 0.883
## 4 4 rmse standard 4.06 5 1.66
## 5 5 rmse standard 4.39 5 2.02
Set best number of components
Two options: num_comp
that minimizes RMSE, and the 1 standard error rule of thumb:
num_comp_min <- pls_results %>%
select_best(metric = "rmse")
num_comp_1se <- pls_results %>%
select_by_one_std_err(
metric = "rmse",
desc(num_comp)
) %>%
select(num_comp)
Evaluate model on the test set
pls_wfl_final <- pls_wfl %>%
finalize_workflow(num_comp_1se)
browser_test_results <- pls_wfl_final %>%
last_fit(split = browser_split)
pls_results <- browser_test_results %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
mutate(method = "pls")
Summary
bind_rows(
ridge_results,
pcr_results,
pls_results
) %>%
arrange(.estimate)
## # A tibble: 3 x 4
## .metric .estimator .estimate method
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 1.74 pcr
## 2 rmse standard 2.56 ridge
## 3 rmse standard 2.95 pls