Load packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
tidyverse, # for data wrangling and visualization
tidymodels, # for data modeling
vip, # for variable importance
here, # for referencing files and folders
readxl # for reading xlsx files
)
and set seed for replication
Replace all NULL
values with NA
’s
Train-test split
covid_split <- covid %>% initial_split(prop = 0.5)
covid_train <- covid_split %>% training()
covid_test <- covid_split %>% testing()
Cross-validation splits
covid_rec <-
recipe(corona_result ~ ., data = covid_train) %>%
step_dummy(all_nominal(), -corona_result, one_hot = TRUE) %>%
step_zv(all_predictors()) %>% # Remove zero variance predictors first
step_interact(~ all_predictors():all_predictors()) %>%
step_zv(all_predictors()) %>% # Remove zero variance interactions
step_normalize(all_predictors())
roc_only <- metric_set(roc_auc)
# Create a custom penalty grid with more appropriate values
penalty_grid <- grid_regular(
penalty(range = c(-5, 0), trans = log10_trans()),
levels = 20
)
# Modify your tuning code
logit_result <-
logit_wfl %>%
tune_grid(
resamples = covid_folds,
grid = penalty_grid,
control = control_grid(save_pred = TRUE),
metrics = roc_only
)
## Warning: package 'glmnet' was built under R version 4.4.3
logit_result %>%
collect_metrics() %>%
ggplot(aes(x = penalty, y = mean)) +
geom_point() +
geom_line() +
ylab("Area under the ROC Curve") +
scale_x_log10(labels = scales::label_number()) +
theme_minimal()
show best results
## # A tibble: 10 x 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.00234 roc_auc binary 0.834 5 0.00397 Preprocessor1_Model10
## 2 0.000379 roc_auc binary 0.834 5 0.00398 Preprocessor1_Model07
## 3 0.000695 roc_auc binary 0.834 5 0.00397 Preprocessor1_Model08
## 4 0.000207 roc_auc binary 0.834 5 0.00399 Preprocessor1_Model06
## 5 0.00001 roc_auc binary 0.834 5 0.00399 Preprocessor1_Model01
## 6 0.0000183 roc_auc binary 0.834 5 0.00399 Preprocessor1_Model02
## 7 0.0000336 roc_auc binary 0.834 5 0.00399 Preprocessor1_Model03
## 8 0.0000616 roc_auc binary 0.834 5 0.00399 Preprocessor1_Model04
## 9 0.000113 roc_auc binary 0.834 5 0.00399 Preprocessor1_Model05
## 10 0.00127 roc_auc binary 0.834 5 0.00398 Preprocessor1_Model09
Two options: lambda that minimizes roc_auc
, and the 1
standard error rule of thumb:
logit_wfl_final %>%
fit(data = covid_train) %>%
predict(new_data = covid_train, type = "prob") %>%
bind_cols(covid_train %>% select(corona_result)) %>% # Add truth column
roc_curve(truth = corona_result, .pred_negative) %>% # Remove 'estimate ='
autoplot() +
labs(title = "Training set AUC")
logit_last_fit %>%
collect_predictions() %>%
roc_curve(truth = corona_result, .pred_negative) %>%
autoplot() +
labs(title = "Test set AUC")
## # A tibble: 1 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 roc_auc binary 0.828 Preprocessor1_Model1