Load packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
tidyverse, # for data wrangling and visualization
tidymodels, # for data modeling
vip, # for variable importance
here, # for referencing files and folders
readxl # for reading xlsx files
)
## package 'vip' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\research\AppData\Local\Temp\RtmpCAU9QQ\downloaded_packages
and set seed for replication
Replace all NULL
values with NA
’s
Train-test split
covid_split <- covid %>% initial_split(prop = 0.5)
covid_train <- covid_split %>% training()
covid_test <- covid_split %>% testing()
Cross-validation splits
logit_result %>%
collect_metrics() %>%
ggplot(aes(x = penalty, y = mean)) +
geom_point() +
geom_line() +
ylab("Area under the ROC Curve") +
scale_x_log10(labels = scales::label_number())
show best results
## # A tibble: 10 x 6
## penalty .metric .estimator mean n std_err
## <dbl> <chr> <chr> <dbl> <int> <dbl>
## 1 3.56e-10 roc_auc binary 0.828 5 0.00483
## 2 1.83e- 9 roc_auc binary 0.828 5 0.00483
## 3 1.40e- 8 roc_auc binary 0.828 5 0.00483
## 4 1.11e- 7 roc_auc binary 0.828 5 0.00483
## 5 1.65e- 6 roc_auc binary 0.828 5 0.00483
## 6 1.39e- 5 roc_auc binary 0.828 5 0.00483
## 7 1.01e- 4 roc_auc binary 0.828 5 0.00483
## 8 4.91e- 3 roc_auc binary 0.827 5 0.00425
## 9 1.42e- 2 roc_auc binary 0.818 5 0.00288
## 10 7.82e- 1 roc_auc binary 0.5 5 0
Two options: lambda that minimizes roc_auc
, and the 1 standard error rule of thumb:
logit_wfl_final %>%
fit(data = covid_train) %>%
predict(new_data = covid_train, type = "prob") %>%
roc_curve(covid_train$corona_result, .pred_negative) %>%
autoplot() +
labs(title = "Training set AUC")
logit_last_fit %>%
collect_predictions() %>%
roc_curve(corona_result, .pred_negative) %>%
autoplot() +
labs(title = "Test set AUC")
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.831