Load packages

Load packages

if (!require("pacman")) install.packages("pacman")

pacman::p_load(
  tidyverse,   # for data wrangling and visualization
  tidymodels,  # for data modeling
  vip,         # for variable importance
  here,        # for referencing files and folders
  readxl       # for reading xlsx files
)

and set seed for replication

set.seed(1203)

Read data

covid_raw <- 
  here("06-classification/data", "corona_tested_individuals_ver_002.xlsx") %>% 
  read_xlsx()

Initial preprocessing

Replace all NULL values with NA’s

is.na(covid_raw[,-1]) <- covid_raw[,-1] == "NULL"
covid <- 
  covid_raw %>%
  select(-test_date, -test_indication) %>% 
  filter(corona_result != "אחר") %>% 
  drop_na() %>%
  mutate(
    corona_result = if_else(corona_result == "שלילי", "negative", "positive"),
    gender = if_else(gender == "זכר", "male", "female")
  ) %>% 
  mutate_all(as_factor)

Save processed data

write.csv(
  covid,
  file = here("06-classification/data","covid_proc.csv"),
  row.names = FALSE
  )

Split

Train-test split

covid_split <- covid       %>% initial_split(prop = 0.5)
covid_train <- covid_split %>% training()
covid_test  <- covid_split %>% testing()

Cross-validation splits

covid_folds <- covid_train %>% vfold_cv(v = 5, strata = corona_result)

Build a Model

logit_model <- 
  logistic_reg() %>%
  set_engine("glmnet") %>% 
  set_mode("classification") %>%
  set_args(penalty = tune(), mixture = 1)

Create the Recipe

covid_rec <- 
  recipe(corona_result ~ ., data = covid_train) %>% 
  step_dummy(all_nominal(), -corona_result, one_hot = TRUE) %>% 
  step_zv(all_predictors()) %>%  # Remove zero variance predictors first
  step_interact(~ all_predictors():all_predictors()) %>%
  step_zv(all_predictors()) %>%  # Remove zero variance interactions
  step_normalize(all_predictors())

Create the Workflow

logit_wfl <- 
  workflow() %>% 
  add_recipe(covid_rec) %>% 
  add_model(logit_model)

Train and Tune the Model

roc_only <- metric_set(roc_auc)

# Create a custom penalty grid with more appropriate values
penalty_grid <- grid_regular(
  penalty(range = c(-5, 0), trans = log10_trans()),
  levels = 20
)

# Modify your tuning code
logit_result <- 
  logit_wfl %>% 
  tune_grid(
    resamples = covid_folds,
    grid = penalty_grid,
    control = control_grid(save_pred = TRUE),
    metrics = roc_only
  )
## Warning: package 'glmnet' was built under R version 4.4.3

Plot Cross-validation results

logit_result %>% 
  collect_metrics() %>% 
  ggplot(aes(x = penalty, y = mean)) + 
  geom_point() + 
  geom_line() + 
  ylab("Area under the ROC Curve") +
  scale_x_log10(labels = scales::label_number()) +
  theme_minimal()

show best results

logit_result %>% 
  show_best(metric = "roc_auc", n = 10)
## # A tibble: 10 x 7
##      penalty .metric .estimator  mean     n std_err .config              
##        <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
##  1 0.00234   roc_auc binary     0.834     5 0.00397 Preprocessor1_Model10
##  2 0.000379  roc_auc binary     0.834     5 0.00398 Preprocessor1_Model07
##  3 0.000695  roc_auc binary     0.834     5 0.00397 Preprocessor1_Model08
##  4 0.000207  roc_auc binary     0.834     5 0.00399 Preprocessor1_Model06
##  5 0.00001   roc_auc binary     0.834     5 0.00399 Preprocessor1_Model01
##  6 0.0000183 roc_auc binary     0.834     5 0.00399 Preprocessor1_Model02
##  7 0.0000336 roc_auc binary     0.834     5 0.00399 Preprocessor1_Model03
##  8 0.0000616 roc_auc binary     0.834     5 0.00399 Preprocessor1_Model04
##  9 0.000113  roc_auc binary     0.834     5 0.00399 Preprocessor1_Model05
## 10 0.00127   roc_auc binary     0.834     5 0.00398 Preprocessor1_Model09

Set best lambda

Two options: lambda that minimizes roc_auc, and the 1 standard error rule of thumb:

lambda_min <- logit_result %>% 
  select_best(
    metric = "roc_auc"
  )

lambda_1se <- logit_result %>% 
  select_by_one_std_err(
    metric = "roc_auc",
    desc(penalty)
  ) %>% 
  select(penalty)

Last fit

Final workflow

logit_wfl_final <- 
  logit_wfl %>%
  finalize_workflow(lambda_1se)

Last fit on the test set

logit_last_fit <- 
  logit_wfl_final %>% 
  last_fit(covid_split)

ROC AUC on the train and test sets

logit_wfl_final %>%
  fit(data = covid_train) %>% 
  predict(new_data = covid_train, type = "prob") %>% 
  bind_cols(covid_train %>% select(corona_result)) %>%  # Add truth column
  roc_curve(truth = corona_result, .pred_negative) %>%  # Remove 'estimate ='
  autoplot() +
  labs(title = "Training set AUC")

logit_last_fit %>% 
  collect_predictions() %>% 
  roc_curve(truth = corona_result, .pred_negative) %>% 
  autoplot() +
  labs(title = "Test set AUC")

logit_last_fit %>% 
  collect_metrics() %>% 
  filter(.metric == "roc_auc")
## # A tibble: 1 x 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 roc_auc binary         0.828 Preprocessor1_Model1

Variable importance

logit_last_fit %>% 
  pluck(".workflow", 1) %>%   
  extract_fit_parsnip() %>%  # Replace pull_workflow_fit() with this
  vip(num_features = 10)