Load packages

Load packages

if (!require("pacman")) install.packages("pacman")

pacman::p_load(
  tidyverse,   # for data wrangling and visualization
  tidymodels,  # for data modeling
  vip,         # for variable importance
  here,        # for referencing files and folders
  readxl       # for reading xlsx files
)
## package 'vip' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\research\AppData\Local\Temp\RtmpCAU9QQ\downloaded_packages

and set seed for replication

set.seed(1203)

Read data

covid_raw <- 
  here("06-classification/data", "corona_tested_individuals_ver_002.xlsx") %>% 
  read_xlsx()

Initial preprocessing

Replace all NULL values with NA’s

is.na(covid_raw[,-1]) <- covid_raw[,-1] == "NULL"
covid <- covid_raw %>%
  select(-test_date, -test_indication) %>% 
  filter(corona_result != "אחר") %>% 
  drop_na() %>%
  mutate(
    corona_result = if_else(corona_result == "שלילי", "negative", "positive"),
    gender = if_else(gender == "זכר", "male", "female")
  ) %>% 
  mutate_all(as_factor)

Save processed data

write.csv(
  covid,
  file = here("06-classification/data","covid_proc.csv"),
  row.names = FALSE
  )

Split

Train-test split

covid_split <- covid       %>% initial_split(prop = 0.5)
covid_train <- covid_split %>% training()
covid_test  <- covid_split %>% testing()

Cross-validation splits

covid_folds <- covid_train %>% vfold_cv(v = 5, strata = corona_result)

Build a Model

logit_model <- 
  logistic_reg() %>%
  set_engine("glmnet") %>% 
  set_mode("classification") %>%
  set_args(penalty = tune(), mixture = 1)

Create the Recipe

covid_rec <- 
  recipe(corona_result ~ ., data = covid_train) %>% 
  step_dummy(all_nominal(), -corona_result, one_hot = TRUE) %>% 
  step_interact(~ all_predictors():all_predictors()) %>%
  step_normalize(all_predictors()) %>% 
  step_zv(all_predictors()) 

Create the Workflow

logit_wfl <- 
  workflow() %>% 
  add_recipe(covid_rec) %>% 
  add_model(logit_model)

Train and Tune the Model

roc_only <- metric_set(roc_auc)

logit_result <- 
  logit_wfl %>% 
  tune_grid(
    resamples = covid_folds,
    control = control_grid(save_pred = TRUE),
    metrics = roc_only
  )

Plot Cross-validation results

logit_result %>% 
  collect_metrics() %>% 
  ggplot(aes(x = penalty, y = mean)) + 
  geom_point() + 
  geom_line() + 
  ylab("Area under the ROC Curve") +
  scale_x_log10(labels = scales::label_number())

show best results

logit_result %>% 
  show_best(metric = "roc_auc", n = 10)
## # A tibble: 10 x 6
##     penalty .metric .estimator  mean     n std_err
##       <dbl> <chr>   <chr>      <dbl> <int>   <dbl>
##  1 3.56e-10 roc_auc binary     0.828     5 0.00483
##  2 1.83e- 9 roc_auc binary     0.828     5 0.00483
##  3 1.40e- 8 roc_auc binary     0.828     5 0.00483
##  4 1.11e- 7 roc_auc binary     0.828     5 0.00483
##  5 1.65e- 6 roc_auc binary     0.828     5 0.00483
##  6 1.39e- 5 roc_auc binary     0.828     5 0.00483
##  7 1.01e- 4 roc_auc binary     0.828     5 0.00483
##  8 4.91e- 3 roc_auc binary     0.827     5 0.00425
##  9 1.42e- 2 roc_auc binary     0.818     5 0.00288
## 10 7.82e- 1 roc_auc binary     0.5       5 0

Set best lambda

Two options: lambda that minimizes roc_auc, and the 1 standard error rule of thumb:

lambda_min <- logit_result %>% 
  select_best(
    metric = "roc_auc"
  )

lambda_1se <- logit_result %>% 
  select_by_one_std_err(
    metric = "roc_auc",
    desc(penalty)
  ) %>% 
  select(penalty)

Last fit

Final workflow

logit_wfl_final <- 
  logit_wfl %>%
  finalize_workflow(lambda_1se)

Last fit on the test set

logit_last_fit <- 
  logit_wfl_final %>% 
  last_fit(covid_split)

ROC AUC on the train and test sets

logit_wfl_final %>%
  fit(data = covid_train) %>% 
  predict(new_data = covid_train, type = "prob") %>% 
  roc_curve(covid_train$corona_result, .pred_negative) %>% 
  autoplot() +
  labs(title = "Training set AUC")

logit_last_fit %>% 
  collect_predictions() %>% 
  roc_curve(corona_result, .pred_negative) %>% 
  autoplot() +
  labs(title = "Test set AUC")

logit_last_fit %>% 
  collect_metrics() %>% 
  filter(.metric == "roc_auc")
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.831

Variable importance

logit_last_fit %>% 
  pluck(".workflow", 1) %>%   
  pull_workflow_fit() %>% 
  vip(num_features = 10)