# Load packages
pacman::p_load(tidyverse, tidymodels, rpart.plot, janitor,
skimr, magrittr, collapse, here)
# Set random seed
set.seed(42)
# Load the data
data('penguins')EC524W25: Lab 004
Decision Trees
Introduction
The lab document today should be helpful for Project 003. I’ve included some helpful code to get you started.
Though I have left out key pieces to tuning the decision trees. I’ll leave that to you after I go over this basic example.
Setup
First, setup the document, loading in packages, setting a random seed, loading the data, and adjusting column names
Check with skimr
skimr::skim(penguins)Decision Trees
Without tidymodels
0.3/ Hopefully you noticed we have some missing data. Let’s impute. By hand. Since we’re doing it by hand, let’s start simply. For categorical variables, you can use the modal class. For numeric, let’s use the median.
# Impute the median for numeric columns
penguins %<>%
mutate_if(is.numeric, ~replace_na(., median(., na.rm = TRUE))) %>%
mutate_if(is.character, ~replace_na(., mode(., na.rm = TRUE)))👨🍳+💋+🤌
1.2 Try each split of island and then calculate the split’s accuracy and gini.
# Find the islands
islands = penguins$island |> unique()
# Example: Biscoe
lapply(X = islands, FUN = function(i) {
# Split into island and not island
grp1 = penguins |> filter(island == i)
grp2 = penguins |> filter(island != i)
# Find the modal species in each group
species1 = grp1$species |> fmode()
species2 = grp2$species |> fmode()
# Calculate accuracy
acc1 = fmean(grp1$species == species1)
acc2 = fmean(grp2$species == species2)
# Calculate gini
g1 = grp1$species |> table() |> prop.table()
g2 = grp2$species |> table() |> prop.table()
gini1 = sum(g1 * (1 - g1))
gini2 = sum(g2 * (1 - g2))
gini = (gini1 + gini2)/2
df = data.frame(
island = i,
group1 = species1,
group2 = species2,
accuracy = c(acc1, acc2),
gini = gini
)
return(df)
})[[1]]
island group1 group2 accuracy gini
1 Torgersen Adelie Gentoo 1.0000000 0.3240758
2 Torgersen Adelie Gentoo 0.4246575 0.3240758
[[2]]
island group1 group2 accuracy gini
1 Biscoe Gentoo Adelie 0.7380952 0.4303974
2 Biscoe Gentoo Adelie 0.6136364 0.4303974
[[3]]
island group1 group2 accuracy gini
1 Dream Chinstrap Gentoo 0.5483871 0.4936091
2 Dream Chinstrap Gentoo 0.5636364 0.4936091
If instead you would like to visualize each split, use rpart.plot:
# Simple decision tree
tree_complex = decision_tree(
mode = "classification",
cost_complexity = 0.005,
tree_depth = 2,
min_n = 10
) %>% set_engine(
"rpart"
) %>% fit(species ~ island, data = penguins)
# Plot the tree
rpart.plot(
tree_complex$fit,
extra = 104,
box.palette = "YlGnBl",
branch.lty = 4,
shadow.col = "gray",
nn = TRUE,
cex = 1.2
)tidymodels workflow
Recipe
# Reload data
data(penguins)
# Define recipe
basic_recipe =
recipe(species ~ ., data = penguins) |>
step_impute_mode(all_nominal_predictors()) |>
step_impute_median(all_numeric_predictors())
# Prep and bake
basic_prep = basic_recipe |> prep()
basic_bake = basic_prep |> bake(new_data = NULL)Model
# Define our decision tree learner
tree_model = decision_tree(mode = "classification") |>
set_engine("rpart")Workflow
# Define our workflow
workflow = workflow() %>%
add_model(tree_model) %>%
add_recipe(basic_recipe)CV
# Create a 5-fold cross validation strategy
rsmp_cv = penguins %>% vfold_cv(v = 5)To look at the resampling splits across the data
rsmp_cv %>% tidy()Fit
# Fit the model
fit_cv = workflow %>%
fit_resamples(
resamples = rsmp_cv,
metrics = metric_set(roc_auc, accuracy)
)Model evaluation
# Collect metrics
fit_cv %>%
collect_metrics(summarize = FALSE)# A tibble: 10 × 5
id .metric .estimator .estimate .config
<chr> <chr> <chr> <dbl> <chr>
1 Fold1 accuracy multiclass 0.928 Preprocessor1_Model1
2 Fold1 roc_auc hand_till 0.935 Preprocessor1_Model1
3 Fold2 accuracy multiclass 0.957 Preprocessor1_Model1
4 Fold2 roc_auc hand_till 0.970 Preprocessor1_Model1
5 Fold3 accuracy multiclass 0.928 Preprocessor1_Model1
6 Fold3 roc_auc hand_till 0.954 Preprocessor1_Model1
7 Fold4 accuracy multiclass 0.928 Preprocessor1_Model1
8 Fold4 roc_auc hand_till 0.922 Preprocessor1_Model1
9 Fold5 accuracy multiclass 0.971 Preprocessor1_Model1
10 Fold5 roc_auc hand_till 0.982 Preprocessor1_Model1
Prediction
Once you have add some complexity to the tuning process, you will have some variation across your models (i.e. get to tuning).
# Finalize workflow
# best_model = workflow %>%
# finalize_workflow(select_best(metric = "accuracy"))# Fit the final workflow
# final_fit = best_model %>% fit(penguins)
# Examine the final fit
# final_fit