# Load packages
::p_load(tidyverse, tidymodels, rpart.plot, janitor,
pacman
skimr, magrittr, collapse, here)# Set random seed
set.seed(42)
# Load the data
data('penguins')
EC524W25: Lab 004
Decision Trees
Introduction
The lab document today should be helpful for Project 003. I’ve included some helpful code to get you started.
Though I have left out key pieces to tuning the decision trees. I’ll leave that to you after I go over this basic example.
Setup
First, setup the document, loading in packages, setting a random seed, loading the data, and adjusting column names
Check with skimr
::skim(penguins) skimr
Decision Trees
Without tidymodels
0.3/ Hopefully you noticed we have some missing data. Let’s impute. By hand. Since we’re doing it by hand, let’s start simply. For categorical variables, you can use the modal class. For numeric, let’s use the median.
# Impute the median for numeric columns
%<>%
penguins mutate_if(is.numeric, ~replace_na(., median(., na.rm = TRUE))) %>%
mutate_if(is.character, ~replace_na(., mode(., na.rm = TRUE)))
👨🍳+💋+🤌
1.2 Try each split of island and then calculate the split’s accuracy and gini.
# Find the islands
= penguins$island |> unique()
islands # Example: Biscoe
lapply(X = islands, FUN = function(i) {
# Split into island and not island
= penguins |> filter(island == i)
grp1 = penguins |> filter(island != i)
grp2
# Find the modal species in each group
= grp1$species |> fmode()
species1 = grp2$species |> fmode()
species2
# Calculate accuracy
= fmean(grp1$species == species1)
acc1 = fmean(grp2$species == species2)
acc2
# Calculate gini
= grp1$species |> table() |> prop.table()
g1 = grp2$species |> table() |> prop.table()
g2 = sum(g1 * (1 - g1))
gini1 = sum(g2 * (1 - g2))
gini2 = (gini1 + gini2)/2
gini
= data.frame(
df island = i,
group1 = species1,
group2 = species2,
accuracy = c(acc1, acc2),
gini = gini
)
return(df)
})
[[1]]
island group1 group2 accuracy gini
1 Torgersen Adelie Gentoo 1.0000000 0.3240758
2 Torgersen Adelie Gentoo 0.4246575 0.3240758
[[2]]
island group1 group2 accuracy gini
1 Biscoe Gentoo Adelie 0.7380952 0.4303974
2 Biscoe Gentoo Adelie 0.6136364 0.4303974
[[3]]
island group1 group2 accuracy gini
1 Dream Chinstrap Gentoo 0.5483871 0.4936091
2 Dream Chinstrap Gentoo 0.5636364 0.4936091
If instead you would like to visualize each split, use rpart.plot
:
# Simple decision tree
= decision_tree(
tree_complex mode = "classification",
cost_complexity = 0.005,
tree_depth = 2,
min_n = 10
%>% set_engine(
) "rpart"
%>% fit(species ~ island, data = penguins)
) # Plot the tree
rpart.plot(
$fit,
tree_complexextra = 104,
box.palette = "YlGnBl",
branch.lty = 4,
shadow.col = "gray",
nn = TRUE,
cex = 1.2
)
tidymodels
workflow
Recipe
# Reload data
data(penguins)
# Define recipe
=
basic_recipe recipe(species ~ ., data = penguins) |>
step_impute_mode(all_nominal_predictors()) |>
step_impute_median(all_numeric_predictors())
# Prep and bake
= basic_recipe |> prep()
basic_prep = basic_prep |> bake(new_data = NULL) basic_bake
Model
# Define our decision tree learner
= decision_tree(mode = "classification") |>
tree_model set_engine("rpart")
Workflow
# Define our workflow
= workflow() %>%
workflow add_model(tree_model) %>%
add_recipe(basic_recipe)
CV
# Create a 5-fold cross validation strategy
= penguins %>% vfold_cv(v = 5) rsmp_cv
To look at the resampling splits across the data
%>% tidy() rsmp_cv
Fit
# Fit the model
= workflow %>%
fit_cv fit_resamples(
resamples = rsmp_cv,
metrics = metric_set(roc_auc, accuracy)
)
Model evaluation
# Collect metrics
%>%
fit_cv collect_metrics(summarize = FALSE)
# A tibble: 10 × 5
id .metric .estimator .estimate .config
<chr> <chr> <chr> <dbl> <chr>
1 Fold1 accuracy multiclass 0.928 Preprocessor1_Model1
2 Fold1 roc_auc hand_till 0.935 Preprocessor1_Model1
3 Fold2 accuracy multiclass 0.957 Preprocessor1_Model1
4 Fold2 roc_auc hand_till 0.970 Preprocessor1_Model1
5 Fold3 accuracy multiclass 0.928 Preprocessor1_Model1
6 Fold3 roc_auc hand_till 0.954 Preprocessor1_Model1
7 Fold4 accuracy multiclass 0.928 Preprocessor1_Model1
8 Fold4 roc_auc hand_till 0.922 Preprocessor1_Model1
9 Fold5 accuracy multiclass 0.971 Preprocessor1_Model1
10 Fold5 roc_auc hand_till 0.982 Preprocessor1_Model1
Prediction
Once you have add some complexity to the tuning process, you will have some variation across your models (i.e. get to tuning).
# Finalize workflow
# best_model = workflow %>%
# finalize_workflow(select_best(metric = "accuracy"))
# Fit the final workflow
# final_fit = best_model %>% fit(penguins)
# Examine the final fit
# final_fit