library(pacman)
p_load(tidyverse, caret, tidymodels, magrittr, janitor, here)Guide to project 001
Setting up
Load the packages
Load the data.
# Load data
train_df =
here('projects', 'project-001', 'data', 'train.csv') |>
read_csv()Rows: 1460 Columns: 81
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
dbl (38): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test_df =
here('projects', 'project-001', 'data', 'test.csv') |>
read_csv()Rows: 1459 Columns: 80
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
dbl (37): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
I’m going to clean the names and then redefine sale price as its log (since the competition focuses on the logged price).
# Clean up names
train_df %<>% clean_names()
test_df %<>% clean_names()
# Replace sale_price with its log
train_df %<>% mutate(sale_price = log(sale_price))
# Add empty sale_price to test_df
test_df$sale_price = NA04–05: Linear regression
Start with one simple model…
# Define a linear regression model
model_reg =
linear_reg() |>
set_mode('regression') |>
set_engine('lm')
# Define a simple recipe
recipe01 =
recipe(sale_price ~ lot_area + gr_liv_area, data = train_df) |>
# Simple imputations
step_impute_mean(all_numeric_predictors()) |>
step_impute_mode(all_nominal_predictors()) |>
# Dummies
step_dummy(all_nominal_predictors()) |>
# Drop near-zero-variance variables and linear combinations
step_nzv(all_numeric_predictors()) |>
step_lincomb(all_numeric_predictors())
# Define the workflow
wf_reg01 =
workflow() |>
add_model(model_reg) |>
add_recipe(recipe01)Add a few more variables…
# Update the recipe to add a few variables
recipe02 =
recipe(sale_price ~ lot_area + gr_liv_area + year_built + ms_zoning, data = train_df) |>
# Simple imputations
step_impute_mean(all_numeric_predictors()) |>
step_impute_mode(all_nominal_predictors()) |>
# Dummies
step_dummy(all_nominal_predictors()) |>
# Drop near-zero-variance variables and linear combinations
step_nzv(all_numeric_predictors()) |>
step_lincomb(all_numeric_predictors())
# Define the workflow
wf_reg02 =
workflow() |>
add_model(model_reg) |>
add_recipe(recipe02)Set up the cross validation splits and then fit.
# Set seed (for repeatability)
set.seed(12345)
# 5-fold CV on the training dataset
credit_cv = train_df %>% vfold_cv(v = 5)
# Fit!
fit_reg01 =
wf_reg01 |>
fit_resamples(resamples = credit_cv, metrics = metric_set(rmse))
fit_reg02 =
wf_reg02 |>
fit_resamples(resamples = credit_cv, metrics = metric_set(rmse))Finally, we can compare the two models’ performances.
# Summarize CV results
fit_reg01 |> collect_metrics(summarize = TRUE)# A tibble: 1 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 rmse standard 0.285 5 0.00818 Preprocessor1_Model1
fit_reg02 |> collect_metrics(summarize = TRUE)# A tibble: 1 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 rmse standard 0.215 5 0.0103 Preprocessor1_Model1
06: Stepwise selection
For this step, I’ll clean the individual datasets with a recipe.
We should add this recipe to a workflow that is part of our cross validation (as I did above and will do in other problems). I’m cutting a corner here to keep things simple with caret.
I’ll also prep() the recipe.
a_recipe =
recipe(sale_price ~ ., data = train_df) |>
# Define the role of 'id'
update_role(id, new_role = 'ID') |>
# Simple imputations
step_impute_mean(all_numeric_predictors()) |>
step_impute_mode(all_nominal_predictors()) |>
# Dummies
step_dummy(all_nominal_predictors()) |>
# Drop near-zero-variance variables and linear combinations
step_nzv(all_numeric_predictors()) |>
step_lincomb(all_numeric_predictors())
# Prep it
prepped_recipe = a_recipe |> prep()# Clean the training data
train_clean = prepped_recipe |> bake(new_data = NULL)
# Clean the testing data
test_clean = prepped_recipe |> bake(new_data = test_df)Check it out: Names match!
setdiff(names(train_clean), names(test_clean))character(0)
Time for the stepwise (forward) model selection…
There are a few options here. We can use the y and x approach, or we can input the form (formula) and data.
Passing y and x:
# Set a seed
set.seed(42)
# Train!
train_fwd_1 = caret::train(
y = train_clean$sale_price,
x = train_clean |> select(-sale_price, -id),
trControl = trainControl(method = 'cv', number = 5),
method = "leapForward",
tuneGrid = data.frame(nvmax = 1:(ncol(train_clean) - 2))
)Passing form and data:
# Set a seed
set.seed(42)
# Train!
train_fwd_2 = caret::train(
form = sale_price ~ .,
data = train_clean |> select(-id),
trControl = trainControl(method = 'cv', number = 5),
method = "leapForward",
tuneGrid = data.frame(nvmax = 1:(ncol(train_clean) - 2))
)There are a few ways to view things about the final model chosen by caret::train().
- You can pass the final object (e.g.,
train_fwd_1) to thesummary()function, which will produce a lot of somewhat helpful output. - You can can also grab the
finalModelfrom yourtrained object and then pass it to other functions likecoef()to see the coefficients. You will also need to passcoef()a number referencing which stepwise model you want to examine. The index of the “best” model is stored in$bestTune(as adata.frame, so we need tounlist()it).
Example of option 2:
# First show the results of training/tuning the model
train_fwd_1Linear Regression with Forward Selection
1460 samples
101 predictor
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 1168, 1168, 1167, 1169, 1168
Resampling results across tuning parameters:
nvmax RMSE Rsquared MAE
1 0.2303056 0.6678791 0.17379110
2 0.2039121 0.7395022 0.14791497
3 0.1867088 0.7819462 0.13256496
4 0.1816407 0.7954345 0.12319391
5 0.1759785 0.8081903 0.11899999
6 0.1728237 0.8152576 0.11313820
7 0.1644675 0.8321740 0.10847265
8 0.1622904 0.8367771 0.10678930
9 0.1585075 0.8440589 0.10395508
10 0.1568374 0.8473310 0.10218056
11 0.1567764 0.8469906 0.10119806
12 0.1554054 0.8495877 0.09974558
13 0.1544955 0.8511324 0.09906164
14 0.1522056 0.8552191 0.09734353
15 0.1511424 0.8568808 0.09641350
16 0.1494450 0.8598634 0.09555140
17 0.1489558 0.8606512 0.09616275
18 0.1487699 0.8611355 0.09613863
19 0.1483046 0.8621789 0.09600034
20 0.1477147 0.8632878 0.09593673
21 0.1467691 0.8648994 0.09548343
22 0.1462668 0.8658834 0.09532404
23 0.1452566 0.8676946 0.09431651
24 0.1440446 0.8697010 0.09374732
25 0.1441062 0.8695358 0.09398880
26 0.1443210 0.8692389 0.09399212
27 0.1442717 0.8692920 0.09426408
28 0.1443767 0.8691793 0.09433800
29 0.1451758 0.8678311 0.09482675
30 0.1452392 0.8677790 0.09463616
31 0.1456767 0.8670319 0.09506984
32 0.1457791 0.8669175 0.09525017
33 0.1461457 0.8662883 0.09537753
34 0.1464655 0.8657473 0.09552602
35 0.1464451 0.8656924 0.09544752
36 0.1467279 0.8652085 0.09596666
37 0.1467671 0.8651091 0.09605969
38 0.1468165 0.8650210 0.09599616
39 0.1469550 0.8647360 0.09608133
40 0.1468436 0.8649868 0.09587510
41 0.1472565 0.8642733 0.09619872
42 0.1476208 0.8637120 0.09662133
43 0.1472988 0.8642650 0.09637158
44 0.1474756 0.8640580 0.09651845
45 0.1474814 0.8641426 0.09643571
46 0.1474250 0.8643272 0.09625518
47 0.1474797 0.8642704 0.09623150
48 0.1472858 0.8646344 0.09615625
49 0.1473402 0.8644793 0.09621567
50 0.1471398 0.8648705 0.09633019
51 0.1469074 0.8652299 0.09600708
52 0.1467876 0.8654632 0.09604628
53 0.1467649 0.8655775 0.09611062
54 0.1468346 0.8654316 0.09612833
55 0.1469844 0.8652047 0.09626699
56 0.1469700 0.8652620 0.09616605
57 0.1470421 0.8650829 0.09611535
58 0.1472747 0.8646207 0.09604550
59 0.1469948 0.8651207 0.09595270
60 0.1470133 0.8650748 0.09611813
61 0.1470602 0.8649801 0.09604934
62 0.1477381 0.8638507 0.09632105
63 0.1475995 0.8640665 0.09623198
64 0.1475682 0.8641409 0.09612948
65 0.1478297 0.8636932 0.09620861
66 0.1476600 0.8640813 0.09606673
67 0.1476669 0.8640173 0.09588294
68 0.1473666 0.8645745 0.09577823
69 0.1472933 0.8647539 0.09569934
70 0.1471200 0.8650143 0.09568499
71 0.1470058 0.8652015 0.09569388
72 0.1468593 0.8654319 0.09573313
73 0.1469497 0.8652744 0.09578497
74 0.1469891 0.8652391 0.09577171
75 0.1469842 0.8652576 0.09575920
76 0.1466637 0.8658040 0.09561904
77 0.1471473 0.8650243 0.09584149
78 0.1472263 0.8648794 0.09586586
79 0.1472736 0.8647777 0.09593137
80 0.1472785 0.8647242 0.09590033
81 0.1472018 0.8648812 0.09583334
82 0.1472296 0.8648649 0.09582429
83 0.1472851 0.8647777 0.09589175
84 0.1477065 0.8641714 0.09587392
85 0.1476490 0.8642658 0.09577871
86 0.1476068 0.8643412 0.09569602
87 0.1476763 0.8642353 0.09574831
88 0.1477115 0.8641622 0.09584941
89 0.1477394 0.8641085 0.09583006
90 0.1477520 0.8641176 0.09580086
91 0.1477509 0.8640773 0.09576743
92 0.1476105 0.8642912 0.09566417
93 0.1476111 0.8642845 0.09570758
94 0.1475826 0.8643309 0.09569759
95 0.1475790 0.8643140 0.09572361
96 0.1475503 0.8643625 0.09570684
97 0.1475571 0.8643436 0.09574052
98 0.1475644 0.8643350 0.09575601
99 0.1475510 0.8643608 0.09574828
100 0.1475444 0.8643709 0.09574728
101 0.1475516 0.8643616 0.09575194
RMSE was used to select the optimal model using the smallest value.
The final value used for the model was nvmax = 24.
# We can show the first model (intercept and one feature)
train_fwd_1$finalModel |> coef(1) (Intercept) overall_qual
10.584442 0.236028
# Now show the coefficients of the "best" model
train_fwd_1$finalModel |> coef(train_fwd_1$bestTune |> unlist()) (Intercept) ms_sub_class overall_qual
7.603888e+00 -6.119700e-04 7.259903e-02
overall_cond year_built total_bsmt_sf
4.891569e-02 1.385832e-03 3.536602e-05
gr_liv_area bsmt_full_bath full_bath
2.553589e-04 5.921167e-02 3.202031e-02
fireplaces garage_cars ms_zoning_RL
4.295657e-02 7.043345e-02 9.563135e-02
ms_zoning_RM lot_config_CulDSac neighborhood_Edwards
-5.126260e-04 6.670718e-02 -7.444941e-02
neighborhood_NridgHt neighborhood_Somerst condition1_Norm
1.084886e-01 1.250205e-01 7.535564e-02
foundation_PConc bsmt_exposure_Gd bsmt_fin_type1_Unf
5.799953e-02 7.678911e-02 -4.841785e-02
central_air_Y functional_Typ sale_type_New
7.580061e-02 5.230610e-02 1.241948e-01
sale_condition_Normal
6.979126e-02
Note: The str() function can be really helpful in examining the structure of any object—as can the class() function to get as sense of which class you’re working with.
07: Lasso
We can start with the existing recipe, but we also want to standardize the variables. I’m also expanding the numeric features with a second-degree polynomial.
Also, we need to deal with some binary variables that end up with only one value in the splits.
# Update the existing recipe with normalization
recipe_lasso =
recipe(sale_price ~ ., data = train_df) |>
# Define the role of 'id'
update_role(id, new_role = 'ID') |>
# Try to take care of categorical issues
step_string2factor(all_string_predictors()) |>
# Simple imputations
step_impute_mean(all_numeric_predictors()) |>
step_impute_mode(all_nominal_predictors()) |>
# Add quadratic terms
step_poly(all_numeric_predictors(), degree = 2) |>
# Dummies
step_zv(all_nominal_predictors()) |>
step_dummy(all_nominal_predictors()) |>
# Normalize variables
step_normalize(all_numeric_predictors()) |>
# Drop near-zero-variance variables and linear combinations
step_corr(all_numeric_predictors(), threshold = 0.9) |>
step_nzv(all_numeric_predictors()) |>
step_lincomb(all_numeric_predictors()) |>
step_zv(all_numeric_predictors())Now we just need a lasso model and workflow…
# The model (make sure to set penalty to tune and mixture=1)
model_lasso =
linear_reg(penalty = tune(), mixture = 1) |>
set_mode('regression') |>
set_engine('glmnet')
# Workflow
wf_lasso =
workflow() |>
add_model(model_lasso) |>
add_recipe(recipe_lasso)… and we can cross validate the penalty!
# Fit!
fit_lasso =
wf_lasso |>
tune_grid(
resamples = credit_cv,
metrics = metric_set(rmse),
grid = grid_regular(penalty(), levels = 50)
)
# How did we do?
fit_lasso |> show_best(metric = 'rmse', n = 10)# A tibble: 10 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.00222 rmse standard 0.127 5 0.00464 Preprocessor1_Model37
2 0.00139 rmse standard 0.127 5 0.00475 Preprocessor1_Model36
3 0.00356 rmse standard 0.127 5 0.00445 Preprocessor1_Model38
4 0.000869 rmse standard 0.127 5 0.00497 Preprocessor1_Model35
5 0.000543 rmse standard 0.128 5 0.00526 Preprocessor1_Model34
6 0.00569 rmse standard 0.129 5 0.00421 Preprocessor1_Model39
7 0.000339 rmse standard 0.129 5 0.00548 Preprocessor1_Model33
8 0.000212 rmse standard 0.129 5 0.00571 Preprocessor1_Model32
9 0.000133 rmse standard 0.130 5 0.00589 Preprocessor1_Model31
10 0.0000829 rmse standard 0.130 5 0.00601 Preprocessor1_Model30