Guide to project 001

Setting up

Load the packages

library(pacman)
p_load(tidyverse, caret, tidymodels, magrittr, janitor, here)

Load the data.

# Load data
train_df =
  here('projects', 'project-001', 'data', 'train.csv') |>
  read_csv()

Rows: 1460 Columns: 81
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
dbl (38): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

test_df =
  here('projects', 'project-001', 'data', 'test.csv') |>
  read_csv()

Rows: 1459 Columns: 80
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
dbl (37): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

I’m going to clean the names and then redefine sale price as its log (since the competition focuses on the logged price).

# Clean up names
train_df %<>% clean_names()
test_df %<>% clean_names()
# Replace sale_price with its log
train_df %<>% mutate(sale_price = log(sale_price))
# Add empty sale_price to test_df
test_df$sale_price = NA

04–05: Linear regression

Start with one simple model…

# Define a linear regression model
model_reg =
  linear_reg() |>
  set_mode('regression') |>
  set_engine('lm')
# Define a simple recipe
recipe01 =
  recipe(sale_price ~ lot_area + gr_liv_area, data = train_df) |>
  # Simple imputations
  step_impute_mean(all_numeric_predictors()) |>
  step_impute_mode(all_nominal_predictors()) |>
  # Dummies
  step_dummy(all_nominal_predictors()) |>
  # Drop near-zero-variance variables and linear combinations
  step_nzv(all_numeric_predictors()) |>
  step_lincomb(all_numeric_predictors())
# Define the workflow
wf_reg01 =
  workflow() |>
  add_model(model_reg) |>
  add_recipe(recipe01)

Add a few more variables…

# Update the recipe to add a few variables
recipe02 =
  recipe(sale_price ~ lot_area + gr_liv_area + year_built + ms_zoning, data = train_df) |>
  # Simple imputations
  step_impute_mean(all_numeric_predictors()) |>
  step_impute_mode(all_nominal_predictors()) |>
  # Dummies
  step_dummy(all_nominal_predictors()) |>
  # Drop near-zero-variance variables and linear combinations
  step_nzv(all_numeric_predictors()) |>
  step_lincomb(all_numeric_predictors())
# Define the workflow
wf_reg02 =
  workflow() |>
  add_model(model_reg) |>
  add_recipe(recipe02)

Set up the cross validation splits and then fit.

# Set seed (for repeatability)
set.seed(12345)
# 5-fold CV on the training dataset
credit_cv = train_df %>% vfold_cv(v = 5)
# Fit!
fit_reg01 =
  wf_reg01 |>
  fit_resamples(resamples = credit_cv, metrics = metric_set(rmse))
fit_reg02 =
  wf_reg02 |>
  fit_resamples(resamples = credit_cv, metrics = metric_set(rmse))

Finally, we can compare the two models’ performances.

# Summarize CV results
fit_reg01 |> collect_metrics(summarize = TRUE)

# A tibble: 1 × 6
  .metric .estimator  mean     n std_err .config             
  <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
1 rmse    standard   0.285     5 0.00818 Preprocessor1_Model1

fit_reg02 |> collect_metrics(summarize = TRUE)

# A tibble: 1 × 6
  .metric .estimator  mean     n std_err .config             
  <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
1 rmse    standard   0.215     5  0.0103 Preprocessor1_Model1

06: Stepwise selection

For this step, I’ll clean the individual datasets with a recipe.

We should add this recipe to a workflow that is part of our cross validation (as I did above and will do in other problems). I’m cutting a corner here to keep things simple with caret.

I’ll also prep() the recipe.

a_recipe =
  recipe(sale_price ~ ., data = train_df) |>
  # Define the role of 'id'
  update_role(id, new_role = 'ID') |>
  # Simple imputations
  step_impute_mean(all_numeric_predictors()) |>
  step_impute_mode(all_nominal_predictors()) |>
  # Dummies
  step_dummy(all_nominal_predictors()) |>
  # Drop near-zero-variance variables and linear combinations
  step_nzv(all_numeric_predictors()) |>
  step_lincomb(all_numeric_predictors())
# Prep it
prepped_recipe = a_recipe |> prep()

# Clean the training data
train_clean = prepped_recipe |> bake(new_data = NULL)
# Clean the testing data
test_clean = prepped_recipe |> bake(new_data = test_df)

Check it out: Names match!

setdiff(names(train_clean), names(test_clean))

character(0)

Time for the stepwise (forward) model selection…

There are a few options here. We can use the y and x approach, or we can input the form (formula) and data.

Passing y and x:

# Set a seed
set.seed(42)
# Train!
train_fwd_1 = caret::train(
  y = train_clean$sale_price,
  x = train_clean |> select(-sale_price, -id),
  trControl = trainControl(method = 'cv', number = 5),
  method = "leapForward",
  tuneGrid = data.frame(nvmax = 1:(ncol(train_clean) - 2))
)

Passing form and data:

# Set a seed
set.seed(42)
# Train!
train_fwd_2 = caret::train(
  form = sale_price ~ .,
  data = train_clean |> select(-id),
  trControl = trainControl(method = 'cv', number = 5),
  method = "leapForward",
  tuneGrid = data.frame(nvmax = 1:(ncol(train_clean) - 2))
)

There are a few ways to view things about the final model chosen by caret::train().

You can pass the final object (e.g., train_fwd_1) to the summary() function, which will produce a lot of somewhat helpful output.
You can can also grab the finalModel from your trained object and then pass it to other functions like coef() to see the coefficients. You will also need to pass coef() a number referencing which stepwise model you want to examine. The index of the “best” model is stored in $bestTune (as a data.frame, so we need to unlist() it).

Example of option 2:

# First show the results of training/tuning the model
train_fwd_1

Linear Regression with Forward Selection 

1460 samples
 101 predictor

No pre-processing
Resampling: Cross-Validated (5 fold) 
Summary of sample sizes: 1168, 1168, 1167, 1169, 1168 
Resampling results across tuning parameters:

  nvmax  RMSE       Rsquared   MAE       
    1    0.2303056  0.6678791  0.17379110
    2    0.2039121  0.7395022  0.14791497
    3    0.1867088  0.7819462  0.13256496
    4    0.1816407  0.7954345  0.12319391
    5    0.1759785  0.8081903  0.11899999
    6    0.1728237  0.8152576  0.11313820
    7    0.1644675  0.8321740  0.10847265
    8    0.1622904  0.8367771  0.10678930
    9    0.1585075  0.8440589  0.10395508
   10    0.1568374  0.8473310  0.10218056
   11    0.1567764  0.8469906  0.10119806
   12    0.1554054  0.8495877  0.09974558
   13    0.1544955  0.8511324  0.09906164
   14    0.1522056  0.8552191  0.09734353
   15    0.1511424  0.8568808  0.09641350
   16    0.1494450  0.8598634  0.09555140
   17    0.1489558  0.8606512  0.09616275
   18    0.1487699  0.8611355  0.09613863
   19    0.1483046  0.8621789  0.09600034
   20    0.1477147  0.8632878  0.09593673
   21    0.1467691  0.8648994  0.09548343
   22    0.1462668  0.8658834  0.09532404
   23    0.1452566  0.8676946  0.09431651
   24    0.1440446  0.8697010  0.09374732
   25    0.1441062  0.8695358  0.09398880
   26    0.1443210  0.8692389  0.09399212
   27    0.1442717  0.8692920  0.09426408
   28    0.1443767  0.8691793  0.09433800
   29    0.1451758  0.8678311  0.09482675
   30    0.1452392  0.8677790  0.09463616
   31    0.1456767  0.8670319  0.09506984
   32    0.1457791  0.8669175  0.09525017
   33    0.1461457  0.8662883  0.09537753
   34    0.1464655  0.8657473  0.09552602
   35    0.1464451  0.8656924  0.09544752
   36    0.1467279  0.8652085  0.09596666
   37    0.1467671  0.8651091  0.09605969
   38    0.1468165  0.8650210  0.09599616
   39    0.1469550  0.8647360  0.09608133
   40    0.1468436  0.8649868  0.09587510
   41    0.1472565  0.8642733  0.09619872
   42    0.1476208  0.8637120  0.09662133
   43    0.1472988  0.8642650  0.09637158
   44    0.1474756  0.8640580  0.09651845
   45    0.1474814  0.8641426  0.09643571
   46    0.1474250  0.8643272  0.09625518
   47    0.1474797  0.8642704  0.09623150
   48    0.1472858  0.8646344  0.09615625
   49    0.1473402  0.8644793  0.09621567
   50    0.1471398  0.8648705  0.09633019
   51    0.1469074  0.8652299  0.09600708
   52    0.1467876  0.8654632  0.09604628
   53    0.1467649  0.8655775  0.09611062
   54    0.1468346  0.8654316  0.09612833
   55    0.1469844  0.8652047  0.09626699
   56    0.1469700  0.8652620  0.09616605
   57    0.1470421  0.8650829  0.09611535
   58    0.1472747  0.8646207  0.09604550
   59    0.1469948  0.8651207  0.09595270
   60    0.1470133  0.8650748  0.09611813
   61    0.1470602  0.8649801  0.09604934
   62    0.1477381  0.8638507  0.09632105
   63    0.1475995  0.8640665  0.09623198
   64    0.1475682  0.8641409  0.09612948
   65    0.1478297  0.8636932  0.09620861
   66    0.1476600  0.8640813  0.09606673
   67    0.1476669  0.8640173  0.09588294
   68    0.1473666  0.8645745  0.09577823
   69    0.1472933  0.8647539  0.09569934
   70    0.1471200  0.8650143  0.09568499
   71    0.1470058  0.8652015  0.09569388
   72    0.1468593  0.8654319  0.09573313
   73    0.1469497  0.8652744  0.09578497
   74    0.1469891  0.8652391  0.09577171
   75    0.1469842  0.8652576  0.09575920
   76    0.1466637  0.8658040  0.09561904
   77    0.1471473  0.8650243  0.09584149
   78    0.1472263  0.8648794  0.09586586
   79    0.1472736  0.8647777  0.09593137
   80    0.1472785  0.8647242  0.09590033
   81    0.1472018  0.8648812  0.09583334
   82    0.1472296  0.8648649  0.09582429
   83    0.1472851  0.8647777  0.09589175
   84    0.1477065  0.8641714  0.09587392
   85    0.1476490  0.8642658  0.09577871
   86    0.1476068  0.8643412  0.09569602
   87    0.1476763  0.8642353  0.09574831
   88    0.1477115  0.8641622  0.09584941
   89    0.1477394  0.8641085  0.09583006
   90    0.1477520  0.8641176  0.09580086
   91    0.1477509  0.8640773  0.09576743
   92    0.1476105  0.8642912  0.09566417
   93    0.1476111  0.8642845  0.09570758
   94    0.1475826  0.8643309  0.09569759
   95    0.1475790  0.8643140  0.09572361
   96    0.1475503  0.8643625  0.09570684
   97    0.1475571  0.8643436  0.09574052
   98    0.1475644  0.8643350  0.09575601
   99    0.1475510  0.8643608  0.09574828
  100    0.1475444  0.8643709  0.09574728
  101    0.1475516  0.8643616  0.09575194

RMSE was used to select the optimal model using the smallest value.
The final value used for the model was nvmax = 24.

# We can show the first model (intercept and one feature)
train_fwd_1$finalModel |> coef(1)

 (Intercept) overall_qual 
   10.584442     0.236028

# Now show the coefficients of the "best" model
train_fwd_1$finalModel |> coef(train_fwd_1$bestTune |> unlist())

          (Intercept)          ms_sub_class          overall_qual 
         7.603888e+00         -6.119700e-04          7.259903e-02 
         overall_cond            year_built         total_bsmt_sf 
         4.891569e-02          1.385832e-03          3.536602e-05 
          gr_liv_area        bsmt_full_bath             full_bath 
         2.553589e-04          5.921167e-02          3.202031e-02 
           fireplaces           garage_cars          ms_zoning_RL 
         4.295657e-02          7.043345e-02          9.563135e-02 
         ms_zoning_RM    lot_config_CulDSac  neighborhood_Edwards 
        -5.126260e-04          6.670718e-02         -7.444941e-02 
 neighborhood_NridgHt  neighborhood_Somerst       condition1_Norm 
         1.084886e-01          1.250205e-01          7.535564e-02 
     foundation_PConc      bsmt_exposure_Gd    bsmt_fin_type1_Unf 
         5.799953e-02          7.678911e-02         -4.841785e-02 
        central_air_Y        functional_Typ         sale_type_New 
         7.580061e-02          5.230610e-02          1.241948e-01 
sale_condition_Normal 
         6.979126e-02

Note: The str() function can be really helpful in examining the structure of any object—as can the class() function to get as sense of which class you’re working with.

07: Lasso

We can start with the existing recipe, but we also want to standardize the variables. I’m also expanding the numeric features with a second-degree polynomial.

Also, we need to deal with some binary variables that end up with only one value in the splits.

# Update the existing recipe with normalization
recipe_lasso =
  recipe(sale_price ~ ., data = train_df) |>
  # Define the role of 'id'
  update_role(id, new_role = 'ID') |>
  # Try to take care of categorical issues
  step_string2factor(all_string_predictors()) |>
  # Simple imputations
  step_impute_mean(all_numeric_predictors()) |>
  step_impute_mode(all_nominal_predictors()) |>
  # Add quadratic terms
  step_poly(all_numeric_predictors(), degree = 2) |>
  # Dummies
  step_zv(all_nominal_predictors()) |>
  step_dummy(all_nominal_predictors()) |>
  # Normalize variables
  step_normalize(all_numeric_predictors()) |>
  # Drop near-zero-variance variables and linear combinations
  step_corr(all_numeric_predictors(), threshold = 0.9) |>
  step_nzv(all_numeric_predictors()) |>
  step_lincomb(all_numeric_predictors()) |>
  step_zv(all_numeric_predictors())

Now we just need a lasso model and workflow…

# The model (make sure to set penalty to tune and mixture=1)
model_lasso =
  linear_reg(penalty = tune(), mixture = 1) |>
  set_mode('regression') |>
  set_engine('glmnet')
# Workflow
wf_lasso =
  workflow() |>
  add_model(model_lasso) |>
  add_recipe(recipe_lasso)

… and we can cross validate the penalty!

# Fit!
fit_lasso =
  wf_lasso |>
  tune_grid(
    resamples = credit_cv,
    metrics = metric_set(rmse),
    grid = grid_regular(penalty(), levels = 50)
  )
# How did we do?
fit_lasso |> show_best(metric = 'rmse', n = 10)

# A tibble: 10 × 7
     penalty .metric .estimator  mean     n std_err .config              
       <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
 1 0.00222   rmse    standard   0.127     5 0.00464 Preprocessor1_Model37
 2 0.00139   rmse    standard   0.127     5 0.00475 Preprocessor1_Model36
 3 0.00356   rmse    standard   0.127     5 0.00445 Preprocessor1_Model38
 4 0.000869  rmse    standard   0.127     5 0.00497 Preprocessor1_Model35
 5 0.000543  rmse    standard   0.128     5 0.00526 Preprocessor1_Model34
 6 0.00569   rmse    standard   0.129     5 0.00421 Preprocessor1_Model39
 7 0.000339  rmse    standard   0.129     5 0.00548 Preprocessor1_Model33
 8 0.000212  rmse    standard   0.129     5 0.00571 Preprocessor1_Model32
 9 0.000133  rmse    standard   0.130     5 0.00589 Preprocessor1_Model31
10 0.0000829 rmse    standard   0.130     5 0.00601 Preprocessor1_Model30