knitr::opts_chunk$set(warning = FALSE) ## Limit warnings
knitr::opts_chunk$set(message = FALSE) ## Limit messages
knitr::opts_chunk$set(echo    = TRUE)  ## Do not limit echo
options(htmltools.dir.version = FALSE)
# setup; load packages, data; set seed
options(scipen = 999)
library(pacman)
p_load(tidyverse, modeldata, skimr, janitor, tidymodels, magrittr, data.table, here)
set.seed(12345)
source("~/Documents/scripts/colors/colors.R")

We have spent a lot of time over the last few weeks learning tidymodels. But there is one thing that we glazed over briefly that is super important for your final projects in this class.

Once we cross validate, fit a model within a workflow, and pick the best parameters, how do we predict that model onto our testing data? This set of notes will show you how to do this last step mimicking the project000 using housing data.

Setup

Let’s load the housing data (training and testing) from Ames, Iowa. Recall that sales prices are only reported in the training data set.

# Load data ------------------------------------------------------------------------------
# Training data
train_dt = here('lab/001-cleaning/data', 'train.csv') %>% 
  fread() %>% 
  janitor::clean_names() %>% 
  mutate(sqft = x1st_flr_sf + x2nd_flr_sf)

# Testing data
test_dt = here('lab/001-cleaning/data', 'test.csv') %>% 
  fread() %>% 
  janitor::clean_names() %>% 
  mutate(sqft = x1st_flr_sf + x2nd_flr_sf)

Always a good idea to skim to check for missing vars + other info. Here I avoided using any

train_dt %>% skimr::skim()

Data summary
Name	Piped data
Number of rows	1460
Number of columns	82
Key	NULL
_______________________
Column type frequency:
character	43
numeric	39
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
ms_zoning	0	1.00	2	7	5
street	0	1.00	4	4	2
alley	1369	0.06	4	4	2
lot_shape	0	1.00	3	3	4
land_contour	0	1.00	3	3	4
utilities	0	1.00	6	6	2
lot_config	0	1.00	3	7	5
land_slope	0	1.00	3	3	3
neighborhood	0	1.00	5	7	25
condition1	0	1.00	4	6	9
condition2	0	1.00	4	6	8
bldg_type	0	1.00	4	6	5
house_style	0	1.00	4	6	8
roof_style	0	1.00	3	7	6
roof_matl	0	1.00	4	7	8
exterior1st	0	1.00	5	7	15
exterior2nd	0	1.00	5	7	16
mas_vnr_type	8	0.99	4	7	4
exter_qual	0	1.00	2	2	4
exter_cond	0	1.00	2	2	5
foundation	0	1.00	4	6	6
bsmt_qual	37	0.97	2	2	4
bsmt_cond	37	0.97	2	2	4
bsmt_exposure	38	0.97	2	2	4
bsmt_fin_type1	37	0.97	3	3	6
bsmt_fin_type2	38	0.97	3	3	6
heating	0	1.00	4	5	6
heating_qc	0	1.00	2	2	5
central_air	0	1.00	1	1	2
electrical	1	1.00	3	5	5
kitchen_qual	0	1.00	2	2	4
functional	0	1.00	3	4	7
fireplace_qu	690	0.53	2	2	5
garage_type	81	0.94	6	7	6
garage_finish	81	0.94	3	3	3
garage_qual	81	0.94	2	2	5
garage_cond	81	0.94	2	2	5
paved_drive	0	1.00	1	1	3
pool_qc	1453	0.00	2	2	3
fence	1179	0.19	4	5	4
misc_feature	1406	0.04	4	4	4
sale_type	0	1.00	2	5	9
sale_condition	0	1.00	6	7	6

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	0	1.00	730.50	421.61	1	365.75	730.5	1095.25	1460	▇▇▇▇▇
ms_sub_class	0	1.00	56.90	42.30	20	20.00	50.0	70.00	190	▇▅▂▁▁
lot_frontage	259	0.82	70.05	24.28	21	59.00	69.0	80.00	313	▇▃▁▁▁
lot_area	0	1.00	10516.83	9981.26	1300	7553.50	9478.5	11601.50	215245	▇▁▁▁▁
overall_qual	0	1.00	6.10	1.38	1	5.00	6.0	7.00	10	▁▂▇▅▁
overall_cond	0	1.00	5.58	1.11	1	5.00	5.0	6.00	9	▁▁▇▅▁
year_built	0	1.00	1971.27	30.20	1872	1954.00	1973.0	2000.00	2010	▁▂▃▆▇
year_remod_add	0	1.00	1984.87	20.65	1950	1967.00	1994.0	2004.00	2010	▅▂▂▃▇
mas_vnr_area	8	0.99	103.69	181.07	0	0.00	0.0	166.00	1600	▇▁▁▁▁
bsmt_fin_sf1	0	1.00	443.64	456.10	0	0.00	383.5	712.25	5644	▇▁▁▁▁
bsmt_fin_sf2	0	1.00	46.55	161.32	0	0.00	0.0	0.00	1474	▇▁▁▁▁
bsmt_unf_sf	0	1.00	567.24	441.87	0	223.00	477.5	808.00	2336	▇▅▂▁▁
total_bsmt_sf	0	1.00	1057.43	438.71	0	795.75	991.5	1298.25	6110	▇▃▁▁▁
x1st_flr_sf	0	1.00	1162.63	386.59	334	882.00	1087.0	1391.25	4692	▇▅▁▁▁
x2nd_flr_sf	0	1.00	346.99	436.53	0	0.00	0.0	728.00	2065	▇▃▂▁▁
low_qual_fin_sf	0	1.00	5.84	48.62	0	0.00	0.0	0.00	572	▇▁▁▁▁
gr_liv_area	0	1.00	1515.46	525.48	334	1129.50	1464.0	1776.75	5642	▇▇▁▁▁
bsmt_full_bath	0	1.00	0.43	0.52	0	0.00	0.0	1.00	3	▇▆▁▁▁
bsmt_half_bath	0	1.00	0.06	0.24	0	0.00	0.0	0.00	2	▇▁▁▁▁
full_bath	0	1.00	1.57	0.55	0	1.00	2.0	2.00	3	▁▇▁▇▁
half_bath	0	1.00	0.38	0.50	0	0.00	0.0	1.00	2	▇▁▅▁▁
bedroom_abv_gr	0	1.00	2.87	0.82	0	2.00	3.0	3.00	8	▁▇▂▁▁
kitchen_abv_gr	0	1.00	1.05	0.22	0	1.00	1.0	1.00	3	▁▇▁▁▁
tot_rms_abv_grd	0	1.00	6.52	1.63	2	5.00	6.0	7.00	14	▂▇▇▁▁
fireplaces	0	1.00	0.61	0.64	0	0.00	1.0	1.00	3	▇▇▁▁▁
garage_yr_blt	81	0.94	1978.51	24.69	1900	1961.00	1980.0	2002.00	2010	▁▁▅▅▇
garage_cars	0	1.00	1.77	0.75	0	1.00	2.0	2.00	4	▁▃▇▂▁
garage_area	0	1.00	472.98	213.80	0	334.50	480.0	576.00	1418	▂▇▃▁▁
wood_deck_sf	0	1.00	94.24	125.34	0	0.00	0.0	168.00	857	▇▂▁▁▁
open_porch_sf	0	1.00	46.66	66.26	0	0.00	25.0	68.00	547	▇▁▁▁▁
enclosed_porch	0	1.00	21.95	61.12	0	0.00	0.0	0.00	552	▇▁▁▁▁
x3ssn_porch	0	1.00	3.41	29.32	0	0.00	0.0	0.00	508	▇▁▁▁▁
screen_porch	0	1.00	15.06	55.76	0	0.00	0.0	0.00	480	▇▁▁▁▁
pool_area	0	1.00	2.76	40.18	0	0.00	0.0	0.00	738	▇▁▁▁▁
misc_val	0	1.00	43.49	496.12	0	0.00	0.0	0.00	15500	▇▁▁▁▁
mo_sold	0	1.00	6.32	2.70	1	5.00	6.0	8.00	12	▃▆▇▃▃
yr_sold	0	1.00	2007.82	1.33	2006	2007.00	2008.0	2009.00	2010	▇▇▇▇▅
sale_price	0	1.00	180921.20	79442.50	34900	129975.00	163000.0	214000.00	755000	▇▅▁▁▁
sqft	0	1.00	1509.62	521.16	334	1123.75	1458.0	1775.25	5642	▇▇▁▁▁

Set up our cross validation splits

# 5-fold CV on the training dataset 
train_cv = train_dt %>% vfold_cv(v = 5)

`tidymodel`ing

`recipe`

I create a very simple recipe here with little for thought. Not really trying my hardest with the model formula. But I could easily start changing my predictors by altering my recipe.

recipe = recipe(sale_price ~ id + 
                             yr_sold + full_bath + year_built + lot_area + sqft +
                             central_air + half_bath +
                             pool_area + overall_qual + overall_cond + lot_area,
                        data = train_dt) %>%
  # step_rm(state, county, pop_pct_veteran, dir_2016) %>% 
  update_role(id, new_role = "id var") %>% 
  step_normalize(all_predictors() & all_numeric())
  

recipe

## Recipe
## 
## Inputs:
## 
##       role #variables
##     id var          1
##    outcome          1
##  predictor         10
## 
## Operations:
## 
## Centering and scaling for all_predictors() & all_numeric()

`model`

I’m going to give a random forest a shot for this prediction. Using 5 fold CV, I want to tune three parameters in my random forest:

min_n: minimum number of data points in a node that are required for the node to be split further
mtry: number of predictors that will be randomly sampled at each split
trees: number of trees contained in the ensemble

model = 
  rand_forest(
    mode = 'regression',
    engine = 'ranger',
    mtry = tune(),
    trees = tune(),
    min_n = tune()
  )

`workflow`

Set my workflow with my model and recipe

wf = 
  workflow() %>%
  add_model(model) %>%
  add_recipe(recipe)

`fit`

Fit my model, tuning my parameters across several values. The measure of error I care about is the RMSE.

fit = wf %>% 
  tune_grid(
    train_cv,
    grid = expand_grid(mtry = c(1,2,3,5,10),
                      min_n = c(1,3,5,10, 25),
                      trees = c(50, 100, 150, 300)),
    metrics = metric_set(rmse)
  )

After tuning, I plot my RMSE to check an see which how well my model did for each variation in my parameters

fit %>% 
  collect_metrics(summarize = TRUE) %>% 
  ggplot(aes(x = min_n, y = mean, group = factor(mtry), color = factor(mtry))) +
  geom_line(size = 0.7, alpha = 0.6) +
  geom_point(size = 2.5) +
  facet_wrap(~trees, nrow = 1) +
  labs(
    title = "Random forest model performance",
    subtitle = "Predicting housing prices",
    caption = "",
    x = "Min data points per node",
    y = 'RMSE',
    color = 'Number of predictors'
  ) +
  hrbrthemes::theme_ipsum() +
  scale_color_manual(values = c(grant_pal_1, grant_pal_3, grant_pal_5, 
                                grant_pal_6, grant_pal_9)) +
  scale_x_continuous(breaks = c(1, 3, 5, 10, 25)) +
  theme(
    panel.grid.minor = element_blank(),
    panel.grid.major.y = element_blank(),
    legend.position = 'bottom'
  )

`finalize_workflow`

Now I want to finalize my workflow and select my best model parameters. This will output the model I want to use to predict housing prices in the testing data.

final_rf = 
  wf %>% 
  finalize_workflow(select_best(fit, metric = "rmse"))

The following code chunk actually fits that model.

# Fitting our final workflow
final = final_rf %>% fit(data = train_dt)
# Examine the final workflow
final

## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
## 
## • step_normalize()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Ranger result
## 
## Call:
##  ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~3,      x), num.trees = ~50, min.node.size = min_rows(~1, x), num.threads = 1,      verbose = FALSE, seed = sample.int(10^5, 1)) 
## 
## Type:                             Regression 
## Number of trees:                  50 
## Sample size:                      1460 
## Number of independent variables:  10 
## Mtry:                             3 
## Target node size:                 1 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       1123092792 
## R squared (OOB):                  0.8220452

`predict`

Using the finalized workflow, I now predict housing prices in the testing data set using the familiar predict function. The result is a vector of \(y^{hat}\)s that my model would predict the sales price of each house.

# Predict onto the test data
yhat = final %>% predict(new_data = test_dt)
# Check out predictions
yhat %>% head()

## # A tibble: 6 × 1
##     .pred
##     <dbl>
## 1 127339.
## 2 151166.
## 3 185422.
## 4 191608.
## 5 201316.
## 6 190181.

test_dt$yhat <- yhat

Predicting testing data using `tidymodels`

Ames housing data

Andrew Dickinson

03 March 2022

Setup

`tidymodel`ing

`recipe`

`model`

`workflow`

`fit`

`finalize_workflow`

`predict`

Predicting testing data using tidymodels

Ames housing data

Andrew Dickinson

03 March 2022

Setup

tidymodeling

recipe

model

workflow

fit

finalize_workflow

predict

Predicting testing data using `tidymodels`

`tidymodel`ing

`recipe`

`model`

`workflow`

`fit`

`finalize_workflow`

`predict`