This vigniette demonstrates how to use the DALEX
package with models created with the xgboost package.
In this example we are going to use the wine
dataset from the breakDown
package. The wine quality will be predicted based on other features.
library("breakDown")
head(wine)
#> fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
#> 1 7.0 0.27 0.36 20.7 0.045
#> 2 6.3 0.30 0.34 1.6 0.049
#> 3 8.1 0.28 0.40 6.9 0.050
#> 4 7.2 0.23 0.32 8.5 0.058
#> 5 7.2 0.23 0.32 8.5 0.058
#> 6 8.1 0.28 0.40 6.9 0.050
#> free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
#> 1 45 170 1.0010 3.00 0.45 8.8
#> 2 14 132 0.9940 3.30 0.49 9.5
#> 3 30 97 0.9951 3.26 0.44 10.1
#> 4 47 186 0.9956 3.19 0.40 9.9
#> 5 47 186 0.9956 3.19 0.40 9.9
#> 6 30 97 0.9951 3.26 0.44 10.1
#> quality
#> 1 6
#> 2 6
#> 3 6
#> 4 6
#> 5 6
#> 6 6
Let’s build a model. We need to prepare xgb.DMatrix
first.
library("xgboost")
model_martix_train <- model.matrix(quality ~ . - 1, wine)
data_train <- xgb.DMatrix(model_martix_train, label = wine$quality)
param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2,
objective = "reg:linear")
wine_xgb_model <- xgb.train(param, data_train, nrounds = 50)
wine_xgb_model
#> ##### xgb.Booster
#> raw: 20.1 Kb
#> call:
#> xgb.train(params = param, data = data_train, nrounds = 50)
#> params (as set within xgb.train):
#> max_depth = "2", eta = "1", silent = "1", nthread = "2", objective = "reg:linear", silent = "1"
#> xgb.attributes:
#> niter
#> callbacks:
#> cb.print.evaluation(period = print_every_n)
#> # of features: 11
#> niter: 50
#> nfeatures : 11
Now we can create an explainer.
library("DALEX")
explainer_xgb <- explain(wine_xgb_model,
data = model_martix_train,
y = wine$quality,
label = "xgboost",
colorize = FALSE)
#> Preparation of a new explainer is initiated
#> -> model label : xgboost
#> -> data : 4898 rows 11 cols
#> -> target variable : 4898 values
#> -> predict function : yhat.default will be used ( default )
#> -> predicted values : numerical, min = 2.869188 , mean = 5.878132 , max = 8.078749
#> -> model_info : package Model of class: xgb.Booster package unrecognized , ver. Unknown , task regression ( default )
#> -> residual function : difference between y and yhat ( default )
#> -> residuals : numerical, min = -3.251447 , mean = -0.0002230403 , max = 3.005342
#> A new explainer has been created!
explainer_xgb
#> Model label: xgboost
#> Model class: xgb.Booster
#> Data head :
#> fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
#> 1 7.0 0.27 0.36 20.7 0.045
#> 2 6.3 0.30 0.34 1.6 0.049
#> free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
#> 1 45 170 1.001 3.0 0.45 8.8
#> 2 14 132 0.994 3.3 0.49 9.5
For continouse variable
nobs <- model_martix_train[1, , drop = FALSE]
sp_xgb <- predict_parts(explainer_xgb,
new_observation = nobs,
type = "break_down")
head(sp_xgb)
#> contribution
#> xgboost: intercept 5.878
#> xgboost: residual.sugar = 20.7 0.332
#> xgboost: alcohol = 8.8 -0.045
#> xgboost: density = 1.001 -0.429
#> xgboost: volatile.acidity = 0.27 -0.297
#> xgboost: free.sulfur.dioxide = 45 -0.040
plot(sp_xgb)
vd_xgb <- model_parts(explainer_xgb, type = "raw")
head(vd_xgb)
#> variable mean_dropout_loss label
#> 1 _full_model_ 0.6295067 xgboost
#> 2 fixed.acidity 0.6391484 xgboost
#> 3 sulphates 0.6471640 xgboost
#> 4 citric.acid 0.6538835 xgboost
#> 5 total.sulfur.dioxide 0.6552513 xgboost
#> 6 chlorides 0.6691735 xgboost
plot(vd_xgb)
In this example we are going to use the HR_data
dataset from the breakDown
package. The model will predict odds that someone will leave the company.
library("breakDown")
head(HR_data)
#> satisfaction_level last_evaluation number_project average_montly_hours
#> 1 0.38 0.53 2 157
#> 2 0.80 0.86 5 262
#> 3 0.11 0.88 7 272
#> 4 0.72 0.87 5 223
#> 5 0.37 0.52 2 159
#> 6 0.41 0.50 2 153
#> time_spend_company Work_accident left promotion_last_5years sales salary
#> 1 3 0 1 0 sales low
#> 2 6 0 1 0 sales medium
#> 3 4 0 1 0 sales medium
#> 4 5 0 1 0 sales low
#> 5 3 0 1 0 sales low
#> 6 3 0 1 0 sales low
Let’s build a model. We need to prepare xgb.DMatrix
first.
library("xgboost")
model_martix_train <- model.matrix(left ~ . - 1, HR_data)
data_train <- xgb.DMatrix(model_martix_train, label = HR_data$left)
param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2,
objective = "binary:logistic", eval_metric = "auc")
HR_xgb_model <- xgb.train(param, data_train, nrounds = 50)
HR_xgb_model
#> ##### xgb.Booster
#> raw: 19.5 Kb
#> call:
#> xgb.train(params = param, data = data_train, nrounds = 50)
#> params (as set within xgb.train):
#> max_depth = "2", eta = "1", silent = "1", nthread = "2", objective = "binary:logistic", eval_metric = "auc", silent = "1"
#> xgb.attributes:
#> niter
#> callbacks:
#> cb.print.evaluation(period = print_every_n)
#> # of features: 19
#> niter: 50
#> nfeatures : 19
Now we can create an explainer.
library("DALEX")
predict_logit <- function(model, x) {
raw_x <- predict(model, x)
exp(raw_x)/(1 + exp(raw_x))
}
logit <- function(x) exp(x)/(1+exp(x))
explainer_xgb <- explain(HR_xgb_model,
data = model_martix_train,
y = HR_data$left,
predict_function = predict_logit,
link = logit,
label = "xgboost",
colorize = FALSE)
#> Preparation of a new explainer is initiated
#> -> model label : xgboost
#> -> data : 14999 rows 19 cols
#> -> target variable : 14999 values
#> -> predict function : predict_logit
#> -> predicted values : numerical, min = 0.5 , mean = 0.5555972 , max = 0.7310584
#> -> model_info : package Model of class: xgb.Booster package unrecognized , ver. Unknown , task regression ( default )
#> -> residual function : difference between y and yhat ( default )
#> -> residuals : numerical, min = -0.7296657 , mean = -0.3175147 , max = 0.4997965
#> A new explainer has been created!
explainer_xgb
#> Model label: xgboost
#> Model class: xgb.Booster
#> Data head :
#> satisfaction_level last_evaluation number_project average_montly_hours
#> 1 0.38 0.53 2 157
#> 2 0.80 0.86 5 262
#> time_spend_company Work_accident promotion_last_5years salesaccounting
#> 1 3 0 0 0
#> 2 6 0 0 0
#> saleshr salesIT salesmanagement salesmarketing salesproduct_mng salesRandD
#> 1 0 0 0 0 0 0
#> 2 0 0 0 0 0 0
#> salessales salessupport salestechnical salarylow salarymedium
#> 1 1 0 0 1 0
#> 2 1 0 0 0 1
For continouse variable
nobs <- model_martix_train[1, , drop = FALSE]
sp_xgb <- predict_parts(explainer_xgb,
new_observation = nobs,
type = "break_down")
head(sp_xgb)
#> contribution
#> xgboost: intercept 0.556
#> xgboost: time_spend_company = 3 -0.013
#> xgboost: satisfaction_level = 0.38 0.012
#> xgboost: last_evaluation = 0.53 0.020
#> xgboost: average_montly_hours = 157 0.061
#> xgboost: salarylow = 1 0.019
plot(sp_xgb)
vd_xgb <- model_parts(explainer_xgb, type = "raw")
head(vd_xgb)
#> variable mean_dropout_loss label
#> 1 _full_model_ 0.4641699 xgboost
#> 2 salarymedium 0.4640913 xgboost
#> 3 salesaccounting 0.4641699 xgboost
#> 4 salesIT 0.4641699 xgboost
#> 5 salesmanagement 0.4641699 xgboost
#> 6 salesmarketing 0.4641699 xgboost
plot(vd_xgb)
sessionInfo()
#> R version 3.6.3 (2020-02-29)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 18363)
#>
#> Matrix products: default
#>
#> locale:
#> [1] LC_COLLATE=Polish_Poland.1250 LC_CTYPE=Polish_Poland.1250
#> [3] LC_MONETARY=Polish_Poland.1250 LC_NUMERIC=C
#> [5] LC_TIME=Polish_Poland.1250
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] DALEX_2.0.1 xgboost_1.0.0.2 breakDown_0.2.0
#>
#> loaded via a namespace (and not attached):
#> [1] Rcpp_1.0.4 pillar_1.4.3 compiler_3.6.3 ingredients_2.0
#> [5] tools_3.6.3 digest_0.6.25 evaluate_0.14 lifecycle_0.2.0
#> [9] tibble_2.1.3 gtable_0.3.0 lattice_0.20-38 pkgconfig_2.0.3
#> [13] rlang_0.4.6 Matrix_1.2-18 yaml_2.2.1 xfun_0.12
#> [17] stringr_1.4.0 dplyr_1.0.0 knitr_1.28 generics_0.0.2
#> [21] vctrs_0.3.1 grid_3.6.3 tidyselect_1.1.0 glue_1.3.2
#> [25] data.table_1.12.8 R6_2.4.1 iBreakDown_1.3.1 rmarkdown_2.1
#> [29] farver_2.0.3 ggplot2_3.3.0 purrr_0.3.3 magrittr_1.5
#> [33] scales_1.1.0 htmltools_0.4.0 colorspace_1.4-1 labeling_0.3
#> [37] stringi_1.4.6 munsell_0.5.0 crayon_1.3.4