This vigniette demonstrates how to use the DALEX package with models created with the xgboost package.
In this example we are going to use the wine dataset from the breakDown package. The wine quality will be predicted based on other features.
library("breakDown")
head(wine)
#>   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
#> 1           7.0             0.27        0.36           20.7     0.045
#> 2           6.3             0.30        0.34            1.6     0.049
#> 3           8.1             0.28        0.40            6.9     0.050
#> 4           7.2             0.23        0.32            8.5     0.058
#> 5           7.2             0.23        0.32            8.5     0.058
#> 6           8.1             0.28        0.40            6.9     0.050
#>   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
#> 1                  45                  170  1.0010 3.00      0.45     8.8
#> 2                  14                  132  0.9940 3.30      0.49     9.5
#> 3                  30                   97  0.9951 3.26      0.44    10.1
#> 4                  47                  186  0.9956 3.19      0.40     9.9
#> 5                  47                  186  0.9956 3.19      0.40     9.9
#> 6                  30                   97  0.9951 3.26      0.44    10.1
#>   quality
#> 1       6
#> 2       6
#> 3       6
#> 4       6
#> 5       6
#> 6       6Let’s build a model. We need to prepare xgb.DMatrix first.
library("xgboost")
model_martix_train <- model.matrix(quality ~ . - 1, wine)
data_train <- xgb.DMatrix(model_martix_train, label = wine$quality)
param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2,
              objective = "reg:linear")
wine_xgb_model <- xgb.train(param, data_train, nrounds = 50)
wine_xgb_model
#> ##### xgb.Booster
#> raw: 20.1 Kb 
#> call:
#>   xgb.train(params = param, data = data_train, nrounds = 50)
#> params (as set within xgb.train):
#>   max_depth = "2", eta = "1", silent = "1", nthread = "2", objective = "reg:linear", silent = "1"
#> xgb.attributes:
#>   niter
#> callbacks:
#>   cb.print.evaluation(period = print_every_n)
#> # of features: 11 
#> niter: 50
#> nfeatures : 11Now we can create an explainer.
library("DALEX")
explainer_xgb <- explain(wine_xgb_model, 
                         data = model_martix_train, 
                         y = wine$quality, 
                         label = "xgboost",
                         colorize = FALSE)
#> Preparation of a new explainer is initiated
#>   -> model label       :  xgboost 
#>   -> data              :  4898  rows  11  cols 
#>   -> target variable   :  4898  values 
#>   -> predict function  :  yhat.default will be used (  default  )
#>   -> predicted values  :  numerical, min =  2.869188 , mean =  5.878132 , max =  8.078749  
#>   -> model_info        :  package Model of class: xgb.Booster package unrecognized , ver. Unknown , task regression (  default  ) 
#>   -> residual function :  difference between y and yhat (  default  )
#>   -> residuals         :  numerical, min =  -3.251447 , mean =  -0.0002230403 , max =  3.005342  
#>   A new explainer has been created!
explainer_xgb
#> Model label:  xgboost 
#> Model class:  xgb.Booster 
#> Data head  :
#>   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
#> 1           7.0             0.27        0.36           20.7     0.045
#> 2           6.3             0.30        0.34            1.6     0.049
#>   free.sulfur.dioxide total.sulfur.dioxide density  pH sulphates alcohol
#> 1                  45                  170   1.001 3.0      0.45     8.8
#> 2                  14                  132   0.994 3.3      0.49     9.5For continouse variable
sv_xgb_satisfaction_level  <- model_profile(explainer_xgb, 
                                            variable = "alcohol", 
                                            type = "partial")
plot(sv_xgb_satisfaction_level)nobs <- model_martix_train[1, , drop = FALSE]
sp_xgb  <- predict_parts(explainer_xgb, 
                         new_observation = nobs,
                         type = "break_down")
head(sp_xgb)
#>                                   contribution
#> xgboost: intercept                       5.878
#> xgboost: residual.sugar = 20.7           0.332
#> xgboost: alcohol = 8.8                  -0.045
#> xgboost: density = 1.001                -0.429
#> xgboost: volatile.acidity = 0.27        -0.297
#> xgboost: free.sulfur.dioxide = 45       -0.040
plot(sp_xgb)vd_xgb <- model_parts(explainer_xgb, type = "raw")
head(vd_xgb)
#>               variable mean_dropout_loss   label
#> 1         _full_model_         0.6295067 xgboost
#> 2        fixed.acidity         0.6391484 xgboost
#> 3            sulphates         0.6471640 xgboost
#> 4          citric.acid         0.6538835 xgboost
#> 5 total.sulfur.dioxide         0.6552513 xgboost
#> 6            chlorides         0.6691735 xgboost
plot(vd_xgb)In this example we are going to use the HR_data dataset from the breakDown package. The model will predict odds that someone will leave the company.
library("breakDown")
head(HR_data)
#>   satisfaction_level last_evaluation number_project average_montly_hours
#> 1               0.38            0.53              2                  157
#> 2               0.80            0.86              5                  262
#> 3               0.11            0.88              7                  272
#> 4               0.72            0.87              5                  223
#> 5               0.37            0.52              2                  159
#> 6               0.41            0.50              2                  153
#>   time_spend_company Work_accident left promotion_last_5years sales salary
#> 1                  3             0    1                     0 sales    low
#> 2                  6             0    1                     0 sales medium
#> 3                  4             0    1                     0 sales medium
#> 4                  5             0    1                     0 sales    low
#> 5                  3             0    1                     0 sales    low
#> 6                  3             0    1                     0 sales    lowLet’s build a model. We need to prepare xgb.DMatrix first.
library("xgboost")
model_martix_train <- model.matrix(left ~ . - 1, HR_data)
data_train <- xgb.DMatrix(model_martix_train, label = HR_data$left)
param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2,
              objective = "binary:logistic", eval_metric = "auc")
HR_xgb_model <- xgb.train(param, data_train, nrounds = 50)
HR_xgb_model
#> ##### xgb.Booster
#> raw: 19.5 Kb 
#> call:
#>   xgb.train(params = param, data = data_train, nrounds = 50)
#> params (as set within xgb.train):
#>   max_depth = "2", eta = "1", silent = "1", nthread = "2", objective = "binary:logistic", eval_metric = "auc", silent = "1"
#> xgb.attributes:
#>   niter
#> callbacks:
#>   cb.print.evaluation(period = print_every_n)
#> # of features: 19 
#> niter: 50
#> nfeatures : 19Now we can create an explainer.
library("DALEX")
predict_logit <- function(model, x) {
  raw_x <- predict(model, x)
  exp(raw_x)/(1 + exp(raw_x))
}
logit <- function(x) exp(x)/(1+exp(x))
explainer_xgb <- explain(HR_xgb_model, 
                         data = model_martix_train, 
                         y = HR_data$left, 
                         predict_function = predict_logit,
                         link = logit,
                         label = "xgboost",
                         colorize = FALSE)
#> Preparation of a new explainer is initiated
#>   -> model label       :  xgboost 
#>   -> data              :  14999  rows  19  cols 
#>   -> target variable   :  14999  values 
#>   -> predict function  :  predict_logit 
#>   -> predicted values  :  numerical, min =  0.5 , mean =  0.5555972 , max =  0.7310584  
#>   -> model_info        :  package Model of class: xgb.Booster package unrecognized , ver. Unknown , task regression (  default  ) 
#>   -> residual function :  difference between y and yhat (  default  )
#>   -> residuals         :  numerical, min =  -0.7296657 , mean =  -0.3175147 , max =  0.4997965  
#>   A new explainer has been created!
explainer_xgb
#> Model label:  xgboost 
#> Model class:  xgb.Booster 
#> Data head  :
#>   satisfaction_level last_evaluation number_project average_montly_hours
#> 1               0.38            0.53              2                  157
#> 2               0.80            0.86              5                  262
#>   time_spend_company Work_accident promotion_last_5years salesaccounting
#> 1                  3             0                     0               0
#> 2                  6             0                     0               0
#>   saleshr salesIT salesmanagement salesmarketing salesproduct_mng salesRandD
#> 1       0       0               0              0                0          0
#> 2       0       0               0              0                0          0
#>   salessales salessupport salestechnical salarylow salarymedium
#> 1          1            0              0         1            0
#> 2          1            0              0         0            1For continouse variable
sv_xgb_satisfaction_level  <- model_profile(explainer_xgb, 
                          variable = "satisfaction_level",
                          type = "partial")
plot(sv_xgb_satisfaction_level)nobs <- model_martix_train[1, , drop = FALSE]
sp_xgb  <- predict_parts(explainer_xgb, 
                         new_observation = nobs,
                         type = "break_down")
head(sp_xgb)
#>                                     contribution
#> xgboost: intercept                         0.556
#> xgboost: time_spend_company = 3           -0.013
#> xgboost: satisfaction_level = 0.38         0.012
#> xgboost: last_evaluation = 0.53            0.020
#> xgboost: average_montly_hours = 157        0.061
#> xgboost: salarylow = 1                     0.019
plot(sp_xgb)vd_xgb <- model_parts(explainer_xgb, type = "raw")
head(vd_xgb)
#>          variable mean_dropout_loss   label
#> 1    _full_model_         0.4641699 xgboost
#> 2    salarymedium         0.4640913 xgboost
#> 3 salesaccounting         0.4641699 xgboost
#> 4         salesIT         0.4641699 xgboost
#> 5 salesmanagement         0.4641699 xgboost
#> 6  salesmarketing         0.4641699 xgboost
plot(vd_xgb)sessionInfo()
#> R version 3.6.3 (2020-02-29)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 18363)
#> 
#> Matrix products: default
#> 
#> locale:
#> [1] LC_COLLATE=Polish_Poland.1250  LC_CTYPE=Polish_Poland.1250   
#> [3] LC_MONETARY=Polish_Poland.1250 LC_NUMERIC=C                  
#> [5] LC_TIME=Polish_Poland.1250    
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] DALEX_2.0.1     xgboost_1.0.0.2 breakDown_0.2.0
#> 
#> loaded via a namespace (and not attached):
#>  [1] Rcpp_1.0.4        pillar_1.4.3      compiler_3.6.3    ingredients_2.0  
#>  [5] tools_3.6.3       digest_0.6.25     evaluate_0.14     lifecycle_0.2.0  
#>  [9] tibble_2.1.3      gtable_0.3.0      lattice_0.20-38   pkgconfig_2.0.3  
#> [13] rlang_0.4.6       Matrix_1.2-18     yaml_2.2.1        xfun_0.12        
#> [17] stringr_1.4.0     dplyr_1.0.0       knitr_1.28        generics_0.0.2   
#> [21] vctrs_0.3.1       grid_3.6.3        tidyselect_1.1.0  glue_1.3.2       
#> [25] data.table_1.12.8 R6_2.4.1          iBreakDown_1.3.1  rmarkdown_2.1    
#> [29] farver_2.0.3      ggplot2_3.3.0     purrr_0.3.3       magrittr_1.5     
#> [33] scales_1.1.0      htmltools_0.4.0   colorspace_1.4-1  labeling_0.3     
#> [37] stringi_1.4.6     munsell_0.5.0     crayon_1.3.4