This vigniette demonstrates how to use the DALEX package with models created with the xgboost package.

1 Regression

In this example we are going to use the wine dataset from the breakDown package. The wine quality will be predicted based on other features.

library("breakDown")
head(wine)
#>   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
#> 1           7.0             0.27        0.36           20.7     0.045
#> 2           6.3             0.30        0.34            1.6     0.049
#> 3           8.1             0.28        0.40            6.9     0.050
#> 4           7.2             0.23        0.32            8.5     0.058
#> 5           7.2             0.23        0.32            8.5     0.058
#> 6           8.1             0.28        0.40            6.9     0.050
#>   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
#> 1                  45                  170  1.0010 3.00      0.45     8.8
#> 2                  14                  132  0.9940 3.30      0.49     9.5
#> 3                  30                   97  0.9951 3.26      0.44    10.1
#> 4                  47                  186  0.9956 3.19      0.40     9.9
#> 5                  47                  186  0.9956 3.19      0.40     9.9
#> 6                  30                   97  0.9951 3.26      0.44    10.1
#>   quality
#> 1       6
#> 2       6
#> 3       6
#> 4       6
#> 5       6
#> 6       6

1.1 Model building

Let’s build a model. We need to prepare xgb.DMatrix first.

library("xgboost")

model_martix_train <- model.matrix(quality ~ . - 1, wine)
data_train <- xgb.DMatrix(model_martix_train, label = wine$quality)
param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2,
              objective = "reg:linear")

wine_xgb_model <- xgb.train(param, data_train, nrounds = 50)
wine_xgb_model
#> ##### xgb.Booster
#> raw: 20.1 Kb 
#> call:
#>   xgb.train(params = param, data = data_train, nrounds = 50)
#> params (as set within xgb.train):
#>   max_depth = "2", eta = "1", silent = "1", nthread = "2", objective = "reg:linear", silent = "1"
#> xgb.attributes:
#>   niter
#> callbacks:
#>   cb.print.evaluation(period = print_every_n)
#> # of features: 11 
#> niter: 50
#> nfeatures : 11

1.2 Explainer

Now we can create an explainer.

library("DALEX")

explainer_xgb <- explain(wine_xgb_model, 
                         data = model_martix_train, 
                         y = wine$quality, 
                         label = "xgboost",
                         colorize = FALSE)
#> Preparation of a new explainer is initiated
#>   -> model label       :  xgboost 
#>   -> data              :  4898  rows  11  cols 
#>   -> target variable   :  4898  values 
#>   -> predict function  :  yhat.default will be used (  default  )
#>   -> predicted values  :  numerical, min =  2.869188 , mean =  5.878132 , max =  8.078749  
#>   -> model_info        :  package Model of class: xgb.Booster package unrecognized , ver. Unknown , task regression (  default  ) 
#>   -> residual function :  difference between y and yhat (  default  )
#>   -> residuals         :  numerical, min =  -3.251447 , mean =  -0.0002230403 , max =  3.005342  
#>   A new explainer has been created!
explainer_xgb
#> Model label:  xgboost 
#> Model class:  xgb.Booster 
#> Data head  :
#>   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
#> 1           7.0             0.27        0.36           20.7     0.045
#> 2           6.3             0.30        0.34            1.6     0.049
#>   free.sulfur.dioxide total.sulfur.dioxide density  pH sulphates alcohol
#> 1                  45                  170   1.001 3.0      0.45     8.8
#> 2                  14                  132   0.994 3.3      0.49     9.5

1.3 Single variable

For continouse variable

sv_xgb_satisfaction_level  <- model_profile(explainer_xgb, 
                                            variable = "alcohol", 
                                            type = "partial")

plot(sv_xgb_satisfaction_level)

1.4 Single prediction

nobs <- model_martix_train[1, , drop = FALSE]
sp_xgb  <- predict_parts(explainer_xgb, 
                         new_observation = nobs,
                         type = "break_down")
head(sp_xgb)
#>                                   contribution
#> xgboost: intercept                       5.878
#> xgboost: residual.sugar = 20.7           0.332
#> xgboost: alcohol = 8.8                  -0.045
#> xgboost: density = 1.001                -0.429
#> xgboost: volatile.acidity = 0.27        -0.297
#> xgboost: free.sulfur.dioxide = 45       -0.040

plot(sp_xgb)

1.5 Variable importance

vd_xgb <- model_parts(explainer_xgb, type = "raw")
head(vd_xgb)
#>               variable mean_dropout_loss   label
#> 1         _full_model_         0.6295067 xgboost
#> 2        fixed.acidity         0.6391484 xgboost
#> 3            sulphates         0.6471640 xgboost
#> 4          citric.acid         0.6538835 xgboost
#> 5 total.sulfur.dioxide         0.6552513 xgboost
#> 6            chlorides         0.6691735 xgboost

plot(vd_xgb)

2 Classification

In this example we are going to use the HR_data dataset from the breakDown package. The model will predict odds that someone will leave the company.

library("breakDown")
head(HR_data)
#>   satisfaction_level last_evaluation number_project average_montly_hours
#> 1               0.38            0.53              2                  157
#> 2               0.80            0.86              5                  262
#> 3               0.11            0.88              7                  272
#> 4               0.72            0.87              5                  223
#> 5               0.37            0.52              2                  159
#> 6               0.41            0.50              2                  153
#>   time_spend_company Work_accident left promotion_last_5years sales salary
#> 1                  3             0    1                     0 sales    low
#> 2                  6             0    1                     0 sales medium
#> 3                  4             0    1                     0 sales medium
#> 4                  5             0    1                     0 sales    low
#> 5                  3             0    1                     0 sales    low
#> 6                  3             0    1                     0 sales    low

2.1 Model building

Let’s build a model. We need to prepare xgb.DMatrix first.

library("xgboost")

model_martix_train <- model.matrix(left ~ . - 1, HR_data)
data_train <- xgb.DMatrix(model_martix_train, label = HR_data$left)
param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2,
              objective = "binary:logistic", eval_metric = "auc")


HR_xgb_model <- xgb.train(param, data_train, nrounds = 50)
HR_xgb_model
#> ##### xgb.Booster
#> raw: 19.5 Kb 
#> call:
#>   xgb.train(params = param, data = data_train, nrounds = 50)
#> params (as set within xgb.train):
#>   max_depth = "2", eta = "1", silent = "1", nthread = "2", objective = "binary:logistic", eval_metric = "auc", silent = "1"
#> xgb.attributes:
#>   niter
#> callbacks:
#>   cb.print.evaluation(period = print_every_n)
#> # of features: 19 
#> niter: 50
#> nfeatures : 19

2.2 Explainer

Now we can create an explainer.

library("DALEX")
predict_logit <- function(model, x) {
  raw_x <- predict(model, x)
  exp(raw_x)/(1 + exp(raw_x))
}
logit <- function(x) exp(x)/(1+exp(x))

explainer_xgb <- explain(HR_xgb_model, 
                         data = model_martix_train, 
                         y = HR_data$left, 
                         predict_function = predict_logit,
                         link = logit,
                         label = "xgboost",
                         colorize = FALSE)
#> Preparation of a new explainer is initiated
#>   -> model label       :  xgboost 
#>   -> data              :  14999  rows  19  cols 
#>   -> target variable   :  14999  values 
#>   -> predict function  :  predict_logit 
#>   -> predicted values  :  numerical, min =  0.5 , mean =  0.5555972 , max =  0.7310584  
#>   -> model_info        :  package Model of class: xgb.Booster package unrecognized , ver. Unknown , task regression (  default  ) 
#>   -> residual function :  difference between y and yhat (  default  )
#>   -> residuals         :  numerical, min =  -0.7296657 , mean =  -0.3175147 , max =  0.4997965  
#>   A new explainer has been created!
explainer_xgb
#> Model label:  xgboost 
#> Model class:  xgb.Booster 
#> Data head  :
#>   satisfaction_level last_evaluation number_project average_montly_hours
#> 1               0.38            0.53              2                  157
#> 2               0.80            0.86              5                  262
#>   time_spend_company Work_accident promotion_last_5years salesaccounting
#> 1                  3             0                     0               0
#> 2                  6             0                     0               0
#>   saleshr salesIT salesmanagement salesmarketing salesproduct_mng salesRandD
#> 1       0       0               0              0                0          0
#> 2       0       0               0              0                0          0
#>   salessales salessupport salestechnical salarylow salarymedium
#> 1          1            0              0         1            0
#> 2          1            0              0         0            1

2.3 Single variable