Read the data from the kaggle website https://www.kaggle.com/jsphyg/star-wars#characters.csv.
library("openxlsx")
library("missRanger")
library("randomForest")
charactersStarWars <- read.xlsx("charactersStarWars.xlsx")
Change characters into factors.
set.seed(1)
rownames(charactersStarWars) <- charactersStarWars[,1]
charactersStarWars <- charactersStarWars[,c(2,3,4,5,6,8,9,10,11)]
charactersStarWars$hair_color <- factor(gsub(charactersStarWars$hair_color, pattern = ",.*", replacement = ""))
charactersStarWars$skin_color <- factor(gsub(charactersStarWars$skin_color, pattern = ",.*", replacement = ""))
charactersStarWars$gender <- factor(charactersStarWars$gender)
charactersStarWars$eye_color <- factor(gsub(charactersStarWars$eye_color, pattern = "[,-].*", replacement = ""))
charactersStarWars$homeworld <- factor(charactersStarWars$homeworld)
charactersStarWars$species <- factor(charactersStarWars$species)
charactersStarWarsFilled <- missRanger(charactersStarWars)
##
## Missing value imputation by random forests
##
## Variables to impute: height, mass, hair_color, skin_color, eye_color, gender, homeworld, species
## Variables used to impute: height, mass, hair_color, skin_color, eye_color, gender, homeworld, species, Jedi
## iter 1: ........
## iter 2: ........
## iter 3: ........
## iter 4: ........
## iter 5: ........
head(charactersStarWarsFilled)
## height mass hair_color skin_color eye_color gender homeworld
## Luke Skywalker 172 77 blond fair blue male Tatooine
## C-3PO 167 75 none gold yellow male Tatooine
## R2-D2 96 32 none white red male Naboo
## Darth Vader 202 136 none white yellow male Tatooine
## Leia Organa 150 49 brown light brown female Alderaan
## Owen Lars 178 120 brown light blue male Tatooine
## species Jedi
## Luke Skywalker Human 1
## C-3PO Droid 0
## R2-D2 Droid 0
## Darth Vader Human 1
## Leia Organa Human 0
## Owen Lars Human 0
Let’s use gbm
library to create a gbm
model with 250 trees 3 levels deep.
library("gbm")
set.seed(1)
model <- gbm(Jedi~height + mass + hair_color + skin_color + eye_color + gender , data = charactersStarWarsFilled)
## Distribution not specified, assuming bernoulli ...
Let’s wrap gbm model into a DALEX explainer.
library("DALEX")
model_explained <- explain(model, charactersStarWarsFilled, y = charactersStarWarsFilled$Jedi)
## Preparation of a new explainer is initiated
## -> model label : gbm ( [33m default [39m )
## -> data : 87 rows 9 cols
## -> target variable : 87 values
## -> data : A column identical to the target variable `y` has been found in the `data`. ( [31m WARNING [39m )
## -> data : It is highly recommended to pass `data` without the target variable column
## -> predict function : yhat.gbm will be used ( [33m default [39m )
## -> predicted values : numerical, min = 0.002511505 , mean = 0.2720332 , max = 0.9663875
## -> residual function : difference between y and yhat ( [33m default [39m )
## -> residuals : numerical, min = -0.5234405 , mean = -0.00766543 , max = 0.7765105
## -> model_info : package gbm , ver. 2.1.5 , task classification ( [33m default [39m )
## [32m A new explainer has been created! [39m
Calculate Feature Importnace explainer.
library("ingredients")
library("auditor")
plot(feature_importance(model_explained, loss_function = DALEX:::loss_one_minus_auc))
plot_roc(model_evaluation(model_explained))
Calculate modelStudio dashboard.
library(modelStudio)
op <- modelStudioOptions(
subtitle = "What makes a Jedi (or Sith)?"
)
modelStudioStarWars <- modelStudio(mexp,
new_observation = charactersFilled[,1:6],
options = op)
modelStudioStarWars
r2d3::save_d3_html(modelStudioStarWars, file = "index.html")