This documents narrates the analysis of Titanic survival data.

Packages that will be used in this analysis:

library(magrittr)
library(dplyr)
library(ggplot2)
library(titanic)

Wrangling

Load

Import the data provided by the titanic package

ds0 <- titanic::titanic_train
head(ds0)
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp Parch
## 1                             Braund, Mr. Owen Harris   male  22     1     0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
## 3                              Heikkinen, Miss. Laina female  26     0     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
## 5                            Allen, Mr. William Henry   male  35     0     0
## 6                                    Moran, Mr. James   male  NA     0     0
##             Ticket    Fare Cabin Embarked
## 1        A/5 21171  7.2500              S
## 2         PC 17599 71.2833   C85        C
## 3 STON/O2. 3101282  7.9250              S
## 4           113803 53.1000  C123        S
## 5           373450  8.0500              S
## 6           330877  8.4583              Q
dplyr::glimpse(ds0)
## Observations: 891
## Variables: 12
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ...
## $ Survived    <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0...
## $ Pclass      <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3...
## $ Name        <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley ...
## $ Sex         <chr> "male", "female", "female", "female", "male", "male", "...
## $ Age         <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 1...
## $ SibSp       <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1...
## $ Parch       <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0...
## $ Ticket      <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", ...
## $ Fare        <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.86...
## $ Cabin       <chr> "", "C85", "", "C123", "", "", "E46", "", "", "", "G6",...
## $ Embarked    <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", ...

Inspect

To better understand the data set, let us inspect quantitative properties of each variable

explore::describe_all(ds0)
##       variable type  na na_pct unique  min   mean    max
## 1  PassengerId  int   0    0.0    891 1.00 446.00 891.00
## 2     Survived  int   0    0.0      2 0.00   0.38   1.00
## 3       Pclass  int   0    0.0      3 1.00   2.31   3.00
## 4         Name  chr   0    0.0    891   NA     NA     NA
## 5          Sex  chr   0    0.0      2   NA     NA     NA
## 6          Age  dbl 177   19.9     89 0.42  29.70  80.00
## 7        SibSp  int   0    0.0      7 0.00   0.52   8.00
## 8        Parch  int   0    0.0      7 0.00   0.38   6.00
## 9       Ticket  chr   0    0.0    681   NA     NA     NA
## 10        Fare  dbl   0    0.0    248 0.00  32.20 512.33
## 11       Cabin  chr   0    0.0    148   NA     NA     NA
## 12    Embarked  chr   0    0.0      4   NA     NA     NA

Tweek

To prepare our data for modeling, let perform routine data transformations:
* 1. Convert column names to lowercase
* 2. Select and sort columns * 3. Rename columns
* 4. Covert strings to factors
* 5. Filter out missing values

# create a new copy that will store tweeks
ds1 <- ds0
# 1. convert columns names to lowercase
names(ds1) <- tolower(names(ds1))
# ds1 %>% glimpse()
# 2. Keep only specific variables and sort columns 
ds1 <- ds1 %>% 
  dplyr::select(survived, pclass, sex, age, sibsp, parch, fare, embarked)
# ds1 %>% glimpse()
# 3. Rename variables
ds1 <- ds1 %>% 
  dplyr::rename(
    n_siblings_spouses  = sibsp
    ,n_parents_children = parch
    ,price_ticket       = fare
    ,port_embarked      = embarked
  )
# 4. Convert a character variable to a factor
ds1 <- ds1 %>% 
  dplyr::mutate(
    survived = factor(survived, levels = c(0,1), labels = c("Died", "Survived"))
    ,sex     = factor(sex, levels = c("male","female"), labels = c("Men","Women"))
    ,pclass  = factor(pclass, levels = c(1,2,3), labels = c("First","Second", "Third"))
  )
# 5. Remove observations where a) age is NA b) port_embarked is missing ("")
ds1 <- ds1 %>% 
  dplyr::filter(!is.na(age)) %>% 
  dplyr::filter(port_embarked != "")
# create an alias data set for modeling
ds_modeling <- ds1
ds_modeling %>% glimpse()
## Observations: 712
## Variables: 8
## $ survived           <fct> Died, Survived, Survived, Survived, Died, Died, ...
## $ pclass             <fct> Third, First, Third, First, Third, First, Third,...
## $ sex                <fct> Men, Women, Women, Women, Men, Men, Men, Women, ...
## $ age                <dbl> 22, 38, 26, 35, 35, 54, 2, 27, 14, 4, 58, 20, 39...
## $ n_siblings_spouses <int> 1, 1, 0, 1, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, ...
## $ n_parents_children <int> 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, ...
## $ price_ticket       <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 51.862...
## $ port_embarked      <chr> "S", "C", "S", "S", "S", "S", "S", "S", "C", "S"...

Tabulating

Summary tables to help us see observed differenced broken down by levels of predictors

#  How does survival brake down by levels of predictors?
ds_modeling %>% 
  dplyr::group_by(survived, sex) %>% 
  dplyr::summarize(
    n_people = n()
    ,mean_age = mean(age, na.rm = T)
  )
## # A tibble: 4 x 4
## # Groups:   survived [2]
##   survived sex   n_people mean_age
##   <fct>    <fct>    <int>    <dbl>
## 1 Died     Men        360     31.6
## 2 Died     Women       64     25.0
## 3 Survived Men         93     27.3
## 4 Survived Women      195     28.6

Modeling

0 - Sex

Summary

model_0 <- stats::glm(
  formula = survived ~ sex
  ,family = "binomial"
  ,data = ds_modeling
)
summary(model_0)
## 
## Call:
## stats::glm(formula = survived ~ sex, family = "binomial", data = ds_modeling)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6721  -0.6779  -0.6779   0.7534   1.7795  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.3535     0.1163  -11.64   <2e-16 ***
## sexWomen      2.4676     0.1852   13.33   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 960.90  on 711  degrees of freedom
## Residual deviance: 749.57  on 710  degrees of freedom
## AIC: 753.57
## 
## Number of Fisher Scoring iterations: 4
# create levels of predictors for which to generate predicted values
ds_predicted_0 <- ds_modeling %>% 
  dplyr::select(sex) %>% 
  dplyr::distinct()
# add model prediction
ds_predicted_0 <- ds_predicted_0 %>% 
  dplyr::mutate(
    log_odds = predict(object = model_0, newdata = .)
    ,probability = plogis(log_odds)
  )
# ds_predicted_0

Graph - g0

ds_predicted_0 %>% 
  ggplot(aes(x = sex, y = probability))+
  geom_bar(stat = "identity")

1 - Sex + Age

Summary (g1)

model_1 <- stats::glm(
  formula = survived ~ sex + age
  ,family = "binomial"
  ,data = ds_modeling
)
summary(model_1)
## 
## Call:
## stats::glm(formula = survived ~ sex + age, family = "binomial", 
##     data = ds_modeling)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7412  -0.6896  -0.6539   0.7572   1.9094  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.174251   0.222864  -5.269 1.37e-07 ***
## sexWomen     2.453935   0.185570  13.224  < 2e-16 ***
## age         -0.005906   0.006339  -0.932    0.352    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 960.9  on 711  degrees of freedom
## Residual deviance: 748.7  on 709  degrees of freedom
## AIC: 754.7
## 
## Number of Fisher Scoring iterations: 4
# create levels of predictors for which to generate predicted values
ds_predicted_1 <- ds_modeling %>% 
  dplyr::select(sex, age) %>% 
  dplyr::distinct()
# add model prediction
ds_predicted_1 <- ds_predicted_1 %>% 
  dplyr::mutate(
    log_odds     = predict(object = model_1, newdata = .)
    ,probability = plogis(log_odds)
  )
# ds_predicted_1

Graph (g1)

g1 <- ds_predicted_1 %>% 
  ggplot(aes(x = age, y = probability))+
  geom_point(aes(color = sex))
g1

2 - Sex + Age + Class

Summary (m2)

model_2 <- stats::glm(
  formula = survived ~ sex + age + pclass
  ,family = "binomial"
  ,data = ds_modeling
)
summary(model_2)
## 
## Call:
## stats::glm(formula = survived ~ sex + age + pclass, family = "binomial", 
##     data = ds_modeling)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7274  -0.6795  -0.3958   0.6561   2.4657  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.255302   0.359875   3.488 0.000486 ***
## sexWomen      2.513890   0.207614  12.108  < 2e-16 ***
## age          -0.037194   0.007669  -4.850 1.24e-06 ***
## pclassSecond -1.301220   0.278173  -4.678 2.90e-06 ***
## pclassThird  -2.572385   0.281508  -9.138  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 960.90  on 711  degrees of freedom
## Residual deviance: 646.69  on 707  degrees of freedom
## AIC: 656.69
## 
## Number of Fisher Scoring iterations: 5
# create levels of predictors for which to generate predicted values
ds_predicted_2 <- ds_modeling %>% 
  dplyr::select(sex, age, pclass) %>% 
  dplyr::distinct()
# add model prediction
ds_predicted_2 <- ds_predicted_2 %>% 
  dplyr::mutate(
    log_odds     = predict(object = model_2, newdata = .)
    ,probability = plogis(log_odds)
  )
# ds_predicted_2

Graph (g2)

g2 <- ds_predicted_2 %>% 
  ggplot(aes(x = age, y = probability))+
  geom_point(aes(color = sex))+
  facet_wrap("pclass")
g2

3 - Sex + Age + Class + Port

Summary (m3)

model_3 <- stats::glm(
  formula = survived ~ sex + age + pclass + port_embarked
  ,family = "binomial"
  ,data = ds_modeling
)
summary(model_3)
## 
## Call:
## stats::glm(formula = survived ~ sex + age + pclass + port_embarked, 
##     family = "binomial", data = ds_modeling)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6464  -0.6808  -0.3979   0.6367   2.4715  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.521032   0.390591   3.894 9.85e-05 ***
## sexWomen        2.515793   0.209293  12.020  < 2e-16 ***
## age            -0.036082   0.007715  -4.677 2.92e-06 ***
## pclassSecond   -1.144614   0.290678  -3.938 8.23e-05 ***
## pclassThird    -2.409565   0.291179  -8.275  < 2e-16 ***
## port_embarkedQ -0.814190   0.567903  -1.434   0.1517    
## port_embarkedS -0.493651   0.266886  -1.850   0.0644 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 960.90  on 711  degrees of freedom
## Residual deviance: 642.68  on 705  degrees of freedom
## AIC: 656.68
## 
## Number of Fisher Scoring iterations: 5
# create levels of predictors for which to generate predicted values
ds_predicted_3 <- ds_modeling %>% 
  dplyr::select(sex, age, pclass, port_embarked) %>% 
  dplyr::distinct()
# add model prediction
ds_predicted_3 <- ds_predicted_3 %>% 
  dplyr::mutate(
    log_odds     = predict(object = model_3, newdata = .)
    ,probability = plogis(log_odds)
  )
# ds_predicted_3

Graph (g3)

g3 <- ds_predicted_3 %>% 
  ggplot(aes(x = age, y = probability))+
  geom_point(aes(color = sex, shape = port_embarked))+
  facet_grid(. ~ pclass)
  # facet_grid(port_embarked ~ pclass)
g3