This documents narrates the analysis of Titanic survival data.
Packages that will be used in this analysis:
library(magrittr)
library(dplyr)
library(ggplot2)
library(titanic)
Import the data provided by the titanic package
ds0 <- titanic::titanic_train
head(ds0)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp Parch
## 1 Braund, Mr. Owen Harris male 22 1 0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
## 3 Heikkinen, Miss. Laina female 26 0 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0
## 5 Allen, Mr. William Henry male 35 0 0
## 6 Moran, Mr. James male NA 0 0
## Ticket Fare Cabin Embarked
## 1 A/5 21171 7.2500 S
## 2 PC 17599 71.2833 C85 C
## 3 STON/O2. 3101282 7.9250 S
## 4 113803 53.1000 C123 S
## 5 373450 8.0500 S
## 6 330877 8.4583 Q
dplyr::glimpse(ds0)
## Observations: 891
## Variables: 12
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ...
## $ Survived <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0...
## $ Pclass <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3...
## $ Name <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley ...
## $ Sex <chr> "male", "female", "female", "female", "male", "male", "...
## $ Age <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 1...
## $ SibSp <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1...
## $ Parch <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0...
## $ Ticket <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", ...
## $ Fare <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.86...
## $ Cabin <chr> "", "C85", "", "C123", "", "", "E46", "", "", "", "G6",...
## $ Embarked <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", ...
To better understand the data set, let us inspect quantitative properties of each variable
explore::describe_all(ds0)
## variable type na na_pct unique min mean max
## 1 PassengerId int 0 0.0 891 1.00 446.00 891.00
## 2 Survived int 0 0.0 2 0.00 0.38 1.00
## 3 Pclass int 0 0.0 3 1.00 2.31 3.00
## 4 Name chr 0 0.0 891 NA NA NA
## 5 Sex chr 0 0.0 2 NA NA NA
## 6 Age dbl 177 19.9 89 0.42 29.70 80.00
## 7 SibSp int 0 0.0 7 0.00 0.52 8.00
## 8 Parch int 0 0.0 7 0.00 0.38 6.00
## 9 Ticket chr 0 0.0 681 NA NA NA
## 10 Fare dbl 0 0.0 248 0.00 32.20 512.33
## 11 Cabin chr 0 0.0 148 NA NA NA
## 12 Embarked chr 0 0.0 4 NA NA NA
To prepare our data for modeling, let perform routine data transformations:
* 1. Convert column names to lowercase
* 2. Select and sort columns * 3. Rename columns
* 4. Covert strings to factors
* 5. Filter out missing values
# create a new copy that will store tweeks
ds1 <- ds0
# 1. convert columns names to lowercase
names(ds1) <- tolower(names(ds1))
# ds1 %>% glimpse()
# 2. Keep only specific variables and sort columns
ds1 <- ds1 %>%
dplyr::select(survived, pclass, sex, age, sibsp, parch, fare, embarked)
# ds1 %>% glimpse()
# 3. Rename variables
ds1 <- ds1 %>%
dplyr::rename(
n_siblings_spouses = sibsp
,n_parents_children = parch
,price_ticket = fare
,port_embarked = embarked
)
# 4. Convert a character variable to a factor
ds1 <- ds1 %>%
dplyr::mutate(
survived = factor(survived, levels = c(0,1), labels = c("Died", "Survived"))
,sex = factor(sex, levels = c("male","female"), labels = c("Men","Women"))
,pclass = factor(pclass, levels = c(1,2,3), labels = c("First","Second", "Third"))
)
# 5. Remove observations where a) age is NA b) port_embarked is missing ("")
ds1 <- ds1 %>%
dplyr::filter(!is.na(age)) %>%
dplyr::filter(port_embarked != "")
# create an alias data set for modeling
ds_modeling <- ds1
ds_modeling %>% glimpse()
## Observations: 712
## Variables: 8
## $ survived <fct> Died, Survived, Survived, Survived, Died, Died, ...
## $ pclass <fct> Third, First, Third, First, Third, First, Third,...
## $ sex <fct> Men, Women, Women, Women, Men, Men, Men, Women, ...
## $ age <dbl> 22, 38, 26, 35, 35, 54, 2, 27, 14, 4, 58, 20, 39...
## $ n_siblings_spouses <int> 1, 1, 0, 1, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, ...
## $ n_parents_children <int> 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, ...
## $ price_ticket <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 51.862...
## $ port_embarked <chr> "S", "C", "S", "S", "S", "S", "S", "S", "C", "S"...
Summary tables to help us see observed differenced broken down by levels of predictors
# How does survival brake down by levels of predictors?
ds_modeling %>%
dplyr::group_by(survived, sex) %>%
dplyr::summarize(
n_people = n()
,mean_age = mean(age, na.rm = T)
)
## # A tibble: 4 x 4
## # Groups: survived [2]
## survived sex n_people mean_age
## <fct> <fct> <int> <dbl>
## 1 Died Men 360 31.6
## 2 Died Women 64 25.0
## 3 Survived Men 93 27.3
## 4 Survived Women 195 28.6
model_0 <- stats::glm(
formula = survived ~ sex
,family = "binomial"
,data = ds_modeling
)
summary(model_0)
##
## Call:
## stats::glm(formula = survived ~ sex, family = "binomial", data = ds_modeling)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6721 -0.6779 -0.6779 0.7534 1.7795
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.3535 0.1163 -11.64 <2e-16 ***
## sexWomen 2.4676 0.1852 13.33 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 960.90 on 711 degrees of freedom
## Residual deviance: 749.57 on 710 degrees of freedom
## AIC: 753.57
##
## Number of Fisher Scoring iterations: 4
# create levels of predictors for which to generate predicted values
ds_predicted_0 <- ds_modeling %>%
dplyr::select(sex) %>%
dplyr::distinct()
# add model prediction
ds_predicted_0 <- ds_predicted_0 %>%
dplyr::mutate(
log_odds = predict(object = model_0, newdata = .)
,probability = plogis(log_odds)
)
# ds_predicted_0
ds_predicted_0 %>%
ggplot(aes(x = sex, y = probability))+
geom_bar(stat = "identity")
model_1 <- stats::glm(
formula = survived ~ sex + age
,family = "binomial"
,data = ds_modeling
)
summary(model_1)
##
## Call:
## stats::glm(formula = survived ~ sex + age, family = "binomial",
## data = ds_modeling)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7412 -0.6896 -0.6539 0.7572 1.9094
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.174251 0.222864 -5.269 1.37e-07 ***
## sexWomen 2.453935 0.185570 13.224 < 2e-16 ***
## age -0.005906 0.006339 -0.932 0.352
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 960.9 on 711 degrees of freedom
## Residual deviance: 748.7 on 709 degrees of freedom
## AIC: 754.7
##
## Number of Fisher Scoring iterations: 4
# create levels of predictors for which to generate predicted values
ds_predicted_1 <- ds_modeling %>%
dplyr::select(sex, age) %>%
dplyr::distinct()
# add model prediction
ds_predicted_1 <- ds_predicted_1 %>%
dplyr::mutate(
log_odds = predict(object = model_1, newdata = .)
,probability = plogis(log_odds)
)
# ds_predicted_1
g1 <- ds_predicted_1 %>%
ggplot(aes(x = age, y = probability))+
geom_point(aes(color = sex))
g1
model_2 <- stats::glm(
formula = survived ~ sex + age + pclass
,family = "binomial"
,data = ds_modeling
)
summary(model_2)
##
## Call:
## stats::glm(formula = survived ~ sex + age + pclass, family = "binomial",
## data = ds_modeling)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7274 -0.6795 -0.3958 0.6561 2.4657
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.255302 0.359875 3.488 0.000486 ***
## sexWomen 2.513890 0.207614 12.108 < 2e-16 ***
## age -0.037194 0.007669 -4.850 1.24e-06 ***
## pclassSecond -1.301220 0.278173 -4.678 2.90e-06 ***
## pclassThird -2.572385 0.281508 -9.138 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 960.90 on 711 degrees of freedom
## Residual deviance: 646.69 on 707 degrees of freedom
## AIC: 656.69
##
## Number of Fisher Scoring iterations: 5
# create levels of predictors for which to generate predicted values
ds_predicted_2 <- ds_modeling %>%
dplyr::select(sex, age, pclass) %>%
dplyr::distinct()
# add model prediction
ds_predicted_2 <- ds_predicted_2 %>%
dplyr::mutate(
log_odds = predict(object = model_2, newdata = .)
,probability = plogis(log_odds)
)
# ds_predicted_2
g2 <- ds_predicted_2 %>%
ggplot(aes(x = age, y = probability))+
geom_point(aes(color = sex))+
facet_wrap("pclass")
g2
model_3 <- stats::glm(
formula = survived ~ sex + age + pclass + port_embarked
,family = "binomial"
,data = ds_modeling
)
summary(model_3)
##
## Call:
## stats::glm(formula = survived ~ sex + age + pclass + port_embarked,
## family = "binomial", data = ds_modeling)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6464 -0.6808 -0.3979 0.6367 2.4715
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.521032 0.390591 3.894 9.85e-05 ***
## sexWomen 2.515793 0.209293 12.020 < 2e-16 ***
## age -0.036082 0.007715 -4.677 2.92e-06 ***
## pclassSecond -1.144614 0.290678 -3.938 8.23e-05 ***
## pclassThird -2.409565 0.291179 -8.275 < 2e-16 ***
## port_embarkedQ -0.814190 0.567903 -1.434 0.1517
## port_embarkedS -0.493651 0.266886 -1.850 0.0644 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 960.90 on 711 degrees of freedom
## Residual deviance: 642.68 on 705 degrees of freedom
## AIC: 656.68
##
## Number of Fisher Scoring iterations: 5
# create levels of predictors for which to generate predicted values
ds_predicted_3 <- ds_modeling %>%
dplyr::select(sex, age, pclass, port_embarked) %>%
dplyr::distinct()
# add model prediction
ds_predicted_3 <- ds_predicted_3 %>%
dplyr::mutate(
log_odds = predict(object = model_3, newdata = .)
,probability = plogis(log_odds)
)
# ds_predicted_3
g3 <- ds_predicted_3 %>%
ggplot(aes(x = age, y = probability))+
geom_point(aes(color = sex, shape = port_embarked))+
facet_grid(. ~ pclass)
# facet_grid(port_embarked ~ pclass)
g3