Recap Lab 5

Leon Eyrich Jessen

A Few Meta Points…

Load Libraries

library("tidyverse")
library("fs")

Remember to load libraries first! You cannot use tools from a toolbox you have not yet “picked up”

Load Data

  • Create the data directory programmatically
dir_create(path = "data")
  • Retrieve the data directly
diabetes_data <- read_csv(file = "https://hbiostat.org/data/repo/diabetes.csv")
  • Write the data to disk
write_csv(x = diabetes_data,
          file = "data/diabetes.csv")
  • Read the data back in
diabetes_data <- read_csv(file = "data/diabetes.csv")
  • Remember to set the eval=FALSE flag in the chunk settings to avoid retrieving the data every time you hit render

Look at data

diabetes_data
# A tibble: 403 × 19
      id  chol stab.glu   hdl ratio glyhb location     age gender height weight
   <dbl> <dbl>    <dbl> <dbl> <dbl> <dbl> <chr>      <dbl> <chr>   <dbl>  <dbl>
 1  1000   203       82    56  3.60  4.31 Buckingham    46 female     62    121
 2  1001   165       97    24  6.90  4.44 Buckingham    29 female     64    218
 3  1002   228       92    37  6.20  4.64 Buckingham    58 female     61    256
 4  1003    78       93    12  6.5   4.63 Buckingham    67 male       67    119
 5  1005   249       90    28  8.90  7.72 Buckingham    64 male       68    183
 6  1008   248       94    69  3.60  4.81 Buckingham    34 male       71    190
 7  1011   195       92    41  4.80  4.84 Buckingham    30 male       69    191
 8  1015   227       75    44  5.20  3.94 Buckingham    37 male       59    170
 9  1016   177       87    49  3.60  4.84 Buckingham    45 male       69    166
10  1022   263       89    40  6.60  5.78 Buckingham    55 female     63    202
# ℹ 393 more rows
# ℹ 8 more variables: frame <chr>, bp.1s <dbl>, bp.1d <dbl>, bp.2s <dbl>,
#   bp.2d <dbl>, waist <dbl>, hip <dbl>, time.ppn <dbl>

Note the column types are stated, i.e. what kind of data is in each column

  • <chr>: character
  • <dbl>: double

mutate()

  • Change the height, weight, waist and hip from inches/pounds to the metric system (cm/kg), rounding to 1 decimal
diabetes_data <- diabetes_data |>
  mutate(height_cm = round(height * 2.54,
                           digits = 1),
         weight_kg = round(weight * 0.454,
                           digits = 1),
         hip_cm = round(hip * 2.54,
                        digits = 1),
         waist_cm = round(waist * 2.54,
                          digits = 1))
diabetes_data |> 
  select(matches("height|weight|hip|waist")) # Note: Selecting using a regex()
# A tibble: 403 × 8
   height weight waist   hip height_cm weight_kg hip_cm waist_cm
    <dbl>  <dbl> <dbl> <dbl>     <dbl>     <dbl>  <dbl>    <dbl>
 1     62    121    29    38      158.      54.9   96.5     73.7
 2     64    218    46    48      163.      99    122.     117. 
 3     61    256    49    57      155.     116.   145.     124. 
 4     67    119    33    38      170.      54     96.5     83.8
 5     68    183    44    41      173.      83.1  104.     112. 
 6     71    190    36    42      180.      86.3  107.      91.4
 7     69    191    46    49      175.      86.7  124.     117. 
 8     59    170    34    39      150.      77.2   99.1     86.4
 9     69    166    34    40      175.      75.4  102.      86.4
10     63    202    45    50      160       91.7  127      114. 
# ℹ 393 more rows

mutate() OR

inch_to_cm_fct <- 2.54
my_digits <- 1
diabetes_data <- diabetes_data |>
  mutate(height_cm = round(height * inch_to_cm_fct,
                           digits = my_digits),
         weight_kg = round(weight * 0.454,
                           digits = my_digits),
         hip_cm = round(hip * inch_to_cm_fct,
                        digits = my_digits),
         waist_cm = round(waist * inch_to_cm_fct,
                          digits = my_digits))
diabetes_data |> 
  select(matches("height|weight|hip|waist")) # Note: Selecting using a regex()
# A tibble: 403 × 8
   height weight waist   hip height_cm weight_kg hip_cm waist_cm
    <dbl>  <dbl> <dbl> <dbl>     <dbl>     <dbl>  <dbl>    <dbl>
 1     62    121    29    38      158.      54.9   96.5     73.7
 2     64    218    46    48      163.      99    122.     117. 
 3     61    256    49    57      155.     116.   145.     124. 
 4     67    119    33    38      170.      54     96.5     83.8
 5     68    183    44    41      173.      83.1  104.     112. 
 6     71    190    36    42      180.      86.3  107.      91.4
 7     69    191    46    49      175.      86.7  124.     117. 
 8     59    170    34    39      150.      77.2   99.1     86.4
 9     69    166    34    40      175.      75.4  102.      86.4
10     63    202    45    50      160       91.7  127      114. 
# ℹ 393 more rows

Now, if we wanted 2 decimals, we only have to change one variable! 👍

filter()

  • How many men in Buckingham are younger than 30 and taller than 1.9m?
diabetes_data |>
  filter(location == "Buckingham",
         age < 30,
         height_cm > 190)
# A tibble: 1 × 23
     id  chol stab.glu   hdl ratio glyhb location     age gender height weight
  <dbl> <dbl>    <dbl> <dbl> <dbl> <dbl> <chr>      <dbl> <chr>   <dbl>  <dbl>
1 10000   185       76    58  3.20  4.83 Buckingham    23 male       76    164
# ℹ 12 more variables: frame <chr>, bp.1s <dbl>, bp.1d <dbl>, bp.2s <dbl>,
#   bp.2d <dbl>, waist <dbl>, hip <dbl>, time.ppn <dbl>, height_cm <dbl>,
#   weight_kg <dbl>, hip_cm <dbl>, waist_cm <dbl>

Note, that , is equivalent to &

filter()

  • Make a scatter plot of weight versus height and colour by gender for inhabitants of Louisa above the age of 40
pl <- diabetes_data |>
  filter(location == "Louisa",
         age > 40) |> 
  ggplot(aes(x = height_cm,
             y = weight_kg,
             colour = gender)) +
  geom_point() +
  theme_minimal(base_size = 26)

filter() OR

  • Make a scatter plot of weight versus height and colour by gender for inhabitants of Louisa above the age of 40
pl <- diabetes_data |>
  filter(location == "Louisa",
         age > 40) |> 
  ggplot(aes(x = height_cm,
             y = weight_kg,
             colour = gender)) +
  geom_point(size = 5,
             alpha = 0.5) +
  geom_smooth(method = "lm") +
  scale_colour_manual(
    values = c("salmon", "cornflowerblue")) +
  theme_minimal(base_size = 26)

filter()

  • Make a scatter plot of weight versus height and colour by gender for inhabitants of Louisa above the age of 40
pl <- diabetes_data |>
  mutate(
    subject = case_when(
      location == "Louisa" & age > 40 ~ gender,
      TRUE ~ "Not incl")) |>
  ggplot(aes(x = height_cm,
             y = weight_kg,
             colour = subject,
             size = subject)) +
  geom_point(alpha = 0.5) +
  scale_size_manual(values = c(5, 5, 1)) +
  theme_minimal(base_size = 26) +
  theme(legend.position = "bottom")
  • Note how we’re creating needed attribute before plotting

  • Always modify data Before plotting!

arrange()

  • How old is the youngest person
diabetes_data |>
  arrange(age) |>
  slice(1) |> 
  select(age)
# A tibble: 1 × 1
    age
  <dbl>
1    19
  • How old is the oldest person
diabetes_data |>
  arrange(desc(age)) |>
  slice(1) |> 
  select(age)
# A tibble: 1 × 1
    age
  <dbl>
1    92

arrange() OR

  • How old is the youngest person
diabetes_data |>
  filter(age == min(age)) |> 
  select(age)
# A tibble: 2 × 1
    age
  <dbl>
1    19
2    19
  • How old is the oldest person
diabetes_data |>
  filter(age == max(age)) |> 
  select(age)
# A tibble: 1 × 1
    age
  <dbl>
1    92

arrange() OR

diabetes_data |>
  summarise(min_age = min(age),
            max_age = max(age))
# A tibble: 1 × 2
  min_age max_age
    <dbl>   <dbl>
1      19      92
diabetes_data |>
  group_by(location) |> 
  summarise(min_age = min(age),
            max_age = max(age))
# A tibble: 2 × 3
  location   min_age max_age
  <chr>        <dbl>   <dbl>
1 Buckingham      19      92
2 Louisa          19      89

OR, OR, OR, OR, OR, OR, OR, etc.

mutate()

  • Create a new variable, where you calculate the BMI
diabetes_data <- diabetes_data |> 
  mutate(BMI = weight_kg / (height_cm / 100)^2)
diabetes_data |> 
  select(BMI)
# A tibble: 403 × 1
     BMI
   <dbl>
 1  22.1
 2  37.4
 3  48.4
 4  18.6
 5  27.9
 6  26.5
 7  28.2
 8  34.4
 9  24.5
10  35.8
# ℹ 393 more rows

case_when()

  • Create a BMI_class variable
diabetes_data <- diabetes_data |> 
  mutate(BMI_class = case_when(BMI < 16.5 ~ "Severely underweight",
                               16.5 <= BMI & BMI < 18.5 ~ "Underweight",
                               18.5 <= BMI & BMI < 25.0 ~ "Normal weight",
                               25.0 <= BMI & BMI < 30.0 ~ "Overweight",
                               30.0 <= BMI & BMI < 35.0 ~ "Obesity class I",
                               35.0 <= BMI & BMI < 40.0 ~ "Obesity class II",
                               40.0 <= BMI ~ "Obesity class III"))
diabetes_data |> 
  select(BMI, BMI_class)
# A tibble: 403 × 2
     BMI BMI_class        
   <dbl> <chr>            
 1  22.1 Normal weight    
 2  37.4 Obesity class II 
 3  48.4 Obesity class III
 4  18.6 Normal weight    
 5  27.9 Overweight       
 6  26.5 Overweight       
 7  28.2 Overweight       
 8  34.4 Obesity class I  
 9  24.5 Normal weight    
10  35.8 Obesity class II 
# ℹ 393 more rows

Factor levels

pl <- diabetes_data |>
  count(BMI_class) |>
  drop_na(BMI_class) |> 
  mutate(pct = n / sum(n) * 100) |>
  ggplot(aes(x = BMI_class,
             y = pct)) +
  geom_col() +
  geom_hline(yintercept = 0) +
  theme_minimal(base_size = 26) +
  theme(axis.text.x = element_text(
    angle = 45,
    hjust = 1)) +
  labs(x = "",
       y = "Percent")

  • Columns order is nonsensical, we know there is an order

Factor levels

diabetes_data <- diabetes_data |>
  mutate(BMI_class = factor(BMI_class,
                            levels =  c("Severely underweight", "Underweight",
                                        "Normal weight", "Overweight",
                                        "Obesity class I", "Obesity class II",
                                        "Obesity class III")))

Factor levels

pl <- diabetes_data |>
  count(BMI_class) |>
  drop_na(BMI_class) |> 
  mutate(pct = n / sum(n) * 100) |>
  ggplot(aes(x = BMI_class,
             y = pct)) +
  geom_col() +
  geom_hline(yintercept = 0) +
  theme_minimal(base_size = 26) +
  theme(axis.text.x = element_text(
    angle = 45,
    hjust = 1)) +
  labs(x = "",
       y = "Percent")

  • Columns order is now sensical, following known order!

Factor levels

pl <- diabetes_data |>
  count(BMI_class) |>
  drop_na(BMI_class) |> 
  mutate(pct = n / sum(n) * 100) |>
  ggplot(aes(x = BMI_class,
             y = pct)) +
  geom_col(colour = "green",
           fill = "lightgreen",
           alpha = 0.5) +
  geom_hline(yintercept = 0) +
  theme_minimal(base_size = 26) +
  theme(axis.text.x = element_text(
    angle = 45,
    hjust = 1)) +
  labs(x = "",
       y = "Percent") +
  theme(plot.margin = unit(
    c(1, 0.5, 0.5, 1), "cm"),
    panel.grid.major.x = element_blank())

  • Columns order is now sensical, following known order!

group_by() |> summarise()

  • For each BMI_class group, calculate the average weight and associated standard deviation
diabetes_data |> 
  group_by(BMI_class) |> 
  summarise(n = n(),
            mu_weight_kg = mean(weight_kg),
            sigma_weight_kg = sd(weight_kg))
# A tibble: 8 × 4
  BMI_class                n mu_weight_kg sigma_weight_kg
  <fct>                <int>        <dbl>           <dbl>
1 Severely underweight     2         49.5            5.80
2 Underweight              7         52.2            6.43
3 Normal weight          112         65.9            9.77
4 Overweight             124         76.2            8.86
5 Obesity class I         90         89.2            9.97
6 Obesity class II        32        102.            13.5 
7 Obesity class III       30        115.            15.8 
8 <NA>                     6         NA             NA   
  • <fct> = a factor variable, i.e. a category!

Assignment

Data augmentation

  • Create a BFP (Body fat percentage) variable
  • Create a WHR (waist-to-hip ratio) variable
diabetes_data <- diabetes_data |> 
  mutate(gender_class = case_when(gender == "female" ~ 0,
                                  gender == "male" ~ 1),
         BFP = 1.39 * BMI + 0.16 * age - 10.34 * gender_class - 9,
         WHR = waist / hip)
diabetes_data
# A tibble: 403 × 28
      id  chol stab.glu   hdl ratio glyhb location     age gender height weight
   <dbl> <dbl>    <dbl> <dbl> <dbl> <dbl> <chr>      <dbl> <chr>   <dbl>  <dbl>
 1  1000   203       82    56  3.60  4.31 Buckingham    46 female     62    121
 2  1001   165       97    24  6.90  4.44 Buckingham    29 female     64    218
 3  1002   228       92    37  6.20  4.64 Buckingham    58 female     61    256
 4  1003    78       93    12  6.5   4.63 Buckingham    67 male       67    119
 5  1005   249       90    28  8.90  7.72 Buckingham    64 male       68    183
 6  1008   248       94    69  3.60  4.81 Buckingham    34 male       71    190
 7  1011   195       92    41  4.80  4.84 Buckingham    30 male       69    191
 8  1015   227       75    44  5.20  3.94 Buckingham    37 male       59    170
 9  1016   177       87    49  3.60  4.84 Buckingham    45 male       69    166
10  1022   263       89    40  6.60  5.78 Buckingham    55 female     63    202
# ℹ 393 more rows
# ℹ 17 more variables: frame <chr>, bp.1s <dbl>, bp.1d <dbl>, bp.2s <dbl>,
#   bp.2d <dbl>, waist <dbl>, hip <dbl>, time.ppn <dbl>, height_cm <dbl>,
#   weight_kg <dbl>, hip_cm <dbl>, waist_cm <dbl>, BMI <dbl>, BMI_class <chr>,
#   gender_class <dbl>, BFP <dbl>, WHR <dbl>

Which correlate better with BMI, WHR or BFP?

First look at the data

pl1 <- diabetes_data |> 
  ggplot(aes(x = BMI,
             y = WHR)) +
  geom_point() +
  theme_minimal(base_size = 26)
pl1

pl2 <- diabetes_data |> 
  ggplot(aes(x = BMI,
             y = BFP)) +
  geom_point() +
  theme_minimal(base_size = 26)
pl2

Well, there does seem to be a tendency…

Which correlate better with BMI, WHR or BFP?

diabetes_data |>
  select(BMI, WHR, BFP) |>
  pivot_longer(cols = c(WHR, BFP),
               names_to = "metric",
               values_to = "value")
# A tibble: 806 × 3
     BMI metric  value
   <dbl> <chr>   <dbl>
 1  22.1 WHR     0.763
 2  22.1 BFP    29.1  
 3  37.4 WHR     0.958
 4  37.4 BFP    47.7  
 5  48.4 WHR     0.860
 6  48.4 BFP    67.6  
 7  18.6 WHR     0.868
 8  18.6 BFP    17.3  
 9  27.9 WHR     1.07 
10  27.9 BFP    29.6  
# ℹ 796 more rows
corrs <- diabetes_data |>
  select(BMI, WHR, BFP) |>
  pivot_longer(cols = c(WHR, BFP),
               names_to = "metric",
               values_to = "value") |>
  drop_na() |>
  group_by(metric) |>
  summarise(PCC = cor(x = BMI,
                      y = value,
                      use = "complete.obs",
                      method = "pearson"))
corrs
# A tibble: 2 × 2
  metric   PCC
  <chr>  <dbl>
1 BFP    0.888
2 WHR    0.104

Which correlate better with BMI, WHR or BFP?

pl1 <- pl1 +
  labs(subtitle = str_c("PCC = ", corrs |> filter(metric == "WHR") |> pull(PCC) |> round(3)))
pl2 <- pl2 +
  labs(subtitle = str_c("PCC = ", corrs |> filter(metric == "BFP") |> pull(PCC) |> round(3)))
pl1 + pl2

A Better Way?

corrs <- corrs |> 
  mutate(R2 = round(PCC^2, digits = 3),
         label = str_c("R2 = ", R2))
corrs
# A tibble: 2 × 4
  metric   PCC    R2 label     
  <chr>  <dbl> <dbl> <chr>     
1 BFP    0.888 0.789 R2 = 0.789
2 WHR    0.104 0.011 R2 = 0.011

Which correlate better with BMI, WHR or BFP?

s1 <- corrs |>
  filter(metric == "WHR") |>
  pull(label)

s2 <- corrs |>
  filter(metric == "BFP") |>
  pull(label)

pl1 <- pl1 +
  labs(subtitle = s1) +
  geom_smooth(method = "lm") +
  theme_minimal()

pl2 <- pl2 +
  labs(subtitle = s2) +
  geom_smooth(method = "lm") +
  theme_minimal()
pl1 / pl2

NB! “Tim Toady” (TIMTOWTDI) and - As much as possible, prepare your data to contain what you need before plotting!

Lastly, why does BFP evidently correlate much better with BMI than WHR?