Remember to load libraries first! You cannot use tools from a toolbox you have not yet “picked up”
eval=FALSE
flag in the chunk settings to avoid retrieving the data every time you hit render
# A tibble: 403 × 19
id chol stab.glu hdl ratio glyhb location age gender height weight
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl> <dbl>
1 1000 203 82 56 3.60 4.31 Buckingham 46 female 62 121
2 1001 165 97 24 6.90 4.44 Buckingham 29 female 64 218
3 1002 228 92 37 6.20 4.64 Buckingham 58 female 61 256
4 1003 78 93 12 6.5 4.63 Buckingham 67 male 67 119
5 1005 249 90 28 8.90 7.72 Buckingham 64 male 68 183
6 1008 248 94 69 3.60 4.81 Buckingham 34 male 71 190
7 1011 195 92 41 4.80 4.84 Buckingham 30 male 69 191
8 1015 227 75 44 5.20 3.94 Buckingham 37 male 59 170
9 1016 177 87 49 3.60 4.84 Buckingham 45 male 69 166
10 1022 263 89 40 6.60 5.78 Buckingham 55 female 63 202
# ℹ 393 more rows
# ℹ 8 more variables: frame <chr>, bp.1s <dbl>, bp.1d <dbl>, bp.2s <dbl>,
# bp.2d <dbl>, waist <dbl>, hip <dbl>, time.ppn <dbl>
Note the column types are stated, i.e. what kind of data is in each column
<chr>
: character<dbl>
: doublemutate()
diabetes_data <- diabetes_data |>
mutate(height_cm = round(height * 2.54,
digits = 1),
weight_kg = round(weight * 0.454,
digits = 1),
hip_cm = round(hip * 2.54,
digits = 1),
waist_cm = round(waist * 2.54,
digits = 1))
diabetes_data |>
select(matches("height|weight|hip|waist")) # Note: Selecting using a regex()
# A tibble: 403 × 8
height weight waist hip height_cm weight_kg hip_cm waist_cm
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 62 121 29 38 158. 54.9 96.5 73.7
2 64 218 46 48 163. 99 122. 117.
3 61 256 49 57 155. 116. 145. 124.
4 67 119 33 38 170. 54 96.5 83.8
5 68 183 44 41 173. 83.1 104. 112.
6 71 190 36 42 180. 86.3 107. 91.4
7 69 191 46 49 175. 86.7 124. 117.
8 59 170 34 39 150. 77.2 99.1 86.4
9 69 166 34 40 175. 75.4 102. 86.4
10 63 202 45 50 160 91.7 127 114.
# ℹ 393 more rows
mutate()
ORinch_to_cm_fct <- 2.54
my_digits <- 1
diabetes_data <- diabetes_data |>
mutate(height_cm = round(height * inch_to_cm_fct,
digits = my_digits),
weight_kg = round(weight * 0.454,
digits = my_digits),
hip_cm = round(hip * inch_to_cm_fct,
digits = my_digits),
waist_cm = round(waist * inch_to_cm_fct,
digits = my_digits))
diabetes_data |>
select(matches("height|weight|hip|waist")) # Note: Selecting using a regex()
# A tibble: 403 × 8
height weight waist hip height_cm weight_kg hip_cm waist_cm
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 62 121 29 38 158. 54.9 96.5 73.7
2 64 218 46 48 163. 99 122. 117.
3 61 256 49 57 155. 116. 145. 124.
4 67 119 33 38 170. 54 96.5 83.8
5 68 183 44 41 173. 83.1 104. 112.
6 71 190 36 42 180. 86.3 107. 91.4
7 69 191 46 49 175. 86.7 124. 117.
8 59 170 34 39 150. 77.2 99.1 86.4
9 69 166 34 40 175. 75.4 102. 86.4
10 63 202 45 50 160 91.7 127 114.
# ℹ 393 more rows
Now, if we wanted 2 decimals, we only have to change one variable! 👍
filter()
# A tibble: 1 × 23
id chol stab.glu hdl ratio glyhb location age gender height weight
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl> <dbl>
1 10000 185 76 58 3.20 4.83 Buckingham 23 male 76 164
# ℹ 12 more variables: frame <chr>, bp.1s <dbl>, bp.1d <dbl>, bp.2s <dbl>,
# bp.2d <dbl>, waist <dbl>, hip <dbl>, time.ppn <dbl>, height_cm <dbl>,
# weight_kg <dbl>, hip_cm <dbl>, waist_cm <dbl>
Note, that ,
is equivalent to &
filter()
filter()
ORfilter()
pl <- diabetes_data |>
mutate(
subject = case_when(
location == "Louisa" & age > 40 ~ gender,
TRUE ~ "Not incl")) |>
ggplot(aes(x = height_cm,
y = weight_kg,
colour = subject,
size = subject)) +
geom_point(alpha = 0.5) +
scale_size_manual(values = c(5, 5, 1)) +
theme_minimal(base_size = 26) +
theme(legend.position = "bottom")
Note how we’re creating needed attribute before plotting
Always modify data Before plotting!
arrange()
arrange()
ORarrange()
OROR, OR, OR, OR, OR, OR, OR, etc.
mutate()
case_when()
diabetes_data <- diabetes_data |>
mutate(BMI_class = case_when(BMI < 16.5 ~ "Severely underweight",
16.5 <= BMI & BMI < 18.5 ~ "Underweight",
18.5 <= BMI & BMI < 25.0 ~ "Normal weight",
25.0 <= BMI & BMI < 30.0 ~ "Overweight",
30.0 <= BMI & BMI < 35.0 ~ "Obesity class I",
35.0 <= BMI & BMI < 40.0 ~ "Obesity class II",
40.0 <= BMI ~ "Obesity class III"))
diabetes_data |>
select(BMI, BMI_class)
# A tibble: 403 × 2
BMI BMI_class
<dbl> <chr>
1 22.1 Normal weight
2 37.4 Obesity class II
3 48.4 Obesity class III
4 18.6 Normal weight
5 27.9 Overweight
6 26.5 Overweight
7 28.2 Overweight
8 34.4 Obesity class I
9 24.5 Normal weight
10 35.8 Obesity class II
# ℹ 393 more rows
pl <- diabetes_data |>
count(BMI_class) |>
drop_na(BMI_class) |>
mutate(pct = n / sum(n) * 100) |>
ggplot(aes(x = BMI_class,
y = pct)) +
geom_col(colour = "green",
fill = "lightgreen",
alpha = 0.5) +
geom_hline(yintercept = 0) +
theme_minimal(base_size = 26) +
theme(axis.text.x = element_text(
angle = 45,
hjust = 1)) +
labs(x = "",
y = "Percent") +
theme(plot.margin = unit(
c(1, 0.5, 0.5, 1), "cm"),
panel.grid.major.x = element_blank())
group_by() |> summarise()
diabetes_data |>
group_by(BMI_class) |>
summarise(n = n(),
mu_weight_kg = mean(weight_kg),
sigma_weight_kg = sd(weight_kg))
# A tibble: 8 × 4
BMI_class n mu_weight_kg sigma_weight_kg
<fct> <int> <dbl> <dbl>
1 Severely underweight 2 49.5 5.80
2 Underweight 7 52.2 6.43
3 Normal weight 112 65.9 9.77
4 Overweight 124 76.2 8.86
5 Obesity class I 90 89.2 9.97
6 Obesity class II 32 102. 13.5
7 Obesity class III 30 115. 15.8
8 <NA> 6 NA NA
<fct>
= a factor variable, i.e. a category!diabetes_data <- diabetes_data |>
mutate(gender_class = case_when(gender == "female" ~ 0,
gender == "male" ~ 1),
BFP = 1.39 * BMI + 0.16 * age - 10.34 * gender_class - 9,
WHR = waist / hip)
diabetes_data
# A tibble: 403 × 28
id chol stab.glu hdl ratio glyhb location age gender height weight
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl> <dbl>
1 1000 203 82 56 3.60 4.31 Buckingham 46 female 62 121
2 1001 165 97 24 6.90 4.44 Buckingham 29 female 64 218
3 1002 228 92 37 6.20 4.64 Buckingham 58 female 61 256
4 1003 78 93 12 6.5 4.63 Buckingham 67 male 67 119
5 1005 249 90 28 8.90 7.72 Buckingham 64 male 68 183
6 1008 248 94 69 3.60 4.81 Buckingham 34 male 71 190
7 1011 195 92 41 4.80 4.84 Buckingham 30 male 69 191
8 1015 227 75 44 5.20 3.94 Buckingham 37 male 59 170
9 1016 177 87 49 3.60 4.84 Buckingham 45 male 69 166
10 1022 263 89 40 6.60 5.78 Buckingham 55 female 63 202
# ℹ 393 more rows
# ℹ 17 more variables: frame <chr>, bp.1s <dbl>, bp.1d <dbl>, bp.2s <dbl>,
# bp.2d <dbl>, waist <dbl>, hip <dbl>, time.ppn <dbl>, height_cm <dbl>,
# weight_kg <dbl>, hip_cm <dbl>, waist_cm <dbl>, BMI <dbl>, BMI_class <chr>,
# gender_class <dbl>, BFP <dbl>, WHR <dbl>
First look at the data
Well, there does seem to be a tendency…
diabetes_data |>
select(BMI, WHR, BFP) |>
pivot_longer(cols = c(WHR, BFP),
names_to = "metric",
values_to = "value")
# A tibble: 806 × 3
BMI metric value
<dbl> <chr> <dbl>
1 22.1 WHR 0.763
2 22.1 BFP 29.1
3 37.4 WHR 0.958
4 37.4 BFP 47.7
5 48.4 WHR 0.860
6 48.4 BFP 67.6
7 18.6 WHR 0.868
8 18.6 BFP 17.3
9 27.9 WHR 1.07
10 27.9 BFP 29.6
# ℹ 796 more rows
corrs <- diabetes_data |>
select(BMI, WHR, BFP) |>
pivot_longer(cols = c(WHR, BFP),
names_to = "metric",
values_to = "value") |>
drop_na() |>
group_by(metric) |>
summarise(PCC = cor(x = BMI,
y = value,
use = "complete.obs",
method = "pearson"))
corrs
# A tibble: 2 × 2
metric PCC
<chr> <dbl>
1 BFP 0.888
2 WHR 0.104
NB! “Tim Toady” (TIMTOWTDI) and - As much as possible, prepare your data to contain what you need before plotting!
R for Bio Data Science