Recap Lab 8

Leon Eyrich Jessen

Tidy PCA Assignment

About Data - I Choose

The BLOSUM (BLOcks SUbstitution Matrix)

  • A substitution matrix used extensively in protein e.g. sequence alignment

  • Contains log-odds scores indicating how likely a substition is

  • Derived from very conserved regions of protein families in the BLOCKS database

  • Using blocks of aligned sequence segments

  • Blocks are chosen such that they have less than x% similarity

  • E.g. BLOSUM62 is derived from blocks with less than 62% similarity

Note, only numbers computed from observed substitutions here! No information on physico-chemical features of amino acids!

Get Data

Download

download.file(
  url = "https://raw.githubusercontent.com/rdpstaff/AlignmentTools/master/src/data/blosum62.txt",
  destfile = "data/_raw/blosum62.txt")
bl62 <- read_table(
  file = "data/_raw/blosum62.txt",
  comment = "#")
bl62
# A tibble: 24 × 24
   A         R     N     D     C     Q     E     G     H     I     L     K     M
   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 A         4    -1    -2    -2     0    -1    -1     0    -2    -1    -1    -1
 2 R        -1     5     0    -2    -3     1     0    -2     0    -3    -2     2
 3 N        -2     0     6     1    -3     0     0     0     1    -3    -3     0
 4 D        -2    -2     1     6    -3     0     2    -1    -1    -3    -4    -1
 5 C         0    -3    -3    -3     9    -3    -4    -3    -3    -1    -1    -3
 6 Q        -1     1     0     0    -3     5     2    -2     0    -3    -2     1
 7 E        -1     0     0     2    -4     2     5    -2     0    -3    -3     1
 8 G         0    -2     0    -1    -3    -2    -2     6    -2    -4    -4    -2
 9 H        -2     0     1    -1    -3     0     0    -2     8    -3    -3    -1
10 I        -1    -3    -3    -3    -1    -3    -3    -4    -3     4     2    -3
# ℹ 14 more rows
# ℹ 11 more variables: F <dbl>, P <dbl>, S <dbl>, T <dbl>, W <dbl>, Y <dbl>,
#   V <dbl>, B <dbl>, Z <dbl>, X <dbl>, `*` <dbl>

Wrangle Data

Adjust Column Names

colnames(bl62) <- c("aa", colnames(bl62))

Subset to Data on the 20 Proteinogenic AA

std_aa = c("A", "R", "N", "D", "C", "Q", "E", "G", "H", "I",
           "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V")
bl62 <- bl62 |> 
  filter(aa %in% std_aa) |> 
  select(aa, std_aa)
bl62
# A tibble: 20 × 21
   aa        A     R     N     D     C     Q     E     G     H     I     L     K
   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 A         4    -1    -2    -2     0    -1    -1     0    -2    -1    -1    -1
 2 R        -1     5     0    -2    -3     1     0    -2     0    -3    -2     2
 3 N        -2     0     6     1    -3     0     0     0     1    -3    -3     0
 4 D        -2    -2     1     6    -3     0     2    -1    -1    -3    -4    -1
 5 C         0    -3    -3    -3     9    -3    -4    -3    -3    -1    -1    -3
 6 Q        -1     1     0     0    -3     5     2    -2     0    -3    -2     1
 7 E        -1     0     0     2    -4     2     5    -2     0    -3    -3     1
 8 G         0    -2     0    -1    -3    -2    -2     6    -2    -4    -4    -2
 9 H        -2     0     1    -1    -3     0     0    -2     8    -3    -3    -1
10 I        -1    -3    -3    -3    -1    -3    -3    -4    -3     4     2    -3
11 L        -1    -2    -3    -4    -1    -2    -3    -4    -3     2     4    -2
12 K        -1     2     0    -1    -3     1     1    -2    -1    -3    -2     5
13 M        -1    -1    -2    -3    -1     0    -2    -3    -2     1     2    -1
14 F        -2    -3    -3    -3    -2    -3    -3    -3    -1     0     0    -3
15 P        -1    -2    -2    -1    -3    -1    -1    -2    -2    -3    -3    -1
16 S         1    -1     1     0    -1     0     0     0    -1    -2    -2     0
17 T         0    -1     0    -1    -1    -1    -1    -2    -2    -1    -1    -1
18 W        -3    -3    -4    -4    -2    -2    -3    -2    -2    -3    -2    -3
19 Y        -2    -2    -2    -3    -2    -1    -2    -3     2    -1    -1    -2
20 V         0    -3    -3    -3    -1    -2    -2    -3    -3     3     1    -2
# ℹ 8 more variables: M <dbl>, F <dbl>, P <dbl>, S <dbl>, T <dbl>, W <dbl>,
#   Y <dbl>, V <dbl>

Wrangle Data

Save to File

write_tsv(x = bl62,
          file = "data/blosum62.tsv")

Visualise Data

pl <- bl62 |>
  pivot_longer(
    cols = -aa,
    names_to = "aa2",
    values_to = "log_odds_score") |> 
  ggplot(aes(
    x = factor(aa,
               levels = std_aa),
    y = factor(aa2,
               levels = rev(std_aa)),
    fill = log_odds_score,
    label = log_odds_score)) +
  geom_tile() +
  geom_text(colour = "darkgrey",
            size = 7) +
  scale_fill_gradient2(low = "blue",
                       mid = "white",
                       high = "red",
                       midpoint = 0) +
  scale_x_discrete(position = "top") +
  coord_fixed() +
  theme_minimal(base_size = 22,
                base_family = "Avenir") +
  theme(legend.position = "none",
        plot.title = element_text(
          hjust = 0.5,
          vjust = -4)) +
  labs(
    x = "",
    y = "",
    title = str_c(
    "The BLOSUM62 Visualised ",
    "as a Heatmap"))

Create a PCA object

bl62_pca <- bl62 |>
  select_if(is.numeric) |>
  prcomp(center = TRUE,
         scale = TRUE)
bl62_pca |>
  str()
List of 5
 $ sdev    : num [1:20] 2.8 1.88 1.51 1.1 1.04 ...
 $ rotation: num [1:20, 1:20] -0.022 -0.228 -0.286 -0.281 0.163 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:20] "A" "R" "N" "D" ...
  .. ..$ : chr [1:20] "PC1" "PC2" "PC3" "PC4" ...
 $ center  : Named num [1:20] -0.8 -1.05 -0.9 -1.3 -1.55 -0.6 -0.85 -1.7 -0.85 -1.35 ...
  ..- attr(*, "names")= chr [1:20] "A" "R" "N" "D" ...
 $ scale   : Named num [1:20] 1.47 2.01 2.29 2.36 2.7 ...
  ..- attr(*, "names")= chr [1:20] "A" "R" "N" "D" ...
 $ x       : num [1:20, 1:20] -0.115 -2.326 -3.34 -3.244 2.597 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : NULL
  .. ..$ : chr [1:20] "PC1" "PC2" "PC3" "PC4" ...
 - attr(*, "class")= chr "prcomp"
  • “Complicated” model object \(\rightarrow\) broom to the rescue!

Make a scree plot using broom to tidy

bl62_pca |>
  tidy("pcs")
# A tibble: 20 × 4
      PC  std.dev percent cumulative
   <dbl>    <dbl>   <dbl>      <dbl>
 1     1 2.80e+ 0 0.391        0.391
 2     2 1.88e+ 0 0.176        0.567
 3     3 1.51e+ 0 0.114        0.681
 4     4 1.10e+ 0 0.0603       0.742
 5     5 1.04e+ 0 0.0542       0.796
 6     6 8.85e- 1 0.0392       0.835
 7     7 8.71e- 1 0.0380       0.873
 8     8 8.51e- 1 0.0362       0.909
 9     9 7.61e- 1 0.0290       0.938
10    10 6.23e- 1 0.0194       0.958
11    11 5.46e- 1 0.0149       0.973
12    12 3.89e- 1 0.00755      0.980
13    13 3.55e- 1 0.0063       0.986
14    14 2.93e- 1 0.0043       0.991
15    15 2.46e- 1 0.00302      0.994
16    16 2.21e- 1 0.00244      0.996
17    17 2.16e- 1 0.00233      0.999
18    18 1.71e- 1 0.00146      1.00 
19    19 2.33e- 2 0.00003      1    
20    20 8.86e-18 0            1    
bl62_pca |>
  tidy("pcs") |> 
  mutate(percent = percent * 100) |> 
  ggplot(aes(x = PC,
             y = percent)) +
  geom_hline(yintercept = 0) +
  geom_col(colour = "black",
           alpha = 0.5) +
  theme_bw(base_size = 20) +
  theme(panel.grid.minor.x = element_blank(),
        panel.grid.major.x = element_blank()) +
  labs(title = "Scree Plot of PCA of BLOSUM62")

Augment using broom

bl62_pca_aug <- bl62_pca |>
  augment(bl62)
bl62_pca_aug
# A tibble: 20 × 42
   .rownames aa        A     R     N     D     C     Q     E     G     H     I
   <chr>     <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 1         A         4    -1    -2    -2     0    -1    -1     0    -2    -1
 2 2         R        -1     5     0    -2    -3     1     0    -2     0    -3
 3 3         N        -2     0     6     1    -3     0     0     0     1    -3
 4 4         D        -2    -2     1     6    -3     0     2    -1    -1    -3
 5 5         C         0    -3    -3    -3     9    -3    -4    -3    -3    -1
 6 6         Q        -1     1     0     0    -3     5     2    -2     0    -3
 7 7         E        -1     0     0     2    -4     2     5    -2     0    -3
 8 8         G         0    -2     0    -1    -3    -2    -2     6    -2    -4
 9 9         H        -2     0     1    -1    -3     0     0    -2     8    -3
10 10        I        -1    -3    -3    -3    -1    -3    -3    -4    -3     4
11 11        L        -1    -2    -3    -4    -1    -2    -3    -4    -3     2
12 12        K        -1     2     0    -1    -3     1     1    -2    -1    -3
13 13        M        -1    -1    -2    -3    -1     0    -2    -3    -2     1
14 14        F        -2    -3    -3    -3    -2    -3    -3    -3    -1     0
15 15        P        -1    -2    -2    -1    -3    -1    -1    -2    -2    -3
16 16        S         1    -1     1     0    -1     0     0     0    -1    -2
17 17        T         0    -1     0    -1    -1    -1    -1    -2    -2    -1
18 18        W        -3    -3    -4    -4    -2    -2    -3    -2    -2    -3
19 19        Y        -2    -2    -2    -3    -2    -1    -2    -3     2    -1
20 20        V         0    -3    -3    -3    -1    -2    -2    -3    -3     3
# ℹ 30 more variables: L <dbl>, K <dbl>, M <dbl>, F <dbl>, P <dbl>, S <dbl>,
#   T <dbl>, W <dbl>, Y <dbl>, V <dbl>, .fittedPC1 <dbl>, .fittedPC2 <dbl>,
#   .fittedPC3 <dbl>, .fittedPC4 <dbl>, .fittedPC5 <dbl>, .fittedPC6 <dbl>,
#   .fittedPC7 <dbl>, .fittedPC8 <dbl>, .fittedPC9 <dbl>, .fittedPC10 <dbl>,
#   .fittedPC11 <dbl>, .fittedPC12 <dbl>, .fittedPC13 <dbl>, .fittedPC14 <dbl>,
#   .fittedPC15 <dbl>, .fittedPC16 <dbl>, .fittedPC17 <dbl>, .fittedPC18 <dbl>,
#   .fittedPC19 <dbl>, .fittedPC20 <dbl>

Add some chemical classes

get_chem_class <- function(x){
  chem_cols <- c(
    "A" = "Hydrophobic",
    "R" = "Basic",
    "N" = "Neutral",
    "D" = "Acidic",
    "C" = "Sulphur",
    "Q" = "Neutral",
    "E" = "Acidic",
    "G" = "Polar",
    "H" = "Basic",
    "I" = "Hydrophobic",
    "L" = "Hydrophobic",
    "K" = "Basic",
    "M" = "Sulphur",
    "F" = "Hydrophobic",
    "P" = "Hydrophobic",
    "S" = "Polar",
    "T" = "Polar",
    "W" = "Hydrophobic",
    "Y" = "Polar",
    "V" = "Hydrophobic")
  return( chem_cols[x] ) # Example of avoiding dependencies => shareable!
}
  • This is a named vector, think dictionary (Yes R can do that too!)

Add some chemical classes

  • Note how we are using our own custom function inside a dplyr pipeline here
get_chem_class(x = c("A", "R", "D"))
            A             R             D 
"Hydrophobic"       "Basic"      "Acidic" 
bl62_pca_aug <- bl62_pca_aug |> 
  mutate(chem_class = get_chem_class(aa))

bl62_pca_aug |>
  select(aa, chem_class)
# A tibble: 20 × 2
   aa    chem_class 
   <chr> <chr>      
 1 A     Hydrophobic
 2 R     Basic      
 3 N     Neutral    
 4 D     Acidic     
 5 C     Sulphur    
 6 Q     Neutral    
 7 E     Acidic     
 8 G     Polar      
 9 H     Basic      
10 I     Hydrophobic
11 L     Hydrophobic
12 K     Basic      
13 M     Sulphur    
14 F     Hydrophobic
15 P     Hydrophobic
16 S     Polar      
17 T     Polar      
18 W     Hydrophobic
19 Y     Polar      
20 V     Hydrophobic

Plot the PCA Plot

pca_plot_axes_labels <- bl62_pca |>
  tidy("eigenvalues") |>
  mutate(lbl = str_c("PC", PC, ", VE = ", round(percent*100,2), "%")) |> 
  pull("lbl")
pca_plot_axes_labels
 [1] "PC1, VE = 39.14%" "PC2, VE = 17.59%" "PC3, VE = 11.41%" "PC4, VE = 6.03%" 
 [5] "PC5, VE = 5.42%"  "PC6, VE = 3.92%"  "PC7, VE = 3.8%"   "PC8, VE = 3.62%" 
 [9] "PC9, VE = 2.9%"   "PC10, VE = 1.94%" "PC11, VE = 1.49%" "PC12, VE = 0.76%"
[13] "PC13, VE = 0.63%" "PC14, VE = 0.43%" "PC15, VE = 0.3%"  "PC16, VE = 0.24%"
[17] "PC17, VE = 0.23%" "PC18, VE = 0.15%" "PC19, VE = 0%"    "PC20, VE = 0%"   

Plot the PCA Plot

pca_plot <- bl62_pca_aug |> 
  ggplot(aes(x = .fittedPC1,
             y = .fittedPC2,
             label = aa,
             colour = chem_class,
             fill = chem_class)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = 0) +
  geom_point(shape = 21,
             size = 6,
             alpha = 0.5) +
  geom_text(colour = "black") +
  scale_fill_manual(
    values = c("red", "blue", "black",
               "purple", "green", "yellow")) +
  scale_colour_manual(
    values = c("red", "blue", "black",
               "purple", "green", "yellow")) +
  coord_fixed() +
  theme_bw(base_size = 20,
           base_family = "avenir") +
  labs(
    title = "PCA: Scores Plot of BLOSUM62",
    x = pluck(pca_plot_axes_labels, 1),
    y = pluck(pca_plot_axes_labels, 2),
    fill = "Chemistry",
    colour = "Chemistry",
    caption = "Up: Aromatic, down: Aliphatic, Left: Charged, Right: Hydrophobic")

That’s about it for tidyverse

  • We have been over a ton of materials - Well done!

  • Now we will have two great labs on R-packages and Shiny

  • And then the mini-symposium, with 6 talks on how R for Bio Data Science is being applied in industry

That’s about it for tidyverse

Lastly, a quick case story:

  • A well seasoned Professor was curious about the tidyverse, but reluctant! Over perhaps 6 months, I would answer the occasional over-the-coffee questions and point to some materials, still feeling some push-back. Then suddenly, one day said colleague came into my office and proclaimed: “I get it now! The penny has dropped! Once you get your head around it, this thing is a complete game changer!

  • It was easy to for me to utter: “Could not agree more!”

  • So, get your “head around it” and you WILL HAVE…

That’s about it for tidyverse