Mock_23S_R_analysis

This is a Rscript the describes the analysis steps for 23S amplicon sequencing of the Mock community 2. Note : most of it can be done using R, but some parts are in bash and must be run through the terminal (ideally on a server, to have more ressources)

Table of content

I. From raw data to ASV table

Taxonomy affiliations

Data analysis

I. From raw data to ASV table

1. Cutadapt: remove the primers

# Script in bash 
# Command launched in the folder containing all the fastq files
cd /MOCK

# Using cutadapt-3.5

# Create a directory that will store trimmed sequences
mkdir Cut_fastq

# Create a file with all the sample names
ls *_R1_001.fastq.gz | cut -d _ -f 1,2,3 > samples.txt

#Primers sequences
#23S_F: "ACAGWAAGACCCTATGAAGCTT"
#23S_R: "CCTGTTATCCCTAGAGTAACTT"

# Paired-end trim
for sample in `awk '{print $1}' samples.txt`
do      
cutadapt -g ^ACAGWAAGACCCTATGAAGCTT -G ^CCTGTTATCCCTAGAGTAACTT --max-n 0 -o ./Cut_fastq/${sample}_R1_trimmed.fastq.gz -p ./Cut_fastq/${sample}_R2_trimmed.fastq.gz ${sample}_R1_001.fastq.gz ${sample}_R2_001.fastq.gz
done

2. DADA2: from raw sequences to ASV table

library(dada2)

# Path
path <- "set/to/your/path/"

# Store forward and reverse fastq (with primers removed by cutadapt)
fnFs <- sort(list.files(paste0(path,"/Cut_fastq"), pattern="_R1_trimmed.fastq.gz", full.names = TRUE))
fnRs <- sort(list.files(paste0(path,"/Cut_fastq"), pattern="_R2_trimmed.fastq.gz", full.names = TRUE))

# Extract sample names
sample.names <- sapply(strsplit(basename(fnFs), "_R"), `[`, 1)

# Place filtered files in a subdirectory "filtered"
filtFs <- file.path(path, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(path, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))
names(filtFs) <- sample.names
names(filtRs) <- sample.names

# Filter and trim sequences
out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, 
                     truncLen=c(220,200), 
                     maxN=0, maxEE=c(2,2),truncQ=2, 
                     rm.phix=TRUE,
                     compress=TRUE, 
                     multithread=TRUE)

# Learn the error rate
errF <- learnErrors(filtFs, multithread=TRUE)
errR <- learnErrors(filtRs, multithread=TRUE)

# Infer samples
dadaFs <- dada(filtFs, err=errF, multithread=TRUE)
dadaRs <- dada(filtRs, err=errR, multithread=TRUE)

# Merge pairs
mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=TRUE)

# Create an ASV table
seqtab <- makeSequenceTable(mergers)

# Remove the chimera
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)

# Keep ASVs that have a good length in order to discard unspecific ASVs
seqtab.nochim <- seqtab.nochim[,nchar(colnames(seqtab)) %in% 345:370]

# Store ASV table
saveRDS(seqtab.nochim, file="seqtab_nochim.rds")

II. Taxonomic affiliations

1. DADA2: Assign taxonomy using Phytool database

# Script in bash 
# Get Phytool 23S database at : https://github-carrtel.shinyapps.io/phytool_v2/

# add a "_" instead of a space between the genus and species name, otherwise DADA2 assignation will only give a genus name
cat Phytool_complete_rRNA_23S_2023-02-10.fasta | tr "[:blank:]" "_" > Phytool_23S.fa

library(dada2)

# Run DADA2 taxonomic assignation algorithme
# Note that the use of 20 threads is not possible in every machine. Here I was using Migale server
taxonomy <- assignTaxonomy(seqtab.nochim, "Phytool_23S.fa", multithread=20)

# Save taxonomy 
saveRDS(taxonomy, file = "DADA2_ASV_taxa_Phytool.rds")

2. Blast: Get sequences match against Phytool database

library(dplyr)

# Create a fasta file for all ASV sequences
ASV_table <- as.data.frame(t(readRDS("seqtab_nochim.rds")))
ASV_table <- ASV_table %>% mutate(sequence = 1:n())
ASV_table$sequence <- paste(">seq", sep="", ASV_table$sequence)
write.table(ASV_table, "ASV_table.txt",sep="\t", quote = FALSE,  col.names=FALSE)

# In the terminal - create a fasta file using awk
# awk '{print $14"\n"$1}' ASV_table.txt > ASV_table.fa

# Script in bash 
# Using blast-2.13.0

# Store the variables
QUERY="ASV_table.fa"
OUT="Phytool_ASV_blast_all.txt"

# Launch blast script
blastn -query $QUERY -subject Phytool_23S.fa -max_target_seqs 10 -evalue 1e-6 -outfmt "6 qseqid sseqid length qlen slen astart gend sstart send evalue pident mismatch gapopen" -out $OUT -num_threads 50

3. Blast: Get sequences match against all NCBI nt database

# Script in bash 
# Store the variables
#Note that the path to the nt database here is the one of Migale server
BLASTDB="/db/outils/BLAST+taxo/nt/current/nt"
QUERY="ASV_table.fa"
OUT="ASV_blast_all_taxa.txt"

# Launch blast script
blastn -query $QUERY -db $BLASTDB -max_target_seqs 10 -evalue 1e-6 -outfmt "6 qseqid sseqid length qlen slen astart gend sstart send evalue pident mismatch gapopen gi sacc staxids sscinames scomnames stitle" -out $OUT -num_threads 60

4. Combine all taxonomy information in a single table and save

library(data.table)
library(dplyr)
library(stringr)

# Load the three tables
ASV_DADA2_taxonomy <- as.data.frame(readRDS("DADA2_ASV_taxa_Phytool.rds"))
ASV_blast_ncbi_taxonomy <- read.table("ASV_blast_all_taxa.txt", sep="\t")
ASV_blast_phytool_taxonomy <- read.table("Phytool_ASV_blast.txt", header = FALSE, sep = "\t", col.names = paste0("V",seq_len(11)), fill = TRUE)

# Add a seqid column to make the link with blast table
ASV_table <- as.data.frame(t(readRDS("seqtab_nochim.rds")))
ASV_table <- ASV_table %>% mutate(sequence = 1:n())
ASV_table$sequence <- paste(">seq", sep="", ASV_table$sequence)
ASV_DADA2_taxonomy$seqid <- ASV_table[match(rownames(ASV_DADA2_taxonomy), rownames(ASV_table)),"sequence"]
ASV_DADA2_taxonomy$seqid <- gsub(">","", ASV_DADA2_taxonomy$seqid)

# Customize ncbi table
ASV_blast_ncbi_taxonomy <- ASV_blast_ncbi_taxonomy[,c(1:11,16)]
colnames(ASV_blast_ncbi_taxonomy) <- c("qseqid","sseqid","length","qlen","slen","astart","aend","evalue","pident","mismatch","gapopen","stitle")

# Keep only known affiliation
ASV_blast_ncbi_taxonomy <- ASV_blast_ncbi_taxonomy[grep("Uncultured",ASV_blast_ncbi_taxonomy$stitle, invert=T),]

# Keep the best hit for each sequence (i.e. the first one)
ASV_blast_ncbi_taxonomy_uniq <- setDT(ASV_blast_ncbi_taxonomy)[!duplicated(rleid(qseqid))]

# Keep only species name
for (i in 1:nrow(ASV_blast_ncbi_taxonomy_uniq)) {
  ASV_blast_ncbi_taxonomy_uniq$ncbi_blast[i] <-  paste(strsplit(ASV_blast_ncbi_taxonomy_uniq$stitle[i], " ")[[1]][1], strsplit(ASV_blast_ncbi_taxonomy_uniq$stitle[i], " ")[[1]][2], sep="_")
}

# Add ncbiBlast annotation in ASV_DADA2_taxonomy_table
ASV_DADA2_taxonomy <- merge(ASV_DADA2_taxonomy, ASV_blast_ncbi_taxonomy_uniq %>%
                select(ncbi_blast, qseqid), by.x = "seqid", by.y = "qseqid", all.x = TRUE)

# Customize phytool blast table
colnames(ASV_blast_phytool_taxonomy) <- c("qseqid","sseqid","length","qlen","slen","astart","aend","evalue","pident","mismatch","gapopen")

# Keep the best hit for each sequence (i.e. the first one)
ASV_blast_phytool_taxonomy_uniq <- setDT(ASV_blast_phytool_taxonomy)[!duplicated(rleid(qseqid))]

# Keep only the species name
ASV_blast_phytool_taxonomy_uniq$Phytool_blast <- sapply(ASV_blast_phytool_taxonomy_uniq$sseqid, function(x) str_split_i(x, ";",-1)[[1]])

# Add PhytoolBlast species annotation in ASV_DADA2_taxonomy_table

ASV_DADA2_taxonomy <- merge(ASV_DADA2_taxonomy, ASV_blast_phytool_taxonomy_uniq %>%
                select(Phytool_blast, qseqid), by.x = "seqid", by.y = "qseqid", all.x = TRUE) 


# Save the taxonomy table
saveRDS(ASV_DADA2_taxonomy, file="ASV_DADA2_taxonomy_combined.rds")

##                       Species           Phytool_blast              ncbi_blast
## seq1        Vaucheria_litorea       Vaucheria_litorea     Xanthonema_montanum
## seq2     Botryococcus_braunii    Botryococcus_braunii    Botryococcus_braunii
## seq3    Planktothrix_agardhii  Planktothrix_rubescens   Planktothrix_agardhii
## seq4      Cyanobium_rubescens       Synechococcus_sp.      Anathece_clathrata
## seq5          Pandorina_morum         Pandorina_morum       Brevundimonas_sp.
## seq6        Staurosira_venter       Staurosira_venter   Fragilaria_construens
## seq7  Haematococcus_lacustris Haematococcus_lacustris Haematococcus_lacustris
## seq8          Vischeria_magna         Vischeria_magna     Chlorella_desiccata
## seq9     Asterionella_formosa    Asterionella_formosa    Asterionella_formosa
## seq10  Microcystis_aeruginosa         Microcystis_sp.  Microcystis_aeruginosa
## seq11    Tetradesmus_obliquus    Tetradesmus_obliquus    Tetradesmus_obliquus
## seq12 Zygnema_circumcarinatum Zygnema_circumcarinatum             Zygnema_sp.
## seq13         Nitzschia_palea           Nitzschia_sp.         Nitzschia_palea
## seq14   Trichormus_variabilis          Wollea_ambigua   Trichormus_variabilis
## seq15                    <NA> Mesotaenium_caldariorum           Mougeotia_sp.
## seq16                    <NA>    Fragilaria_perminuta  Fragilaria_crotonensis
## seq17         Pandorina_morum         Pandorina_morum       Brevundimonas_sp.
## seq18     Cryptomonas_curvata         Cryptomonas_sp.     Cryptomonas_curvata
## seq19 Staurastrum_punctulatum Staurastrum_punctulatum Staurastrum_punctulatum
## seq20        Romeria_chlorina        Romeria_chlorina     Pseudanabaena_yagii

III. Data analysis

1. Prepare ASV table for the analysis

Get normalized and unormalized tables

library(DESeq2)

# Add seq name instead of ASV
ASV_table <- as.data.frame(t(readRDS("seqtab_nochim.rds")))
ASV_table <- ASV_table %>% mutate(sequence = 1:n())
ASV_table$sequence <- paste("seq", sep="", ASV_table$sequence)
rownames(ASV_table) <- ASV_table$sequence

# Unormalized table
ASV_table_raw <- ASV_table[,-13]

# Normalization : mean
ASV_matrix <- as.matrix(ASV_table_raw)

s_num<-dim(ASV_matrix)[2]
otu.perc<-matrix(rep("NA",times=(dim(ASV_matrix)[1]*s_num)),nrow=dim(ASV_matrix)[1],ncol=s_num)
rownames(otu.perc)<-rownames(ASV_matrix)
colnames(otu.perc)=colnames(ASV_matrix) 
totals<-colSums(ASV_matrix)

for(s in c(1:s_num)){
  vec<-(ASV_matrix[,s]/totals[s])*100
  otu.perc[,s]<-vec
  rm(vec)
}

ASV_table_perc <- as.data.frame(otu.perc)

library(ggplot2)

ASV_table_raw$species <- ASV_DADA2_taxonomy[match(rownames(ASV_table_raw), ASV_DADA2_taxonomy$seqid),"Phytool_blast"]

melt_table <- melt(ASV_table_raw, id.var="species")
melt_table$value <- as.numeric(melt_table$value)

species_abund <- dcast(melt_table, species + variable ~., sum, value.var="value")

colnames(species_abund) <- c("species","variable","value")

Equimolar <- subset(species_abund, variable=="AUTEQ_S182_L001"|variable=="GENEQ_S179_L001")

gg <- ggplot(Equimolar, aes(x=species, y=value, fill=variable)) +
      geom_bar(stat= "identity", position="dodge", aes(fill=variable), width=0.7) +
      scale_fill_manual(values=c("#383838","#a9a9a9")) +
      theme(strip.text.y = element_text(size = 8, angle = 0), axis.text.x = element_text(size = 8 , angle = 90), axis.text.y = element_text(size = 8)) +
      xlab("Species") +
      ylab("Number of reads")

Group the “true” species together

library(ggplot2)

ASV_table_raw_T <- na.omit(ASV_table_raw)

ASV_table_raw_T[ASV_table_raw_T$species == "Planktothrix_suspensa" | 
                ASV_table_raw_T$species == "Planktothrix_rubescens"| 
                ASV_table_raw_T$species == "Planktothrix_agardhii"|
                ASV_table_raw_T$species == "Planktothrix_mougeotii", "True_specie"] <- "Planktothrix_rubescens"
ASV_table_raw_T[ASV_table_raw_T$species == "Bathycoccus_prasinos" | 
                ASV_table_raw_T$species == "Botryococcus_braunii", "True_specie"] <- "Botryococcus_braunii"
ASV_table_raw_T[ASV_table_raw_T$species == "Asterionella_ralfsii"|
                ASV_table_raw_T$species == "Asterionella_formosa", "True_specie"] <- "Asterionella_formosa"
ASV_table_raw_T[ASV_table_raw_T$species == "Cryptomonas_sp." |
                ASV_table_raw_T$species == "Cryptomonas_curvata", "True_specie"] <- "Cryptomonas_sp."
ASV_table_raw_T[ASV_table_raw_T$species == "Nitzschia_sp.", "True_specie"] <- "Nitzschia_palea"
ASV_table_raw_T[ASV_table_raw_T$species == "Staurastrum_punctulatum", "True_specie"] <- "Cosmarium_regnelii"
ASV_table_raw_T[ASV_table_raw_T$species == "Microcystis_sp."| 
                ASV_table_raw_T$species == "Microcystis_aeruginosa", "True_specie"] <- "Microcystis_aeruginosa"
ASV_table_raw_T[ASV_table_raw_T$species == "Zygnema_circumcarinatum" | 
                ASV_table_raw_T$species == "Zygnema_sp.", "True_specie"] <- "Zygnema_sp."
ASV_table_raw_T[ASV_table_raw_T$species == "Vischeria_magna", "True_specie"] <- "Vischeria_magna"
ASV_table_raw_T[ASV_table_raw_T$species == "Staurosira_venter", "True_specie"] <- "Staurosira_venter"
ASV_table_raw_T[ASV_table_raw_T$species == "Tetradesmus_obliquus", "True_specie"] <- "Tetradesmus_obliquus"
ASV_table_raw_T[ASV_table_raw_T$species == "Trichormus_variabilis"|
                ASV_table_raw_T$species == "Wollea_ambigua", "True_specie"] <- "Dolichospermum_flosaquae"
ASV_table_raw_T[ASV_table_raw_T$species == "Synechococcus_sp." |
                ASV_table_raw_T$species == "Cyanobium_rubescens", "True_specie"] <- "Anathece_clathrata"
ASV_table_raw_T[ASV_table_raw_T$species == "Vaucheria_litorea", "True_specie"] <- "Xanthonema_montanum"
ASV_table_raw_T[ASV_table_raw_T$species == "Mesotaenium_caldariorum", "True_specie"] <- "Mougeotia_sp."
ASV_table_raw_T[ASV_table_raw_T$species == "Fragilaria_perminuta", "True_specie"] <- "Ulnaria_acus"
ASV_table_raw_T[ASV_table_raw_T$species == "Haematococcus_lacustris", "True_specie"] <- "Haematococcus_lacustris"
ASV_table_raw_T[ASV_table_raw_T$species == "Pandorina_morum", "True_specie"] <- "Pandorina_morum"

ASV_table_raw_T <- ASV_table_raw_T[,c("AUTEQ_S182_L001","AUTEX_S184_L001","AUTGR_S183_L001","GENEQ_S179_L001","GENEX_S181_L001","GENGR_S180_L001","True_specie")]

melt_table <- melt(ASV_table_raw_T, id.var="True_specie")
melt_table$value <- as.numeric(melt_table$value)

species_abund <- dcast(melt_table, True_specie + variable ~., sum, value.var="value")

colnames(species_abund) <- c("species","variable","value")

2. Graphic for Equimolar data

Equimolar <- subset(species_abund, variable=="AUTEQ_S182_L001" | variable=="GENEQ_S179_L001")

Equimolar$species <- factor(Equimolar$species, levels=c("Dolichospermum_flosaquae","Planktothrix_rubescens","Microcystis_aeruginosa","Anathece_clathrata","Cosmarium_regnelii","Mougeotia_sp.","Zygnema_sp.","Haematococcus_lacustris","Tetradesmus_obliquus","Pandorina_morum","Botryococcus_braunii","Nitzschia_palea","Staurosira_venter","Ulnaria_acus","Asterionella_formosa","Cryptomonas_sp.","Vischeria_magna","Xanthonema_montanum"))

Equimolar <- na.omit(Equimolar)

gg1 <- ggplot(Equimolar, aes(x=species, y=value, fill=variable)) +
       geom_bar(stat= "identity", position="dodge", aes(fill=variable), width=0.7) +
       scale_fill_manual(values=c("#383838","#a9a9a9")) +
       theme(strip.text.y = element_text(size = 8, angle = 0), 
             axis.text.x = element_text(size = 8 , angle = 90), 
             axis.text.y = element_text(size = 8), 
             legend.position="none",
             panel.border = element_blank(),  
             panel.grid.major = element_blank(),
             panel.grid.minor = element_blank(),
             panel.background = element_blank(),
             axis.line = element_line(colour = "grey")) +
      xlab("Species") +
      ylab("Number of reads")

3. Graphic for Gradient concentrations

Gradient <- subset(species_abund, variable=="AUTGR_S183_L001" | variable=="GENGR_S180_L001")
Gradient <- na.omit(Gradient)

Gradient[Gradient$species == "Botryococcus_braunii", "DNA_concentration"] <- 1
Gradient[Gradient$species == "Cosmarium_regnelii", "DNA_concentration"] <- 1
Gradient[Gradient$species == "Cryptomonas_sp.", "DNA_concentration"] <- 1
Gradient[Gradient$species == "Zygnema_sp.", "DNA_concentration"] <- 10
Gradient[Gradient$species == "Mougeotia_sp.", "DNA_concentration"] <- 10
Gradient[Gradient$species == "Ulnaria_acus", "DNA_concentration"] <- 10
Gradient[Gradient$species == "Nitzschia_palea", "DNA_concentration"] <- 20
Gradient[Gradient$species == "Microcystis_aeruginosa", "DNA_concentration"] <- 20
Gradient[Gradient$species == "Tetradesmus_obliquus", "DNA_concentration"] <- 20
Gradient[Gradient$species == "Pandorina_morum", "DNA_concentration"] <- 30
Gradient[Gradient$species == "Dolichospermum_flosaquae", "DNA_concentration"] <- 30
Gradient[Gradient$species == "Haematococcus_lacustris", "DNA_concentration"] <- 30
Gradient[Gradient$species == "Staurosira_venter", "DNA_concentration"] <- 40
Gradient[Gradient$species == "Vischeria_magna", "DNA_concentration"] <- 40
Gradient[Gradient$species == "Asterionella_formosa", "DNA_concentration"] <- 40
Gradient[Gradient$species == "Xanthonema_montanum", "DNA_concentration"] <- 50
Gradient[Gradient$species == "Anathece_clathrata", "DNA_concentration"] <- 50
Gradient[Gradient$species == "Planktothrix_rubescens", "DNA_concentration"] <- 50

Gradient <- subset(Gradient, !DNA_concentration =="NA")

gg2 <- ggplot(Gradient, aes(x=DNA_concentration, y=as.numeric(value),group=variable)) +
       geom_point(aes(color=variable), size=5) +
       scale_color_manual(values=c("#383838","#a9a9a9")) +
       geom_smooth(method=lm,se=F, aes(color=variable)) +
       theme(legend.position="none",
             panel.border = element_blank(),  
             panel.grid.major = element_blank(),
             panel.grid.minor = element_blank(),
             panel.background = element_blank(),
             axis.line = element_line(colour = "grey")) +
       xlab("DNA concentration (ng/µL) introduced in Mock 2") +
       ylab("Number of reads")

4. Graphic for Exponential concentrations

Expo <- subset(species_abund, variable=="AUTEX_S184_L001" | variable=="GENEX_S181_L001")
Expo <- na.omit(Expo)

Expo[Expo$species == "Asterionella_formosa", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Cryptomonas_sp.", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Staurosira_venter", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Vischeria_magna", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Zygnema_sp.", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Mougeotia_sp.", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Ulnaria_acus", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Anathece_clathrata", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Nitzschia_palea", "DNA_concentration"] <- 1
Expo[Expo$species == "Cosmarium_regnelii", "DNA_concentration"] <- 1
Expo[Expo$species == "Planktothrix_rubescens", "DNA_concentration"] <- 1
Expo[Expo$species == "Microcystis_aeruginosa", "DNA_concentration"] <- 1
Expo[Expo$species == "Tetradesmus_obliquus", "DNA_concentration"] <- 1
Expo[Expo$species == "Dolichospermum_flosaquae", "DNA_concentration"] <- 1
Expo[Expo$species == "Haematococcus_lacustris", "DNA_concentration"] <- 1
Expo[Expo$species == "Pandorina_morum", "DNA_concentration"] <- 1
Expo[Expo$species == "Botryococcus_braunii", "DNA_concentration"] <- 50
Expo[Expo$species == "Xanthonema_montanum", "DNA_concentration"] <- 50

Expo <- subset(Expo, !DNA_concentration =="NA")

gg3 <- ggplot(Expo, aes(x=as.factor(DNA_concentration), y=log(value + 1))) +
       geom_boxplot(aes(fill=variable)) +
       scale_fill_manual(values=c("#383838","#a9a9a9")) +
       theme(legend.position="none",
             panel.border = element_blank(),  
             panel.grid.major = element_blank(),
             panel.grid.minor = element_blank(),
             panel.background = element_blank(),
             axis.line = element_line(colour = "grey")) +
       xlab("DNA concentration (ng/µL) introduced in Mock 2") +
       ylab("Number of reads (logx+1)")