This is a Rscript the describes the analysis steps for 23S amplicon sequencing of the Mock community 2. Note : most of it can be done using R, but some parts are in bash and must be run through the terminal (ideally on a server, to have more ressources)

Table of content

I. From raw data to ASV table

  1. Cutadapt: remove the primers
  2. DADA2: from raw sequences to ASV table
  1. Taxonomy affiliations
  1. DADA2: Assign taxonomy using Phytool database
  2. Blast: Get sequences match against Phytool database
  3. Blast: Get sequences matchagainst all NCBI nt database
  4. Combine all taxonomy information in a single table and save
  1. Data analysis
  1. Prepare ASV table for the analysis
  2. Equimolar concentrations
  3. Gradient concentrations
  4. Exponential concentrations

I. From raw data to ASV table

1. Cutadapt: remove the primers

# Script in bash 
# Command launched in the folder containing all the fastq files
cd /MOCK

# Using cutadapt-3.5

# Create a directory that will store trimmed sequences
mkdir Cut_fastq

# Create a file with all the sample names
ls *_R1_001.fastq.gz | cut -d _ -f 1,2,3 > samples.txt

#Primers sequences
#23S_F: "ACAGWAAGACCCTATGAAGCTT"
#23S_R: "CCTGTTATCCCTAGAGTAACTT"

# Paired-end trim
for sample in `awk '{print $1}' samples.txt`
do      
cutadapt -g ^ACAGWAAGACCCTATGAAGCTT -G ^CCTGTTATCCCTAGAGTAACTT --max-n 0 -o ./Cut_fastq/${sample}_R1_trimmed.fastq.gz -p ./Cut_fastq/${sample}_R2_trimmed.fastq.gz ${sample}_R1_001.fastq.gz ${sample}_R2_001.fastq.gz
done

2. DADA2: from raw sequences to ASV table

library(dada2)

# Path
path <- "set/to/your/path/"

# Store forward and reverse fastq (with primers removed by cutadapt)
fnFs <- sort(list.files(paste0(path,"/Cut_fastq"), pattern="_R1_trimmed.fastq.gz", full.names = TRUE))
fnRs <- sort(list.files(paste0(path,"/Cut_fastq"), pattern="_R2_trimmed.fastq.gz", full.names = TRUE))

# Extract sample names
sample.names <- sapply(strsplit(basename(fnFs), "_R"), `[`, 1)

# Place filtered files in a subdirectory "filtered"
filtFs <- file.path(path, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(path, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))
names(filtFs) <- sample.names
names(filtRs) <- sample.names

# Filter and trim sequences
out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, 
                     truncLen=c(220,200), 
                     maxN=0, maxEE=c(2,2),truncQ=2, 
                     rm.phix=TRUE,
                     compress=TRUE, 
                     multithread=TRUE)

# Learn the error rate
errF <- learnErrors(filtFs, multithread=TRUE)
errR <- learnErrors(filtRs, multithread=TRUE)

# Infer samples
dadaFs <- dada(filtFs, err=errF, multithread=TRUE)
dadaRs <- dada(filtRs, err=errR, multithread=TRUE)

# Merge pairs
mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=TRUE)

# Create an ASV table
seqtab <- makeSequenceTable(mergers)

# Remove the chimera
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)

# Keep ASVs that have a good length in order to discard unspecific ASVs
seqtab.nochim <- seqtab.nochim[,nchar(colnames(seqtab)) %in% 345:370]

# Store ASV table
saveRDS(seqtab.nochim, file="seqtab_nochim.rds")

II. Taxonomic affiliations

1. DADA2: Assign taxonomy using Phytool database

# Script in bash 
# Get Phytool 23S database at : https://github-carrtel.shinyapps.io/phytool_v2/

# add a "_" instead of a space between the genus and species name, otherwise DADA2 assignation will only give a genus name
cat Phytool_complete_rRNA_23S_2023-02-10.fasta | tr "[:blank:]" "_" > Phytool_23S.fa
library(dada2)

# Run DADA2 taxonomic assignation algorithme
# Note that the use of 20 threads is not possible in every machine. Here I was using Migale server
taxonomy <- assignTaxonomy(seqtab.nochim, "Phytool_23S.fa", multithread=20)

# Save taxonomy 
saveRDS(taxonomy, file = "DADA2_ASV_taxa_Phytool.rds")

2. Blast: Get sequences match against Phytool database

library(dplyr)

# Create a fasta file for all ASV sequences
ASV_table <- as.data.frame(t(readRDS("seqtab_nochim.rds")))
ASV_table <- ASV_table %>% mutate(sequence = 1:n())
ASV_table$sequence <- paste(">seq", sep="", ASV_table$sequence)
write.table(ASV_table, "ASV_table.txt",sep="\t", quote = FALSE,  col.names=FALSE)

# In the terminal - create a fasta file using awk
# awk '{print $14"\n"$1}' ASV_table.txt > ASV_table.fa
# Script in bash 
# Using blast-2.13.0

# Store the variables
QUERY="ASV_table.fa"
OUT="Phytool_ASV_blast_all.txt"

# Launch blast script
blastn -query $QUERY -subject Phytool_23S.fa -max_target_seqs 10 -evalue 1e-6 -outfmt "6 qseqid sseqid length qlen slen astart gend sstart send evalue pident mismatch gapopen" -out $OUT -num_threads 50

3. Blast: Get sequences match against all NCBI nt database

# Script in bash 
# Store the variables
#Note that the path to the nt database here is the one of Migale server
BLASTDB="/db/outils/BLAST+taxo/nt/current/nt"
QUERY="ASV_table.fa"
OUT="ASV_blast_all_taxa.txt"

# Launch blast script
blastn -query $QUERY -db $BLASTDB -max_target_seqs 10 -evalue 1e-6 -outfmt "6 qseqid sseqid length qlen slen astart gend sstart send evalue pident mismatch gapopen gi sacc staxids sscinames scomnames stitle" -out $OUT -num_threads 60

4. Combine all taxonomy information in a single table and save

library(data.table)
library(dplyr)
library(stringr)

# Load the three tables
ASV_DADA2_taxonomy <- as.data.frame(readRDS("DADA2_ASV_taxa_Phytool.rds"))
ASV_blast_ncbi_taxonomy <- read.table("ASV_blast_all_taxa.txt", sep="\t")
ASV_blast_phytool_taxonomy <- read.table("Phytool_ASV_blast.txt", header = FALSE, sep = "\t", col.names = paste0("V",seq_len(11)), fill = TRUE)

# Add a seqid column to make the link with blast table
ASV_table <- as.data.frame(t(readRDS("seqtab_nochim.rds")))
ASV_table <- ASV_table %>% mutate(sequence = 1:n())
ASV_table$sequence <- paste(">seq", sep="", ASV_table$sequence)
ASV_DADA2_taxonomy$seqid <- ASV_table[match(rownames(ASV_DADA2_taxonomy), rownames(ASV_table)),"sequence"]
ASV_DADA2_taxonomy$seqid <- gsub(">","", ASV_DADA2_taxonomy$seqid)

# Customize ncbi table
ASV_blast_ncbi_taxonomy <- ASV_blast_ncbi_taxonomy[,c(1:11,16)]
colnames(ASV_blast_ncbi_taxonomy) <- c("qseqid","sseqid","length","qlen","slen","astart","aend","evalue","pident","mismatch","gapopen","stitle")

# Keep only known affiliation
ASV_blast_ncbi_taxonomy <- ASV_blast_ncbi_taxonomy[grep("Uncultured",ASV_blast_ncbi_taxonomy$stitle, invert=T),]

# Keep the best hit for each sequence (i.e. the first one)
ASV_blast_ncbi_taxonomy_uniq <- setDT(ASV_blast_ncbi_taxonomy)[!duplicated(rleid(qseqid))]

# Keep only species name
for (i in 1:nrow(ASV_blast_ncbi_taxonomy_uniq)) {
  ASV_blast_ncbi_taxonomy_uniq$ncbi_blast[i] <-  paste(strsplit(ASV_blast_ncbi_taxonomy_uniq$stitle[i], " ")[[1]][1], strsplit(ASV_blast_ncbi_taxonomy_uniq$stitle[i], " ")[[1]][2], sep="_")
}

# Add ncbiBlast annotation in ASV_DADA2_taxonomy_table
ASV_DADA2_taxonomy <- merge(ASV_DADA2_taxonomy, ASV_blast_ncbi_taxonomy_uniq %>%
                select(ncbi_blast, qseqid), by.x = "seqid", by.y = "qseqid", all.x = TRUE)

# Customize phytool blast table
colnames(ASV_blast_phytool_taxonomy) <- c("qseqid","sseqid","length","qlen","slen","astart","aend","evalue","pident","mismatch","gapopen")

# Keep the best hit for each sequence (i.e. the first one)
ASV_blast_phytool_taxonomy_uniq <- setDT(ASV_blast_phytool_taxonomy)[!duplicated(rleid(qseqid))]

# Keep only the species name
ASV_blast_phytool_taxonomy_uniq$Phytool_blast <- sapply(ASV_blast_phytool_taxonomy_uniq$sseqid, function(x) str_split_i(x, ";",-1)[[1]])

# Add PhytoolBlast species annotation in ASV_DADA2_taxonomy_table

ASV_DADA2_taxonomy <- merge(ASV_DADA2_taxonomy, ASV_blast_phytool_taxonomy_uniq %>%
                select(Phytool_blast, qseqid), by.x = "seqid", by.y = "qseqid", all.x = TRUE) 


# Save the taxonomy table
saveRDS(ASV_DADA2_taxonomy, file="ASV_DADA2_taxonomy_combined.rds")
##                       Species           Phytool_blast              ncbi_blast
## seq1        Vaucheria_litorea       Vaucheria_litorea     Xanthonema_montanum
## seq2     Botryococcus_braunii    Botryococcus_braunii    Botryococcus_braunii
## seq3    Planktothrix_agardhii  Planktothrix_rubescens   Planktothrix_agardhii
## seq4      Cyanobium_rubescens       Synechococcus_sp.      Anathece_clathrata
## seq5          Pandorina_morum         Pandorina_morum       Brevundimonas_sp.
## seq6        Staurosira_venter       Staurosira_venter   Fragilaria_construens
## seq7  Haematococcus_lacustris Haematococcus_lacustris Haematococcus_lacustris
## seq8          Vischeria_magna         Vischeria_magna     Chlorella_desiccata
## seq9     Asterionella_formosa    Asterionella_formosa    Asterionella_formosa
## seq10  Microcystis_aeruginosa         Microcystis_sp.  Microcystis_aeruginosa
## seq11    Tetradesmus_obliquus    Tetradesmus_obliquus    Tetradesmus_obliquus
## seq12 Zygnema_circumcarinatum Zygnema_circumcarinatum             Zygnema_sp.
## seq13         Nitzschia_palea           Nitzschia_sp.         Nitzschia_palea
## seq14   Trichormus_variabilis          Wollea_ambigua   Trichormus_variabilis
## seq15                    <NA> Mesotaenium_caldariorum           Mougeotia_sp.
## seq16                    <NA>    Fragilaria_perminuta  Fragilaria_crotonensis
## seq17         Pandorina_morum         Pandorina_morum       Brevundimonas_sp.
## seq18     Cryptomonas_curvata         Cryptomonas_sp.     Cryptomonas_curvata
## seq19 Staurastrum_punctulatum Staurastrum_punctulatum Staurastrum_punctulatum
## seq20        Romeria_chlorina        Romeria_chlorina     Pseudanabaena_yagii

III. Data analysis

1. Prepare ASV table for the analysis

Get normalized and unormalized tables
library(DESeq2)

# Add seq name instead of ASV
ASV_table <- as.data.frame(t(readRDS("seqtab_nochim.rds")))
ASV_table <- ASV_table %>% mutate(sequence = 1:n())
ASV_table$sequence <- paste("seq", sep="", ASV_table$sequence)
rownames(ASV_table) <- ASV_table$sequence

# Unormalized table
ASV_table_raw <- ASV_table[,-13]

# Normalization : mean
ASV_matrix <- as.matrix(ASV_table_raw)

s_num<-dim(ASV_matrix)[2]
otu.perc<-matrix(rep("NA",times=(dim(ASV_matrix)[1]*s_num)),nrow=dim(ASV_matrix)[1],ncol=s_num)
rownames(otu.perc)<-rownames(ASV_matrix)
colnames(otu.perc)=colnames(ASV_matrix) 
totals<-colSums(ASV_matrix)

for(s in c(1:s_num)){
  vec<-(ASV_matrix[,s]/totals[s])*100
  otu.perc[,s]<-vec
  rm(vec)
}

ASV_table_perc <- as.data.frame(otu.perc)
library(ggplot2)

ASV_table_raw$species <- ASV_DADA2_taxonomy[match(rownames(ASV_table_raw), ASV_DADA2_taxonomy$seqid),"Phytool_blast"]

melt_table <- melt(ASV_table_raw, id.var="species")
melt_table$value <- as.numeric(melt_table$value)

species_abund <- dcast(melt_table, species + variable ~., sum, value.var="value")

colnames(species_abund) <- c("species","variable","value")

Equimolar <- subset(species_abund, variable=="AUTEQ_S182_L001"|variable=="GENEQ_S179_L001")

gg <- ggplot(Equimolar, aes(x=species, y=value, fill=variable)) +
      geom_bar(stat= "identity", position="dodge", aes(fill=variable), width=0.7) +
      scale_fill_manual(values=c("#383838","#a9a9a9")) +
      theme(strip.text.y = element_text(size = 8, angle = 0), axis.text.x = element_text(size = 8 , angle = 90), axis.text.y = element_text(size = 8)) +
      xlab("Species") +
      ylab("Number of reads") 
Group the “true” species together
library(ggplot2)

ASV_table_raw_T <- na.omit(ASV_table_raw)

ASV_table_raw_T[ASV_table_raw_T$species == "Planktothrix_suspensa" | 
                ASV_table_raw_T$species == "Planktothrix_rubescens"| 
                ASV_table_raw_T$species == "Planktothrix_agardhii"|
                ASV_table_raw_T$species == "Planktothrix_mougeotii", "True_specie"] <- "Planktothrix_rubescens"
ASV_table_raw_T[ASV_table_raw_T$species == "Bathycoccus_prasinos" | 
                ASV_table_raw_T$species == "Botryococcus_braunii", "True_specie"] <- "Botryococcus_braunii"
ASV_table_raw_T[ASV_table_raw_T$species == "Asterionella_ralfsii"|
                ASV_table_raw_T$species == "Asterionella_formosa", "True_specie"] <- "Asterionella_formosa"
ASV_table_raw_T[ASV_table_raw_T$species == "Cryptomonas_sp." |
                ASV_table_raw_T$species == "Cryptomonas_curvata", "True_specie"] <- "Cryptomonas_sp."
ASV_table_raw_T[ASV_table_raw_T$species == "Nitzschia_sp.", "True_specie"] <- "Nitzschia_palea"
ASV_table_raw_T[ASV_table_raw_T$species == "Staurastrum_punctulatum", "True_specie"] <- "Cosmarium_regnelii"
ASV_table_raw_T[ASV_table_raw_T$species == "Microcystis_sp."| 
                ASV_table_raw_T$species == "Microcystis_aeruginosa", "True_specie"] <- "Microcystis_aeruginosa"
ASV_table_raw_T[ASV_table_raw_T$species == "Zygnema_circumcarinatum" | 
                ASV_table_raw_T$species == "Zygnema_sp.", "True_specie"] <- "Zygnema_sp."
ASV_table_raw_T[ASV_table_raw_T$species == "Vischeria_magna", "True_specie"] <- "Vischeria_magna"
ASV_table_raw_T[ASV_table_raw_T$species == "Staurosira_venter", "True_specie"] <- "Staurosira_venter"
ASV_table_raw_T[ASV_table_raw_T$species == "Tetradesmus_obliquus", "True_specie"] <- "Tetradesmus_obliquus"
ASV_table_raw_T[ASV_table_raw_T$species == "Trichormus_variabilis"|
                ASV_table_raw_T$species == "Wollea_ambigua", "True_specie"] <- "Dolichospermum_flosaquae"
ASV_table_raw_T[ASV_table_raw_T$species == "Synechococcus_sp." |
                ASV_table_raw_T$species == "Cyanobium_rubescens", "True_specie"] <- "Anathece_clathrata"
ASV_table_raw_T[ASV_table_raw_T$species == "Vaucheria_litorea", "True_specie"] <- "Xanthonema_montanum"
ASV_table_raw_T[ASV_table_raw_T$species == "Mesotaenium_caldariorum", "True_specie"] <- "Mougeotia_sp."
ASV_table_raw_T[ASV_table_raw_T$species == "Fragilaria_perminuta", "True_specie"] <- "Ulnaria_acus"
ASV_table_raw_T[ASV_table_raw_T$species == "Haematococcus_lacustris", "True_specie"] <- "Haematococcus_lacustris"
ASV_table_raw_T[ASV_table_raw_T$species == "Pandorina_morum", "True_specie"] <- "Pandorina_morum"

ASV_table_raw_T <- ASV_table_raw_T[,c("AUTEQ_S182_L001","AUTEX_S184_L001","AUTGR_S183_L001","GENEQ_S179_L001","GENEX_S181_L001","GENGR_S180_L001","True_specie")]

melt_table <- melt(ASV_table_raw_T, id.var="True_specie")
melt_table$value <- as.numeric(melt_table$value)

species_abund <- dcast(melt_table, True_specie + variable ~., sum, value.var="value")

colnames(species_abund) <- c("species","variable","value")

2. Graphic for Equimolar data

Equimolar <- subset(species_abund, variable=="AUTEQ_S182_L001" | variable=="GENEQ_S179_L001")

Equimolar$species <- factor(Equimolar$species, levels=c("Dolichospermum_flosaquae","Planktothrix_rubescens","Microcystis_aeruginosa","Anathece_clathrata","Cosmarium_regnelii","Mougeotia_sp.","Zygnema_sp.","Haematococcus_lacustris","Tetradesmus_obliquus","Pandorina_morum","Botryococcus_braunii","Nitzschia_palea","Staurosira_venter","Ulnaria_acus","Asterionella_formosa","Cryptomonas_sp.","Vischeria_magna","Xanthonema_montanum"))

Equimolar <- na.omit(Equimolar)

gg1 <- ggplot(Equimolar, aes(x=species, y=value, fill=variable)) +
       geom_bar(stat= "identity", position="dodge", aes(fill=variable), width=0.7) +
       scale_fill_manual(values=c("#383838","#a9a9a9")) +
       theme(strip.text.y = element_text(size = 8, angle = 0), 
             axis.text.x = element_text(size = 8 , angle = 90), 
             axis.text.y = element_text(size = 8), 
             legend.position="none",
             panel.border = element_blank(),  
             panel.grid.major = element_blank(),
             panel.grid.minor = element_blank(),
             panel.background = element_blank(),
             axis.line = element_line(colour = "grey")) +
      xlab("Species") +
      ylab("Number of reads") 

3. Graphic for Gradient concentrations

Gradient <- subset(species_abund, variable=="AUTGR_S183_L001" | variable=="GENGR_S180_L001")
Gradient <- na.omit(Gradient)

Gradient[Gradient$species == "Botryococcus_braunii", "DNA_concentration"] <- 1
Gradient[Gradient$species == "Cosmarium_regnelii", "DNA_concentration"] <- 1
Gradient[Gradient$species == "Cryptomonas_sp.", "DNA_concentration"] <- 1
Gradient[Gradient$species == "Zygnema_sp.", "DNA_concentration"] <- 10
Gradient[Gradient$species == "Mougeotia_sp.", "DNA_concentration"] <- 10
Gradient[Gradient$species == "Ulnaria_acus", "DNA_concentration"] <- 10
Gradient[Gradient$species == "Nitzschia_palea", "DNA_concentration"] <- 20
Gradient[Gradient$species == "Microcystis_aeruginosa", "DNA_concentration"] <- 20
Gradient[Gradient$species == "Tetradesmus_obliquus", "DNA_concentration"] <- 20
Gradient[Gradient$species == "Pandorina_morum", "DNA_concentration"] <- 30
Gradient[Gradient$species == "Dolichospermum_flosaquae", "DNA_concentration"] <- 30
Gradient[Gradient$species == "Haematococcus_lacustris", "DNA_concentration"] <- 30
Gradient[Gradient$species == "Staurosira_venter", "DNA_concentration"] <- 40
Gradient[Gradient$species == "Vischeria_magna", "DNA_concentration"] <- 40
Gradient[Gradient$species == "Asterionella_formosa", "DNA_concentration"] <- 40
Gradient[Gradient$species == "Xanthonema_montanum", "DNA_concentration"] <- 50
Gradient[Gradient$species == "Anathece_clathrata", "DNA_concentration"] <- 50
Gradient[Gradient$species == "Planktothrix_rubescens", "DNA_concentration"] <- 50

Gradient <- subset(Gradient, !DNA_concentration =="NA")

gg2 <- ggplot(Gradient, aes(x=DNA_concentration, y=as.numeric(value),group=variable)) +
       geom_point(aes(color=variable), size=5) +
       scale_color_manual(values=c("#383838","#a9a9a9")) +
       geom_smooth(method=lm,se=F, aes(color=variable)) +
       theme(legend.position="none",
             panel.border = element_blank(),  
             panel.grid.major = element_blank(),
             panel.grid.minor = element_blank(),
             panel.background = element_blank(),
             axis.line = element_line(colour = "grey")) +
       xlab("DNA concentration (ng/µL) introduced in Mock 2") +
       ylab("Number of reads") 

4. Graphic for Exponential concentrations

Expo <- subset(species_abund, variable=="AUTEX_S184_L001" | variable=="GENEX_S181_L001")
Expo <- na.omit(Expo)

Expo[Expo$species == "Asterionella_formosa", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Cryptomonas_sp.", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Staurosira_venter", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Vischeria_magna", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Zygnema_sp.", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Mougeotia_sp.", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Ulnaria_acus", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Anathece_clathrata", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Nitzschia_palea", "DNA_concentration"] <- 1
Expo[Expo$species == "Cosmarium_regnelii", "DNA_concentration"] <- 1
Expo[Expo$species == "Planktothrix_rubescens", "DNA_concentration"] <- 1
Expo[Expo$species == "Microcystis_aeruginosa", "DNA_concentration"] <- 1
Expo[Expo$species == "Tetradesmus_obliquus", "DNA_concentration"] <- 1
Expo[Expo$species == "Dolichospermum_flosaquae", "DNA_concentration"] <- 1
Expo[Expo$species == "Haematococcus_lacustris", "DNA_concentration"] <- 1
Expo[Expo$species == "Pandorina_morum", "DNA_concentration"] <- 1
Expo[Expo$species == "Botryococcus_braunii", "DNA_concentration"] <- 50
Expo[Expo$species == "Xanthonema_montanum", "DNA_concentration"] <- 50

Expo <- subset(Expo, !DNA_concentration =="NA")

gg3 <- ggplot(Expo, aes(x=as.factor(DNA_concentration), y=log(value + 1))) +
       geom_boxplot(aes(fill=variable)) +
       scale_fill_manual(values=c("#383838","#a9a9a9")) +
       theme(legend.position="none",
             panel.border = element_blank(),  
             panel.grid.major = element_blank(),
             panel.grid.minor = element_blank(),
             panel.background = element_blank(),
             axis.line = element_line(colour = "grey")) +
       xlab("DNA concentration (ng/µL) introduced in Mock 2") +
       ylab("Number of reads (logx+1)")