This is a script that describes the analysis steps for 16S amplicon sequencing of the Mock community 2. Note : most of it can be done using R, but some parts are in bash and must be run through the terminal (ideally on a server, to have more ressources)
I. From raw data to ASV table
# Script in bash
# Command launched in the folder containing all the fastq files
cd /MOCK
# Using cutadapt-3.5
# Create a directory that will store trimmed sequences
mkdir Cut_fastq
# Create a file with all the sample names
ls *_R1_001.fastq.gz | cut -d _ -f 1,2,3 > samples.txt
# Primers sequences
# 23S_F: "ACAGWAAGACCCTATGAAGCTT"
# 23S_R: "CCTGTTATCCCTAGAGTAACTT"
# Paired-end trim
for sample in `awk '{print $1}' samples.txt`
do
cutadapt -g ^ACAGWAAGACCCTATGAAGCTT -G ^CCTGTTATCCCTAGAGTAACTT --max-n 0 -o ./Cut_fastq/${sample}_R1_trimmed.fastq.gz -p ./Cut_fastq/${sample}_R2_trimmed.fastq.gz ${sample}_R1_001.fastq.gz ${sample}_R2_001.fastq.gz
done
library(dada2)
# Path
path <- "/set/to/your/path"
# Store forward and reverse fastq (with primers removed by cutadapt)
fnFs <- sort(list.files(paste0(path,"/Cut_fastq"), pattern="_R1_trimmed.fastq.gz", full.names = TRUE))
fnRs <- sort(list.files(paste0(path,"/Cut_fastq"), pattern="_R2_trimmed.fastq.gz", full.names = TRUE))
# Extract sample names
sample.names <- sapply(strsplit(basename(fnFs), "_R"), `[`, 1)
# Place filtered files in a subdirectory "filtered"
filtFs <- file.path(path, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(path, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))
names(filtFs) <- sample.names
names(filtRs) <- sample.names
# Filter and trim sequences
# Here the option truncQ is removed
out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs,
truncLen=c(220,200),
maxN=0, maxEE=c(2,2),
rm.phix=TRUE,
compress=TRUE,
multithread=TRUE)
# Learn the error rate
errF <- learnErrors(filtFs, multithread=TRUE)
errR <- learnErrors(filtRs, multithread=TRUE)
# Infer samples
dadaFs <- dada(filtFs, err=errF, multithread=TRUE)
dadaRs <- dada(filtRs, err=errR, multithread=TRUE)
# Merge pairs
mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=TRUE)
# Create an ASV table
seqtab <- makeSequenceTable(mergers)
# Remove the chimera
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
# Keep ASVs that have a good length in order to discard unspecific ASVs
seqtab.nochim <- seqtab.nochim[,nchar(colnames(seqtab)) %in% 375:385]
# Store ASV table
saveRDS(seqtab.nochim, file="seqtab_nochim.rds")
# Script in bash
# Get Phytool 16S database at : https://github-carrtel.shinyapps.io/phytool_v2/
# add a "_" instead of a space between the genus and species name, otherwise DADA2 assignation will only give a genus name
cat Phytool_complete_rRNA_16S_2023-02-15.fasta | tr "[:blank:]" "_" > Phytool_16S.fa
library(dada2)
# Run DADA2 taxonomic assignation algorithme
# Note that the use of 20 threads is not possible in every machine. Here I was using Migale server
taxonomy <- assignTaxonomy(seqtab.nochim, "Phytool_16S.fa", multithread=20)
# Save taxonomy
saveRDS(taxonomy, file = "DADA2_ASV_taxa_Phytool.rds")
library(dplyr)
# Create a fasta file for all ASV sequences
ASV_table <- as.data.frame(t(readRDS("seqtab_nochim.rds")))
ASV_table <- ASV_table %>% mutate(sequence = 1:n())
ASV_table$sequence <- paste(">seq", sep="", ASV_table$sequence)
write.table(ASV_table, "ASV_table.txt",sep="\t", quote = FALSE, col.names=FALSE)
# In the terminal - create a fasta file using awk
# awk '{print $14"\n"$1}' ASV_table.txt > ASV_table.fa
# Script in bash
# Using blast-2.13.0
# Store the variables
QUERY="ASV_table.fa"
OUT="Phytool_ASV_blast_all.txt"
# Launch blast script
blastn -query $QUERY -subject Phytool_16S.fa -max_target_seqs 10 -evalue 1e-6 -outfmt "6 qseqid sseqid length qlen slen astart gend sstart send evalue pident mismatch gapopen" -out $OUT -num_threads 50
# Script in bash
# Store the variables
# Note that the path to the nt database here is the one of Migale server
BLASTDB="/db/outils/BLAST+taxo/nt/current/nt"
QUERY="ASV_table.fa"
OUT="ASV_blast_all_taxa.txt"
# Launch blast script
blastn -query $QUERY -db $BLASTDB -max_target_seqs 10 -evalue 1e-6 -outfmt "6 qseqid sseqid length qlen slen astart gend sstart send evalue pident mismatch gapopen gi sacc staxids sscinames scomnames stitle" -out $OUT -num_threads 60
library(data.table)
library(dplyr)
library(stringr)
# Load the three tables
ASV_DADA2_taxonomy <- as.data.frame(readRDS("DADA2_ASV_taxa_Phytool.rds"))
ASV_blast_ncbi_taxonomy <- read.table("ASV_blast_all_taxa.txt", sep="\t")
ASV_blast_phytool_taxonomy <- read.table("Phytool_ASV_blast.txt", header = FALSE, sep = "\t", col.names = paste0("V",seq_len(11)), fill = TRUE)
# Add a seqid column to make the link with blast table
ASV_table <- as.data.frame(t(readRDS("seqtab_nochim.rds")))
ASV_table <- ASV_table %>% mutate(sequence = 1:n())
ASV_table$sequence <- paste(">seq", sep="", ASV_table$sequence)
ASV_DADA2_taxonomy$seqid <- ASV_table[match(rownames(ASV_DADA2_taxonomy), rownames(ASV_table)),"sequence"]
ASV_DADA2_taxonomy$seqid <- gsub(">","", ASV_DADA2_taxonomy$seqid)
# Customize ncbi table
ASV_blast_ncbi_taxonomy <- ASV_blast_ncbi_taxonomy[,c(1:11,16)]
colnames(ASV_blast_ncbi_taxonomy) <- c("qseqid","sseqid","length","qlen","slen","astart","aend","evalue","pident","mismatch","gapopen","stitle")
# Keep only known affiliation
ASV_blast_ncbi_taxonomy <- ASV_blast_ncbi_taxonomy[grep("Uncultured",ASV_blast_ncbi_taxonomy$stitle, invert=T),]
# Keep the best hit for each sequence (i.e. the first one)
ASV_blast_ncbi_taxonomy_uniq <- setDT(ASV_blast_ncbi_taxonomy)[!duplicated(rleid(qseqid))]
# Keep only species name
for (i in 1:nrow(ASV_blast_ncbi_taxonomy_uniq)) {
ASV_blast_ncbi_taxonomy_uniq$ncbi_blast[i] <- paste(strsplit(ASV_blast_ncbi_taxonomy_uniq$stitle[i], " ")[[1]][1], strsplit(ASV_blast_ncbi_taxonomy_uniq$stitle[i], " ")[[1]][2], sep="_")
}
# Add ncbiBlast annotation in ASV_DADA2_taxonomy_table
ASV_DADA2_taxonomy <- merge(ASV_DADA2_taxonomy, ASV_blast_ncbi_taxonomy_uniq %>%
select(ncbi_blast, qseqid), by.x = "seqid", by.y = "qseqid", all.x = TRUE)
# Customize phytool blast table
colnames(ASV_blast_phytool_taxonomy) <- c("qseqid","sseqid","length","qlen","slen","astart","aend","evalue","pident","mismatch","gapopen")
# Keep the best hit for each sequence (i.e. the first one)
ASV_blast_phytool_taxonomy_uniq <- setDT(ASV_blast_phytool_taxonomy)[!duplicated(rleid(qseqid))]
# Keep only the species name
ASV_blast_phytool_taxonomy_uniq$Phytool_blast <- sapply(ASV_blast_phytool_taxonomy_uniq$sseqid, function(x) str_split_i(x, ";",-1)[[1]])
# Add PhytoolBlast species annotation in ASV_DADA2_taxonomy_table
ASV_DADA2_taxonomy <- merge(ASV_DADA2_taxonomy, ASV_blast_phytool_taxonomy_uniq %>%
select(Phytool_blast, qseqid), by.x = "seqid", by.y = "qseqid", all.x = TRUE)
# Save the taxonomy table
saveRDS(ASV_DADA2_taxonomy, file="ASV_DADA2_taxonomy_combined.rds")
## Species Phytool_blast ncbi_blast
## seq1 Vaucheria_litorea Vaucheria_litorea <NA>
## seq2 Botryococcus_braunii Botryococcus_braunii Botryococcus_braunii
## seq3 Synechococcus_sp. Synechococcus_sp. <NA>
## seq4 Planktothrix_rubescens Planktothrix_suspensa Planktothrix_agardhii
## seq5 Planktothrix_agardhii Planktothrix_rubescens Planktothrix_rubescens
## seq6 Fragilaria_zeilleri Staurosira_venter <NA>
## seq7 Asterionella_ralfsii Asterionella_ralfsii Asterionella_formosa
## seq8 Microcystis_sp. Westiellopsis_prolifica Microcystis_aeruginosa
## seq9 Vischeria_magna Vischeria_magna Nannochloris_bacillaris
## seq10 Trichormus_variabilis Trichormus_variabilis Trichormus_variabilis
## seq11 Nitzschia_palea Nitzschia_sp. Nitzschia_palea
## seq12 Zygnema_circumcarinatum Zygnema_sp. Zygnema_circumcarinatum
## seq13 Cosmarium_botrytis Staurastrum_punctulatum Staurastrum_margaritaceum
## seq14 Ulnaria_acus Fragilaria_perminuta uncultured_bacterium
## seq15 Cryptomonas_curvata Cryptomonas_sp. Cryptomonas_curvata
## seq16 Cryptomonas_curvata Cryptomonas_curvata Cryptomonas_curvata
## seq17 Pseudanabaena_sp. Pseudanabaena_sp. Pseudanabaena_sp.
## seq18 Vaucheria_litorea Vaucheria_litorea <NA>
## seq19 <NA> Nodosilinea_sp. <NA>
## seq20 <NA> Mesotaenium_caldariorum Mougeotia_sp.
## seq21 Botryococcus_braunii Botryococcus_braunii Botryococcus_braunii
## seq22 Tetradesmus_obliquus Tetradesmus_obliquus Tetradesmus_obliquus
## seq23 Cyanobium_rubescens Synechococcus_sp. <NA>
## seq24 <NA> Bathycoccus_prasinos Brevundimonas_vesicularis
## seq25 Planktothrix_rubescens Planktothrix_suspensa Planktothrix_agardhii
library(DESeq2)
ASV_table <- as.data.frame(t(readRDS("seqtab_nochim.rds")))
ASV_table <- ASV_table %>% mutate(sequence = 1:n())
ASV_table$sequence <- paste("seq", sep="", ASV_table$sequence)
# Add seq name instead of ASV
rownames(ASV_table) <- ASV_table$sequence
# Unormalized table
ASV_table_raw <- ASV_table[,-13]
# Normalization : mean
ASV_matrix <- as.matrix(ASV_table_raw)
s_num<-dim(ASV_matrix)[2]
otu.perc<-matrix(rep("NA",times=(dim(ASV_matrix)[1]*s_num)),nrow=dim(ASV_matrix)[1],ncol=s_num)
rownames(otu.perc)<-rownames(ASV_matrix)
colnames(otu.perc)=colnames(ASV_matrix)
totals<-colSums(ASV_matrix)
for(s in c(1:s_num)){
vec<-(ASV_matrix[,s]/totals[s])*100
otu.perc[,s]<-vec
rm(vec)
}
ASV_table_perc <- as.data.frame(otu.perc)
library(ggplot2)
ASV_table_raw_T <- na.omit(ASV_table_raw)
ASV_table_raw_T[ASV_table_raw_T$species == "Planktothrix_suspensa" |
ASV_table_raw_T$species == "Planktothrix_rubescens"|
ASV_table_raw_T$species == "Planktothrix_agardhii"|
ASV_table_raw_T$species == "Planktothrix_mougeotii", "True_specie"] <- "Planktothrix_rubescens"
ASV_table_raw_T[ASV_table_raw_T$species == "Bathycoccus_prasinos" |
ASV_table_raw_T$species == "Botryococcus_braunii", "True_specie"] <- "Botryococcus_braunii"
ASV_table_raw_T[ASV_table_raw_T$species == "Asterionella_ralfsii"|
ASV_table_raw_T$species == "Asterionella_formosa", "True_specie"] <- "Asterionella_formosa"
ASV_table_raw_T[ASV_table_raw_T$species == "Cryptomonas_sp." |
ASV_table_raw_T$species == "Cryptomonas_curvata", "True_specie"] <- "Cryptomonas_sp."
ASV_table_raw_T[ASV_table_raw_T$species == "Nitzschia_sp.", "True_specie"] <- "Nitzschia_palea"
ASV_table_raw_T[ASV_table_raw_T$species == "Staurastrum_punctulatum", "True_specie"] <- "Cosmarium_regnelii"
ASV_table_raw_T[ASV_table_raw_T$species == "Microcystis_sp."|
ASV_table_raw_T$species == "Westiellopsis_prolifica"|
ASV_table_raw_T$species == "Microcystis_aeruginosa", "True_specie"] <- "Microcystis_aeruginosa"
ASV_table_raw_T[ASV_table_raw_T$species == "Zygnema_circumcarinatum" |
ASV_table_raw_T$species == "Zygnema_sp.", "True_specie"] <- "Zygnema_sp."
ASV_table_raw_T[ASV_table_raw_T$species == "Vischeria_magna", "True_specie"] <- "Vischeria_magna"
ASV_table_raw_T[ASV_table_raw_T$species == "Staurosira_venter", "True_specie"] <- "Staurosira_venter"
ASV_table_raw_T[ASV_table_raw_T$species == "Tetradesmus_obliquus", "True_specie"] <- "Tetradesmus_obliquus"
ASV_table_raw_T[ASV_table_raw_T$species == "Trichormus_variabilis"|
ASV_table_raw_T$species == "Wollea_ambigua", "True_specie"] <- "Dolichospermum_flosaquae"
ASV_table_raw_T[ASV_table_raw_T$species == "Synechococcus_sp." |
ASV_table_raw_T$species == "Cyanobium_rubescens", "True_specie"] <- "Anathece_clathrata"
ASV_table_raw_T[ASV_table_raw_T$species == "Vaucheria_litorea", "True_specie"] <- "Xanthonema_montanum"
ASV_table_raw_T[ASV_table_raw_T$species == "Mesotaenium_caldariorum", "True_specie"] <- "Mougeotia_sp."
ASV_table_raw_T[ASV_table_raw_T$species == "Fragilaria_perminuta", "True_specie"] <- "Ulnaria_acus"
ASV_table_raw_T[ASV_table_raw_T$species == "Haematococcus_lacustris", "True_specie"] <- "Haematococcus_lacustris"
ASV_table_raw_T[ASV_table_raw_T$species == "Pandorina_morum" |
ASV_table_raw_T$species == "Bathycoccus_prasinos", "True_specie"] <- "Pandorina_morum"
ASV_table_raw_T <- ASV_table_raw_T[,c("AUTEQ_S182_L001","AUTEX_S184_L001","AUTGR_S183_L001","GENEQ_S179_L001","GENEX_S181_L001","GENGR_S180_L001","True_specie")]
melt_table <- melt(ASV_table_raw_T, id.var="True_specie")
melt_table$value <- as.numeric(melt_table$value)
species_abund <- dcast(melt_table, True_specie + variable ~., sum, value.var="value")
colnames(species_abund) <- c("species","variable","value")
Equimolar <- subset(species_abund, variable=="AUTEQ_S182_L001" | variable=="GENEQ_S179_L001")
Haem_Aut <- c("Haematococcus_lacustris","AUTEQ_S182_L001",0)
Haem_Gen <- c("Haematococcus_lacustris","GENEQ_S179_L001",0)
Equimolar <- rbind(Equimolar,Haem_Aut)
Equimolar <- rbind(Equimolar,Haem_Gen)
Equimolar$species <- factor(Equimolar$species, levels=c("Dolichospermum_flosaquae","Planktothrix_rubescens","Microcystis_aeruginosa","Anathece_clathrata","Cosmarium_regnelii","Mougeotia_sp.","Zygnema_sp.","Haematococcus_lacustris","Tetradesmus_obliquus","Pandorina_morum","Botryococcus_braunii","Nitzschia_palea","Staurosira_venter","Ulnaria_acus","Asterionella_formosa","Cryptomonas_sp.","Vischeria_magna","Xanthonema_montanum"))
Equimolar <- na.omit(Equimolar)
gg1 <- ggplot(Equimolar, aes(x=species, y=as.numeric(value), fill=variable)) +
geom_bar(stat= "identity", position="dodge", aes(fill=variable), width=0.7) +
scale_fill_manual(values=c("#383838","#a9a9a9")) +
theme(strip.text.y = element_text(size = 8, angle = 0),
axis.text.x = element_text(size = 8 , angle = 90),
axis.text.y = element_text(size = 8),
legend.position="none",
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "grey")) +
xlab("Species") +
ylab("Number of reads")
Gradient <- subset(species_abund, variable=="AUTGR_S183_L001"|variable=="GENGR_S180_L001")
Haem_Aut <- c("Haematococcus_lacustris","AUTGR_S183_L001",0)
Haem_Gen <- c("Haematococcus_lacustris","GENGR_S180_L001",0)
Gradient <- rbind(Gradient,Haem_Aut)
Gradient <- rbind(Gradient,Haem_Gen)
Gradient <- na.omit(Gradient)
Gradient[Gradient$species == "Haematococcus_lacustris", "DNA_concentration"] <- 1
Gradient[Gradient$species == "Pandorina_morum", "DNA_concentration"] <- 1
Gradient[Gradient$species == "Cryptomonas_sp.", "DNA_concentration"] <- 1
Gradient[Gradient$species == "Zygnema_sp.", "DNA_concentration"] <- 10
Gradient[Gradient$species == "Mougeotia_sp.", "DNA_concentration"] <- 10
Gradient[Gradient$species == "Ulnaria_acus", "DNA_concentration"] <- 10
Gradient[Gradient$species == "Staurosira_venter", "DNA_concentration"] <- 20
Gradient[Gradient$species == "Microcystis_aeruginosa", "DNA_concentration"] <- 20
Gradient[Gradient$species == "Tetradesmus_obliquus", "DNA_concentration"] <- 20
Gradient[Gradient$species == "Dolichospermum_flosaquae", "DNA_concentration"] <- 30
Gradient[Gradient$species == "Nitzschia_palea", "DNA_concentration"] <- 30
Gradient[Gradient$species == "Asterionella_formosa", "DNA_concentration"] <- 30
Gradient[Gradient$species == "Botryococcus_braunii", "DNA_concentration"] <- 40
Gradient[Gradient$species == "Cosmarium_regnelii", "DNA_concentration"] <- 40
Gradient[Gradient$species == "Vischeria_magna", "DNA_concentration"] <- 40
Gradient[Gradient$species == "Xanthonema_montanum", "DNA_concentration"] <- 50
Gradient[Gradient$species == "Anathece_clathrata", "DNA_concentration"] <- 50
Gradient[Gradient$species == "Planktothrix_rubescens", "DNA_concentration"] <- 50
Gradient <- subset(Gradient, !DNA_concentration =="NA")
gg2 <- ggplot(Gradient, aes(x=DNA_concentration, y=as.numeric(value),group=variable)) +
geom_point(aes(color=variable), size=5) +
scale_color_manual(values=c("#383838","#a9a9a9")) +
geom_smooth(method=lm,se=F, aes(color=variable)) +
theme(legend.position="none",
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "grey")) +
xlab("DNA concentration (ng/µL) introduced in Mock 2") +
ylab("Number of reads")
Expo <- subset(species_abund, variable=="AUTEX_S184_L001"|variable=="GENEX_S181_L001")
Haem_Aut <- c("Haematococcus_lacustris","AUTEX_S184_L001",0)
Haem_Gen <- c("Haematococcus_lacustris","GENEX_S181_L001",0)
Expo <- rbind(Expo,Haem_Aut)
Expo <- rbind(Expo,Haem_Gen)
Expo <- na.omit(Expo)
Expo[Expo$species == "Asterionella_formosa", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Cryptomonas_sp.", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Staurosira_venter", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Vischeria_magna", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Zygnema_sp.", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Mougeotia_sp.", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Ulnaria_acus", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Anathece_clathrata", "DNA_concentration"] <- 0.1
Expo[Expo$species == "Nitzschia_palea", "DNA_concentration"] <- 1
Expo[Expo$species == "Cosmarium_regnelii", "DNA_concentration"] <- 1
Expo[Expo$species == "Planktothrix_rubescens", "DNA_concentration"] <- 1
Expo[Expo$species == "Microcystis_aeruginosa", "DNA_concentration"] <- 1
Expo[Expo$species == "Tetradesmus_obliquus", "DNA_concentration"] <- 1
Expo[Expo$species == "Dolichospermum_flosaquae", "DNA_concentration"] <- 1
Expo[Expo$species == "Haematococcus_lacustris", "DNA_concentration"] <- 1
Expo[Expo$species == "Pandorina_morum", "DNA_concentration"] <- 1
Expo[Expo$species == "Botryococcus_braunii", "DNA_concentration"] <- 50
Expo[Expo$species == "Xanthonema_montanum", "DNA_concentration"] <- 50
Expo <- subset(Expo, !DNA_concentration =="NA")
gg3 <- ggplot(Expo, aes(x=as.factor(DNA_concentration), y=log(as.numeric(value) + 1))) +
geom_boxplot(aes(fill=variable)) +
scale_fill_manual(values=c("#383838","#a9a9a9")) +
theme(legend.position="none",
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "grey")) +
xlab("DNA concentration (ng/µL) introduced in Mock 2") +
ylab("Number of reads (logx+1)")