Gene counts tables were generated from bam files using Rsubread.
# Path of hard disk where company data were stored
hd_path <- '/Volumes/EXT_DISK/AML_FLT3_Sequencing/BWA_mapping/'
patients <- c('AR', 'DN', 'DNF', 'FE', 'FF', 'LS', 'MD', 'NG', 'RL_1',
'RL_2', 'SE', 'SF', 'SM', 'TR')
fc <- featureCounts(files = paste0(hd_path, patients, '.deduped.bam'),
annot.inbuilt = 'hg38',
isPairedEnd = TRUE, nthreads = 12)
df <- fc$counts
df <- data.frame(df) %>% rownames_to_column() %>% as_tibble() %>%
dplyr::rename(GeneID = rowname)
# keep only genes with at least one read
genes <- df %>% filter(!if_all(.cols = where(is.numeric), ~ .x == 0))
# translate ENSEMBL ID in Gene name
gene_name <- getSYMBOL(x = genes$GeneID, data = 'org.Hs.eg.db')
mapping_df <- tibble(GeneID = names(gene_name), gene_name)
genes <- left_join(genes, mapping_df, by = 'GeneID') %>%
relocate(GeneID, gene_name) %>%
arrange(gene_name)
write_tsv(genes, './result/rnaseq/raw_count_matrix.tsv')
# 1) countData derived from Rsubread package (BAM files -> to raw counts)
cts <- read_tsv('./result/rnaseq/raw_count_matrix.tsv')
classified <- read_tsv('./result/getitd/getitd_compact_res_custom.tsv')
classified$type[grepl('JMD\\+', classified$mutation)] <- paste0('JMD_TKD', 1:sum(grepl('JMD\\+', classified$mutation)))
classified$type[grepl('JMD$', classified$mutation)] <- paste0('JMD', 1:sum(grepl('JMD$', classified$mutation)))
classified$type[grepl('^TKD$', classified$mutation)] <- paste0('TKD', 1:sum(grepl('^TKD$', classified$mutation)))
write_tsv(classified, './result/getitd/mapping_getitd_patientID.tsv')
colnames(cts)[3:length(colnames(cts))] <- str_sub(colnames(cts)[3:length(colnames(cts))], 1,4)
colnames(cts)[3:length(colnames(cts))] <- str_remove_all(colnames(cts)[3:length(colnames(cts))], '.d')
colnames(cts)[3:length(colnames(cts))] <- str_remove_all(colnames(cts)[3:length(colnames(cts))], '\\.')
colnames(cts)[3:length(colnames(cts))] <- str_replace_all(colnames(cts)[3:length(colnames(cts))], '_', '.')
#cts <- cts %>% select(-c('RL.2'))
#rename columns according to ITD classification
v <- classified$type[match(colnames(cts)[3:length(colnames(cts))], classified$sample)]
colnames(cts)[3:length(colnames(cts))] <- v
write_tsv(cts, './result/rnaseq/count_matrix_flt3_annotated.tsv')
# 2) colData construction
colData <- data.frame(condition = rep('a', length(v)))
rownames(colData) <- v
colData$condition[grepl('^JMD', rownames(colData))] <- 'JMD'
colData$condition[grepl('^TKD', rownames(colData))] <- 'TKD'
colData$condition[grepl('^JMD_', rownames(colData))] <- 'JMD_TKD'
# create cts1 with columns same as colData
cts1 <- cts %>% column_to_rownames('GeneID')
cts1 <- cts1[, rownames(colData)]
all(rownames(colData) == colnames(cts1))
# create deseqdataset
dds <- DESeqDataSetFromMatrix(countData = cts1,
colData = colData,
design = ~ condition)
# add as features the gene_names of geneID
featureData <- data.frame(gene = cts$gene_name)
rownames(featureData) <- rownames(cts1)
mcols(dds) <- DataFrame(mcols(dds), featureData)
write_rds(dds, './result/rnaseq/dds_object.rds')
# Read input data
cts <- read_tsv('./result/deseq2/count_matrix_flt3_annotated.tsv')
dds <- readRDS('./result/deseq2/dds_object.rds')
cts$gene_name <- NULL
cts <- cts %>%
column_to_rownames('GeneID')
# Counts threshold to obtain exactly the number of sequenced genes 262
keep <- rowSums(counts(dds) >= 3950) >= 8
dds1 <- dds[keep,]
raw_counts <- counts(dds1, normalized=FALSE)
# Define groups
group <- dds$condition
y <- DGEList(counts=raw_counts,group=group)
y <- calcNormFactors(y)
# Get normalized counts
norm_counts <- data.frame(y$counts)
# Logarithmic transformation
norm_counts_log2 <- data.frame(log2(norm_counts)) %>%
rownames_to_column('geneID')
gene_IDs <- tibble(geneID = rownames(mcols(dds1)),
gene_name = mcols(dds1)$gene)
norm_counts_log2 <- inner_join(norm_counts_log2,
gene_IDs) %>%
relocate(geneID, gene_name)
# Compute z-score
mean_v <- colMeans(norm_counts_log2[3:ncol(norm_counts_log2)])
sd_v <- colSds(as.matrix(norm_counts_log2[sapply(norm_counts_log2, is.numeric)]))
names(sd_v) <- names(mean_v)
norm_counts_log2_z <- norm_counts_log2
for(patient in names(mean_v)){
norm_counts_log2_z[, patient] <- (norm_counts_log2[,patient] - mean_v[patient])/sd_v[patient]
}
write_tsv(norm_counts_log2_z, './result/rnaseq/zscore.tsv')
norm_counts_log2_z <- read_tsv('./result/rnaseq/zscore.tsv')
norm_counts_log2_z <- norm_counts_log2_z %>% dplyr::select(-c('geneID'))
norm_counts_log2_z <- norm_counts_log2_z %>% column_to_rownames('gene_name')
norm_counts_log2_z_t <- t(norm_counts_log2_z)
z_dist = dist(norm_counts_log2_z_t, method = 'euclidean')
z_hc = hclust(z_dist, method = 'complete')
z_plot <- ggdendrogram(z_hc, theme_dendro = TRUE) +
theme(axis.line.y = element_line(colour = 'darkgrey'),
axis.ticks.y = element_line())
z_plot
Figure 5D Hierarchical clustering of patients according to their expression profile of 262 genes.
# Read DeSeq2 object
dds <- readRDS('./result/rnaseq/dds_object.rds')
# threshold to obtain exactly the number of sequenced genes 262
keep <- rowSums(counts(dds) >= 3950) >= 8
dds1 <- dds[keep,]
rld <- rlog(dds, blind = TRUE)
p <- plotPCA1(rld)
PCA <- p +
theme_classic() +
scale_color_brewer(palette = 'Dark2') +
theme(legend.title = element_blank())
write_rds(PCA, './result/rnaseq/PCA.rds')
PCA_plot <- read_rds('./result/rnaseq/PCA.rds')
PCA_plot
Figure S6C Principal Component Analysis (PCA) of patients’ expression profile.