Generic variant callers can’t identify medium-sized insertions, like FLT3 ITDs. As such, we used the specialized algorithm getITD v.1.5.16 (Blätte et al., 2019) to localize ITDs in each patient from command line. Reads mapping on FLT3 genomic region between JMD and TKD domain (28033888 – 28034214) were extracted from bam files and converted in fastq format with samtools v.1.16.1. getITD was run with default parameters, but with a custom reference sequence without introns (‘caatttaggtatgaaagccagctacagatggtacaggtgaccggctcctcagataatgagtacttctacgttgattt cagagaatatgaatatgatctcaaatgggagtttccaagagaaaatttagagtttgggaaggtactaggatcaggtgctt ttggaaaagtgatgaacgcaacagcttatggaattagcaaaacaggagtctcaatccaggttgccgtcaaaatgctgaaag’) that was annotated according to getITD annotation file.
The bash code to run get itd is available in
getitd_cycle.sh
The result of this analysis is reported in Table S4.
# Vector of patients ID
patients <- c('AR', 'DN', 'DNF', 'FE', 'FF', 'LS', 'MD', 'NG', 'RL.1',
'RL.2', 'SE', 'SF', 'SM', 'TR')
# JMD and TKD domain protein sequence of FLT3
JMD <- 'QFRYESQLQMVQVTGSSDNEYFYVDFREYEYDLKWEFPRENLEF'
TKD <- 'GKVLGSGAFGKVMNATAYGISKTGVSIQVAVKMLK'
for(p_idx in c(1:length(patients))){
patient <- patients[p_idx]
print(patient)
# 1) Read each patient getitd file
itd_file <- read_tsv(paste0('input/getitd_analysis/flt3_itd_analysis/getitd/',
patient,
'_getitd/itds_collapsed-is-same_is-similar_is-close_is-same_trailing_hc.tsv')) %>%
select(sample, length, coverage, vaf, ar, seq, sense, start_protein_as, insertion_site_protein_as, insertion_site_domain)
itd_file$translated_seq <- ''
# 2) Translate the ITDs found
for(i in c(1:length(itd_file$seq))){
# A) create all possible reading frames on both '+' and '-'
strings <- DNAStringSet(itd_file$seq[i])
dna3_subseqs <- lapply(1:3, function(pos)
subseq(c(strings, reverseComplement(strings)), start=pos))
# B) translate
# translation of 'dna3_subseqs' produces a list of length 3 representing
# each ORF for forward and reverse complement
frames <- lapply(dna3_subseqs, translate)
# get translated sequences
sequences <- unlist(lapply(frames, function(x){unlist(as.character(x))}))
# C) align on protein sequence of JMD and TKD domain
JMD_scores <- unlist(lapply(frames, function(set){
lapply(as.character(set), function(string){pairwiseAlignment(pattern = unlist(string),
subject = JMD,
type = 'local', scoreOnly = TRUE)})}))
TKD_scores <- unlist(lapply(frames, function(set){
lapply(as.character(set), function(string){pairwiseAlignment(pattern = unlist(string),
subject = TKD,
type = 'local', scoreOnly = TRUE)})}))
if(max(JMD_scores) > max(TKD_scores)){
print('Using JMD')
best_string <- sequences[JMD_scores == max(JMD_scores)]
}else{
print('Using TKD')
best_string <- sequences[TKD_scores == max(TKD_scores)]
}
itd_file$translated_seq[i] <- best_string
}
if(p_idx == 1){
global_itd_file <- itd_file %>% mutate_at('start_protein_as', as.character) %>% mutate_at('insertion_site_protein_as', as.character)
}else{
itd_file <- itd_file %>% mutate_at('start_protein_as', as.character) %>% mutate_at('insertion_site_protein_as', as.character)
global_itd_file <- bind_rows(global_itd_file, itd_file)
}
}
global_itd_file_custom <- global_itd_file
global_itd_file_c <- global_itd_file %>% filter(insertion_site_protein_as != '.')
global_itd_file_c$domain[grepl('JMD', global_itd_file_c$insertion_site_domain)] <- 'JMD'
global_itd_file_c$domain[grepl('TKD', global_itd_file_c$insertion_site_domain)] <- 'TKD'
global_itd_file_summary <- global_itd_file_c %>%
filter(!is.na(domain)) %>%
group_by(sample) %>%
summarize(mutation = paste0(unique(domain), collapse = '+'),
t_seq = paste0(translated_seq, collapse = ';'),
itd = n())
write_tsv(global_itd_file, 'result/getitd/getitd_all_res_custom.tsv')
write_tsv(global_itd_file_summary, 'result/getitd/getitd_compact_res_custom.tsv')
global_itd_file_summary <- read_tsv('result/getitd/getitd_compact_res_custom.tsv', show_col_types = FALSE)
global_itd_file_summary %>% dplyr::count(mutation)
## # A tibble: 3 × 2
## mutation n
## <chr> <int>
## 1 JMD 8
## 2 JMD+TKD 4
## 3 TKD 2
As shown in the table, patients carried multiples ITDs and we classified as follows:
8 patients as FLT3 ITD-JMD (JMD1-8).
4 patients as FLT3 ITD-JMD+TKD (JMD_TKD1-4).
2 as FLT3 ITD-TKD (TKD1-2).
Available clinical data were coded into discrete variables. In particular:
Vital status was represented as 1 (deceased), 0 (alive)
Recurrency was represented as 0 (no recurrency), 1 (recurrency), 2 (resistant patient)
# Read coded clinical data
clinical_coded <- read_tsv('./input/clinical_data/clinical_data_coded.txt')
# clustering according vital status, recurrency
clinical_coded <- clinical_coded %>% filter(!is.na(vital_status))
clinical_tb <- clinical_coded %>%
dplyr::select(FLT3_subtype, vital_status, recurrency) %>%
column_to_rownames('FLT3_subtype')
model_d = dist(clinical_tb, method = 'euclidean')
model_hc = hclust(model_d, method = 'complete')
#plot(model_hc, hang = -1)
model_plot <- ggdendrogram(model_hc, theme_dendro = TRUE) +
theme(axis.line.y = element_line(colour = 'darkgrey'),
axis.ticks.y = element_line())
model_plot