FLT3-ITD Annotation

Generic variant callers can’t identify medium-sized insertions, like FLT3 ITDs. As such, we used the specialized algorithm getITD v.1.5.16 (Blätte et al., 2019) to localize ITDs in each patient from command line. Reads mapping on FLT3 genomic region between JMD and TKD domain (28033888 – 28034214) were extracted from bam files and converted in fastq format with samtools v.1.16.1. getITD was run with default parameters, but with a custom reference sequence without introns (‘caatttaggtatgaaagccagctacagatggtacaggtgaccggctcctcagataatgagtacttctacgttgattt cagagaatatgaatatgatctcaaatgggagtttccaagagaaaatttagagtttgggaaggtactaggatcaggtgctt ttggaaaagtgatgaacgcaacagcttatggaattagcaaaacaggagtctcaatccaggttgccgtcaaaatgctgaaag’) that was annotated according to getITD annotation file.

The bash code to run get itd is available in getitd_cycle.sh

The result of this analysis is reported in Table S4.

getITD output processing

# Vector of patients ID
patients <- c('AR', 'DN', 'DNF', 'FE', 'FF', 'LS', 'MD', 'NG', 'RL.1',
              'RL.2', 'SE', 'SF', 'SM', 'TR')

# JMD and TKD domain protein sequence of FLT3 
JMD <- 'QFRYESQLQMVQVTGSSDNEYFYVDFREYEYDLKWEFPRENLEF'
TKD <- 'GKVLGSGAFGKVMNATAYGISKTGVSIQVAVKMLK'

for(p_idx in c(1:length(patients))){
  patient <- patients[p_idx]
  print(patient)
  
  # 1) Read each patient getitd file
  itd_file <- read_tsv(paste0('input/getitd_analysis/flt3_itd_analysis/getitd/', 
                              patient, 
                              '_getitd/itds_collapsed-is-same_is-similar_is-close_is-same_trailing_hc.tsv')) %>% 
    select(sample, length, coverage, vaf, ar, seq, sense, start_protein_as, insertion_site_protein_as, insertion_site_domain)
  
  itd_file$translated_seq <- ''
  
  # 2) Translate the ITDs found
  for(i in c(1:length(itd_file$seq))){
    
    # A) create all possible reading frames on both '+' and '-'
    strings <- DNAStringSet(itd_file$seq[i])
    dna3_subseqs <- lapply(1:3, function(pos) 
      subseq(c(strings, reverseComplement(strings)), start=pos))
    
    # B) translate
    # translation of 'dna3_subseqs' produces a list of length 3 representing 
    # each ORF for forward and reverse complement 
    frames <- lapply(dna3_subseqs, translate)
    
    # get translated sequences 
    sequences <- unlist(lapply(frames, function(x){unlist(as.character(x))}))
    
    # C) align on protein sequence of JMD and TKD domain
    JMD_scores <- unlist(lapply(frames, function(set){
      lapply(as.character(set), function(string){pairwiseAlignment(pattern = unlist(string),
                                                                   subject = JMD,
                                                                   type = 'local', scoreOnly = TRUE)})}))
    
    TKD_scores <- unlist(lapply(frames, function(set){
      lapply(as.character(set), function(string){pairwiseAlignment(pattern = unlist(string),
                                                                   subject = TKD,
                                                                   type = 'local', scoreOnly = TRUE)})}))
    
    if(max(JMD_scores) > max(TKD_scores)){
      print('Using JMD')
      best_string <- sequences[JMD_scores == max(JMD_scores)]
    }else{
      print('Using TKD')
      best_string <- sequences[TKD_scores == max(TKD_scores)]
    }
    
    itd_file$translated_seq[i] <- best_string
  }
  
  if(p_idx == 1){
    global_itd_file <- itd_file  %>% mutate_at('start_protein_as', as.character) %>% mutate_at('insertion_site_protein_as', as.character)
  }else{
    itd_file <- itd_file %>% mutate_at('start_protein_as', as.character) %>% mutate_at('insertion_site_protein_as', as.character)
    global_itd_file <- bind_rows(global_itd_file, itd_file)
  }
}

global_itd_file_custom <- global_itd_file
global_itd_file_c <- global_itd_file %>% filter(insertion_site_protein_as != '.')

global_itd_file_c$domain[grepl('JMD', global_itd_file_c$insertion_site_domain)] <- 'JMD'
global_itd_file_c$domain[grepl('TKD', global_itd_file_c$insertion_site_domain)] <- 'TKD'

global_itd_file_summary <- global_itd_file_c %>% 
  filter(!is.na(domain)) %>% 
  group_by(sample) %>% 
  summarize(mutation = paste0(unique(domain), collapse = '+'),
            t_seq = paste0(translated_seq, collapse = ';'),
            itd = n())
                                                                              
write_tsv(global_itd_file, 'result/getitd/getitd_all_res_custom.tsv')
write_tsv(global_itd_file_summary, 'result/getitd/getitd_compact_res_custom.tsv')

getITD processing result overview

global_itd_file_summary <- read_tsv('result/getitd/getitd_compact_res_custom.tsv', show_col_types = FALSE)
global_itd_file_summary %>% dplyr::count(mutation)
## # A tibble: 3 × 2
##   mutation     n
##   <chr>    <int>
## 1 JMD          8
## 2 JMD+TKD      4
## 3 TKD          2

As shown in the table, patients carried multiples ITDs and we classified as follows:

  • 8 patients as FLT3 ITD-JMD (JMD1-8).

  • 4 patients as FLT3 ITD-JMD+TKD (JMD_TKD1-4).

  • 2 as FLT3 ITD-TKD (TKD1-2).

Clinical data clustering

Available clinical data were coded into discrete variables. In particular:

# Read coded clinical data
clinical_coded <- read_tsv('./input/clinical_data/clinical_data_coded.txt')

# clustering according vital status, recurrency
clinical_coded <- clinical_coded %>% filter(!is.na(vital_status))

clinical_tb <- clinical_coded %>% 
  dplyr::select(FLT3_subtype, vital_status, recurrency) %>% 
  column_to_rownames('FLT3_subtype')
model_d = dist(clinical_tb, method = 'euclidean')
model_hc = hclust(model_d, method = 'complete')
#plot(model_hc, hang = -1)

model_plot <- ggdendrogram(model_hc, theme_dendro = TRUE) +
  theme(axis.line.y = element_line(colour = 'darkgrey'),
        axis.ticks.y = element_line())

model_plot