# Define a vector of cell lines
celllines <- c('JMD', 'TKD')
The chunk of code below generates the normalized data (see ‘Data normalization’ paragraph of Methods).
# 1) Normalize raw data
for (cellline in celllines){
raw_path <- paste0("./input/RAW_", cellline, "_MD.csv")
norm_path <- paste0("./input/MD_", cellline, '_NORM.csv')
norm<-normalization(raw_path)
write.csv(norm, norm_path)
}
The chunck of code below generates the Figure S3.
The plots represent the Hill curves of raw and normalized data for each
analite. The plot_normalization
function is a custom
function in “0.libraries.R” file.
# 2) Plot raw data against normalized data
for (cellline in celllines){
raw_path <- paste0("./input/RAW_", cellline, "_MD.csv")
norm_path <- paste0("./input/MD_", cellline, '_NORM.csv')
plot_normalization(raw_path, norm_path, cellline = cellline)
}
for(cellline in c('JMD', 'TKD')){
# Read normalized data
norm_path <- paste0("./input/MD_", cellline, '_NORM.csv')
normalizeddata<-CNOlist(norm_path)
# Convert the MIDAS format in a dataframe
norm_df <- from_midas_to_df(normalizeddata)
# we want the conditions to be the variables
# so we put the conditions as columns
norm_df_transpose <- t(norm_df)
norm_tib <- as_tibble(norm_df_transpose)
norm_tib$analytes <- rownames(norm_df_transpose)
norm_tib <- norm_tib %>% relocate(analytes)
if(cellline == 'JMD'){
df_merge <- norm_tib
}else{
df_merge <- inner_join(df_merge, norm_tib, by = c('analytes'), suffix = c('.JMD', '.TKD'))
}
}
df_merge <- df_merge %>%
relocate(analytes, 'CTRL.JMD') %>%
column_to_rownames("analytes")
# we want the conditions to be the variables
# so we put the conditions as columns
df_merge_transpose <- t(df_merge)
# convert in tibble
norm_tib <- as_tibble(df_merge_transpose)
norm_tib$analytes <- rownames(df_merge_transpose)
norm_tib <- norm_tib %>% relocate(analytes)
#create new column for shape attribute, empty string
norm_tib$shape <- ''
#create index for conditions to label with different shape
#look for names in column with grepl
#assigne variable to map shape to posistions found
idx_IGF1 <- which(grepl('IGF1', norm_tib$analytes))
norm_tib$shape[idx_IGF1] <- 'IGF1'
idx_TNFa <- which(grepl('TNF', norm_tib$analytes))
norm_tib$shape[idx_TNFa] <- 'TNFa'
idx_CTRL <- which(grepl('CTRL', norm_tib$analytes))
norm_tib$shape[idx_CTRL] <- 'CTRL'
# calculate PCA
col_num <- ncol(norm_tib) #calculate number of columns
#in prcomp col_num-1 because the last column is text, it wants only mumbers
norm_tib <- norm_tib %>%
mutate(cellline = ifelse(grepl('JMD', analytes), 'JMD', 'TKD'))
norm_tib <- norm_tib %>%
mutate(analytes = str_remove_all(analytes, '.JMD|.TKD|IGF1R\\+|IGF1R|\\+IGF1R|TNF\\+|\\+TNF|TNF'))
pca <- prcomp(norm_tib[,2:(col_num-1)],
center = TRUE,
scale. = TRUE)
pca.plot <- autoplot(pca,
data = norm_tib,
label = FALSE) +
theme_classic() +
geom_hline(yintercept = 0)+
geom_vline(xintercept = 0)+
geom_point(size = 5,aes(shape=shape, col = cellline)) +
scale_color_manual(values = c('#36437F','#E1BD6A'), labels = c('JMD', 'TKD')) +
geom_text_repel(aes(label = norm_tib$analytes))
pca.plot
Figure 2C Principal Component Analysis (PCA) of FLT3
ITD-JMD and FLT3 ITD-JMD cells treated upon different perturbations.
The preprocessing step consists of three phases:
(i) compression, in which unmeasured and untargeted proteins, as well as linear cascades of undesignated nodes, are removed.
(ii) expansion, in which the remaining nodes are connected to the upstream regulators with every possible combination of OR/AND Boolean operators. AND operators will be nodes in the network!
(iii) imputation, in which the software integrates the scaffold model with regulations function inferred without bias from the training dataset.
# set cellline
pkn_path <- './input/PKN_3008.sif'
for(cellline in celllines){
# read experimental data and prior knowledge network
norm_path <- paste0("./input/MD_", cellline, '_NORM.csv')
CNOlist <- CNOlist(norm_path)
pkn <- readSIF(pkn_path)
# Compression and Expansion operation according to experimental data
model<-preprocessing(data=CNOlist, model=pkn, expansion = TRUE)
# Imputation: integrate data-driven edges with CNOfeeder
BTable <- makeBTables(CNOlist=CNOlist, k=2, measErr=c(0.1, 0))
Lrank <- linksRanking(CNOlist=CNOlist, measErr=c(0.1, 0), savefile=FALSE)
# integrate inferred edges and prior knowledge
modelIntegr <- mapBTables2model(BTable=BTable,model=model,allInter=TRUE)
modelIntegrWeight <- weighting(modelIntegr=modelIntegr, PKNmodel=model,
CNOlist=CNOlist, integrFac=10)
modelIntegrWeight
#save expanded model and remove incoming edges in inputs
writeSIF(modelIntegrWeight, paste0('./input/integrated_PKN_', cellline, '.sif'))
}
The analysis of pre-processing step results reveal that: - JMD model has 206 nodes and xxx edges; - TKD model has 208 nodes and xxx edges;
preprocessed_JMD <- read_tsv(paste0('./input/integrated_PKN_JMD.sif'),
col_names = c('source','interaction','target'))
preprocessed_TKD <- read_tsv(paste0('./input/integrated_PKN_TKD.sif'),
col_names = c('source','interaction','target'))
#write_tsv(preprocessed_TKD, '../input/integrated_PKN_TKD_new.tsv', col_names = FALSE)
TKD_proteins <- unique(c(preprocessed_TKD$source,preprocessed_TKD$target))
JMD_proteins <- unique(c(preprocessed_JMD$source,preprocessed_JMD$target))
cat(paste0('JMD model has ', length(JMD_proteins), ' nodes and ', nrow(preprocessed_JMD), ' edges\n',
'TKD model has ', length(TKD_proteins), ' nodes and ', nrow(preprocessed_TKD), ' edges.\n',
'The differences between the nodes of the two models are just in logic gates: ',
paste0(setdiff(TKD_proteins, JMD_proteins), collapse = ';')))
## JMD model has 206 nodes and 756 edges
## TKD model has 208 nodes and 782 edges.
## The differences between the nodes of the two models are just in logic gates: and177;and178
We performed 1000 runs of optimization using CellNOptR which creates context-specific Boolean models (i) by filtering out interactions not relevant to the system and (ii) by selecting the Boolean operators (i.e., AND/OR) that best integrate inputs acting on the same node. CellNOptR exploits a genetic algorithm that minimizes the difference (mean squared error, mse) between experimental data and the values simulated from the Boolean model. In each optimization run, we selected the best model (lowest mse).
N.B This code runs for several hours, so don’t run it.
for (cellline in c('JMD', 'TKD')){
# read experimental data
norm_path <- paste0("./input/MD_", cellline, '_NORM.csv')
CNOlist <- CNOlist(norm_path)
# read prior knowledge network integrated with data-driven edges
modelT0<- readSIF(paste0('input/integrated_PKN_', cellline, '.sif'))
# ### --- MULTIPLE TRAINING --- ###
#
ncol = length(modelT0$reacID)
#
# inizialize dataframe multiple run
optModels <- data.frame(matrix(ncol = ncol, nrow = 0))
# list with simulation matrix at t0 and t1 for each optimal model
matrix_sim <- list()
# create a vector with all possible expanded gates
labels <- modelT0$reacID
names(optModels) <- labels
#
nrun = 1
while (nrun <= 1000){
print(nrun)
train <- gaBinaryT1(CNOlist = CNOlist,
model = modelT0,
verbose = FALSE)
simList = prep4sim(modelT0)
indexList = indexFinder(CNOlist, modelT0)
bString <- train$bString
modelCut <- cutModel(modelT0, bString)
simListCut <- cutSimList(simList, bString)
# t0
Sim0 <- simulatorT0(CNOlist=CNOlist,
model=modelCut,
simList=simListCut,
indexLis =indexList)
simRes0 <- as.matrix(Sim0[,indexList$signals,drop=FALSE])
#simRes0 = Sim0
# t1
Sim <- simulatorT1(CNOlist=CNOlist, model=modelCut, simList=simListCut, indexList=indexList)
simRes <- as.matrix(Sim[,indexList$signals,drop=FALSE])
#simRes = Sim
simResults <- list(t0=simRes0, t1=simRes)
c <- cutAndPlot(model = modelT0,
bStrings = list(train$bString)[1], #lista di vettori,
CNOlist = CNOlist,
plotPDF = FALSE,
plotParams = list(margin = 0.3, #margins of boxes
width = 20, #dimension of single boxes
height = 8,
cmap_scale = 1,
cex =1, #fontsize of labels
ymin = NULL,
maxrow = 7)) #max number of row per single plot
# Take t0 and t1 from the cutAndPlot list
t0 <- rbind(c$simResults[[1]]$t0, c$simResults[[2]]$t0)
t1 <- rbind(c$simResults[[1]]$t1, c$simResults[[2]]$t1)
# If some of the cells in the matrix is 0, perform another run of optimization
# without updating the counts n_run, because we want to discard that!
if(sum(is.na(simResults$t0)) > 0 | sum(is.na(t1)) > 0){
next
}else{
optModels[nrun,]<- train$bString
matrix_sim[[nrun]] <- list()
matrix_sim[[nrun]] <- simResults
matrix_sim[[nrun]]$t1 <- t1
nrun <- nrun + 1
}
}
# saving workspace in order to work with results in local
if(cellline == 'TKD'){
save.image(file = 'TKD_workspace.Rdata')
}else{
save.image(file = 'JMD_workspace.Rdata')
}
}
The best models were chosen according to their mean squared error (mse). The first step was retrieving the mse from optimized object produced in the server. This is a time consuming operation.
for(cellline in celllines){
# Read integrated PKN with experimental data
pknmodel <- readSIF(paste0('./input/integrated_PKN_', cellline, '.sif'))
# Read the normalized Luminex data
norm_path <- paste0("./input/MD_", cellline, '_NORM.csv')
CNOlist <- CNOlist(norm_path)
# Load the 1000 run optimization workspace
# $optModels contains 1000 models
load(paste0('./server/', cellline, '_workspace.RData'))
# The aim is taking for each optimized model the mean squared error
mse <- c()
# Create a matrix_sim list that for each optimized model annotates: ....
matrix_sim <- list()
# Loop on each optimized model
for (i in c(1:length(optModels[,1]))){
print(i)
# To take the mse, we exploit cutAndPlot function
# hacking the arguments and putting as bString the binary string
# representing the optimized model in server run
c <- cutAndPlot(model = modelT0,
bStrings = list(as.vector(unlist(unlist(optModels[i,]))))[1],
CNOlist = CNOlist,
plotPDF = FALSE,
tag = cellline)
# The mse is for each species between simulated and experimental
# so to obtain an mse for the model, I sum up the mse
mse <- c(mse, sum(c$mse))
}
write_rds(x = mse, paste0('./results/', cellline, '_mse.RDS'))
}
Then, we added the mse to the optimized models and sorted them in ascending order according to mse and selected the first 100 models. It is time consuming.
for(cellline in celllines){
# Read integrated PKN with experimental data
pknmodel <- readSIF(paste0('./input/integrated_PKN_', cellline, '.sif'))
# Read the normalized Luminex data
norm_path <- paste0("./input/MD_", cellline, '_NORM.csv')
CNOlist <- CNOlist(norm_path)
# Load 1000 run workspace
load(paste0('./server/', cellline, '_workspace.RData'))
# Read mse vector
mse <- read_rds(paste0('./results/', cellline, '_mse.RDS'))
optModels_t <- tibble(optModels)
optModels_t$mse <- mse
# arrange according to mse and select the first 100 models
best_100_models <- optModels_t %>% arrange(mse)
best_100_models <- best_100_models[1:100,]
best_100_models$mse <- NULL
mse_thr <- c()
matrix_sim_thr <- list()
# Loop on each optimized model
for (i in c(1:nrow(best_100_models[,1]))){
print(i)
c <- cutAndPlot(model = modelT0,
bStrings = list(as.vector(unlist(unlist(best_100_models[i,]))))[1],
CNOlist = CNOlist,
plotPDF = FALSE,
tag = cellline)
mse_thr <- c(mse_thr, sum(c$mse))
# Reconstruct a matrix of T0 (before any treatment) and T1 (i condition)
# simulated nodes states
t0 <- rbind(c$simResults[[1]]$t0, c$simResults[[2]]$t0)
t1 <- rbind(c$simResults[[1]]$t1, c$simResults[[2]]$t1)
# Compute the difference between the two conditions (simulated fold-change)
sim <- t1 - t0
if(sum(is.na(t0)) > 0 | sum(is.na(t1)) > 0){
next
}else{
att <- t1 - t0
matrix_sim_thr[[i]] <- att
}
}
write_rds(matrix_sim_thr, paste0('./results/', cellline, '_best_model_matrix.RDS'))
}
Once the 100 best model were selected, we calculated the average state of each protein in the 100 best models. This procedure enables quantitative prediction even using Boolean models, which are discrete by nature. These averaged values were compared with the training data to evaluate the goodness of fit.
for(cellline in celllines){
# Read the normalized Luminex data
norm_path <- paste0("./input/MD_", cellline, '_NORM.csv')
CNOlist <- CNOlist(norm_path)
# Read the list with 100 optimized models
matrix_sim <- read_rds(paste0('./results/', cellline, '_best_model_matrix.RDS'))
# Create a matrix of experimental signals
exp <- getSignals(CNOlist)$'90' - getSignals(CNOlist)$'0'
# Create a matrix of simulated data
sims <- Filter(Negate(is.null), matrix_sim)
average_sim <- Reduce("+", sims) / length(sims)
colnames(average_sim) <- colnames(exp)
# Create a matrix of the difference between average simulated data in 100
# optimized models and the exerimental data
diff <- abs(average_sim - exp)
colnames(diff) <- colnames(exp)
# Set as rownames the conditions and as column names the analytes
# Get condition names from CNOlist object
cond <- CNOlist@cues
row.names(cond) <- LETTERS[1:nrow(cond)]
colnames(cond)[ colnames(cond) %in% colnames(CNOlist@inhibitors)] <- paste0( colnames(cond)[ colnames(cond) %in% colnames(CNOlist@inhibitors)], 'i')
# Create the conditions (rows) names
for(i in c(1:nrow(cond))){
rownames(cond)[i] <- paste0(colnames(cond)[cond[i,] == 1], collapse = '+')
}
conditions <- rownames(cond)
colnames(average_sim) <- colnames(exp)
rownames(exp) <- conditions
rownames(average_sim) <- conditions
rownames(diff) <- conditions
# Put the three matrices in a list
matrices <- list(exp, average_sim, abs(diff))
write_rds(matrices, file = paste0('./results/', cellline, '_matrices_heatmap.RDS'))
cat(paste0(cellline, ':', (max(diff)), ' '))
}
Visualization in an heatmap
for(cellline in celllines){
# Read the RDS object of experimental, simulated and errors
matrices <- read_rds(paste0('./results/', cellline, '_matrices_heatmap.RDS'))
# Since max error is in TKD cell line, set this value for max error color scale
TKDe <- 1.65 #max(diff)
# Create an empty list for the three heatmaps
plot_list <- list()
plot_title <- c('Experimental in ', 'Simulated in ', 'Difference ')
paletteLength <-1000
myColor <- colorRampPalette(c("darkblue", "lightyellow", "red3"))(paletteLength)
for (i in c(1:length(matrices))){
if(cellline == 'JMD' & i == 3){
# for the error matrix select only two colors
myColor <- colorRampPalette(c("lightyellow", "red3"))(paletteLength)
myBreaks <- c(seq(min(matrices[[i]]), TKDe, length.out=paletteLength))
# Set errors < 0.5 as white, because are acceptable
# get the number of values < 0.5 and subtract it to the length of colors
n <- length(myColor)-sum(myBreaks<=0.5)
# concatenate a vector of colors with
# white of length of numbers < 0.5 and the vector of length of remaining numbers
# from first color to the len-n one
myNewColor <- c(rep("#FFFFE0", sum(myBreaks<=0.5)), myColor[1:n])
myColor <- myNewColor
}else{
myBreaks <- c(seq(min(matrices[[i]]), 0, length.out=ceiling(paletteLength/2) + 1),
seq(max(matrices[[i]])/paletteLength, max(matrices[[i]]), length.out=floor(paletteLength/2)))
if(cellline == 'TKD' & i == 3){
myColor <- colorRampPalette(c("lightyellow", "red3"))(paletteLength)
myBreaks <- c(seq(min(matrices[[i]]), max(matrices[[i]]), length.out=paletteLength))
n <- length(myColor)-sum(myBreaks<=0.5)
myNewColor <- c(rep("#FFFFE0", sum(myBreaks<=0.5)), myColor[1:n])
myColor <- myNewColor
}
}
p <- pheatmap(as.matrix(matrices[[i]]),
color = myColor,
cellwidth = 20,
cellheight = 20,
cluster_rows = FALSE,
cluster_cols = FALSE,
border_color = 'white',
breaks = unique(myBreaks),
fontsize = 13,
width = 5,
heigth = 30,
labels_col = colnames(matrices[[i]]),
main = paste0(plot_title[i], cellline),
angle_col = c('45'))
plot_list[[i]] <- p[[4]]
}
# Save the plot list for creating the grid
write_rds(plot_list, paste0('./results/', cellline, '_plot_list.RDS'))
# Save the whole environment image for the simulations
save.image(paste0('./results/', cellline, '_subfamily_threshold.RData'))
}
Visualize FLT3-ITD JMD and TKD together
plot_list_JMD <- read_rds(paste0('./results/JMD_plot_list.RDS'))
plot_list_TKD <- read_rds(paste0('./results/TKD_plot_list.RDS'))
plot_list_total <- c(plot_list_JMD, plot_list_TKD)
grid.arrange(grobs = plot_list_total, ncol = 3, nrow = 2)
Figure 3A. Color-coded representations of the experimental activity modulation (T90 – T0) of sentinel proteins used to train the two Boolean models (left panel) and the average prediction of protein activities in the family of 100 best models (central panel). The protein activity modulation ranges from -1 to 1 and is represented with a gradient from blue (inhibited) to red (activated). The absolute value of the difference between experimental and simulated protein activity modulation (right panel) is reported as a gradient from light yellow (error < 0.5) to red (1.85). The fit between simulated and experimental data was generally higher in the FLT3 ITD-JMD model, which has been more extensively characterized by the scientific community than the FLT3ITD-TKD system.