Note: The path of all datasets can be adjusted to
access the Data within the Git repository structure,
but the file names must remain the same.
import rpy2%load_ext rpy2.ipython
The rpy2.ipython extension is already loaded. To reload it, use:
%reload_ext rpy2.ipython
1 Fisher-Pitman Permutation Test
for Comparison of Total SNVs, Indels, and SVs Across Tumor Types
%%Rlibrary(coin)set.seed(1234)snv_indel_sv =read.csv('/IPMNPDAC_WGS/Data/snvindelsvTotalnb4StatTest.csv')snv_indel_sv = snv_indel_sv[,c('tumourType', 'total_indels', 'total_SNVs', 'total_SV')]#X = as.numeric(tmbdf$snv_indel_TMB)A =as.factor(snv_indel_sv$tumourType)Xsnv =as.numeric(snv_indel_sv$total_SNVs)Xindel =as.numeric(snv_indel_sv$total_indels)Xsv =as.numeric(snv_indel_sv$total_SV)snvtest =oneway_test(Xsnv~A, data=snv_indel_sv, distribution =approximate(nresample =9999)) indeltest =oneway_test(Xindel~A, data=snv_indel_sv, distribution =approximate(nresample =9999))svtest =oneway_test(Xsv~A, data=snv_indel_sv, distribution =approximate(nresample =9999))print(snvtest)print(indeltest)print(svtest)
Approximative Two-Sample Fisher-Pitman Permutation Test
data: Xsnv by A (invasive, nonInvasive)
Z = 2.8447, p-value = 0.0038
alternative hypothesis: true mu is not equal to 0
Approximative Two-Sample Fisher-Pitman Permutation Test
data: Xindel by A (invasive, nonInvasive)
Z = 2.5685, p-value = 0.009101
alternative hypothesis: true mu is not equal to 0
Approximative Two-Sample Fisher-Pitman Permutation Test
data: Xsv by A (invasive, nonInvasive)
Z = 3.3662, p-value = 3e-04
alternative hypothesis: true mu is not equal to 0
2 Fig.1.B. Mutational Landscape of
IPMNs and PDACs
import pandas as pdimport matplotlib.pyplot as pltimport matplotlib.pyplot as pltfrom matplotlib import rcParamsrcParams['font.family'] ='Arial'# Affects all text elementsimport shutupshutup.please()datapath ='/IPMNPDAC_WGS/Data/'## 0) preparation samplelegenddfSample = pd.read_csv(datapath +'tumorTypetemplate.csv')## 1) preparation SNVplotSNV = pd.read_csv(datapath +'41samplesSNVNumberBarPlot.csv')## 2) preparation indels plotIndel = pd.read_csv(datapath +'total41NumberIntersectIndels.csv')plotIndel['typeTumorSample'] = [x[3:] for x in plotIndel.typeTumosample]## 3) preparation SVs plotBrass = pd.read_csv(datapath +'total41NumberBrass.csv')plotBrass = plotBrass.rename(columns = {'tandem-duplication':'tand-dupl', 'translocation':'trans'})plotBrass['typeTumorSample'] = [x[3:] for x in plotBrass.typeTumorSample]## 4) SBS96 preparation plotsbsig = pd.read_csv(datapath +'41sbsigReorder.csv')plotsbsig['typeTumorSample'] = [x[3:] for x in plotsbsig.typeTumorSample]## 5) ID signature plotidsig = pd.read_csv(datapath +'41idsigReorder.csv')plotidsig['typeTumorSample'] = [x[3:] for x in plotidsig.typeTumorSample]## 6) SV signatureplotsvsig = pd.read_csv(datapath +'41samplesSVsigNbBarPlot.csv')## 7) CN signature read in and preparation plotCNsig = pd.read_csv(datapath +'41cnsigReorder.csv')plotCNsig['typeTumorSample'] = [x[3:] for x in plotCNsig.typeTumorSample]## 8) plot fig, axes = plt.subplots(1, 8, figsize=(30, 15), dpi=144, gridspec_kw={'width_ratios': [0.2,1,1,1,1,1,1,1]},sharey=True) #adjuct te widthplt.subplots_adjust(wspace =0.15,bottom=0.15, right=0.98, top=0.97,left=0.07) #hspacedfSample.plot(ax=axes[0], x ='typeTumorSample', kind ='barh', stacked =True, mark_right =True, color={'Invasive': 'maroon', 'Noninvasive': 'orange','Pancreatitis': 'gray', 'IPMN_LGD': 'mediumslateblue','IPMN_HGD': 'darkslateblue','Invasivex':'white', 'Pathology':'white','IPMN_HGD_PDAC': 'lightcoral', 'PDAC': 'maroon'}, width=1.0)axes[0].set_ylabel('Case tumour type',fontsize=16,weight='bold')axes[0].tick_params(axis='y', labelsize=14)axes[0].set_xlim(0, 3)axes[0].set_xticklabels(['Invasive', 'Pathology'], rotation=90, horizontalalignment='right',fontsize=16)axes[0].set_frame_on(False)axes[0].legend(loc="upper center")handles, labels = axes[0].get_legend_handles_labels()axes[0].legend(handles, labels, loc='center', bbox_to_anchor=(15,-0.12), ncol=9,frameon=False,fontsize=16)#############plotSNV.plot(ax=axes[1], kind ='barh', color ='skyblue', edgecolor ="black", width =1.0)axes[1].set_xlabel('a. Number of SNVs',fontsize=16,weight='bold')axes[1].tick_params(axis='x', labelsize=14)axes[1].set_xlim(0, 8000)axes[1].legend(fontsize=14, frameon=False)#########################plotIndel.plot(ax=axes[2], x ='typeTumosample', kind ='barh', stacked =True, mark_right =True, edgecolor ="black", width=1.0)axes[2].set_xlabel('b. Number of Ins/Dels',fontsize=16,weight='bold')axes[2].tick_params(axis='x', labelsize=14)axes[2].set_xlim(0, 800)axes[2].legend(fontsize=14,frameon=False)#########################plotBrass.plot(ax=axes[3], x ='typeTumorSample', kind ='barh', stacked =True, mark_right =True, edgecolor ="black", width=1.0)axes[3].set_xlabel('c. Number of SVs',fontsize=16, weight='bold')axes[3].tick_params(axis='x', labelsize=14)axes[3].set_xlim(0, 250)axes[3].legend(fontsize=14,frameon=False)########################plotsbsig.plot(ax=axes[4], x ='typeTumorSample', kind ='barh', stacked =True, mark_right =True, edgecolor ="black", width=1.0)axes[4].set_xlabel('d. Number of SBS96',fontsize=16, weight='bold')axes[4].tick_params(axis='x', labelsize=14)axes[4].set_xlim(0, 8000)axes[4].legend(fontsize=14, frameon=False)#######################plotidsig.plot(ax=axes[5], x ='typeTumorSample', kind ='barh', stacked =True, mark_right =True, edgecolor ="black", width=1.0)axes[5].set_xlabel('e. Number of ID83',fontsize=16, weight='bold')axes[5].tick_params(axis='x', labelsize=14)axes[5].set_xlim(0, 800)axes[5].legend(fontsize=14, frameon=False)#########################plotsvsig.plot(ax=axes[6], x ='typeTumorSample', kind ='barh', stacked =True, mark_right =True, edgecolor ="black", width=1.0)axes[6].set_xlabel('f. Number of SV32',fontsize=16, weight='bold')axes[6].tick_params(axis='x', labelsize=14)axes[6].set_xlim(0, 350)axes[6].legend(fontsize=14, frameon=False)#######################plotCNsig.plot(ax=axes[7], x ='typeTumorSample', kind ='barh', stacked =True, mark_right =True, edgecolor ="black", width=1.0)axes[7].set_xlabel('g. Number of CN48',fontsize=16, weight='bold')axes[7].tick_params(axis='x', labelsize=14)axes[7].set_xlim(0, 350)axes[7].legend(fontsize=14, frameon=False)from IPython.core.interactiveshell import InteractiveShellInteractiveShell.ast_node_interactivity ="all"plt.show();
png
3 Fig.1C. Heatmap and Bar Chart:
Genomic Analysis of Driver Genes
import osimport pandas as pdimport comuta #changed line 996 from new_cats = side_cats - paired_cats to new_cats = list(side_cats - paired_cats) #changed line 1002 from missing_categories = paired_cats - side_cats to missing_categories = list(paired_cats - side_cats)import matplotlib.pyplot as pltfrom matplotlib import rcParamsrcParams['font.family'] ='Arial'# Affects all text elementsdriverheatmappath ='/IPMNPDAC_WGS/Data/'# 1) Data processing# 1a) read in snv-indel-brass-cnv datadrivermutsDf = pd.read_csv(driverheatmappath +'41sampleDriverAllmutsDirectHeatmap.csv')drivermutsDf = drivermutsDf.query('value != "Silent"') # 1b) read in TMB datatmbDf = pd.read_csv(driverheatmappath +'tmbdf4driverheatmapPlot.csv')sampleTumourTMBorderList =list(tmbDf ['sample'])# 1.c) TMB data for top bartmbdf = pd.read_csv(driverheatmappath +'41sampleTMBdirectHeadmap.csv')# 1d) read in purity datacellularityDf = pd.read_csv(driverheatmappath +'41samplesCellularity4alldriverMutsHeatmap.csv')# 1e) read in bar datasidebarDf = pd.read_csv(driverheatmappath +'41sample_side_bar_data4alldrivermutsheatmap.csv')category_order =list(sidebarDf.category)sampleOrderForlabel = sampleTumourTMBorderList###################################################################mut_mapping = {'deletion_3UTRexon': 'lightgreen','deletion_exon': 'deepskyblue','deletion_nonExon': 'lime','inversion_3UTRexon': 'darkkhaki','inversion_nonExon': 'wheat','tandem-duplication_nonExon':'thistle','translocation_5UTRexon': 'turquoise','translocation_nonExon': 'dodgerblue','Loss':'violet','Invasive': 'maroon', 'Non-invasive': 'orange','Pancreatitis': 'gray', 'IPMN_LGD': 'mediumslateblue','IPMN_HGD': 'darkslateblue','IPMN_HGD_PDAC': 'lightcoral', 'PDAC': 'maroon','downstream': 'green', '3UTR': 'blue', 'nonsense': 'red','ess_splice_Del': "purple", 'frameshift_Del': "teal",'frameshift_Ins': "slategray", 'upstream': 'lightblue','missense': 'cyan',#'Silent': 'orange','0.0_0.0': {'facecolor': 'grey', 'alpha': 0.05},'Absent': {'facecolor': 'grey', 'alpha': 0.05}}################################################################### 2) plot# 2a) plot heatmaptoy_comut = comuta.CoMut()# add indicator, can add this after toy_comut.add_categorical_data, but plot on topindicator_kwargs = {'color': 'red', 'marker': 'o','linewidth': 1, 'markersize': 5}toy_comut.add_categorical_data(drivermutsDf, name='mut type', mapping=mut_mapping, category_order=category_order)# 2b) plot sidebarside_mapping = {'mutnb': 'pink'}bar_kwargs = {'height': 0.8}toy_comut.add_side_bar_data(sidebarDf, paired_name ='mut type', name='MutFq', mapping =side_mapping, position ='right', bar_kwargs=bar_kwargs)## 2c) plot cellularitycat_mapping = {'Absent': {'facecolor': 'red'}}value_range = (0, 1)toy_comut.add_continuous_data(cellularityDf, name ='Purity', mapping ='gray_r', cat_mapping = cat_mapping, value_range = value_range)# 2d) plot tmb barbar_mapping = {'Nonsynonymous': 'green', 'Synonymous': 'pink'}bar_kwargs = {'width': 0.8, 'edgecolor': 'green'}toy_comut.add_bar_data(tmbDf, name ='Mutation burden', mapping = bar_mapping, bar_kwargs = bar_kwargs)#ylabel = 'Muts/Mb')toy_comut.plot_comut(figsize=(16, 12), x_padding=0.04,hspace =0.03, wspace =0.03, y_padding=0.04, tri_padding=0.03)toy_comut.axes['Mutation burden'].set_ylabel('TMB', rotation ='horizontal', ha ='right', va ='center', y =0.3)toy_comut.axes['Mutation burden'].set_ylim(ymin=0, ymax=10)toy_comut.axes['Mutation burden'].axvline(0, color='black')#toy_comut.axes['mut type'].set_xlabel('xt', rotation = 45)toy_comut.axes['mut type'].set_xticklabels(labels=sampleOrderForlabel,rotation =45,horizontalalignment='right')custom_rcParams = {'font.size': 12}rcParams.update(custom_rcParams)toy_comut.add_unified_legend(bbox_to_anchor = (1.8, -4))#toy_comut.add_unified_legend()plt.subplots_adjust(bottom=0.001, right=0.9, top=0.99,left=0.02);plt.savefig(driverheatmappath +'driverHeatMap.png', dpi=300, bbox_inches='tight')
png
4 Fisher-Pitman Permutation Test
for Comparison of SV Numbers Between Trees with Single and Multiple
Branches
%%Rlibrary("coin")library(stringr)set.seed(123)df =read.csv('/IPMNPDAC_WGS/Data/SV_multiple_single_branches.csv')X =as.numeric(df$Total)A =as.factor(df$samples)z =oneway_test(X~A, data=df, distribution =approximate(nresample =9999))z
Approximative Two-Sample Fisher-Pitman Permutation Test
data: X by A (multipleBarnches, singleBranch)
Z = 3.0075, p-value = 0.0024
alternative hypothesis: true mu is not equal to 0
5 Fig.2B. Boxplot of SV Counts in
Trees with Single versus Multiple Branches
11 Permutation test for Comparison
of TMB Across Tumour Types
import rpy2%load_ext rpy2.ipython
%%Rlibrary(coin)tmbdf =read.csv('/IPMNPDAC_WGS/Data/tmbplotPermutest.csv')X =as.numeric(tmbdf$snv_indel_TMB)A =as.factor(tmbdf$tumorStage)tmbtest =oneway_test(X~A, data=tmbdf, distribution =approximate(nresample =9999))tmbtest
Approximative Two-Sample Fisher-Pitman Permutation Test
data: X by A (IPMN, PDAC)
Z = -3.5961, p-value = 2e-04
alternative hypothesis: true mu is not equal to 0
Loading required package: survival
12 Fig.S1B1-a. Boxplots of TMB
import osfrom glob import globimport pandas as pdimport matplotlib.pyplot as pltfrom matplotlib import rcParamsimport shutupshutup.please()rcParams['font.family'] ='Arial'## TMB bar plot datatmbDf = pd.read_csv('/IPMNPDAC_WGS/Data/tmbplotPermutest.csv')[['tumorStage', 'snv_indel_TMB']]ipmn = tmbDf.query('tumorStage=="IPMN"')pdac = tmbDf.query('tumorStage=="PDAC"')## boxplotdata = [ipmn.snv_indel_TMB, pdac.snv_indel_TMB]fig = plt.figure(figsize =(5,5))ax = fig.add_subplot(111)bp = ax.boxplot(data, patch_artist =True, notch ='True', vert =90)colors = ['lightgreen', 'pink']for patch, color inzip(bp['boxes'], colors): patch.set_facecolor(color)# changing color and linewidth of# whiskersfor whisker in bp['whiskers']: whisker.set(color ='blue', linewidth =1.5, linestyle =":")# changing color and linewidth of# capsfor cap in bp['caps']: cap.set(color ='purple', linewidth =2)# changing color and linewidth of# mediansfor median in bp['medians']: median.set(color ='red', linewidth =3)# changing style of fliersfor flier in bp['fliers']: flier.set(marker ='D', color ='black', alpha =0.5) #plt.title("TMB profile of IPMN-PDAC", fontsize=14, weight='bold')plt.ylabel('Tumor mutation burden', fontsize=14)# Removing top axes and right axes# ticksax.get_xaxis().tick_bottom()ax.get_yaxis().tick_left()# show plotimport matplotlib.patches as mpatchesipmn_patch = mpatches.Patch(color='lightgreen', label='IPMN')pdac_patch = mpatches.Patch(color='pink', label='PDAC')plt.legend(handles=[ipmn_patch, pdac_patch], loc='best', frameon=False)plt.xticks([])plt.show()
png
%%writefile fishertestDriver.R%%Rlibrary(coin)data <- data.frame( tumourSample = factor(rep(c("IPMN", "PDAC"), times = c(22, 19))), KRAS_snvIndel = factor(c(rep(c("positive", "negative"), times = c(10, 12)), rep(c("positive", "negative"), times = c(12, 7)))), GNAS_snvIndel = factor(c(rep(c("positive", "negative"), times = c(8, 14)), rep(c("positive", "negative"), times = c(1, 18)))), LRP1B_snvIndel = factor(c(rep(c("positive", "negative"), times = c(0, 22)), rep(c("positive", "negative"), times = c(7, 12)))), TP53_snvIndel = factor(c(rep(c("positive", "negative"), times = c(2, 20)), rep(c("positive", "negative"), times = c(13, 6)))), U2AF1_los = factor(c(rep(c("positive", "negative"), times = c(4, 18)), rep(c("positive", "negative"), times = c(13, 6)))), RNF43_los = factor(c(rep(c("positive", "negative"), times = c(4, 18)), rep(c("positive", "negative"), times = c(12, 7)))), los_12q21_31 = factor(c(rep(c("positive", "negative"), times = c(1, 21)), rep(c("positive", "negative"), times = c(10, 9)))), los_21q22_3 = factor(c(rep(c("positive", "negative"), times = c(4, 18)), rep(c("positive", "negative"), times = c(13, 6)))), los_17q22 = factor(c(rep(c("positive", "negative"), times = c(4, 18)), rep(c("positive", "negative"), times = c(12, 7)))), los_6q27 = factor(c(rep(c("positive", "negative"), times = c(8, 14)), rep(c("positive", "negative"), times = c(15, 4)))), gain_8q24_3 = factor(c(rep(c("positive", "negative"), times = c(6, 16)), rep(c("positive", "negative"), times = c(16, 3)))), gain_14q32 = factor(c(rep(c("positive", "negative"), times = c(15, 7)), rep(c("positive", "negative"), times = c(19, 0)))), gain_17p11 = factor(c(rep(c("positive", "negative"), times = c(19,3)), rep(c("positive", "negative"), times = c(9, 10)))), gain_8q23 = factor(c(rep(c("positive", "negative"), times = c(6, 16)), rep(c("positive", "negative"), times = c(12, 7)))), gain_11p11 = factor(c(rep(c("positive", "negative"), times = c(3, 19)), rep(c("positive", "negative"), times = c(9, 10)))))# save data backwrite.csv(data,'/IPMNPDAC_WGS/Data/allIPMNdata4fisherTest.csv', row.names=FALSE)# Function to run Fisher's Exact Test for each outcome and extract p-valuerun_fisher_test <- function(outcome) { test_result <- independence_test(reformulate("tumourSample", outcome), data = data, distribution ="exact") p_value <- pvalue(test_result)return(p_value)}outcome_columns <- names(data)[-1]p_values <- sapply(outcome_columns, run_fisher_test)adjusted_p_values <- p.adjust(p_values, method ="fdr")# Create a data frame with the resultsresults_df <- data.frame( Outcome = outcome_columns, P_Value = p_values, Adjusted_P_Value = adjusted_p_values)#write.csv(results_df,'/outpath/allIPMNdata4fisherTestP_values.csv', row.names=FALSE)print(results_df)
Writing fishertestDriver.R
13 Fisher’s Exact Test with FDR
Correction for Driver Mutation Differences Across Tumor Types
import rpy2%load_ext rpy2.ipython
%%Rlibrary(coin)data <-data.frame(tumourSample =factor(rep(c("IPMN", "PDAC"), times =c(22, 19))),KRAS_snvIndel =factor(c(rep(c("positive", "negative"), times =c(10, 12)),rep(c("positive", "negative"), times =c(12, 7)))),GNAS_snvIndel =factor(c(rep(c("positive", "negative"), times =c(8, 14)),rep(c("positive", "negative"), times =c(1, 18)))),LRP1B_snvIndel =factor(c(rep(c("positive", "negative"), times =c(0, 22)),rep(c("positive", "negative"), times =c(7, 12)))),TP53_snvIndel =factor(c(rep(c("positive", "negative"), times =c(2, 20)),rep(c("positive", "negative"), times =c(13, 6)))), U2AF1_los =factor(c(rep(c("positive", "negative"), times =c(4, 18)),rep(c("positive", "negative"), times =c(13, 6)))),RNF43_los =factor(c(rep(c("positive", "negative"), times =c(4, 18)),rep(c("positive", "negative"), times =c(12, 7)))),los_12q21_31 =factor(c(rep(c("positive", "negative"), times =c(1, 21)),rep(c("positive", "negative"), times =c(10, 9)))),los_21q22_3 =factor(c(rep(c("positive", "negative"), times =c(4, 18)),rep(c("positive", "negative"), times =c(13, 6)))),los_17q22 =factor(c(rep(c("positive", "negative"), times =c(4, 18)),rep(c("positive", "negative"), times =c(12, 7)))),los_6q27 =factor(c(rep(c("positive", "negative"), times =c(8, 14)),rep(c("positive", "negative"), times =c(15, 4)))),gain_8q24_3 =factor(c(rep(c("positive", "negative"), times =c(6, 16)),rep(c("positive", "negative"), times =c(16, 3)))),gain_14q32 =factor(c(rep(c("positive", "negative"), times =c(15, 7)),rep(c("positive", "negative"), times =c(19, 0)))), gain_17p11 =factor(c(rep(c("positive", "negative"), times =c(19,3)),rep(c("positive", "negative"), times =c(9, 10)))),gain_8q23 =factor(c(rep(c("positive", "negative"), times =c(6, 16)),rep(c("positive", "negative"), times =c(12, 7)))),gain_11p11 =factor(c(rep(c("positive", "negative"), times =c(3, 19)),rep(c("positive", "negative"), times =c(9, 10)))))# save data backwrite.csv(data,'/IPMNPDAC_WGS/Data/allIPMNdata4fisherTest.csv', row.names=FALSE)# Function to run Fisher's Exact Test for each outcome and extract p-valuerun_fisher_test <-function(outcome) { test_result <-independence_test(reformulate("tumourSample", outcome), data = data, distribution ="exact") p_value <-pvalue(test_result)return(p_value)}outcome_columns <-names(data)[-1]p_values <-sapply(outcome_columns, run_fisher_test)adjusted_p_values <-p.adjust(p_values, method ="fdr")# Create a data frame with the resultsresults_df <-data.frame(Outcome = outcome_columns,P_Value = p_values,Adjusted_P_Value = adjusted_p_values)#write.csv(results_df,'/outpath/allIPMNdata4fisherTestP_values.csv', row.names=FALSE)print(results_df)
14 Figure S2A. Stacked Bar Charts
Showing the Proportion of SBS96, ID83, SV32, and CN48 Signatures in Each
IPMN and IPMN-Derived PDAC
import pandas as pdimport matplotlib.pyplot as pltfrom natsort import natsort_keygenfrom matplotlib import rcParamsrcParams['font.family'] ='Arial'sfilepath ='/IPMNPDAC_WGS/Data/'## 1a) input SBS96sbs96all = pd.read_csv(sfilepath +'s41SBSsignatureCount.csv')sbsplotdf = sbs96all[['typeTumorSample', 'SBS1','SBS2','SBS5','SBS13', 'SBS17a','SBS17b', 'SBS28', 'SBS40']]sbsplotdf = sbsplotdf.sort_values(by="typeTumorSample", key=natsort_keygen())sbsplotdf['typeTumorSample'] = [x[3:] for x in sbsplotdf.typeTumorSample]sbsplotdfb = sbsplotdf.drop(['typeTumorSample'],axis=1)sbsplotdf_total = sbsplotdfb.sum(axis=1)sbsplotdf2rate = sbsplotdf[sbsplotdf.columns[1:]].div(sbsplotdf_total, 0) *100sbsplotdf2rate['typeTumorSample'] = sbsplotdf.typeTumorSample# 1b) input ID83id83all = pd.read_csv(sfilepath +'s41indelSignatureCount.csv') idplotdf = id83all[['typeTumorSample', 'ID1','ID2','ID5','ID6','ID8','ID9','ID14']]idplotdf = idplotdf.sort_values(by="typeTumorSample", key=natsort_keygen())idplotdf['typeTumorSample'] = [x[3:] for x in idplotdf.typeTumorSample]idplotdfb = idplotdf.drop(['typeTumorSample'],axis=1)idplotdf_total = idplotdfb.sum(axis=1)idplotdf2rate = idplotdf[idplotdf.columns[1:]].div(idplotdf_total, 0) *100idplotdf2rate['typeTumorSample'] = idplotdf.typeTumorSample# 1c input SV32svall = pd.read_csv(sfilepath +'s41SVsignatureCount.csv')svplotdf = svall[['typeTumorSample', 'SV2','SV4','SV5','SV7', 'SV9', 'SV10']]svplotdf = svplotdf.sort_values(by="typeTumorSample", key=natsort_keygen())svplotdf['typeTumorSample'] = [x[3:] for x in svplotdf.typeTumorSample]svplotdfb = svplotdf.drop(['typeTumorSample'],axis=1)svplotdf_total = svplotdfb.sum(axis=1)svplotdf2rate = svplotdf[svplotdf.columns[1:]].div(svplotdf_total, 0) *100svplotdf2rate['typeTumorSample'] = svplotdf.typeTumorSample# 1d) input CN48 cnall = pd.read_csv(sfilepath +'s41CNsignature.csv')cnplotdf = cnall[['typeTumorSample', 'CN1','CN9','CN24','CNV48B']]cnplotdf = cnplotdf.sort_values(by="typeTumorSample", key=natsort_keygen())cnplotdf['typeTumorSample'] = [x[3:] for x in cnplotdf.typeTumorSample]cnplotdfb = cnplotdf.drop(['typeTumorSample'],axis=1)cnplotdf_total = cnplotdfb.sum(axis=1)cnplotdf2rate = cnplotdf[cnplotdf.columns[1:]].div(cnplotdf_total, 0) *100cnplotdf2rate['typeTumorSample'] = cnplotdf.typeTumorSample# 2) set up plotfig, axes = plt.subplots(1, 4, figsize=(20, 10), sharey=True)colorSBS = ['blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink', 'gray']colorID = ['blue', 'orange', 'olive', 'cyan', 'lime', 'brown','fuchsia']colorSV = ['blue', 'orange', 'olive', 'cyan', 'lime', 'brown']colorCN = ['blue', 'green', 'gray', 'orange']# 3a) plot sbs sigssbsplotdf2rate.plot(ax=axes[0], x ='typeTumorSample', kind ='barh', stacked =True, mark_right =True, legend=True, color=colorSBS, width=1.0)axes[0].legend(bbox_to_anchor=(0.26, -0.1),fontsize=12)axes[0].set_xlabel('Proportion of SBS96 in Each Sample', fontsize=14,weight='bold')# 3b) plot Id sigsidplotdf2rate.plot(ax=axes[1], x ='typeTumorSample', kind ='barh', stacked =True, mark_right =True, legend=True, color=colorID, width=1.0)axes[1].legend(bbox_to_anchor=(0.22, -0.1),fontsize=12)axes[1].set_xlabel('Proportion of ID83 in Each Sample',fontsize=14,weight='bold')# 3c plot SV sigssvplotdf2rate.plot(ax=axes[2], x ='typeTumorSample', kind ='barh', stacked =True, mark_right =True, legend=True, color=colorSV, width=1.0)axes[2].set_xlabel('Proportion of SV32 in Each Sample',fontsize=14,weight='bold');axes[2].legend(bbox_to_anchor=(0.23, -0.1),fontsize=12)# 3d) plot CN sigscnplotdf2rate.plot(ax=axes[3], x ='typeTumorSample', kind ='barh', stacked =True, mark_right =True, legend=True, color=colorCN, width=1.0)axes[3].set_xlabel('Proportion of CN48 in Each Sample',fontsize=14,weight='bold');axes[3].legend(bbox_to_anchor=(0.26, -0.1), fontsize=12)# 4) spacingfig.tight_layout(pad=2.0)plt.show();
png
15 Fisher’s Exact Test for
Comparison of SV Signatures Across Tumor Types
import rpy2%load_ext rpy2.ipython
%%Rlibrary(coin)data <-data.frame(tumourSample =factor(rep(c("IPMN", "PDAC"), times =c(22, 19))),SV2 =factor(c(rep(c("positive", "negative"), times =c(22, 0)),rep(c("positive", "negative"), times =c(17, 2)))),SV4 =factor(c(rep(c("positive", "negative"), times =c(0, 22)),rep(c("positive", "negative"), times =c(7, 12)))),SV5 =factor(c(rep(c("positive", "negative"), times =c(5, 17)),rep(c("positive", "negative"), times =c(5, 14)))),SV7 =factor(c(rep(c("positive", "negative"), times =c(4, 18)),rep(c("positive", "negative"), times =c(9, 10)))), SV9 =factor(c(rep(c("positive", "negative"), times =c(1, 21)),rep(c("positive", "negative"), times =c(5, 14)))) )# save data back#write.csv(data,'/outpath/SVSigfisherTest.csv', row.names=FALSE)# Function to run Fisher's Exact Test for each outcome and extract p-valuerun_fisher_test <-function(outcome) { test_result <-independence_test(reformulate("tumourSample", outcome), data = data, distribution ="exact") p_value <-pvalue(test_result)return(p_value)}outcome_columns <-names(data)[-1]p_values <-sapply(outcome_columns, run_fisher_test)adjusted_p_values <-p.adjust(p_values, method ="fdr")# Create a data frame with the resultsresults_df <-data.frame(Outcome = outcome_columns,P_Value = p_values,Adjusted_P_Value = adjusted_p_values)print(results_df)
18 Tumor Microenvironment
Composition Converges Along Evolutionary Trajectories
18.1 Fig S10. Cell Type Proportions
Along Evolutionary Trajectories: Comparing Single MRCA and Multiple
Independent Clones
import osimport numpy as npimport pandas as pdimport matplotlib.pyplot as plttreeSample = ['case6_S7','case6_S9','case7_S1','case7_S4','case7_S5','case9_S2','case9_S3','case9_S4','case10_S2','case10_S5','case12_S1','case12_S2','case12_S3','case12_S5','case13_S3','case13_S4','case13_S5','case16_S2','case16_S4','case2_S2','case2_S4','case2_S10','case4_S1','case4_S2','case4_S3','case4_S4', 'case4_S5','case15_S1','case15_S3', 'case15_S4','case15_S10','case15_S11',]colorx =['lightgreen']*19+['pink']*13# 1) import datasetos.chdir(r'/mnt/b/1_3July23VersionDataPlotCode/NC_revision/pathwayCelltype')cellTypes = pd.read_csv('ESTIMATE_EPIC_scoresb.csv')treeCells = cellTypes.query('Samples==@treeSample')treeCells = treeCells.rename(columns = {'B_cell':'B cell', 'Cancer associated_fibroblast':'Fibroblast','T_cell_CD4+' : 'CD4+','T_cell_CD8+':'CD8+','Endothelial_cell': 'Endothelial cell','NK_cell':'NK cell','uncharacterized_cell':'Uncharacterized'})treeCells = treeCells.set_index('Samples')treeCells = treeCells.reindex(treeSample) # Apply custom sortcellnames =list(treeCells)lbs =list(treeCells.index)binx = np.arange(len(lbs))# 2) Create a figure and axis objectsfig, axes = plt.subplots(nrows=len(cellnames), ncols=1, figsize=(12, 20))# 3) Iterate over the data and create subplots with bar chartsfor i, ax inenumerate(axes):# Get the data for the current subplot subplot_df = treeCells[[cellnames[i]]]# Set the title for the subplot#ax.set_title(f"Subplot {i+1}")# Create the bar chart for the current subplot ax.bar(subplot_df.index, subplot_df[cellnames[i]]*100, width =0.3, color=colorx) ax.set_xticklabels([]) ax.set_xticks([]) ax.set_xlim(-0.5, binx.size-0.5)# Set the labels for the x-axis and y-axis#ax.set_xlabel('X-axis') ax.set_ylabel(cellnames[i], fontsize=14)ax.set_xticks(range(len(lbs))) ax.set_xticklabels(lbs, rotation=45, ha='right') #plt.xlim([-0.5,binx.size-0.5])# Adjust the spacing between subplots#plt.tight_layout()# Show the plotplt.show()
png
18.2 T-Test for Comparison of Cell
Type Proportions: Single MRCA vs. Multiple Independent Clones
The p value of B cell T-test is: 0.3176
The p value of Fibroblast T-test is: 0.2652
The p value of CD4 T-test is: 0.2327
The p value of CD8 T-test is: 0.0005
The p value of Endothelial cell T-test is: 0.3024
The p value of Macrophage T-test is: 0.6782
The p value of NK cell T-test is: 0.5311
The p value of Uncharacterized T-test is: 0.6141
18.3 Boxplot for Comparison of Cell
Type Proportions: Single MRCA vs. Multiple Independent Clones
19 Cellularity Comparison Between
Single-Branch and Multiple-Branch Trees
import pandas as pdfrom scipy.stats import ttest_inddatapath ='/mnt/b/1_3July23VersionDataPlotCode/NC_revision/excelSubmited/'purityData = pd.read_csv(datapath +'573583_0_data_set_10134107_spmdqf.csv')purityData = purityData[['sampleName', 'cellularity']]multipleBranch = ['case10','case3','case16','case2','case9','case7','case6']singleBranch = ['case4','case12','case13', 'case15']sampleNames =list(purityData.sampleName)multipleBranchSample = [x for x in sampleNames if x.split('_')[0] in multipleBranch]singleBranchSample = [x for x in sampleNames if x.split('_')[0] in singleBranch]multipleBranchPurity = purityData.query('sampleName==@multipleBranchSample')singleBranchPurity = purityData.query('sampleName==@singleBranchSample')mean_multipleBranchPurity = multipleBranchPurity['cellularity'].mean()mean_singleBranchPurity = singleBranchPurity['cellularity'].mean()multipleBranchPurityV =list(multipleBranchPurity.cellularity)singleBranchPurityV =list(singleBranchPurity.cellularity)t_stat, p_value = ttest_ind(multipleBranchPurityV , singleBranchPurityV)
p_value
0.7736972351901088
mean_multipleBranchPurity
0.3155518181818182
mean_singleBranchPurity
0.33191588235294117
20 ID83 and CNV48b Signatures
Across Tumor Types
20.1 ID83 Activity Profiles Across
Tumor Types
import pandas as pdimport matplotlib.pyplot as pltdatapath ='/mnt/b/1_3July23VersionDataPlotCode/SupplementaryInformation/FigureS2/'id83 = pd.read_csv(datapath +'intersect41_tumorlabel_id83.csv')id83 = id83[['typeTumorSample','ID1','ID2','ID5','ID6','ID8','ID9','ID14']]id83 = id83.sort_values(by='typeTumorSample')id83['typeTumorSample'] = [x[3:] for x in id83.typeTumorSample]id83 = id83.set_index('typeTumorSample')# Create a figure and axis objectsfig, axes = plt.subplots(nrows=len(list(id83)), ncols=1, figsize=(14, 14), sharex=True)# Iterate over the data and create subplots with bar chartsfor i, ax inenumerate(axes):# Get the data for the current subplot subplot_df = id83[[list(id83)[i]]]# Set the title for the subplot#ax.set_title(f"Subplot {i+1}")# Create the bar chart for the current subplot ax.bar(subplot_df.index, subplot_df[list(id83)[i]], width =0.3) ax.set_xticklabels([])#ax.set_xticks([]) ax.set_xlim(-0.5, len(subplot_df)-0.5)# Set the labels for the x-axis and y-axis#ax.set_xlabel('X-axis') ax.set_ylabel(list(id83)[i])colorOrder = ['blue']*22+ ['red']*19#22 low grade and 19 high gradefor xtick, color inzip(axes[6].get_xticklabels(), colorOrder): xtick.set_color(color)axes[6].set_xticks(range(len(list(subplot_df.index)))) axes[6].set_xticklabels(list(subplot_df.index), rotation=-30, ha='left') #plt.xticks(rotation=-30, ha='left', fontsize=12, rotation_mode='anchor')#plt.xlim([-0.5,binx.size-0.5])# Adjust the spacing between subplots#plt.tight_layout()# Show the plotplt.show()
png
20.2 Fig S2C. Correlation Analysis
of Activity Profiles Between ID1, ID2, and CNV48b