# tools for handling files
import sys
import os
# pandas/numpy for handling data
import pandas as pd
import numpy as np
# seaborn/matplotlib for graphing
import matplotlib.pyplot as plt
import seaborn as sns
# statistics
from statistics import mean
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats
# regex
import re
# for reading individual telomere length data from files
from ast import literal_eval
# for grabbing individual cells
import more_itertools
# my module containing functions for handling/visualizing/analyzing telomere length/chr rearrangement data
import telomere_methods_rad_patient as trp
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.stats import zscore
from scipy.stats import ks_2samp
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison
# incase reloading modules is required
import importlib
%load_ext autoreload
%autoreload
# setting darkgrid style for seaborn figures
sns.set_style(style="darkgrid",rc= {'patch.edgecolor': 'black'})
...
all_patients_df = pd.read_csv('../data/compiled patient data csv files/all_patients_df.csv')
all_patients_df['telo data'] = all_patients_df['telo data'].map(literal_eval)
fig = plt.figure(figsize=(10,4))
ax = sns.set(font_scale = 1.5)
ax = sns.boxenplot(x='timepoint',y='telo means', data=all_patients_df,
linewidth=1)
ax = sns.swarmplot(x='timepoint',y='telo means', data=all_patients_df, size=8,
linewidth=1)
# ax.set_title("Mean Telomere Length (TeloFISH) per timepoint")
ax.set_ylabel('Mean Telomere Length')
ax.set_xlabel('timepoint')
plt.savefig('../graphs/paper figures/main figs/all patient Mean telomere length means teloFISH.png',
dpi=400)
lin_reg_df = all_patients_df.pivot(index='patient id', columns='timepoint', values='telo means')
lin_reg_df = lin_reg_df.drop(13)
lin_reg_df['constant'] = 1
lin_reg_df.corr()
x_names = [['1 non irrad'], ['1 non irrad', '2 irrad @ 4 Gy'], ['1 non irrad', '2 irrad @ 4 Gy', '3 B']]
y_name = '4 C'
for x_name in x_names:
x = lin_reg_df[x_name].values.reshape(-1, len(x_name))
y = lin_reg_df['4 C'].values.reshape(-1, 1)
regression = LinearRegression().fit(x, y)
print(f"Linear regression for {x_name} vs. {y_name}:\nR2 is {regression.score(x, y):.4f}")
# more indepth stats
# target = lin_reg_df['4 C']
# linear_m = sm.OLS(endog=target, exog=lin_reg_df[['1 non irrad', '2 irrad @ 4 Gy', 'constant']], missing='drop')
# results = linear_m.fit()
# results.summary()
exploded_telos_all_patients_df = pd.read_csv('../data/compiled patient data csv files/exploded_telos_all_patients_df.csv')
# graphing individual telomeres per individual per timepoint, personal telomere length dynamics as fxn of radiotherapy
patient_ids = list(exploded_telos_all_patients_df['patient id'].unique())
trp.histogram_plot_groups(x='individual telomeres',
data=exploded_telos_all_patients_df,
groupby='patient id',
iterable=[patient_ids[0]],
# iterable=patient_ids,
n_bins=45,
znorm=False)
# one way ANOVA between individual telos .. change to repeated measures ANOVA
trp.telos_scipy_anova_post_hoc_tests(df=exploded_telos_all_patients_df)
test_df = exploded_telos_all_patients_df.copy()
for col in test_df.columns:
test_df.rename({col: col.replace(' ', '_')}, axis=1, inplace=True)
test_df = test_df[test_df['patient_id'] != 13].copy()
test_df.groupby(['timepoint']).agg('mean').reset_index()
from statsmodels.stats.anova import AnovaRM
RM_ANOVA_results = AnovaRM(test_df,
'individual_telomeres',
'patient_id',
within=['timepoint'],
aggregate_func=np.mean).fit()
print(RM_ANOVA_results)
# graphing individual telomeres per timepoint, overall cohort telomere length dynamics as fxn of radiotherapy
patient_ids = list(exploded_telos_all_patients_df['patient id'].unique())
trp.histogram_plot_groups(x='individual telomeres',
data=exploded_telos_all_patients_df,
groupby='timepoint',
n_bins=45,
znorm=False)
z_norm = trp.z_norm_individual_telos(exploded_telos_df=exploded_telos_all_patients_df)
# z-norming distributions of individual telomeres per timepoint to enable statistical analysis between
# shapes of overall cohort telomere length dynamics
time_points = list(z_norm['timepoint'].unique())
trp.histogram_plot_groups(x='z-norm_individual_telos',
data=z_norm,
groupby='timepoint',
n_bins=45,
znorm=True)
# we see a diff. in shape between all timepoints, as well we see a sig diff between irrad @ 4 Gy & 4 C shapes,
# though mean was the same (ANOVA)
test = ks_2samp
test_name = 'Kolmogorov-Smirnov'
timept_pairs, row = trp.eval_make_test_comparisons(df=z_norm, target='z-norm_individual_telos',
test=test, test_name=test_name,)
# iterate for between patients as well? would just be loop using above fxn, passing df per patient
KS_stats_df = pd.DataFrame(row, columns=[test_name, 'timepoint 1', 'timepoint 2', 'p value'])
trp.df_to_png(df=KS_stats_df, path='../graphs/paper figures/supp figs/KS test between overall shapes of individ telo dist.png')
melted_all_patients_df = pd.melt(
all_patients_df,
id_vars = [col for col in all_patients_df.columns if col != 'Q1' and col != 'Q2-3' and col != 'Q4'],
var_name='relative Q',
value_name='Q freq counts')
melted_all_patients_df['Q freq counts'] = melted_all_patients_df['Q freq counts'].astype('float64')
# ax = sns.set(font_scale=1)
fig = plt.figure(figsize=(10,4))
sns.set_style(style="darkgrid",rc= {'patch.edgecolor': 'black'})
palette ={"Q1":"#fdff38","Q2-3":"#d0fefe","Q4":"#ffbacd"}
ax = sns.boxenplot(x='timepoint', y='Q freq counts', hue='relative Q', data=melted_all_patients_df, palette=palette,
linewidth=2, saturation=5, color="black", )
ax = sns.stripplot(x='timepoint', y='Q freq counts', hue='relative Q', data=melted_all_patients_df, palette=palette,
linewidth=1, color="black", dodge=True, )
ax=fig.gca()
# ax.set_title('Changes in Distribution Individual Telos Relative to Pre-Rad Therapy Time point', fontsize=14)
ax.set_xlabel('timepoint', fontsize=14)
ax.set_ylabel('Counts of Individual Telomeres', fontsize=14)
ax.tick_params(labelsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0, fontsize='small')
# plt.savefig('../graphs/telomere length/examining changes in individual telomere lengths per quartile.png', dpi=400)
all_qPCR_df = pd.read_csv('../data/qPCR telo data/all_qPCR_df.csv')
fig = plt.figure(figsize=(10,4))
ax = sns.set(font_scale = 1.5)
ax= sns.boxenplot(x='timepoint', y='telo means qPCR', data=all_qPCR_df,
linewidth=1)
ax= sns.swarmplot(x='timepoint', y='telo means qPCR', data=all_qPCR_df,
linewidth=1, size=8)
# ax.set_title("Mean Telomere Length (qPCR) Per Timepoint")
ax.set_ylabel('Mean Telomere Length')
ax.set_xlabel('timepoint')
# plt.savefig('../graphs/telomere length/all patient telomere length means qPCR.png', dpi=400)
pivot_qPCR_df = all_qPCR_df.pivot(index='patient id', columns='timepoint', values='telo means qPCR')
pivot_qPCR_df['constant'] = 1
pivot_qPCR_df.corr()
x_names = [['1 non irrad'], ['1 non irrad', '3 B']]
y_name = '4 C'
for x_name in x_names:
x = pivot_qPCR_df[x_name].values.reshape(-1, len(x_name))
y = pivot_qPCR_df['4 C'].values.reshape(-1, 1)
regression = LinearRegression().fit(x, y)
print(f"Linear regression for {x_name} vs. {y_name}:\nR2 is {regression.score(x, y):.4f}")
# more indepth stats
# target = pivot_qPCR_df['4 C']
# linear_m = sm.OLS(endog=target, exog=pivot_qPCR_df[['1 non irrad', 'constant']], missing='drop')
# results = linear_m.fit()
# print(results.summary())
# linear_m2 = sm.OLS(endog=target, exog=pivot_qPCR_df[['1 non irrad', '3 B', 'constant']], missing='drop')
# results2 = linear_m2.fit()
# print(results2.summary())
sns.lmplot(x='1 non irrad', y='4 C', data=pivot_qPCR_df, fit_reg=True)
# conducting one-way ANOVA for mean telomere length
# need change to repeated measures
df = all_qPCR_df
g_1 = df[df['timepoint'] == '1 non irrad']['telo means qPCR']
g_2 = df[df['timepoint'] == '3 B']['telo means qPCR']
g_3 = df[df['timepoint'] == '4 C']['telo means qPCR']
stats.f_oneway(g_1, g_2, g_3)
all_chr_aberr_df = pd.read_csv('../data/compiled patient data csv files/all_chr_aberr_df.csv')
general_cleaner = Pipeline([('cleaner', trp.general_chr_aberr_cleaner(drop_what_timepoint=False, adjust_clonality=True))])
cleaned_chr_df = general_cleaner.fit_transform(all_chr_aberr_df)
melt_aberrations = pd.melt(cleaned_chr_df, id_vars=['patient id', 'timepoint'],
var_name='aberration type', value_name='count per cell')
melt_aberrations['count per cell'] = melt_aberrations['count per cell'].astype('int64')
melt_aberrations['aberration type'] = melt_aberrations['aberration type'].astype('str')
# melt_aberrations_chr_only = melt_aberrations[~melt_aberrations['aberration type'].isin(['# sub-telo SCEs', 'tricentrics',
# '# dicentrics', '# translocations',
# '# sat associations', 'cell number'])].copy()
# ax = sns.set(font_scale=2)
# ax = sns.catplot(y='aberration type', x='count per cell', hue='chromosome',
# col='timepoint', col_wrap=2,
# data=melt_aberrations_chr_only, kind='bar', height=7, aspect=1.5, orient="h",)
# ax.set_ylabels('')
# ax.set_xlabels('average count per cell')
ax = sns.set_style(style="darkgrid",rc= {'patch.edgecolor': 'black'})
ax = sns.set(font_scale=1.35)
ax = sns.catplot(y='aberration type', x='count per cell',
col='timepoint', col_wrap=2,
data=melt_aberrations[melt_aberrations['aberration type'] != '# terminal SCEs'],
kind='bar', height=3.75, aspect=1.5, orient="h",)
ax.set_ylabels('')
ax.set_xlabels('average count per patient')
# ax.savefig('../graphs/chromosome aberr/all patients rearrangements.png', dpi=400)
plt.figure(figsize=(6,6))
ax=sns.set(font_scale=1.5)
ax = sns.lineplot(x='timepoint', y='count per cell', data=melt_aberrations[melt_aberrations['aberration type'] != '# terminal SCEs'],
hue='aberration type',
# palette=sns.color_palette("terrain", melt_aberrations['aberration type'].nunique()),
)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
trp.chr_scipy_anova_post_hoc_tests(df=melt_aberrations[melt_aberrations['aberration type'] != '# terminal SCEs'],
post_hoc='tukeyHSD')
# pivoting out aberrations for linear regression
group_chr = cleaned_chr_df.groupby(['patient id', 'timepoint']).agg('mean').reset_index()
pivot_chr = group_chr.pivot(index='patient id', columns='timepoint', values='# inversions')
row = []
aberr_types = [col for col in group_chr.columns if col != 'patient id' and col != 'timepoint']
for aberr in aberr_types:
pivot_chr = group_chr.pivot(index='patient id', columns='timepoint', values=aberr)
x_name2 = ['1 non irrad']
x_name3 = ['2 irrad @ 4 Gy', '1 non irrad']
y_name = '4 C'
# print(f'ABERRATION TYPE | {aberr}')
for x_name in [x_name2, x_name3]:
x = pivot_chr[x_name].values.reshape(-1, len(x_name))
y = pivot_chr['4 C'].values.reshape(-1, 1)
regression = LinearRegression().fit(x, y)
# print(f"Linear regression for {x_name} vs. {y_name}:\nR2 is {regression.score(x, y):.4f}")
# print('\n')
row.append(['Linear Regression', aberr, x_name, y_name, f'{regression.score(x, y):.4f}'])
LM_aberr_r2 = pd.DataFrame(data=row, columns=['Model', 'Aberration type', 'Variables', 'Target', 'R2 score'])
LM_aberr_r2
group_cleaned_chr_abber_df = group_chr[group_chr['timepoint'] != '2 irrad @ 4 Gy']
all_qPCR_chr_aberr = all_qPCR_df.merge(group_chr, on=['patient id', 'timepoint'])
df = (all_patients_df[all_patients_df['timepoint'] != '2 irrad @ 4 Gy']
[['patient id', 'timepoint', 'telo means', 'Q1', 'Q2-3', 'Q4']])
all_merge = all_qPCR_chr_aberr.merge(df, on=['patient id', 'timepoint'])
all_merge.corr()
sns.lineplot(x='timepoint', y='count per cell', hue='aberration type',
data=melt_aberrations[melt_aberrations['aberration type'].isin(['# inversions', '# dicentrics', 'excess chr fragments'])])
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0, fontsize='small')