In [1]:
# tools for handling files
import sys
import os

# pandas/numpy for handling data
import pandas as pd
import numpy as np

# seaborn/matplotlib for graphing
import matplotlib.pyplot as plt
import seaborn as sns

# statistics
from statistics import mean 
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats

# for reading individual telomere length data from files
from ast import literal_eval

# for grabbing individual cells
import more_itertools

# my module containing functions for handling/visualizing/analyzing telomere length/chr rearrangement data
import telomere_methods_rad_patient as trp

# machine learning 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, auc, accuracy_score, r2_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from bayes_opt import BayesianOptimization

import imgkit

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# setting darkgrid style for seaborn figures
# sns.set(font='helvetica')
sns.set_style(style="darkgrid",rc= {'patch.edgecolor': 'black'})

import xgboost
import shap

# sns.set(font='helvetica')
/usr/local/lib/python3.7/site-packages/statsmodels/compat/pandas.py:49: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version
  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)
In [2]:
# incase reloading modules is required
import importlib
%load_ext autoreload
%autoreload 

import telomere_methods_rad_patient as trp

Machine Learning (ML) - predicting mean telomere length post-therapy (4 C) using pre-therapy individual telos

Loading/merging data for ML

In [115]:
exploded_telos_all_patients_df = pd.read_csv('../data/compiled patient data csv files/exploded_telos_all_patients_df.csv')
all_patients_df = pd.read_csv('../data/compiled patient data csv files/all_patients_df.csv')
In [116]:
# cleaning & combing data; retaining features of interest
telo_data = trp.combine_data(exploded_telos=exploded_telos_all_patients_df, all_patients_df=all_patients_df)
print(telo_data.shape)
telo_data.head()
(128800, 4)
Out[116]:
patient id timepoint individual telomeres 4 C telo means
0 1 1 non irrad 92.916201 99.346299
1 1 2 irrad @ 4 Gy 79.189944 99.346299
2 1 1 non irrad 152.043172 99.346299
3 1 2 irrad @ 4 Gy 53.849162 99.346299
4 1 1 non irrad 128.815642 99.346299

Train/test split

In [117]:
telo_test = telo_data.copy()
train_set, test_set = train_test_split(telo_test, test_size=0.2, shuffle=True, stratify=telo_test[['patient id', 'timepoint']])

Initializing cleaning/model pipelines

In [118]:
clean_process_pipe = Pipeline([('features', trp.make_features(make_log_target=False)), 
                               ('dummies', trp.make_dummies(drop_first=True)),
                               ('cleaner', trp.clean_data(drop_patient_id=True))
                              ])
In [119]:
model = XGBRegressor(n_estimators=200, max_depth=7, learning_rate=0.2,
                     objective ='reg:squarederror', random_state=1)

xgb_pipe = Pipeline([('XGB', model)
                    ])

full_pipe = Pipeline([('clean_process', clean_process_pipe),
                      ('model', model)
                     ])

Cleaning data with pipeline

In [120]:
train_clean = train_set.copy()
test_clean = test_set.copy()

train_clean = clean_process_pipe.fit_transform(train_clean)
test_clean = clean_process_pipe.fit_transform(test_clean)

Random GridSearch for pipeline/model params

In [121]:
# #XGBoost params for random grid search
# param_grid = {'XGB__max_depth': [3, 7, 9],
#               'XGB__learning_rate': [0.05, 0.1]}

# X_train = train_clean[['individual telomeres', 'timepoint_1']].copy()
# y_train = train_clean['4 C telo means'].copy()

# folds = KFold(5, shuffle=True, random_state=0)
# df_results, best_estimator = trp.grid_search(X_train, y_train, xgb_pipe, param_grid, 
#                                           scoring='neg_mean_absolute_error', cv=folds, n_iter=2)
# print(best_estimator)

Bayesian Optimization

In [122]:
# target = '4 C telo means'
# features = [col for col in train_clean if col != target and col != 'patient id']

# X_train = train_clean[features].copy()
# y_train = train_clean[target].copy()
In [123]:
# pbounds = {
#     'learning_rate': (0.1, 0.2),
#     'n_estimators': (100, 200),
#     'max_depth': (3, 30),
#     'subsample': (.80, 1.0),  
#     'colsample': (1.0, 1.0),  # Change for datasets with lots of features
#     'gamma': (0, 1)}

# def xgboost_hyper_param(learning_rate, n_estimators, max_depth,
#                         subsample, colsample, gamma):
 
#     max_depth = int(max_depth)
#     n_estimators = int(n_estimators)
 
#     clf = XGBRegressor(max_depth=max_depth,
#                        learning_rate=learning_rate,
#                        n_estimators=n_estimators,
#                        gamma=gamma, objective='reg:squarederror')
    
#     return np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='neg_mean_absolute_error'))
 
# optimizer = BayesianOptimization(
#     f=xgboost_hyper_param,
#     pbounds=pbounds,
#     random_state=1,)

# optimizer.maximize(init_points=10, n_iter=5)
In [124]:
# bayesian optimized model
# bo_model = XGBRegressor(n_estimators=260, max_depth=17, learning_rate=0.25,
#                        objective ='reg:squarederror', random_state=0, subsample=.9,
#                        gamma=1.7)

Visualizing model objective

In [125]:
trp.plot_individ_telos_ML_objective(df=test_set, target='4 C telo means')

Cross validation; MAE & R2 score

In [134]:
# predicting mean telo telo post-therapy (4 C) using individual telos
fit_xgb_model, telo_row = trp.cv_score_fit_mae_test(train_set=train_clean, test_set=test_clean,
                                                    model=model, cv=5)
MAE per CV fold: 
[3.14668999 3.19372532 3.22121307 3.1875709  3.17420765] 

MEAN of MAE all folds: 3.184681387108867
STD of MAE all folds: 0.02440389889021047

MAE of predict_y_test & y_test: 3.121088495862735
R2 between predict_y_test & y_test: 0.8872186341899334
In [135]:
y_predict_list, y_true_list = trp.predict_target_4C_compare_actual(telo_data=telo_test, test_set=test_set, 
                                     model=fit_xgb_model, target='4 C telo means',
                                     clean_process_pipe=clean_process_pipe, verbose=False)
MAE predicted vs. actual 4 C telo means: 2.8854748075410948
R2 predicted vs. actual 4 C telo means: 0.9318397162379264
In [128]:
plt.figure(figsize=(6,6))
ax = sns.regplot(x=y_true_list, y=y_predict_list, scatter_kws={'s':200, 'edgecolor':'k'}, )
ax.set_xlabel('Actual mean telomere lengths post-therapy', fontsize=18)
ax.set_ylabel('Predicted mean telomere lengths post-therapy', fontsize=18)
ax.tick_params(labelsize=14)
ax.set_title('Fig. 1', fontsize=22, weight='bold')
plt.savefig(f'../graphs/paper figures/main figs/ML model actual 4 C mean telos vs predicted.png', 
            dpi=400,
            bbox_inches = "tight"
           )

Dataframe for model metrics

In [129]:
stats_df = trp.make_stats_df(stats_list=telo_row)
stats_df
Out[129]:
Model Features Target Average MAE of CV folds Std dev of MAE of CV folds MAE predicted vs. test values R2 predicted vs. test values
0 XGBoost [individual telomeres, timepoint_2 irrad @ 4 Gy] 4 C telo means 3.1847 0.0244 3.1211 0.8872
In [130]:
path=f'../graphs/paper figures/supp figs/ML model metrics target_4C mean telos | features_individ telos.png'
trp.df_to_png(df=stats_df, path=path)
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
In [131]:
## Trialing model interpretability functions
In [136]:
shap.initjs()

X = test_clean.drop('4 C telo means', axis=1).sample(9000, random_state=2)

# # explain the model's predictions using SHAP
explainer = shap.TreeExplainer(fit_xgb_model)
shap_values = explainer.shap_values(X)

# visualize the first prediction's explanation
shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:])
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Out[136]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [112]:
# change to jupyter theme light mode for viz
shap.force_plot(explainer.expected_value, shap_values, X)
shap.force_plot is slow for many thousands of rows, try subsampling your data.
Out[112]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.

Testing if linear regression can predict 4C mean telos using individual telos

In [ ]:
lr_model = LinearRegression(normalize=True)
lr_pipe = Pipeline([('lr_model', lr_model)
                   ])
In [ ]:
# predicting mean telo telo post-therapy (4 C) using individual telos
fit_lr_model, row = trp.cv_score_fit_mae_test(train_set=train_clean, test_set=test_clean,
                                              model=lr_model, cv=5)
In [ ]:
lr_y_predict_list, y_true_list = trp.predict_target_4C_compare_actual(telo_data=telo_test, test_set=test_set, 
                                     model=fit_lr_model, target='4 C telo means',
                                     clean_process_pipe=clean_process_pipe, verbose=False)
In [26]:
plt.figure(figsize=(6,6))
ax = sns.regplot(x=y_true_list, y=lr_y_predict_list, scatter_kws={'s':200, 'edgecolor':'k'},)
ax.set_xlabel('Actual mean telomere lengths post-therapy', fontsize=18)
ax.set_ylabel('Predicted mean telomere lengths post-therapy', fontsize=18)
ax.tick_params(labelsize=14)

Machine Learning - exploring predictions of #s of short telomeres post-therapy

Loading/merging data for ML

In [58]:
exploded_telos_all_patients_df = pd.read_csv('../data/compiled patient data csv files/exploded_telos_all_patients_df.csv')
all_patients_df = pd.read_csv('../data/compiled patient data csv files/all_patients_df.csv')
In [59]:
# cleaning & combing data; retaining features of interest
quartile_telo_data = trp.combine_data(exploded_telos=exploded_telos_all_patients_df, 
                                      all_patients_df=all_patients_df,
                                      prediction_objective='4 C # short telos from individual telos')
quartile_telo_data.head()
Out[59]:
patient id timepoint individual telomeres 4 C # short telos
0 1 1 non irrad 92.916201 372
1 1 2 irrad @ 4 Gy 79.189944 372
2 1 1 non irrad 152.043172 372
3 1 2 irrad @ 4 Gy 53.849162 372
4 1 1 non irrad 128.815642 372

Train/test split

In [60]:
quartile_telo_test = quartile_telo_data.copy()

y = quartile_telo_test[['4 C # short telos']]
X = quartile_telo_test.drop(['4 C # short telos'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

q_train_set = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
q_test_set = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)

Initializing cleaning/model pipelines

In [61]:
q_clean_process_pipe = Pipeline([('features', trp.make_features(make_log_target=False)), 
                               ('dummies', trp.make_dummies(drop_first=True)),
                               ('cleaner', trp.clean_data(drop_patient_id=True))
                              ])
In [62]:
# initialize XGBoost model & pipeline for hyperparams gridsearch
q_model = XGBRegressor(n_estimators=200, max_depth=6, learning_rate=0.2,
                       objective='reg:squarederror', random_state=0, gamma=3)

q_xgb_pipe = Pipeline([('XGB', model)
                      ])

q_full_pipe = Pipeline([('clean_process', q_clean_process_pipe),
                        ('model', q_model)
                       ])

Cleaning/modeling data with pipeline

In [63]:
q_train_clean = q_train_set.copy()
q_test_clean = q_test_set.copy()

q_train_clean = q_clean_process_pipe.fit_transform(q_train_clean)
q_test_clean = q_clean_process_pipe.fit_transform(q_test_clean)

Visualizing model objective

In [64]:
q_test_set.sort_values(by='patient id', ascending=True, inplace=True)
q_test_set.reset_index(drop=True, inplace=True)
In [65]:
temp = q_test_set.copy()
temp['4 C # short telos'] = temp['4 C # short telos'] / 10
trp.plot_individ_telos_ML_objective(df=temp, target='4 C # short telos')

Cross validation; MAE & R2 score when predicting # of short telomeres post-therapy (4 C)

In [66]:
# score model by cross validation, 5 folds, on X/y_train data 
# fit model on train data
# w/ model, predict y_test from X_test; score model by MAE/R2 - return model

q_fit_xgb_model, short_telos_row = trp.cv_score_fit_mae_test(train_set=q_train_clean, test_set=q_test_clean,
                                                             model=q_model, cv=5, 
                                                             target='4 C # short telos')
MAE per CV fold: 
[237.85922455 232.5570504  231.49848481 235.13496811 234.79888259] 

MEAN of MAE all folds: 234.3697220903006
STD of MAE all folds: 2.212518464814333

MAE of predict_y_test & y_test: 232.40543270348022
R2 between predict_y_test & y_test: 0.8119670308841592
In [67]:
q_y_xgb_predict, q_y_true = trp.predict_target_4C_compare_actual(telo_data=quartile_telo_data, test_set=q_test_set, 
                                                                 model=q_fit_xgb_model, target='4 C # short telos',
                                                                 clean_process_pipe=q_clean_process_pipe, verbose=False)
MAE predicted vs. actual 4 C # short telos: 210.40122767857142
R2 predicted vs. actual 4 C # short telos: 0.8709452787950632
In [77]:
plt.figure(figsize=(6,6))
ax = sns.regplot(x=q_y_true, y=q_y_xgb_predict, scatter_kws={'s':200, 'edgecolor':'k'}, )
ax.set_xlabel('Actual # short telos post-therapy', fontsize=18)
ax.set_ylabel('Predicted # short telos post-therapy', fontsize=18)
ax.tick_params(labelsize=14)
ax.set_title('Fig. 2', weight='bold', fontsize=20)
plt.savefig(f'../graphs/paper figures/main figs/ML model actual 4 C # short telomeres vs predicted.png', 
            dpi=400,
           bbox_inches = "tight"
           )

Dataframe for model metrics

In [38]:
telos_short_telos_rows = telo_row + short_telos_row
stats_df = trp.make_stats_df(stats_list=telos_short_telos_rows)
stats_df
Out[38]:
Model Features Target Average MAE of CV folds Std dev of MAE of CV folds MAE predicted vs. test values R2 predicted vs. test values
0 XGBoost [individual telomeres, timepoint_2 irrad @ 4 Gy] 4 C telo means 3.2077 0.0540 3.2009 0.8872
1 XGBoost [individual telomeres, timepoint_2 irrad @ 4 Gy] 4 C # short telos 232.6429 1.9894 235.1353 0.8110
In [39]:
path=f'../graphs/paper figures/supp figs/ML models predicting #1 4 C telo means and #2 # short telos.png'
trp.df_to_png(df=stats_df, path=path)
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               

Machine Learning - exploring predictions of chromosome rearrangements

Loading data & general cleaning

In [3]:
all_chr_aberr_df = pd.read_csv('../data/compiled patient data csv files/all_chr_aberr_df.csv')

general_cleaner = Pipeline([('cleaner', trp.general_chr_aberr_cleaner())])
cleaned_chr_df = general_cleaner.fit_transform(all_chr_aberr_df)

Train/test split

In [4]:
chr_train, chr_test = train_test_split(cleaned_chr_df, test_size=0.2, shuffle=True, 
                                       stratify=cleaned_chr_df[['patient id', 'timepoint']])

Initializing cleaning/model pipelines

In [5]:
features = ['# inversions']
target = '# inversions'

make_new_features_target = Pipeline([('make features', trp.make_chr_features(combine_inversions=False, bool_features=False,
                                                                             features=features)),
                                     ('make target merge', trp.make_target_merge(target=target, features=features))
                                    ])
In [6]:
# initialize XGBoost model & pipeline for hyperparams gridsearch
chr_model = XGBRegressor(n_estimators=200, max_depth=15, learning_rate=0.1,
                         objective='reg:squarederror',
                         random_state=0,
                        )

chr_xgb_pipe = Pipeline([('XGB', chr_model)
                        ])

chr_full_pipe = Pipeline([('make ftr target', make_new_features_target),
                          ('model', chr_model)
                         ])

Cleaning/modeling data with pipeline

In [7]:
cleaned_chr_train = chr_train.copy()
cleaned_chr_test = chr_test.copy()

cleaned_chr_train = make_new_features_target.fit_transform(cleaned_chr_train)
cleaned_chr_test = make_new_features_target.fit_transform(cleaned_chr_test)

Cross validation; MAE & R2 score when predicting # of chr aberrations post-therapy (4 C)

In [8]:
bo_model = XGBRegressor(n_estimators=300, 
                        max_depth=10, 
                        learning_rate=0.2,
                        objective='reg:squarederror',
#                         colsample=.8897,
#                         gamma=3.185,
                        random_state=0,
                        )
In [46]:
# score model by cross validation, 5 folds, on X/y_train data 
# fit model on train data; w/ model, predict y_test from X_test; score model by MAE/R2 - return model
chr_fit_xgb_model, row = trp.cv_score_fit_mae_test(train_set=cleaned_chr_train, test_set=cleaned_chr_test,
                                              model=chr_model, cv=5, target='4 C # inversions')
MAE per CV fold: 
[0.12516719 0.14348651 0.10829034 0.10515421 0.05156103] 

MEAN of MAE all folds: 0.10673185718461517
STD of MAE all folds: 0.030787999325940507

MAE of predict_y_test & y_test: 0.2513865844479629
R2 between predict_y_test & y_test: -0.16296240759924197
In [47]:
chr_y_predict, y_true, = trp.chr_aberr_predict_target_4C_compare_actual(cleaned_unsplit_chr_data=cleaned_chr_df, 
                                                                        cleaned_test_set=cleaned_chr_test, 
                                                                        model=chr_fit_xgb_model, target='4 C # inversions',
                                                                        clean_process_pipe=make_new_features_target, verbose=False)
In [48]:
plt.figure(figsize=(6,6))
ax = sns.regplot(x=y_true, y=chr_y_predict, scatter_kws={'s':200, 'edgecolor':'k'}, )
ax.set_xlabel('Actual # inversions post-therapy', fontsize=18)
ax.set_ylabel('Predicted # inversions post-therapy', fontsize=18)
ax.tick_params(labelsize=14)
# ax.set_title('Fig. 2', weight='bold', fontsize=20)

Looping through all chr aberration types for XGBoost model fitting, creating dataframes for graphing, displaying model metrics

In [49]:
features_list = [['# inversions'], ['# terminal inversions'], ['# translocations'], ['# dicentrics'],
                 ['# inversions', '# terminal inversions', '# translocations', '# dicentrics']]

           
target1_list = ['# inversions', '# terminal inversions', '# translocations', '# dicentrics',
                'aberration index']

target2_list = ['4 C # inversions', '4 C # terminal inversions', '4 C # translocations', '4 C # dicentrics',
                '4 C aberration index']

stats_list = []
stats_list, graphing_dict = trp.script_load_clean_data_ml_pipeline_loop_aberrations(features_list=features_list,
                                                                                    target1_list=target1_list,
                                                                                    target2_list=target2_list,
                                                                                    stats_list=stats_list,
                                                                                    verbose=False)

Dataframes & graphing model metrics & performance

In [50]:
graphing_df = trp.make_graphing_df(graphing_dict=graphing_dict)
stats_df = trp.make_stats_df(stats_list=stats_list)
display(stats_df)
path=f'../graphs/paper figures/supp figs/ML models predicting chr aberration types.png'
trp.df_to_png(df=stats_df, path=path)
Model Features Target Average MAE of CV folds Std dev of MAE of CV folds MAE predicted vs. test values R2 predicted vs. test values
0 XGBoost [# inversions, timepoint_2 irrad @ 4 Gy] 4 C # inversions 0.0696 0.0372 0.3293 -0.0388
1 XGBoost [# terminal inversions, timepoint_2 irrad @ 4 Gy] 4 C # terminal inversions 0.0770 0.0362 0.1779 -0.1141
2 XGBoost [# translocations, timepoint_2 irrad @ 4 Gy] 4 C # translocations 0.0273 0.0059 0.0937 -0.0161
3 XGBoost [# dicentrics, timepoint_2 irrad @ 4 Gy] 4 C # dicentrics 0.1116 0.0274 0.2258 -0.0165
4 XGBoost [# inversions, # terminal inversions, # transl... 4 C aberration index 0.2283 0.0529 0.2935 -0.1993
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
In [51]:
# saving df of model metrics for all models
all_rows = telo_row + short_telos_row + stats_list
stats_df = trp.make_stats_df(stats_list=all_rows)

path=f'../graphs/paper figures/supp figs/ML models for all telo and chr aberr endpoints.png'
trp.df_to_png(df=stats_df, path=path)
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
In [52]:
ax = sns.set(font_scale=1.1)
ax = sns.lmplot(x='actual values', y='predicted values', col='aberration type', hue='aberration type', 
           sharex=False, sharey=False, col_wrap=3, data=graphing_df, height=3.5)
plt.tight_layout(pad=.6)

Clustering telomere data

Mean telomere length (teloFISH)

In [22]:
all_patients_df = pd.read_csv('../data/compiled patient data csv files/all_patients_df.csv')
means = all_patients_df.drop(['telo data', 'Q1', 'Q2-3', 'Q4'], axis=1)
In [23]:
clustered_telos = trp.cluster_data_return_df(means, target='telo means', cut_off_n=2,
                                             metric='correlation', method='single',
                                             y_size=7)
0 Cluster number 2 has 11 elements
1 Cluster number 1 has 3 elements
In [24]:
trp.graph_cluster_groups(clustered_telos, target='telo means', hue='telo means cluster groups')
In [25]:
trp.graph_clusters_per_patient(clustered_telos, target='telo means',
                               y_dimen=1, x_dimen=2,
                               fsize=(12,3.5))
telo means CLUSTER 1 | patient IDs: [7, 10, 12]
telo means CLUSTER 2 | patient IDs: [1, 2, 3, 5, 6, 8, 9, 11, 14, 15, 16]

Short telomeres

In [27]:
all_patients_df = pd.read_csv('../compiled patient data csv files/all_patients_df.csv')
short_telos = all_patients_df.drop(['telo data', 'Q2-3', 'Q4', 'telo means'], axis=1)
short_telos.rename(columns={'Q1': '# short telomeres'}, inplace=True)
In [28]:
clustered_short_telos = trp.cluster_data_return_df(short_telos, target='# short telomeres', cut_off_n=2)
0 Cluster number 1 has 11 elements
1 Cluster number 2 has 3 elements
In [29]:
clustered_short_telos['# short telomeres cluster groups'] = (clustered_short_telos['# short telomeres cluster groups']
                                                             .apply(lambda row: trp.swap_short_telos_group_number(row)))
In [30]:
trp.graph_cluster_groups(clustered_short_telos, target='# short telomeres', hue='# short telomeres cluster groups')
In [31]:
trp.graph_clusters_per_patient(clustered_short_telos, target='# short telomeres',
                               y_dimen=1, x_dimen=2, fsize=(12, 3.5))
# short telomeres CLUSTER 1 | patient IDs: [7, 10, 12]
# short telomeres CLUSTER 2 | patient IDs: [1, 2, 3, 5, 6, 8, 9, 11, 14, 15, 16]

Clustering analysis by hierarchical clustering of longitudinal changes in mean telomere length and number of short telomeres in the patients yielded 4 groups in each case, distinguishable by differential responses to radiation therapy. When patients were clustered by mean telomere length or number of short telomeres, the clustered groups differed by only one patient, suggesting strong agreement between mean telomere length and number of short telomeres for data clustering.

Combining cluster groups of means + short telos

In [33]:
y_list = ['telo means', '# short telomeres']
hue_list = ['telo means cluster groups', '# short telomeres cluster groups']
df_list = [clustered_telos, clustered_short_telos]
ylim_dict = {'telo means': (75, 150),
             '# short telomeres': (200, 4500)}

trp.plot_multiple_types_clusters(y_list=y_list, hue_list=hue_list,
                                 df_list=df_list, ylim_dict=ylim_dict)

Mean telomere length (qPCR)

In [64]:
all_qPCR_df = pd.read_csv('../qPCR telo data/all_qPCR_df.csv')
means_qPCR = all_qPCR_df.drop(['SEM'], axis=1)
means_qPCR['timepoint'] = means_qPCR['timepoint'].apply(lambda row: trp.encode_timepts(row))
means_qPCR.head()
Out[64]:
patient id timepoint telo means qPCR
0 1 1 2.182028
1 1 3 1.964348
2 1 4 1.749799
3 2 1 1.669445
4 2 3 1.780092
In [65]:
clustered_qPCR = trp.cluster_data_return_df(means_qPCR, target='telo means qPCR', cut_off_n=2,
                                            y_size=9, x_size=9)
0 Cluster number 1 has 7 elements
1 Cluster number 2 has 7 elements
In [66]:
trp.graph_cluster_groups(clustered_qPCR, target='telo means qPCR', hue='telo means qPCR cluster groups')

Clustering chromosome rearrangement data

In [34]:
all_chr_aberr_df = pd.read_csv('../data/compiled patient data csv files/all_chr_aberr_df.csv')

general_cleaner = Pipeline([('cleaner', trp.general_chr_aberr_cleaner(drop_what_timepoint=False))])
cleaned_chr_df = general_cleaner.fit_transform(all_chr_aberr_df)
mean_chr = cleaned_chr_df.groupby(['patient id', 'timepoint']).agg('mean').reset_index()

Testing cluster functions on inversions

In [68]:
clustered_invs = trp.cluster_data_return_df(mean_chr, target='# inversions', cut_off_n=2,
                                            y_size=5, x_size=10)
0 Cluster number 1 has 11 elements
1 Cluster number 2 has 3 elements
In [69]:
trp.graph_cluster_groups(clustered_invs, target='# inversions', hue='# inversions cluster groups')
In [70]:
trp.graph_clusters_per_patient(clustered_invs, target='# inversions', y_dimen=1, x_dimen=2, fsize=(12,3.5))
# inversions CLUSTER 1 | patient IDs: [1, 2, 5, 7, 8, 9, 10, 11, 14, 15, 16]
# inversions CLUSTER 2 | patient IDs: [3, 6, 12]

Looping through all clustered groups graphs

In [71]:
# chr_aberr = ['# inversions', '# terminal inversions', '# translocations', '# dicentrics']

# for target in chr_aberr:
#     if target == '# inversions':
#         n = 2
#     else:
#         n = 1
#     clustered = trp.cluster_data_return_df(mean_chr, target=target, cut_off_n=n)
#     trp.graph_cluster_groups(clustered, target=target, hue=f'{target} cluster groups')
#     trp.graph_clusters_per_patient(clustered, target=target, y_dimen=1, x_dimen=2, fsize=(12,3.5))

Graphing all aberration cluster groups at once

In [35]:
mean_chr['aberration index'] = (mean_chr['# inversions'] + mean_chr['# terminal inversions'] +
                                mean_chr['# translocations'] + mean_chr['# dicentrics'])
In [36]:
# initializing list of aberrations to cluster by
y_list = ['# inversions', 
          '# terminal inversions', 
          '# translocations', 
          '# dicentrics', 
          'aberration index']

# initializing list of dfs bearing clustered groups
inv_df = trp.cluster_data_return_df(mean_chr, target='# inversions', cut_off_n=2, verbose=False)
term_inv_df = trp.cluster_data_return_df(mean_chr, target='# terminal inversions', cut_off_n=1, verbose=False)
trans_df = trp.cluster_data_return_df(mean_chr, target='# translocations', cut_off_n=1, verbose=False)
dicent_df = trp.cluster_data_return_df(mean_chr, target='# dicentrics', cut_off_n=1, verbose=False)
aberr_index_df = trp.cluster_data_return_df(mean_chr, target='aberration index', cut_off_n=1, verbose=False)

df_list = [inv_df, 
           term_inv_df,
           trans_df, 
           dicent_df,
           aberr_index_df]

# hues to graph by
hue_list = ['# inversions cluster groups', 
            '# terminal inversions cluster groups', 
            '# translocations cluster groups', 
            '# dicentrics cluster groups', 
            'aberration index cluster groups']

# dimensions for each aberration
ylim_dict = {'# inversions': (.1, 1.4),
             '# terminal inversions': (.1, .4),
             '# translocations': (0, .35),
             '# dicentrics': (0, 1),
             'aberration index': (.2, 2.2)}
In [37]:
trp.plot_multiple_types_clusters(y_list=y_list, hue_list=hue_list,
                                 df_list=df_list, ylim_dict=ylim_dict)
In [ ]: