# tools for handling files
import sys
import os
# pandas/numpy for handling data
import pandas as pd
import numpy as np
# seaborn/matplotlib for graphing
import matplotlib.pyplot as plt
import seaborn as sns
# statistics
from statistics import mean
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats
# for reading individual telomere length data from files
from ast import literal_eval
# for grabbing individual cells
import more_itertools
# my module containing functions for handling/visualizing/analyzing telomere length/chr rearrangement data
import telomere_methods_rad_patient as trp
# machine learning
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, auc, accuracy_score, r2_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from bayes_opt import BayesianOptimization
import imgkit
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# setting darkgrid style for seaborn figures
# sns.set(font='helvetica')
sns.set_style(style="darkgrid",rc= {'patch.edgecolor': 'black'})
import xgboost
import shap
# sns.set(font='helvetica')
# incase reloading modules is required
import importlib
%load_ext autoreload
%autoreload
import telomere_methods_rad_patient as trp
exploded_telos_all_patients_df = pd.read_csv('../data/compiled patient data csv files/exploded_telos_all_patients_df.csv')
all_patients_df = pd.read_csv('../data/compiled patient data csv files/all_patients_df.csv')
# cleaning & combing data; retaining features of interest
telo_data = trp.combine_data(exploded_telos=exploded_telos_all_patients_df, all_patients_df=all_patients_df)
print(telo_data.shape)
telo_data.head()
telo_test = telo_data.copy()
train_set, test_set = train_test_split(telo_test, test_size=0.2, shuffle=True, stratify=telo_test[['patient id', 'timepoint']])
clean_process_pipe = Pipeline([('features', trp.make_features(make_log_target=False)),
('dummies', trp.make_dummies(drop_first=True)),
('cleaner', trp.clean_data(drop_patient_id=True))
])
model = XGBRegressor(n_estimators=200, max_depth=7, learning_rate=0.2,
objective ='reg:squarederror', random_state=1)
xgb_pipe = Pipeline([('XGB', model)
])
full_pipe = Pipeline([('clean_process', clean_process_pipe),
('model', model)
])
train_clean = train_set.copy()
test_clean = test_set.copy()
train_clean = clean_process_pipe.fit_transform(train_clean)
test_clean = clean_process_pipe.fit_transform(test_clean)
# #XGBoost params for random grid search
# param_grid = {'XGB__max_depth': [3, 7, 9],
# 'XGB__learning_rate': [0.05, 0.1]}
# X_train = train_clean[['individual telomeres', 'timepoint_1']].copy()
# y_train = train_clean['4 C telo means'].copy()
# folds = KFold(5, shuffle=True, random_state=0)
# df_results, best_estimator = trp.grid_search(X_train, y_train, xgb_pipe, param_grid,
# scoring='neg_mean_absolute_error', cv=folds, n_iter=2)
# print(best_estimator)
# target = '4 C telo means'
# features = [col for col in train_clean if col != target and col != 'patient id']
# X_train = train_clean[features].copy()
# y_train = train_clean[target].copy()
# pbounds = {
# 'learning_rate': (0.1, 0.2),
# 'n_estimators': (100, 200),
# 'max_depth': (3, 30),
# 'subsample': (.80, 1.0),
# 'colsample': (1.0, 1.0), # Change for datasets with lots of features
# 'gamma': (0, 1)}
# def xgboost_hyper_param(learning_rate, n_estimators, max_depth,
# subsample, colsample, gamma):
# max_depth = int(max_depth)
# n_estimators = int(n_estimators)
# clf = XGBRegressor(max_depth=max_depth,
# learning_rate=learning_rate,
# n_estimators=n_estimators,
# gamma=gamma, objective='reg:squarederror')
# return np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='neg_mean_absolute_error'))
# optimizer = BayesianOptimization(
# f=xgboost_hyper_param,
# pbounds=pbounds,
# random_state=1,)
# optimizer.maximize(init_points=10, n_iter=5)
# bayesian optimized model
# bo_model = XGBRegressor(n_estimators=260, max_depth=17, learning_rate=0.25,
# objective ='reg:squarederror', random_state=0, subsample=.9,
# gamma=1.7)
trp.plot_individ_telos_ML_objective(df=test_set, target='4 C telo means')
# predicting mean telo telo post-therapy (4 C) using individual telos
fit_xgb_model, telo_row = trp.cv_score_fit_mae_test(train_set=train_clean, test_set=test_clean,
model=model, cv=5)
y_predict_list, y_true_list = trp.predict_target_4C_compare_actual(telo_data=telo_test, test_set=test_set,
model=fit_xgb_model, target='4 C telo means',
clean_process_pipe=clean_process_pipe, verbose=False)
plt.figure(figsize=(6,6))
ax = sns.regplot(x=y_true_list, y=y_predict_list, scatter_kws={'s':200, 'edgecolor':'k'}, )
ax.set_xlabel('Actual mean telomere lengths post-therapy', fontsize=18)
ax.set_ylabel('Predicted mean telomere lengths post-therapy', fontsize=18)
ax.tick_params(labelsize=14)
ax.set_title('Fig. 1', fontsize=22, weight='bold')
plt.savefig(f'../graphs/paper figures/main figs/ML model actual 4 C mean telos vs predicted.png',
dpi=400,
bbox_inches = "tight"
)
stats_df = trp.make_stats_df(stats_list=telo_row)
stats_df
path=f'../graphs/paper figures/supp figs/ML model metrics target_4C mean telos | features_individ telos.png'
trp.df_to_png(df=stats_df, path=path)
## Trialing model interpretability functions
shap.initjs()
X = test_clean.drop('4 C telo means', axis=1).sample(9000, random_state=2)
# # explain the model's predictions using SHAP
explainer = shap.TreeExplainer(fit_xgb_model)
shap_values = explainer.shap_values(X)
# visualize the first prediction's explanation
shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:])
# change to jupyter theme light mode for viz
shap.force_plot(explainer.expected_value, shap_values, X)
lr_model = LinearRegression(normalize=True)
lr_pipe = Pipeline([('lr_model', lr_model)
])
# predicting mean telo telo post-therapy (4 C) using individual telos
fit_lr_model, row = trp.cv_score_fit_mae_test(train_set=train_clean, test_set=test_clean,
model=lr_model, cv=5)
lr_y_predict_list, y_true_list = trp.predict_target_4C_compare_actual(telo_data=telo_test, test_set=test_set,
model=fit_lr_model, target='4 C telo means',
clean_process_pipe=clean_process_pipe, verbose=False)
plt.figure(figsize=(6,6))
ax = sns.regplot(x=y_true_list, y=lr_y_predict_list, scatter_kws={'s':200, 'edgecolor':'k'},)
ax.set_xlabel('Actual mean telomere lengths post-therapy', fontsize=18)
ax.set_ylabel('Predicted mean telomere lengths post-therapy', fontsize=18)
ax.tick_params(labelsize=14)
exploded_telos_all_patients_df = pd.read_csv('../data/compiled patient data csv files/exploded_telos_all_patients_df.csv')
all_patients_df = pd.read_csv('../data/compiled patient data csv files/all_patients_df.csv')
# cleaning & combing data; retaining features of interest
quartile_telo_data = trp.combine_data(exploded_telos=exploded_telos_all_patients_df,
all_patients_df=all_patients_df,
prediction_objective='4 C # short telos from individual telos')
quartile_telo_data.head()
quartile_telo_test = quartile_telo_data.copy()
y = quartile_telo_test[['4 C # short telos']]
X = quartile_telo_test.drop(['4 C # short telos'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)
q_train_set = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
q_test_set = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
q_clean_process_pipe = Pipeline([('features', trp.make_features(make_log_target=False)),
('dummies', trp.make_dummies(drop_first=True)),
('cleaner', trp.clean_data(drop_patient_id=True))
])
# initialize XGBoost model & pipeline for hyperparams gridsearch
q_model = XGBRegressor(n_estimators=200, max_depth=6, learning_rate=0.2,
objective='reg:squarederror', random_state=0, gamma=3)
q_xgb_pipe = Pipeline([('XGB', model)
])
q_full_pipe = Pipeline([('clean_process', q_clean_process_pipe),
('model', q_model)
])
q_train_clean = q_train_set.copy()
q_test_clean = q_test_set.copy()
q_train_clean = q_clean_process_pipe.fit_transform(q_train_clean)
q_test_clean = q_clean_process_pipe.fit_transform(q_test_clean)
q_test_set.sort_values(by='patient id', ascending=True, inplace=True)
q_test_set.reset_index(drop=True, inplace=True)
temp = q_test_set.copy()
temp['4 C # short telos'] = temp['4 C # short telos'] / 10
trp.plot_individ_telos_ML_objective(df=temp, target='4 C # short telos')
# score model by cross validation, 5 folds, on X/y_train data
# fit model on train data
# w/ model, predict y_test from X_test; score model by MAE/R2 - return model
q_fit_xgb_model, short_telos_row = trp.cv_score_fit_mae_test(train_set=q_train_clean, test_set=q_test_clean,
model=q_model, cv=5,
target='4 C # short telos')
q_y_xgb_predict, q_y_true = trp.predict_target_4C_compare_actual(telo_data=quartile_telo_data, test_set=q_test_set,
model=q_fit_xgb_model, target='4 C # short telos',
clean_process_pipe=q_clean_process_pipe, verbose=False)
plt.figure(figsize=(6,6))
ax = sns.regplot(x=q_y_true, y=q_y_xgb_predict, scatter_kws={'s':200, 'edgecolor':'k'}, )
ax.set_xlabel('Actual # short telos post-therapy', fontsize=18)
ax.set_ylabel('Predicted # short telos post-therapy', fontsize=18)
ax.tick_params(labelsize=14)
ax.set_title('Fig. 2', weight='bold', fontsize=20)
plt.savefig(f'../graphs/paper figures/main figs/ML model actual 4 C # short telomeres vs predicted.png',
dpi=400,
bbox_inches = "tight"
)
telos_short_telos_rows = telo_row + short_telos_row
stats_df = trp.make_stats_df(stats_list=telos_short_telos_rows)
stats_df
path=f'../graphs/paper figures/supp figs/ML models predicting #1 4 C telo means and #2 # short telos.png'
trp.df_to_png(df=stats_df, path=path)
all_chr_aberr_df = pd.read_csv('../data/compiled patient data csv files/all_chr_aberr_df.csv')
general_cleaner = Pipeline([('cleaner', trp.general_chr_aberr_cleaner())])
cleaned_chr_df = general_cleaner.fit_transform(all_chr_aberr_df)
chr_train, chr_test = train_test_split(cleaned_chr_df, test_size=0.2, shuffle=True,
stratify=cleaned_chr_df[['patient id', 'timepoint']])
features = ['# inversions']
target = '# inversions'
make_new_features_target = Pipeline([('make features', trp.make_chr_features(combine_inversions=False, bool_features=False,
features=features)),
('make target merge', trp.make_target_merge(target=target, features=features))
])
# initialize XGBoost model & pipeline for hyperparams gridsearch
chr_model = XGBRegressor(n_estimators=200, max_depth=15, learning_rate=0.1,
objective='reg:squarederror',
random_state=0,
)
chr_xgb_pipe = Pipeline([('XGB', chr_model)
])
chr_full_pipe = Pipeline([('make ftr target', make_new_features_target),
('model', chr_model)
])
cleaned_chr_train = chr_train.copy()
cleaned_chr_test = chr_test.copy()
cleaned_chr_train = make_new_features_target.fit_transform(cleaned_chr_train)
cleaned_chr_test = make_new_features_target.fit_transform(cleaned_chr_test)
bo_model = XGBRegressor(n_estimators=300,
max_depth=10,
learning_rate=0.2,
objective='reg:squarederror',
# colsample=.8897,
# gamma=3.185,
random_state=0,
)
# score model by cross validation, 5 folds, on X/y_train data
# fit model on train data; w/ model, predict y_test from X_test; score model by MAE/R2 - return model
chr_fit_xgb_model, row = trp.cv_score_fit_mae_test(train_set=cleaned_chr_train, test_set=cleaned_chr_test,
model=chr_model, cv=5, target='4 C # inversions')
chr_y_predict, y_true, = trp.chr_aberr_predict_target_4C_compare_actual(cleaned_unsplit_chr_data=cleaned_chr_df,
cleaned_test_set=cleaned_chr_test,
model=chr_fit_xgb_model, target='4 C # inversions',
clean_process_pipe=make_new_features_target, verbose=False)
plt.figure(figsize=(6,6))
ax = sns.regplot(x=y_true, y=chr_y_predict, scatter_kws={'s':200, 'edgecolor':'k'}, )
ax.set_xlabel('Actual # inversions post-therapy', fontsize=18)
ax.set_ylabel('Predicted # inversions post-therapy', fontsize=18)
ax.tick_params(labelsize=14)
# ax.set_title('Fig. 2', weight='bold', fontsize=20)
features_list = [['# inversions'], ['# terminal inversions'], ['# translocations'], ['# dicentrics'],
['# inversions', '# terminal inversions', '# translocations', '# dicentrics']]
target1_list = ['# inversions', '# terminal inversions', '# translocations', '# dicentrics',
'aberration index']
target2_list = ['4 C # inversions', '4 C # terminal inversions', '4 C # translocations', '4 C # dicentrics',
'4 C aberration index']
stats_list = []
stats_list, graphing_dict = trp.script_load_clean_data_ml_pipeline_loop_aberrations(features_list=features_list,
target1_list=target1_list,
target2_list=target2_list,
stats_list=stats_list,
verbose=False)
graphing_df = trp.make_graphing_df(graphing_dict=graphing_dict)
stats_df = trp.make_stats_df(stats_list=stats_list)
display(stats_df)
path=f'../graphs/paper figures/supp figs/ML models predicting chr aberration types.png'
trp.df_to_png(df=stats_df, path=path)
# saving df of model metrics for all models
all_rows = telo_row + short_telos_row + stats_list
stats_df = trp.make_stats_df(stats_list=all_rows)
path=f'../graphs/paper figures/supp figs/ML models for all telo and chr aberr endpoints.png'
trp.df_to_png(df=stats_df, path=path)
ax = sns.set(font_scale=1.1)
ax = sns.lmplot(x='actual values', y='predicted values', col='aberration type', hue='aberration type',
sharex=False, sharey=False, col_wrap=3, data=graphing_df, height=3.5)
plt.tight_layout(pad=.6)
all_patients_df = pd.read_csv('../data/compiled patient data csv files/all_patients_df.csv')
means = all_patients_df.drop(['telo data', 'Q1', 'Q2-3', 'Q4'], axis=1)
clustered_telos = trp.cluster_data_return_df(means, target='telo means', cut_off_n=2,
metric='correlation', method='single',
y_size=7)
trp.graph_cluster_groups(clustered_telos, target='telo means', hue='telo means cluster groups')
trp.graph_clusters_per_patient(clustered_telos, target='telo means',
y_dimen=1, x_dimen=2,
fsize=(12,3.5))
all_patients_df = pd.read_csv('../compiled patient data csv files/all_patients_df.csv')
short_telos = all_patients_df.drop(['telo data', 'Q2-3', 'Q4', 'telo means'], axis=1)
short_telos.rename(columns={'Q1': '# short telomeres'}, inplace=True)
clustered_short_telos = trp.cluster_data_return_df(short_telos, target='# short telomeres', cut_off_n=2)
clustered_short_telos['# short telomeres cluster groups'] = (clustered_short_telos['# short telomeres cluster groups']
.apply(lambda row: trp.swap_short_telos_group_number(row)))
trp.graph_cluster_groups(clustered_short_telos, target='# short telomeres', hue='# short telomeres cluster groups')
trp.graph_clusters_per_patient(clustered_short_telos, target='# short telomeres',
y_dimen=1, x_dimen=2, fsize=(12, 3.5))
Clustering analysis by hierarchical clustering of longitudinal changes in mean telomere length and number of short telomeres in the patients yielded 4 groups in each case, distinguishable by differential responses to radiation therapy. When patients were clustered by mean telomere length or number of short telomeres, the clustered groups differed by only one patient, suggesting strong agreement between mean telomere length and number of short telomeres for data clustering.
y_list = ['telo means', '# short telomeres']
hue_list = ['telo means cluster groups', '# short telomeres cluster groups']
df_list = [clustered_telos, clustered_short_telos]
ylim_dict = {'telo means': (75, 150),
'# short telomeres': (200, 4500)}
trp.plot_multiple_types_clusters(y_list=y_list, hue_list=hue_list,
df_list=df_list, ylim_dict=ylim_dict)
all_qPCR_df = pd.read_csv('../qPCR telo data/all_qPCR_df.csv')
means_qPCR = all_qPCR_df.drop(['SEM'], axis=1)
means_qPCR['timepoint'] = means_qPCR['timepoint'].apply(lambda row: trp.encode_timepts(row))
means_qPCR.head()
clustered_qPCR = trp.cluster_data_return_df(means_qPCR, target='telo means qPCR', cut_off_n=2,
y_size=9, x_size=9)
trp.graph_cluster_groups(clustered_qPCR, target='telo means qPCR', hue='telo means qPCR cluster groups')
all_chr_aberr_df = pd.read_csv('../data/compiled patient data csv files/all_chr_aberr_df.csv')
general_cleaner = Pipeline([('cleaner', trp.general_chr_aberr_cleaner(drop_what_timepoint=False))])
cleaned_chr_df = general_cleaner.fit_transform(all_chr_aberr_df)
mean_chr = cleaned_chr_df.groupby(['patient id', 'timepoint']).agg('mean').reset_index()
clustered_invs = trp.cluster_data_return_df(mean_chr, target='# inversions', cut_off_n=2,
y_size=5, x_size=10)
trp.graph_cluster_groups(clustered_invs, target='# inversions', hue='# inversions cluster groups')
trp.graph_clusters_per_patient(clustered_invs, target='# inversions', y_dimen=1, x_dimen=2, fsize=(12,3.5))
# chr_aberr = ['# inversions', '# terminal inversions', '# translocations', '# dicentrics']
# for target in chr_aberr:
# if target == '# inversions':
# n = 2
# else:
# n = 1
# clustered = trp.cluster_data_return_df(mean_chr, target=target, cut_off_n=n)
# trp.graph_cluster_groups(clustered, target=target, hue=f'{target} cluster groups')
# trp.graph_clusters_per_patient(clustered, target=target, y_dimen=1, x_dimen=2, fsize=(12,3.5))
mean_chr['aberration index'] = (mean_chr['# inversions'] + mean_chr['# terminal inversions'] +
mean_chr['# translocations'] + mean_chr['# dicentrics'])
# initializing list of aberrations to cluster by
y_list = ['# inversions',
'# terminal inversions',
'# translocations',
'# dicentrics',
'aberration index']
# initializing list of dfs bearing clustered groups
inv_df = trp.cluster_data_return_df(mean_chr, target='# inversions', cut_off_n=2, verbose=False)
term_inv_df = trp.cluster_data_return_df(mean_chr, target='# terminal inversions', cut_off_n=1, verbose=False)
trans_df = trp.cluster_data_return_df(mean_chr, target='# translocations', cut_off_n=1, verbose=False)
dicent_df = trp.cluster_data_return_df(mean_chr, target='# dicentrics', cut_off_n=1, verbose=False)
aberr_index_df = trp.cluster_data_return_df(mean_chr, target='aberration index', cut_off_n=1, verbose=False)
df_list = [inv_df,
term_inv_df,
trans_df,
dicent_df,
aberr_index_df]
# hues to graph by
hue_list = ['# inversions cluster groups',
'# terminal inversions cluster groups',
'# translocations cluster groups',
'# dicentrics cluster groups',
'aberration index cluster groups']
# dimensions for each aberration
ylim_dict = {'# inversions': (.1, 1.4),
'# terminal inversions': (.1, .4),
'# translocations': (0, .35),
'# dicentrics': (0, 1),
'aberration index': (.2, 2.2)}
trp.plot_multiple_types_clusters(y_list=y_list, hue_list=hue_list,
df_list=df_list, ylim_dict=ylim_dict)