# tools for handling files
import sys
import os

# pandas/numpy for handling data
import pandas as pd
import numpy as np
from pandas import ExcelWriter
from pandas import ExcelFile

# for reading individual telomere length data from files
from ast import literal_eval

# for grabbing individual cells
import more_itertools

# my module containing functions for handling/visualizing/analyzing telomere length/chr rearrangement data
import telomere_methods_rad_patient as trp

# incase reloading modules is required
import importlib
%load_ext autoreload
%autoreload

/usr/local/lib/python3.7/site-packages/statsmodels/compat/pandas.py:49: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version
  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)

Loading Telomere Length Data from TeloFISH¶

Extracting telomere length data output from ImageJ from all radiation therapy patients¶

all_patients_dict = trp.generate_dictionary_from_TeloLength_data('../data/raw patient teloFISH data/')

SW9A non irrad.xlsx data extraction in progress..
BJ1 for SW9_.xlsx data extraction in progress..
SW11A non irrad.xlsx data extraction in progress..
BJ1 for SW15_.xlsx data extraction in progress..
SW6A non irrad.xlsx data extraction in progress..
SW6A irrad @ 4 Gy.xlsx data extraction in progress..
SW8B.xlsx data extraction in progress..
SW14A irrad @ 4 Gy.xlsx data extraction in progress..
SW8A irrad @ 4 Gy.xlsx data extraction in progress..
SW5A irrad @ 4 Gy.xlsx data extraction in progress..
SW8C.xlsx data extraction in progress..
SW1A non irrad.xlsx data extraction in progress..
BJ1 for SW11_.xlsx data extraction in progress..
SW16A non irrad.xlsx data extraction in progress..
BJ1 for SW13_.xlsx data extraction in progress..
BJ-hTERT for SW9_.xlsx data extraction in progress..
BJ1 for SW14_.xlsx data extraction in progress..
SW9B.xlsx data extraction in progress..
BJ1 for SW8_.xlsx data extraction in progress..
SW_1_ok_3_C_.xlsx data extraction in progress..

/usr/local/lib/python3.7/site-packages/scipy/stats/stats.py:2275: RuntimeWarning: Mean of empty slice.
  mns = a.mean(axis=axis)
/usr/local/lib/python3.7/site-packages/numpy/core/_methods.py:154: RuntimeWarning: invalid value encountered in true_divide
  ret, rcount, out=ret, casting='unsafe', subok=False)
/usr/local/lib/python3.7/site-packages/numpy/core/_methods.py:217: RuntimeWarning: Degrees of freedom <= 0 for slice
  keepdims=keepdims)
/usr/local/lib/python3.7/site-packages/numpy/core/_methods.py:186: RuntimeWarning: invalid value encountered in true_divide
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
/usr/local/lib/python3.7/site-packages/numpy/core/_methods.py:207: RuntimeWarning: invalid value encountered in true_divide
  ret, rcount, out=ret, casting='unsafe', subok=False)

SW3A irrad @ 4 Gy.xlsx data extraction in progress..
SW11A irrad @ 4 Gy.xlsx data extraction in progress..
BJ1 for SW16_.xlsx data extraction in progress..
BJ1 for SW12_.xlsx data extraction in progress..
SW8A non irrad.xlsx data extraction in progress..
BJ-hTERT for SW8_.xlsx data extraction in progress..
SW10A non irrad.xlsx data extraction in progress..
SW12A irrad @ 4 Gy.xlsx data extraction in progress..
SW9C.xlsx data extraction in progress..
BJ1 for SW10_.xlsx data extraction in progress..
SW7A non irrad.xlsx data extraction in progress..
SW1A irrad @ 4 Gy.xlsx data extraction in progress..
SW13A irrad @ 4 Gy.xlsx data extraction in progress..
SW1B.xlsx data extraction in progress..
BJ-hTERT for SW6_.xlsx data extraction in progress..
SW13B.xlsx data extraction in progress..
BJ1 for SW2_.xlsx data extraction in progress..
SW2A non irrad.xlsx data extraction in progress..
SW5C.xlsx data extraction in progress..
SW15C.xlsx data extraction in progress..
SW7C.xlsx data extraction in progress..
SW11B.xlsx data extraction in progress..
SW3B.xlsx data extraction in progress..
BJ-hTERT for SW15_.xlsx data extraction in progress..
SW15A non irrad.xlsx data extraction in progress..
SW12A non irrad.xlsx data extraction in progress..
BJ-hTERT for SW11_.xlsx data extraction in progress..
SW3C.xlsx data extraction in progress..
SW11C.xlsx data extraction in progress..
SW7B.xlsx data extraction in progress..
SW15B.xlsx data extraction in progress..
BJ1 for SW6_.xlsx data extraction in progress..
BJ-hTERT for SW2_.xlsx data extraction in progress..
SW5B.xlsx data extraction in progress..
SW5A non irrad.xlsx data extraction in progress..
SW1C.xlsx data extraction in progress..
BJ-hTERT for SW13_.xlsx data extraction in progress..
SW10A irrad @ 4 Gy.xlsx data extraction in progress..
SW2A irrad @ 4 Gy.xlsx data extraction in progress..
BJ1 for SW1_.xlsx data extraction in progress..
SW10B.xlsx data extraction in progress..
BJ-hTERT for SW5_.xlsx data extraction in progress..
SW2B.xlsx data extraction in progress..
SW13A non irrad.xlsx data extraction in progress..
SW14C.xlsx data extraction in progress..
SW6C.xlsx data extraction in progress..
SW9A irrad @ 4 Gy.xlsx data extraction in progress..
SW16A irrad @ 4 Gy.xlsx data extraction in progress..
BJ-hTERT for SW14_.xlsx data extraction in progress..
BJ-hTERT for SW16_.xlsx data extraction in progress..
SW16C.xlsx data extraction in progress..
BJ1 for SW3_.xlsx data extraction in progress..
SW12B.xlsx data extraction in progress..
BJ-hTERT for SW7_.xlsx data extraction in progress..
SW12C.xlsx data extraction in progress..
SW16B.xlsx data extraction in progress..
BJ-hTERT for SW3_.xlsx data extraction in progress..
BJ1 for SW7_.xlsx data extraction in progress..
BJ-hTERT for SW12_.xlsx data extraction in progress..
SW3A non irrad.xlsx data extraction in progress..
SW15A irrad @ 4 Gy.xlsx data extraction in progress..
SW7A irrad @ 4 Gy.xlsx data extraction in progress..
BJ-hTERT for SW10_.xlsx data extraction in progress..
SW6B.xlsx data extraction in progress..
SW14B.xlsx data extraction in progress..
BJ-hTERT for SW1_.xlsx data extraction in progress..
SW14A non irrad.xlsx data extraction in progress..
BJ1 for SW5_.xlsx data extraction in progress..
SW2C.xlsx data extraction in progress..
SW10C.xlsx data extraction in progress..
completed file collection

Making dataframe from dict w/ all patients telomere length data, contains telo means & individual telos as list¶

all_patients_df = trp.generate_dataframe_from_dict(all_patients_dict)

# don't need telo means per cell @ this time
all_patients_df = all_patients_df.drop(['cell data'], axis=1)
print(all_patients_df.shape)

(59, 7)

Saving all patients telomere length data for later retrieval¶

# changing telo data to list in prep for saving to csv
all_patients_df['telo data'] = all_patients_df['telo data'].apply(lambda row: row.tolist())
all_patients_df.to_csv('../data/compiled patient data csv files/all_patients_df.csv', index=False)

Generating all patients telo df containing telo counts per quartile melted into tidy data format¶

melted_all_patients_df = pd.melt(
    all_patients_df,
    id_vars = [col for col in all_patients_df.columns if col != 'Q1' and col != 'Q2-3' and col != 'Q4'],
    var_name='relative Q',
    value_name='Q freq counts')

melted_all_patients_df['Q freq counts'] = melted_all_patients_df['Q freq counts'].astype('float64')
melted_all_patients_df.head(4)

Saving melted all patients df to csv¶

melted_all_patients_df.to_csv('../data/compiled patient data csv files/melted_all_patients_df.csv', index=False)

Pivoted Dataframe w/ timepoints as columns, and telomere length means for each patient timepoint in rows¶

pivot_patients_telo_means_df = all_patients_df.pivot(index='patient id', columns='timepoint', values='telo means')
pivot_patients_telo_means_df = pivot_patients_telo_means_df.drop(13)

Saving pivoted telo means df to file¶

pivot_patients_telo_means_df.to_csv('../data/compiled patient data csv files/pivot_patients_telo_means_df.csv', index=False)

Exploding individual telomere length measurements from contained list into dataframe (i.e row per individual telomere) while retaining related column info¶

# can imagine the lists containing the individual telos per patient exploding to the right; maintains the index relationship
explode_telos_raw = all_patients_df['telo data'].apply(pd.Series)

print(explode_telos_raw.shape)
explode_telos_raw.head(4)

(59, 4600)

exploded_telos_all_patients_df = (explode_telos_raw
                                  
    # we'll merge the exploded telos df w/ our original all patients df on the index!
    .merge(all_patients_df, right_index = True, left_index = True)
    .drop(['telo data', 'Q1', 'Q2-3', 'Q4'], axis = 1)
    .melt(id_vars = ['patient id', 'timepoint', 'telo means'], value_name = "individual telomeres") 
    .drop("variable", axis = 1)
    .dropna())

exploded_telos_all_patients_df.head(4)

Saving exploded telomere df for later retrieval¶

exploded_telos_all_patients_df.to_csv('../data/compiled patient data csv files/exploded_telos_all_patients_df.csv', index=False)

Loading Telomere Length Data from qPCR - Aidan & Lynn¶

Extracting average telomere length data by qPCR data from Aidan & Lynn¶

all_qPCR_df = pd.read_excel('../data/qPCR telo data/REVISED Tel +Alb (both plates) 2019-08-05 13 Quantification Cq Results.xlsx',
                            sheet_name=1,
                           skiprows=1,
                            usecols=[24, 25, 26,]
                           )

all_qPCR_df.columns = ['Sample', 'telo means qPCR', 'SEM']
all_qPCR_df['Sample'] = all_qPCR_df['Sample'].astype('str')

all_qPCR_df.dropna(axis=0, inplace=True)
all_qPCR_df.drop([24, 47, 48], axis=0, inplace=True)

all_qPCR_df['Sample'] = all_qPCR_df['Sample'].apply(lambda x: trp.change_sample_ID(x))
all_qPCR_df['timepoint'] = all_qPCR_df['Sample'].apply(lambda x: trp.make_timepoint_col(x))
all_qPCR_df['patient id'] = all_qPCR_df['Sample'].apply(lambda x: trp.make_patient_ID(x))

all_qPCR_df['patient id'] = all_qPCR_df['patient id'].astype('str')
all_qPCR_df.drop(['Sample'], axis=1, inplace=True)

cols = ['patient id', 'timepoint', 'telo means qPCR', 'SEM']
all_qPCR_df = all_qPCR_df.reindex(columns=cols)
all_qPCR_df.head(4)

all_qPCR_df.to_csv('../data/qPCR telo data/all_qPCR_df.csv', index=False)

Loading Chromosome Aberration Data from Subtelo-dGH¶

all_chr_aberr_df = trp.make_dataframe_chr_aberr_data('../data/dGH scoresheets/')

<DirEntry 'SW14_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW3_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW8_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW13_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW9_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW5_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW12_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW15_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW2_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW10_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW7_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW1_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW16_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW11_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW6_timepoints_subtelodGH.xlsx'>

all_chr_aberr_df.to_csv('../data/compiled patient data csv files/all_chr_aberr_df.csv', index=False)

	patient id	timepoint	telo data	telo means	relative Q	Q freq counts
0	1	1 non irrad	[70.74301669292741, 91.86033510872664, 59.1284...	84.796483	Q1	1195.0
1	1	2 irrad @ 4 Gy	[91.86033510872664, 119.31165592077517, 99.251...	90.975826	Q1	724.0
2	1	3 B	[191.10982054566642, 141.48603338585482, 114.0...	116.779989	Q1	231.0
3	1	4 C	[86.58100550477684, 139.3729078012595, 99.2504...	99.346299	Q1	372.0

	0	1	2	3	4	5	6	7	8	9	...	4590	4591	4592	4593	4594	4595	4596	4597	4598	4599
0	70.743017	91.860335	59.128492	65.463687	51.737430	52.793296	59.128492	69.687151	70.743017	144.653631	...	70.743017	153.099028	161.547486	89.748603	166.826815	71.798883	181.607122	49.625202	71.798883	102.418994
1	91.860335	119.311656	99.251397	144.653631	108.754190	100.307262	106.641392	107.697247	162.601726	103.474860	...	84.468429	69.687151	101.363128	147.819751	168.938547	88.692737	88.691850	48.569832	54.905028	97.139665
2	191.109821	141.486033	114.033519	156.266594	126.703910	185.832402	115.089385	84.469274	150.988827	71.798165	...	73.910614	141.484619	158.379888	74.966480	86.581006	93.971127	111.921788	89.748603	105.586592	97.011747
3	86.581006	139.372908	99.250404	85.525140	101.363128	109.810056	87.636871	111.921788	134.094972	76.022346	...	96.083799	107.698324	93.972067	59.127900	99.251397	76.021586	76.022346	59.127900	129.871508	172.106145

	patient id	timepoint	telo means	individual telomeres
0	1	1 non irrad	84.796483	70.743017
1	1	2 irrad @ 4 Gy	90.975826	91.860335
2	1	3 B	116.779989	191.109821
3	1	4 C	99.346299	86.581006

	patient id	timepoint	telo means qPCR	SEM
0	1	1 non irrad	2.182028	0.083233
1	1	3 B	1.964348	0.051942
2	1	4 C	1.749799	0.068195
3	2	1 non irrad	1.669445	0.034776