In [1]:
# tools for handling files
import sys
import os

# pandas/numpy for handling data
import pandas as pd
import numpy as np
from pandas import ExcelWriter
from pandas import ExcelFile

# for reading individual telomere length data from files
from ast import literal_eval

# for grabbing individual cells
import more_itertools

# my module containing functions for handling/visualizing/analyzing telomere length/chr rearrangement data
import telomere_methods_rad_patient as trp

# incase reloading modules is required
import importlib
%load_ext autoreload
%autoreload 
/usr/local/lib/python3.7/site-packages/statsmodels/compat/pandas.py:49: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version
  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)

 

...

 


Loading Telomere Length Data from TeloFISH


Extracting telomere length data output from ImageJ from all radiation therapy patients

In [2]:
all_patients_dict = trp.generate_dictionary_from_TeloLength_data('../data/raw patient teloFISH data/')
SW9A non irrad.xlsx data extraction in progress..
BJ1 for SW9_.xlsx data extraction in progress..
SW11A non irrad.xlsx data extraction in progress..
BJ1 for SW15_.xlsx data extraction in progress..
SW6A non irrad.xlsx data extraction in progress..
SW6A irrad @ 4 Gy.xlsx data extraction in progress..
SW8B.xlsx data extraction in progress..
SW14A irrad @ 4 Gy.xlsx data extraction in progress..
SW8A irrad @ 4 Gy.xlsx data extraction in progress..
SW5A irrad @ 4 Gy.xlsx data extraction in progress..
SW8C.xlsx data extraction in progress..
SW1A non irrad.xlsx data extraction in progress..
BJ1 for SW11_.xlsx data extraction in progress..
SW16A non irrad.xlsx data extraction in progress..
BJ1 for SW13_.xlsx data extraction in progress..
BJ-hTERT for SW9_.xlsx data extraction in progress..
BJ1 for SW14_.xlsx data extraction in progress..
SW9B.xlsx data extraction in progress..
BJ1 for SW8_.xlsx data extraction in progress..
SW_1_ok_3_C_.xlsx data extraction in progress..
/usr/local/lib/python3.7/site-packages/scipy/stats/stats.py:2275: RuntimeWarning: Mean of empty slice.
  mns = a.mean(axis=axis)
/usr/local/lib/python3.7/site-packages/numpy/core/_methods.py:154: RuntimeWarning: invalid value encountered in true_divide
  ret, rcount, out=ret, casting='unsafe', subok=False)
/usr/local/lib/python3.7/site-packages/numpy/core/_methods.py:217: RuntimeWarning: Degrees of freedom <= 0 for slice
  keepdims=keepdims)
/usr/local/lib/python3.7/site-packages/numpy/core/_methods.py:186: RuntimeWarning: invalid value encountered in true_divide
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
/usr/local/lib/python3.7/site-packages/numpy/core/_methods.py:207: RuntimeWarning: invalid value encountered in true_divide
  ret, rcount, out=ret, casting='unsafe', subok=False)
SW3A irrad @ 4 Gy.xlsx data extraction in progress..
SW11A irrad @ 4 Gy.xlsx data extraction in progress..
BJ1 for SW16_.xlsx data extraction in progress..
BJ1 for SW12_.xlsx data extraction in progress..
SW8A non irrad.xlsx data extraction in progress..
BJ-hTERT for SW8_.xlsx data extraction in progress..
SW10A non irrad.xlsx data extraction in progress..
SW12A irrad @ 4 Gy.xlsx data extraction in progress..
SW9C.xlsx data extraction in progress..
BJ1 for SW10_.xlsx data extraction in progress..
SW7A non irrad.xlsx data extraction in progress..
SW1A irrad @ 4 Gy.xlsx data extraction in progress..
SW13A irrad @ 4 Gy.xlsx data extraction in progress..
SW1B.xlsx data extraction in progress..
BJ-hTERT for SW6_.xlsx data extraction in progress..
SW13B.xlsx data extraction in progress..
BJ1 for SW2_.xlsx data extraction in progress..
SW2A non irrad.xlsx data extraction in progress..
SW5C.xlsx data extraction in progress..
SW15C.xlsx data extraction in progress..
SW7C.xlsx data extraction in progress..
SW11B.xlsx data extraction in progress..
SW3B.xlsx data extraction in progress..
BJ-hTERT for SW15_.xlsx data extraction in progress..
SW15A non irrad.xlsx data extraction in progress..
SW12A non irrad.xlsx data extraction in progress..
BJ-hTERT for SW11_.xlsx data extraction in progress..
SW3C.xlsx data extraction in progress..
SW11C.xlsx data extraction in progress..
SW7B.xlsx data extraction in progress..
SW15B.xlsx data extraction in progress..
BJ1 for SW6_.xlsx data extraction in progress..
BJ-hTERT for SW2_.xlsx data extraction in progress..
SW5B.xlsx data extraction in progress..
SW5A non irrad.xlsx data extraction in progress..
SW1C.xlsx data extraction in progress..
BJ-hTERT for SW13_.xlsx data extraction in progress..
SW10A irrad @ 4 Gy.xlsx data extraction in progress..
SW2A irrad @ 4 Gy.xlsx data extraction in progress..
BJ1 for SW1_.xlsx data extraction in progress..
SW10B.xlsx data extraction in progress..
BJ-hTERT for SW5_.xlsx data extraction in progress..
SW2B.xlsx data extraction in progress..
SW13A non irrad.xlsx data extraction in progress..
SW14C.xlsx data extraction in progress..
SW6C.xlsx data extraction in progress..
SW9A irrad @ 4 Gy.xlsx data extraction in progress..
SW16A irrad @ 4 Gy.xlsx data extraction in progress..
BJ-hTERT for SW14_.xlsx data extraction in progress..
BJ-hTERT for SW16_.xlsx data extraction in progress..
SW16C.xlsx data extraction in progress..
BJ1 for SW3_.xlsx data extraction in progress..
SW12B.xlsx data extraction in progress..
BJ-hTERT for SW7_.xlsx data extraction in progress..
SW12C.xlsx data extraction in progress..
SW16B.xlsx data extraction in progress..
BJ-hTERT for SW3_.xlsx data extraction in progress..
BJ1 for SW7_.xlsx data extraction in progress..
BJ-hTERT for SW12_.xlsx data extraction in progress..
SW3A non irrad.xlsx data extraction in progress..
SW15A irrad @ 4 Gy.xlsx data extraction in progress..
SW7A irrad @ 4 Gy.xlsx data extraction in progress..
BJ-hTERT for SW10_.xlsx data extraction in progress..
SW6B.xlsx data extraction in progress..
SW14B.xlsx data extraction in progress..
BJ-hTERT for SW1_.xlsx data extraction in progress..
SW14A non irrad.xlsx data extraction in progress..
BJ1 for SW5_.xlsx data extraction in progress..
SW2C.xlsx data extraction in progress..
SW10C.xlsx data extraction in progress..
completed file collection

Making dataframe from dict w/ all patients telomere length data, contains telo means & individual telos as list

In [3]:
all_patients_df = trp.generate_dataframe_from_dict(all_patients_dict)

# don't need telo means per cell @ this time
all_patients_df = all_patients_df.drop(['cell data'], axis=1)
print(all_patients_df.shape)
(59, 7)

Saving all patients telomere length data for later retrieval

In [4]:
# changing telo data to list in prep for saving to csv
all_patients_df['telo data'] = all_patients_df['telo data'].apply(lambda row: row.tolist())
all_patients_df.to_csv('../data/compiled patient data csv files/all_patients_df.csv', index=False)

Generating all patients telo df containing telo counts per quartile melted into tidy data format

In [5]:
melted_all_patients_df = pd.melt(
    all_patients_df,
    id_vars = [col for col in all_patients_df.columns if col != 'Q1' and col != 'Q2-3' and col != 'Q4'],
    var_name='relative Q',
    value_name='Q freq counts')

melted_all_patients_df['Q freq counts'] = melted_all_patients_df['Q freq counts'].astype('float64')
melted_all_patients_df.head(4)
Out[5]:
patient id timepoint telo data telo means relative Q Q freq counts
0 1 1 non irrad [70.74301669292741, 91.86033510872664, 59.1284... 84.796483 Q1 1195.0
1 1 2 irrad @ 4 Gy [91.86033510872664, 119.31165592077517, 99.251... 90.975826 Q1 724.0
2 1 3 B [191.10982054566642, 141.48603338585482, 114.0... 116.779989 Q1 231.0
3 1 4 C [86.58100550477684, 139.3729078012595, 99.2504... 99.346299 Q1 372.0

Saving melted all patients df to csv

In [6]:
melted_all_patients_df.to_csv('../data/compiled patient data csv files/melted_all_patients_df.csv', index=False)

Pivoted Dataframe w/ timepoints as columns, and telomere length means for each patient timepoint in rows

In [7]:
pivot_patients_telo_means_df = all_patients_df.pivot(index='patient id', columns='timepoint', values='telo means')
pivot_patients_telo_means_df = pivot_patients_telo_means_df.drop(13)

Saving pivoted telo means df to file

In [8]:
pivot_patients_telo_means_df.to_csv('../data/compiled patient data csv files/pivot_patients_telo_means_df.csv', index=False)
In [9]:
# can imagine the lists containing the individual telos per patient exploding to the right; maintains the index relationship
explode_telos_raw = all_patients_df['telo data'].apply(pd.Series)

print(explode_telos_raw.shape)
explode_telos_raw.head(4)
(59, 4600)
Out[9]:
0 1 2 3 4 5 6 7 8 9 ... 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599
0 70.743017 91.860335 59.128492 65.463687 51.737430 52.793296 59.128492 69.687151 70.743017 144.653631 ... 70.743017 153.099028 161.547486 89.748603 166.826815 71.798883 181.607122 49.625202 71.798883 102.418994
1 91.860335 119.311656 99.251397 144.653631 108.754190 100.307262 106.641392 107.697247 162.601726 103.474860 ... 84.468429 69.687151 101.363128 147.819751 168.938547 88.692737 88.691850 48.569832 54.905028 97.139665
2 191.109821 141.486033 114.033519 156.266594 126.703910 185.832402 115.089385 84.469274 150.988827 71.798165 ... 73.910614 141.484619 158.379888 74.966480 86.581006 93.971127 111.921788 89.748603 105.586592 97.011747
3 86.581006 139.372908 99.250404 85.525140 101.363128 109.810056 87.636871 111.921788 134.094972 76.022346 ... 96.083799 107.698324 93.972067 59.127900 99.251397 76.021586 76.022346 59.127900 129.871508 172.106145

4 rows × 4600 columns

In [10]:
exploded_telos_all_patients_df = (explode_telos_raw
                                  
    # we'll merge the exploded telos df w/ our original all patients df on the index!
    .merge(all_patients_df, right_index = True, left_index = True)
    .drop(['telo data', 'Q1', 'Q2-3', 'Q4'], axis = 1)
    .melt(id_vars = ['patient id', 'timepoint', 'telo means'], value_name = "individual telomeres") 
    .drop("variable", axis = 1)
    .dropna())

exploded_telos_all_patients_df.head(4)
Out[10]:
patient id timepoint telo means individual telomeres
0 1 1 non irrad 84.796483 70.743017
1 1 2 irrad @ 4 Gy 90.975826 91.860335
2 1 3 B 116.779989 191.109821
3 1 4 C 99.346299 86.581006

Saving exploded telomere df for later retrieval

In [11]:
exploded_telos_all_patients_df.to_csv('../data/compiled patient data csv files/exploded_telos_all_patients_df.csv', index=False)

Loading Telomere Length Data from qPCR - Aidan & Lynn


Extracting average telomere length data by qPCR data from Aidan & Lynn

In [12]:
all_qPCR_df = pd.read_excel('../data/qPCR telo data/REVISED Tel +Alb (both plates) 2019-08-05 13 Quantification Cq Results.xlsx',
                            sheet_name=1,
                           skiprows=1,
                            usecols=[24, 25, 26,]
                           )

all_qPCR_df.columns = ['Sample', 'telo means qPCR', 'SEM']
all_qPCR_df['Sample'] = all_qPCR_df['Sample'].astype('str')
In [13]:
all_qPCR_df.dropna(axis=0, inplace=True)
all_qPCR_df.drop([24, 47, 48], axis=0, inplace=True)
In [14]:
all_qPCR_df['Sample'] = all_qPCR_df['Sample'].apply(lambda x: trp.change_sample_ID(x))
all_qPCR_df['timepoint'] = all_qPCR_df['Sample'].apply(lambda x: trp.make_timepoint_col(x))
all_qPCR_df['patient id'] = all_qPCR_df['Sample'].apply(lambda x: trp.make_patient_ID(x))

all_qPCR_df['patient id'] = all_qPCR_df['patient id'].astype('str')
all_qPCR_df.drop(['Sample'], axis=1, inplace=True)
In [15]:
cols = ['patient id', 'timepoint', 'telo means qPCR', 'SEM']
all_qPCR_df = all_qPCR_df.reindex(columns=cols)
all_qPCR_df.head(4)
Out[15]:
patient id timepoint telo means qPCR SEM
0 1 1 non irrad 2.182028 0.083233
1 1 3 B 1.964348 0.051942
2 1 4 C 1.749799 0.068195
3 2 1 non irrad 1.669445 0.034776
In [16]:
all_qPCR_df.to_csv('../data/qPCR telo data/all_qPCR_df.csv', index=False)

Loading Chromosome Aberration Data from Subtelo-dGH


In [17]:
all_chr_aberr_df = trp.make_dataframe_chr_aberr_data('../data/dGH scoresheets/')
<DirEntry 'SW14_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW3_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW8_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW13_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW9_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW5_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW12_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW15_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW2_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW10_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW7_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW1_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW16_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW11_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW6_timepoints_subtelodGH.xlsx'>
In [18]:
all_chr_aberr_df.to_csv('../data/compiled patient data csv files/all_chr_aberr_df.csv', index=False)