From 2d50faeff9681f8692faf702c5c383e53073209f Mon Sep 17 00:00:00 2001 From: Mike Dijkhof Date: Thu, 1 Jul 2021 15:22:36 +0200 Subject: [PATCH] the rest of the scripts --- DatesParser.py | 54 ++++++ DemoParser.py | 136 ++++++++++++++ FinalDF_Parser.py | 217 ++++++++++++++++++++++ PAParser.py | 415 +++++++++++++++++++++++++++++++++++++++++++ ScatterBoxplotter.py | 73 ++++++++ 5 files changed, 895 insertions(+) create mode 100644 DatesParser.py create mode 100644 DemoParser.py create mode 100644 FinalDF_Parser.py create mode 100644 PAParser.py create mode 100644 ScatterBoxplotter.py diff --git a/DatesParser.py b/DatesParser.py new file mode 100644 index 0000000..41e71ae --- /dev/null +++ b/DatesParser.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Apr 2 10:57:40 2021 + +@author: Dijkhofmf +""" + +import os +import pandas as pd + +pd.options.mode.chained_assignment = None # default='warn' + +Path = r'I:\Mike Dijkhof\Connecare MGP\Data\FinalFiles' + +os.chdir(Path) + + +FilenameOutc = 'SurgAdmComp.csv' +FilenameSACM = 'DataSACM.csv' +FilenameComplet = 'Complete.csv' + + +DFComp = pd.DataFrame(pd.read_csv(FilenameOutc)) +DFComp = DFComp.set_index('Study ID') +DFSACM = pd.DataFrame(pd.read_csv(FilenameSACM)) +DFSACM = DFSACM.set_index('Study ID') +DFComplet = pd.DataFrame(pd.read_csv(FilenameComplet)) +DFComplet = DFComplet.set_index('Study ID') + + +Startdate = pd.to_datetime(DFSACM['Start date Fitbit']).dt.date +Enddate = pd.to_datetime(DFSACM['End date Fitbit']).dt.date + +DFDates = pd.DataFrame() +DFDates['Study ID'] = DFComp.index +DFDates = DFDates.set_index('Study ID') +DFDates['Start'] = Startdate +DFDates['Surgery'] = pd.to_datetime(DFComp['Date of surgery']).dt.date +DFDates['Preop'] = DFDates['Surgery'] - DFDates['Start'] +DFDates['Discharge'] = pd.to_datetime(DFComp['Date of hospital discharge']).dt.date +DFDates['LOS'] = DFDates['Discharge'] - DFDates['Surgery'] +DFDates['St2Dis'] = DFDates['Discharge'] - DFDates['Start'] +DFDates['First Comp'] = pd.to_datetime(DFComp['Date first complication at home']).dt.date +DFDates['T2C'] = DFDates['First Comp'] - DFDates['Discharge'] +DFDates['First Read'] = pd.to_datetime(DFComp['Date (first) readmission']).dt.date +DFDates['T2R'] = DFDates['First Read'] - DFDates['Discharge'] +DFDates['Sec Read'] = pd.to_datetime(DFComp['Date second readmission']).dt.date +DFDates['T2SR'] = DFDates['Sec Read'] - DFDates['Discharge'] +DFDates['End'] = Enddate +DFDates['Length'] = DFDates['End'] - DFDates['Start'] + +DFDates = DFDates[DFComplet['Has patient completed study?']=='Yes'] + +DFDates.to_csv('Dates.csv') \ No newline at end of file diff --git a/DemoParser.py b/DemoParser.py new file mode 100644 index 0000000..d4a757e --- /dev/null +++ b/DemoParser.py @@ -0,0 +1,136 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Mar 8 10:38:31 2021 + +@author: Dijkhofmf +""" + +# Import stuff +import os +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +import seaborn as sns + + +pd.options.mode.chained_assignment = None # default='warn' + +#%% Define filenames and path + + +FilenameComplete = 'Complete.csv' +FilenameDemo = 'DemoData.csv' +Filename_T0 = 'FinalDF_T0.csv' + + +Path = 'I:\Mike Dijkhof\Connecare MGP\Data\FinalFiles' + +# Set path +os.chdir(Path) + +DFComplete = pd.DataFrame(pd.read_csv(FilenameComplete)) + +DFDemo = pd.DataFrame(pd.read_csv(FilenameDemo)) +DFDemo['Complete data'] = DFComplete['Has patient completed study?'] +DFDemo = DFDemo.drop(DFDemo[DFDemo['Complete data'] !='Yes'].index) + +DFDemo['ASA-classification'] = DFDemo['ASA-classification'].str.replace('ASA ', '').astype('float64') +DFDemo = DFDemo.replace('Unchecked', 0) +DFDemo = DFDemo.replace('Checked', 1) +Dropcols = ['Year of birth', 'Subject ID Connecare', 'Subject ID Connecare (version 2.0)','Date subject signed consent', 'Nationality', 'Language', 'Former occupation', + 'Does the patient have a smartphone that they use?', 'How many days a week is the smartphone used?', + 'Does the patient have a tablet that they use?','How many days a week is the tablet used?','Does the patient have a computer/pc that they use?', + 'How many days a week is the computer/pc used?','Smart device at home', 'Smart device at inclusion? (check all that apply) (choice=Fitbit)', + 'Smart device at inclusion? (check all that apply) (choice=Weight scale)','Indication Surgery', 'Comments', 'Complete?', 'Complete data'] +DFDemo = DFDemo.drop(Dropcols, axis=1) +DFDemo = DFDemo.set_index('Study ID') + +# Calculate CCI score +DFDemo.iloc[:,20:26] = DFDemo.iloc[:,20:26]*2 +DFDemo.iloc[:,26] = DFDemo.iloc[:,26]*3 +DFDemo.iloc[:,26:28] = DFDemo.iloc[:,26:28]*6 + +ColMask = DFDemo.columns[10:29] +DFDemo['Comorb'] = DFDemo[ColMask].sum(axis=1) +DFDemo = DFDemo.drop(ColMask, axis=1) + +#%% + +DF_T0 = pd.DataFrame(pd.read_csv(Filename_T0)) +DF_T0 = DF_T0.set_index('Study ID') + +DFDemo['Type'] = DF_T0['Pt Type'] + + #%% code variables + +DFDemo['Gender'] = DFDemo['Gender'].replace('Female', 0) +DFDemo['Gender'] = DFDemo['Gender'].replace('Male', 1) + +Housing = pd.get_dummies(DFDemo['Housing'], drop_first=True) +Education = pd.get_dummies(DFDemo['Education'], drop_first=True) +Smoking = pd.get_dummies(DFDemo['Smoking'], drop_first=True) +Med_Dif = pd.get_dummies(DFDemo['Difficulty preparing medication?'], drop_first=True) +Loc_Tu = pd.get_dummies(DFDemo['Location tumour'], drop_first=True) +Prim_Mal = pd.get_dummies(DFDemo['Primary Malignancy'], drop_first=True) + +DFDemo['Recurrent disease?'] = DFDemo['Recurrent disease?'].replace('No', 0) +DFDemo['Recurrent disease?'] = DFDemo['Recurrent disease?'].replace('Yes', 1) + +DFDemo = DFDemo.drop(['Marital State', 'Housing', 'Education', 'Tumour Stage', 'Smoking', 'Difficulty preparing medication?', + 'Location tumour', 'Primary Malignancy'], axis=1) + + +#%% +DFDemo = pd.concat([DFDemo, Housing, Education, Smoking, Med_Dif, Loc_Tu, Prim_Mal], axis=1) + +#%% Create Neoadjuvant therapy variable + +for i,r in DFDemo.iterrows(): + if (DFDemo.loc[i,'Neo-adjuvant therapy (choice=Chemotherapy)'] == 1) & (DFDemo.loc[i,'Neo-adjuvant therapy (choice=Radiotherapy)'] == 1): + DFDemo.loc[i,'Neo'] = 1 + elif DFDemo.loc[i, 'Neo-adjuvant therapy (choice=Chemotherapy)'] == 1: + DFDemo.loc[i,'Neo'] = 2 + elif DFDemo.loc[i,'Neo-adjuvant therapy (choice=Immunotherapy)'] == 1: + DFDemo.loc[i,'Neo'] = 3 + elif DFDemo.loc[i,'Neo-adjuvant therapy (choice=Radiotherapy)'] == 1: + DFDemo.loc[i,'Neo'] = 4 + elif DFDemo.loc[i,'Neo-adjuvant therapy (choice=Targeted Therapy)'] == 1: + DFDemo.loc[i,'Neo'] = 5 + elif DFDemo.loc[i,'Neo-adjuvant therapy (choice=None)'] == 1: + DFDemo.loc[i,'Neo'] = 0 + +Neo = pd.get_dummies(DFDemo['Neo'], drop_first=True) + +NeoDrop = ['Neo-adjuvant therapy (choice=Chemotherapy)','Neo-adjuvant therapy (choice=Chemotherapy)','Neo-adjuvant therapy (choice=Immunotherapy)', + 'Neo-adjuvant therapy (choice=Radiotherapy)', 'Neo-adjuvant therapy (choice=None)', 'Neo-adjuvant therapy (choice=Targeted Therapy)', 'Neo'] + +DFDemo = DFDemo.drop(NeoDrop, axis=1) + +DFDemo = pd.concat([DFDemo, Neo], axis=1) + +#%% +plt.figure() +sns.displot(DFDemo['Age (years)']) + +#%% + +DemoComp = DFDemo[DFDemo['Type'] != 'Healthy'] +DemoComp = DemoComp.drop('Type', axis=1) +DemoNoComp = DFDemo[DFDemo['Type'] == 'Healthy'] +DemoNoComp = DemoNoComp.drop('Type', axis=1) + +from scipy import stats + +#outcome = pd.DataFrame(index=['stat', 'p-value']) +outcomeT = stats.ttest_ind(DemoNoComp, DemoComp, nan_policy='omit') + +OutcomeT = outcomeT[1].tolist() + +OutcomeMW = [] +for column in DemoComp: + print(column) + outcomeMW = stats.mannwhitneyu(DemoNoComp[column], DemoComp[column]) + OutcomeMW.append(outcomeMW[1]) + + +#DFDemo.to_csv('FinalDemo.csv') \ No newline at end of file diff --git a/FinalDF_Parser.py b/FinalDF_Parser.py new file mode 100644 index 0000000..a785292 --- /dev/null +++ b/FinalDF_Parser.py @@ -0,0 +1,217 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Feb 25 11:13:35 2021 + +@author: Dijkhofmf +""" +# Import stuff +import os +import pandas as pd +import numpy as np +#import seaborn as sns +#import matplotlib.pyplot as plt + +pd.options.mode.chained_assignment = None # default='warn' + +#%% Define filenames and path + +Filename_T0 = 'BaselineT0.csv' +Filename_T1 = 'DischargeT1.csv' +#Filename_T2 = 'FollowUpT2Data.csv' +FilenameOutc = 'SurgAdmComp.csv' +FilenameComplete = 'Complete.csv' + +Path = 'I:\Mike Dijkhof\Connecare MGP\Data\FinalFiles' + +# Set path +os.chdir(Path) + +DFT0 = pd.DataFrame(pd.read_csv(Filename_T0)) +DFT1 = pd.DataFrame(pd.read_csv(Filename_T1)) +#DFT2 = pd.DataFrame(pd.read_csv(Filename_T2)) +DFComplete = pd.DataFrame(pd.read_csv(FilenameComplete)) +DFCompl = pd.DataFrame(pd.read_csv(FilenameOutc)) + +#%% + +DFT0['Complete'] = DFComplete['Has patient completed study?'] +DFT0 = DFT0.drop(DFT0[DFT0['Complete'] !='Yes'].index) +DFT0 = DFT0.astype('str') +DFT0 = DFT0.set_index(['Study ID']) + +DFT1['Complete'] = DFComplete['Has patient completed study?'] +DFT1 = DFT1.drop(DFT1[DFT1['Complete'] !='Yes'].index) +DFT1 = DFT1.astype('str') +DFT1 = DFT1.set_index(['Study ID']) + +# DFT2['Complete data'] = DFComplete['Has patient completed study?'] +# DFT2 = DFT2.drop(DFT2[DFT2['Complete data'] !='Yes'].index) +# DFT2 = DFT2.astype('str') +# DFT2 = DFT2.set_index(['Study ID']) + +DFCompl['Complete'] = DFComplete['Has patient completed study?'] +DFCompl = DFCompl.drop(DFCompl[DFCompl['Complete'] !='Yes'].index) +DFCompl = DFCompl.set_index(['Study ID']) + +#%% +DFT0 = DFT0.apply(lambda x: x.str.replace(',','.'), axis=1) +DFT1 = DFT1.apply(lambda x: x.str.replace(',','.'), axis=1) +#DFT2 = DFT2.apply(lambda x: x.str.replace(',','.'), axis=1) + +#%% +FinalDF_T0 = pd.DataFrame() +FinalDF_T0[['BMI','GFI', 'HADS_A', 'HADS_D', 'ADL', 'iADL']] = DFT0[['BMI', 'Groningen Frailty Index', 'Anxiety - Hospital Anxiety Depression Scale', 'Depression - Hospital Anxiety Depression Scale', 'ADL', 'iADL']].astype('float64') + +FinalDF_T1 = pd.DataFrame() +FinalDF_T1[['GFI', 'HADS_A', 'HADS_D', 'ADL', 'iADL']] = DFT1[['Groningen Frailty Index', 'Anxiety - Hospital Anxiety Depression Scale', 'Depression - Hospital Anxiety Depression Scale', 'ADL', 'iADL']].astype('float64') + +FinalDF_T2 = pd.DataFrame() +#FinalDF_T2[['GFI', 'HADS_A', 'HADS_D', 'ADL', 'iADL']] = DFT2[['Groningen Frailty Index', 'Anxiety - Hospital Anxiety Depression Scale', 'Depression - Hospital Anxiety Depression Scale', 'ADL', 'iADL']].astype('float64') + +#%% TUG_T0 +FinalDF_T0['TUG1'] = DFT0['Timed to Up&Go - attempt 1 (sec)'].astype('float64').fillna(0) +FinalDF_T0['TUG2'] = DFT0['Timed to Up&Go - attempt 2 (sec)'].astype('float64').fillna(0) + +for i, r in FinalDF_T0.iterrows(): + if FinalDF_T0.loc[i,'TUG1'] != 0 and FinalDF_T0.loc[i,'TUG2'] != 0: + FinalDF_T0.loc[i,'TUGTot'] = (FinalDF_T0.loc[i,'TUG1']+FinalDF_T0.loc[i,'TUG2'])/2 + else: + FinalDF_T0.loc[i,'TUGTot'] = (FinalDF_T0.loc[i,'TUG1']+FinalDF_T0.loc[i,'TUG2'])/1 + +FinalDF_T0['TUG1'] = FinalDF_T0['TUG1'].replace(0, np.nan) +FinalDF_T0['TUG2'] = FinalDF_T0['TUG2'].replace(0, np.nan) +FinalDF_T0['TUGTot'] = FinalDF_T0['TUGTot'].replace(0, np.nan) + +FinalDF_T0 = FinalDF_T0.drop(['TUG1', 'TUG2'], axis=1) + +# TUG_T1 Asuming that all missing data were due to physical disabilties --> NaNs to 30 seconds +FinalDF_T1['TUG1'] = DFT1['Timed to Up&Go - attempt 1 (sec)'].astype('float64').fillna(30) +FinalDF_T1['TUG2'] = DFT1['Timed to Up&Go - attempt 2 (sec)'].astype('float64').fillna(30) + +FinalDF_T1['TUGTot'] = (FinalDF_T1['TUG1']+FinalDF_T1['TUG2'])/2 + +FinalDF_T1 = FinalDF_T1.drop(['TUG1', 'TUG2'], axis=1) + + +# TUG_T2 +#FinalDF_T2['TUG1'] = DFT2['Timed to Up&Go - attempt 1 (sec)'].astype('float64').fillna(0) +#FinalDF_T2['TUG2'] = DFT2['Timed to Up&Go - attempt 2 (sec)'].astype('float64').fillna(0) + +# for i, r in FinalDF_T2.iterrows(): +# if FinalDF_T2.loc[i,'TUG1'] != 0 and FinalDF_T2.loc[i,'TUG2'] != 0: +# FinalDF_T2.loc[i,'TUGTot'] = (FinalDF_T2.loc[i,'TUG1']+FinalDF_T2.loc[i,'TUG2'])/2 +# else: +# FinalDF_T2.loc[i,'TUGTot'] = (FinalDF_T2.loc[i,'TUG1']+FinalDF_T2.loc[i,'TUG2'])/1 + +# FinalDF_T2['TUG1'] = FinalDF_T2['TUG1'].replace(0, np.nan) +# FinalDF_T2['TUG2'] = FinalDF_T2['TUG2'].replace(0, np.nan) +# FinalDF_T2['TUGTot'] = FinalDF_T2['TUGTot'].replace(0, np.nan) + +#%% +FinalDF_T0[['HGSR1','HGSR2','HGSR3']] = DFT0[['Handgrip Strength test Attempt 1 rigth','Handgrip Strength test Attempt 2 rigth','Handgrip Strength test Attempt 3 right']].astype('float64') +FinalDF_T0['HGSRAvg'] = (FinalDF_T0['HGSR1'] + FinalDF_T0['HGSR2'] + FinalDF_T0['HGSR3'])/3 +FinalDF_T0[['HGSL1','HGSL2','HGSL3']] = DFT0[['Handgrip Strength test Attempt 1 left', 'Handgrip Strength test Attempt 2 left', 'Handgrip Strength test Attempt 3 left']].astype('float64') +FinalDF_T0['HGSLAvg'] = (FinalDF_T0['HGSL1'] + FinalDF_T0['HGSL2'] + FinalDF_T0['HGSL3'])/3 +FinalDF_T0['Dominance'] = DFT0['Hand dominance'] + + +FinalDF_T1[['HGSR1','HGSR2','HGSR3']] = DFT1[['Handgrip Strength test Attempt 1 rigth','Handgrip Strength test Attempt 2 rigth','Handgrip Strength test Attempt 3 right']].astype('float64') +FinalDF_T1['HGSRAvg'] = (FinalDF_T1['HGSR1'] + FinalDF_T1['HGSR2'] + FinalDF_T1['HGSR3'])/3 +FinalDF_T1[['HGSL1','HGSL2','HGSL3']] = DFT1[['Handgrip Strength test Attempt 1 left', 'Handgrip Strength test Attempt 2 left', 'Handgrip Strength test Attempt 3 left']].astype('float64') +FinalDF_T1['HGSLAvg'] = (FinalDF_T1['HGSL1'] + FinalDF_T1['HGSL2'] + FinalDF_T1['HGSL3'])/3 + +for i, r in DFT1.iterrows(): + if DFT1.loc[i,'Handgrip Strength Test'] == 'No': + FinalDF_T1.loc[i,['HGSR1','HGSR2','HGSR3','HGSRAvg','HGSL1','HGSL2','HGSL3','HGSLAvg']] = 0 + +for index, rows in FinalDF_T1.iterrows(): + + if FinalDF_T0.loc[index, 'Dominance'] == 'Rigth': + FinalDF_T0.loc[index, 'HGSDom'] = FinalDF_T0.loc[index,'HGSRAvg'] + FinalDF_T1.loc[index, 'HGSDom'] = FinalDF_T1.loc[index,'HGSRAvg'] + elif FinalDF_T0.loc[index, 'Dominance'] == 'Left': + FinalDF_T0.loc[index, 'HGSDom'] = FinalDF_T0.loc[index,'HGSLAvg'] + FinalDF_T1.loc[index, 'HGSDom'] = FinalDF_T1.loc[index,'HGSLAvg'] + else: + FinalDF_T0.loc[index, 'HGSDom'] = (FinalDF_T0.loc[index,'HGSRAvg']+FinalDF_T0.loc[index,'HGSLAvg'])/2 + FinalDF_T1.loc[index, 'HGSDom'] = (FinalDF_T1.loc[index,'HGSRAvg']+FinalDF_T1.loc[index,'HGSLAvg'])/2 + +FinalDF_T0 = FinalDF_T0.drop(['HGSR1', 'HGSR2', 'HGSR3', 'HGSRAvg','HGSL1', 'HGSL2', 'HGSL3', 'HGSLAvg', 'Dominance'], axis=1) +FinalDF_T1 = FinalDF_T1.drop(['HGSR1', 'HGSR2', 'HGSR3', 'HGSRAvg','HGSL1', 'HGSL2', 'HGSL3', 'HGSLAvg'], axis=1) + +# FinalDF_T2[['HGSR1','HGSR2','HGSR3']] = DFT2[['Handgrip Strength test Attempt 1 right','Handgrip Strength test Attempt 2 right','Handgrip Strength test Attempt 3 right']].astype('float64') +# FinalDF_T2['HGSRAvg'] = (FinalDF_T2['HGSR1'] + FinalDF_T2['HGSR2'] + FinalDF_T2['HGSR3'])/3 +# FinalDF_T2[['HGSL1','HGSL2','HGSL3']] = DFT2[['Handgrip Strength test Attempt 1 left', 'Handgrip Strength test Attempt 2 left', 'Handgrip Strength test Attempt 3 left']].astype('float64') +# FinalDF_T2['HGSLAvg'] = (FinalDF_T2['HGSL1'] + FinalDF_T2['HGSL2'] + FinalDF_T2['HGSL3'])/3 + +# if FinalDF_T0['Dominance'] == 'Right': +# FinalDF_T2['HSGDom'] = FinalDF_T0['HGSRAvg'] +# elif FinalDF_T0['Dominance'] == 'Left': +# FinalDF_T2['HSGDom'] = FinalDF_T0['HGSLAvg'] +# else: +# FinalDF_T2['HGSDom'] = (FinalDF_T0['HGSRAvg']+FinalDF_T0['HGSLAvg'])/2 + +#%% +EORTCCols = DFT0.columns[15:59].tolist() +EORTCScoresT0 = DFT0[EORTCCols] +#EORTCScoresT2 = DFT2[EORTCCols] + + +#%% +os.chdir('I:\Mike Dijkhof\Python') + +import EORTC as eor +import SQUASH as sq + +NewEORTCScoresT0 = eor.EORTCCalculator(EORTCScoresT0, EORTCCols) +#NewEORTCScoresT2 = eor.EORTCCalculator(EORTCScoresT2, EORTCCols) + +EORTCT0 = eor.EORTCScore(EORTCScoresT0) +#EORTCT2 = eor.EORTCScore(EORTCScoresT2) + +os.chdir(Path) + +#%% plaatjes + +# for index, row in EORTC.iterrows(): +# plt.figure(figsize=(20,8)) +# plt.title('EORTC preoperative outcomes pt ' + str(index)) +# sns.barplot(x=EORTC.columns, y=EORTC.loc[index,:]) + +#%% + +SQUASHScoresT0 = sq.SQUASHParse(DFT0) +#SQUASHScoresT2 = sq.SQUASHParse(DFT2) + +ColsToDrop = ['SQUASH baseline afgenomen?', 'Woon werkverkeer?', 'Werk?', 'Huishoudelijk werk?'] + +SQUASHScoresT0 = SQUASHScoresT0.drop(ColsToDrop, axis=1) +SQUASHScoresT0 = SQUASHScoresT0.astype('float64') + +#SQUASHScoresT2 = SQUASHScoresT2.drop(ColsToDrop, axis=1) +#SQUASHScoresT2 = SQUASHScoresT2.astype('float64') + +SQUASHT0 = sq.SQUASHScore(SQUASHScoresT0) +#SQUASHT2 = sq.SQUASHScore(SQUASHScoresT2) + +#%% +FinalDF_T0['Pt Type'] = DFCompl.loc[:,'Complications at home during monitoring ? '].values +FinalDF_T1['Pt Type'] = DFCompl.loc[:,'Complications at home during monitoring ? '].values +#FinalDF_T2['Pt Type'] = DFCompl.loc[:,'Complications at home during monitoring ? '].values + + +FinalDF_T0['Pt Type'] = FinalDF_T0['Pt Type'].str.replace('Yes', 'Complication') +FinalDF_T0['Pt Type'] = FinalDF_T0['Pt Type'].str.replace('No', 'Healthy') + +FinalDF_T1['Pt Type'] = FinalDF_T1['Pt Type'].str.replace('Yes', 'Complication') +FinalDF_T1['Pt Type'] = FinalDF_T1['Pt Type'].str.replace('No', 'Healthy') + +#FinalDF_T2['Pt Type'] = FinalDF_T2['Pt Type'].str.replace('Yes', 'Complication') +#FinalDF_T2['Pt Type'] = FinalDF_T2['Pt Type'].str.replace('No', 'Healthy') + + +#%% Save FinalDF to .csv file + +FinalDF_T0.to_csv('FinalDF_T0.csv') +FinalDF_T1.to_csv('FinalDF_T1.csv') +#FinalDF_T2.to_csv('FinalDF_T2.csv') \ No newline at end of file diff --git a/PAParser.py b/PAParser.py new file mode 100644 index 0000000..58fb8e3 --- /dev/null +++ b/PAParser.py @@ -0,0 +1,415 @@ +# -*- coding: utf-8 -*- +""" +Script for parsing the Fitbit data into graphs. +@author M.F. Dijkhof +""" +# Import stuff +import os +import pandas as pd +import seaborn as sns +import numpy as np +import matplotlib.pyplot as plt + +# Disable copy overwrite warning +pd.options.mode.chained_assignment = None # default='warn' + +#%% Define filenames and path + +FilenameComp = 'SurgeryAndAdmission2.csv' #Surg and Adm + Complications +FilenamePA = 'PA_Data.csv' +FilenameSteps = 'StepData.csv' +FilenameComplete = 'Complete.csv' +FilenameOutcome = 'Complications.csv' + +Path = 'I:\Mike Dijkhof\Connecare MGP\Data' + +# Set path +os.chdir(Path) + +#%% Create DF from files + +DFComp = pd.DataFrame(pd.read_csv(FilenameComp)) +DFPA = pd.DataFrame(pd.read_csv(FilenamePA)) +DFSteps = pd.DataFrame(pd.read_csv(FilenameSteps)) +DFComplete = pd.DataFrame(pd.read_csv(FilenameComplete)) +DFOutcome = pd.DataFrame(pd.read_csv(FilenameOutcome)) + +DFComp = DFComp.set_index('Study ID') +DFPA = DFPA.set_index('Study ID') +DFSteps = DFSteps.set_index('Study ID') +DFComplete = DFComplete.set_index('Study ID') +DFOutcome = DFOutcome.set_index('Study ID') + +#%% +# Clear all uncomplete cases +CompleteCheck= DFComplete['Has patient completed study?'] == 'Yes' + +DFComp = DFComp[CompleteCheck] +DFPA = DFPA[CompleteCheck] +DFOutcome = DFOutcome[CompleteCheck] +DFSteps = DFSteps[CompleteCheck] + + +# Transpose PA data into the right format +NewDF= pd.DataFrame(DFPA.iloc[0]).transpose() + +counter = range(1, len(DFPA)) + +for i in counter: + NewRow = DFPA.iloc[i].transpose() + NewDF = NewDF.append(NewRow) + +NewDF = NewDF.drop(['Complete?'], axis=1) + +# Do the same for Step data +NewStepDF = pd.DataFrame(DFSteps.iloc[0]).transpose() + +counter = range(1, len(DFSteps)) + +for i in counter: + NewRow = DFSteps.iloc[i].transpose() + NewStepDF = NewStepDF.append(NewRow) + +NewStepDF = NewStepDF.drop(['Complete?'], axis=1) + +#%% Create DF with important dates + +DFDates = DFComp [['Date of surgery','Date of hospital discharge', +'Date first complication at home', 'Date (first) readmission', +'Date discharge after first readmission', 'Date second readmission', +'Date discharge second readmission']] + +for i in DFDates: + DFDates[i] = pd.to_datetime(DFDates[i]).dt.date + +DFDates['LOS'] = DFDates['Date of hospital discharge'] - DFDates['Date of surgery'] #LOS = Length of stay +DFDates['TTC'] = DFDates['Date first complication at home'] - DFDates['Date of surgery'] #TTC = Time to complication +DFDates['TTR'] = DFDates['Date (first) readmission'] - DFDates['Date of surgery'] #TTR = Time to readmission +DFDates['TT2R'] = DFDates['Date second readmission'] - DFDates['Date of surgery'] #TT2R = Time to second readmission + +#%% Create coordinates from the dates for the plots + +AXVcoord = pd.DataFrame(columns= ['LOS', 'TTC', 'TTR', 'TT2R']) + +for rows, index in DFDates.iterrows(): + AXVcoord.loc[rows, 'LOS'] = DFDates['LOS'].loc[rows].days + AXVcoord.loc[rows, 'TTC'] = DFDates['TTC'].loc[rows].days + AXVcoord.loc[rows, 'TTR'] = DFDates['TTR'].loc[rows].days + AXVcoord.loc[rows, 'TT2R'] = DFDates['TT2R'].loc[rows].days + +AXVcomb = AXVcoord.values.tolist() +AXVArray = np.array(AXVcomb) + + +#%% Create DFs for each PA level + +NoActDF = NewDF.loc[:, :'No activity After Surgery: 90'] +LowActDF = NewDF.loc[:, 'Low activity Before Surgery: -1 ':'Low activity After Surgery: 90'] +MedActDF = NewDF.loc[:, 'Medium activity Before Surgery: -1':'Medium activity After Surgery: 90'] +HighActDF = NewDF.loc[:, 'High activity Before Surgery: -1 ':'High activity After Surgery: 90'] + +def MakeStepDF(NewDF): + StepDF = NewDF.iloc[:,321:427] + StepDF = StepDF.drop('Days Fitbit prescribed after surgery', axis=1) + StepDF = StepDF.replace(' ', '') + StepDF = StepDF.replace('N.A.', np.nan) + StepDF = StepDF.replace('N.A. ', np.nan) + StepDF = StepDF.replace('NA.', np.nan) + StepDF = StepDF.replace('n.a.', np.nan) + StepDF = StepDF.replace('N.A', np.nan) + StepDF = StepDF.replace('NaN', np.nan) + StepDF = StepDF.astype('float64') + return StepDF + +StepDF = MakeStepDF(NewStepDF) + +#%% Day -14 to surgery were in the wrong order so we have to flip the first 14 days + +def DayFlipper(DF): + ListCol = DF.columns.tolist() + ListCol[0:14] = ListCol[0:14][::-1] + DF = DF[ListCol] + return(DF) + +NoActDF = DayFlipper(NoActDF) +print(NoActDF.columns) +LowActDF = DayFlipper(LowActDF) +print(LowActDF.columns) +MedActDF = DayFlipper(MedActDF) +print(MedActDF.columns) +HighActDF = DayFlipper(HighActDF) +print(HighActDF.columns) +StepDF = DayFlipper(StepDF) +print(StepDF.columns) + +#%% +OldColumns = LowActDF.columns +NewColumns = range(-14, 91) + +LowActDF.columns = NewColumns +MedActDF.columns = NewColumns +HighActDF.columns = NewColumns +StepDF.columns = NewColumns + +# Set NaN to zeroes in order to calculate the total amount of activity +LowActDFZeroes = LowActDF.fillna(0) +MedActDFZeroes = MedActDF.fillna(0) +HighActDFZeroes = HighActDF.fillna(0) +StepDFZeroes = StepDF.fillna(0) + +TotActDF = LowActDF + MedActDF + HighActDF +TotActDFZeroes = LowActDFZeroes + MedActDFZeroes + HighActDFZeroes + +# Remove pts that reported less than threshold PA days +Threshold = 200 + +NaNCount = LowActDF.isnull().sum(axis=1) # Count days without data per patient +NaNRowDrop = (LowActDF.isnull().sum(axis=1)) < Threshold + +NoActDFClean = NoActDF[NaNRowDrop] +LowActDFClean = LowActDFZeroes[NaNRowDrop] +MedActDFClean = MedActDFZeroes[NaNRowDrop] +HighActDFClean = HighActDFZeroes[NaNRowDrop] +TotActDFClean = TotActDFZeroes[NaNRowDrop] + +#%% + +# NoActDFClean['Group'] = 'Complication' +# LowActDFClean['Group'] = 'Complication' +# MedActDFClean['Group'] = 'Complication' +# HighActDFClean['Group'] = 'Complication' +# TotActDFClean['Group'] = 'Complication' +# StepDF['Group'] = 'Complication' + +def Grouper(DF): + DF['Group'] = 'Complication' + DF['Group'] = DF['Group'].where(DFOutcome['Complications at home during monitoring ? '] == 'Yes', other='No Comp') + return DF + +NoActDFClean = Grouper(NoActDFClean) +LowActDFClean = Grouper(LowActDFClean) +MedActDFClean = Grouper(MedActDFClean) +HighActDFClean = Grouper(HighActDFClean) +TotActDFClean = Grouper(TotActDFClean) +StepDF = Grouper(StepDF) + +# #%% Divide Comps, Non-comps and Unknown-Comps + +# LowActComp = LowActDFClean.loc[NewDF['Complications at Home'] == 'Yes'] +# MedActComp = MedActDFClean.loc[NewDF['Complications at Home'] == 'Yes'] +# HighActComp = HighActDFClean.loc[NewDF['Complications at Home'] == 'Yes'] +# TotActComp = TotActDFClean.loc[NewDF['Complications at Home'] == 'Yes'] + +# LowActNoComp = LowActDFClean.loc[NewDF['Complications at Home'] == 'No'] +# MedActNoComp = MedActDFClean.loc[NewDF['Complications at Home'] == 'No'] +# HighActNoComp = HighActDFClean.loc[NewDF['Complications at Home'] == 'No'] +# TotActNoComp = TotActDFClean.loc[NewDF['Complications at Home'] == 'No'] + +# LowActUnk = LowActDFClean.loc[(NewDF['Complications at Home'] != 'Yes') & (NewDF['Complications at Home'] != 'No')] +# MedActUnk = MedActDFClean.loc[(NewDF['Complications at Home'] != 'Yes') & (NewDF['Complications at Home'] != 'No')] +# HighActUnk = HighActDFClean.loc[(NewDF['Complications at Home'] != 'Yes') & (NewDF['Complications at Home'] != 'No')] +# TotActUnk = TotActDFClean.loc[(NewDF['Complications at Home'] != 'Yes') & (NewDF['Complications at Home'] != 'No')] + +#%% Plot comps, non-comps amd unknown patient data with event-dates + +colors = ['k','c','r', 'r'] # k=discharge, c=complication, r=readmissions + +def PAPlotter(Low, Med, High, Tot, Step, AXV): + + for index, row in Tot.iterrows(): + + counter = index-1 + fig, ax1 = plt.subplots(figsize=(20,8)) + + ax1.plot(Low.loc[index], 'b:') + ax1.plot(Med.loc[index], 'r:') + ax1.plot(High.loc[index], 'y:') + ax1.plot(Tot.loc[index]) + ax1.set_ylabel('Minutes of PA') + ax1.set_xlabel('Days') + plt.ylim(0,1440) + + plt.vlines(x=0, ymin=0, ymax=1440, linestyle='dashed') + plt.vlines(AXV[counter], ymin= 0, ymax= 1440, colors=colors, linestyle='dotted') + + ax2 = ax1.twinx() + ax2.plot(Step.loc[index], 'k') + ax2.set_ylabel('Steps per day') + + plt.title('PA levels comp pt' + str(index)) + plt.ylim(0,25000) + +PAPlotter(LowActDFClean, MedActDFClean, HighActDFClean, TotActDFClean, StepDF, AXVcomb) +#PAPlotter(LowActNoComp, MedActNoComp, HighActNoComp,TotActNoComp, StepDF, AXVcomb, 'No Complication') +#PAPlotter(LowActUnk, MedActUnk, HighActUnk, TotActUnk, StepDF, AXVcomb, 'Unknown Complication') + + +#%% Calculate differences between comp PA and no comp PA + +def PAStats(DF, group): + MeanTotPA = DF.mean().mean() + StdTotPA = DF.std().std() + PreMean= DF.loc[:,-14:-1].mean().mean() + PreStd = DF.loc[:,-14:-1].std().std() + Post30Mean = DF.loc[:,0:30].mean().mean() + Post30Std = DF.loc[:,0:30].std().std() + Post60Mean = DF.loc[:,0:60].mean().mean() + Post60Std = DF.loc[:,0:60].std().std() + Post90Mean = DF.loc[:,0:90].mean().mean() + Post90Std = DF.loc[:,0:90].std().std() + + print('Stats '+ group + ':', '\n') + print('Total Mean min PA ='+ str(MeanTotPA),'Std=' + str(StdTotPA)) + print('Preoperative Mean min PA =' + str(PreMean), 'Std=' + str(PreStd)) + print('30 days Postop. Mean min PA =' + str(Post30Mean), 'Std=' + str(Post30Std)) + print('60 days Postop. Mean min PA =' + str(Post60Mean), 'Std=' + str(Post60Std)) + print('90 days Postop. Mean min PA =' + str(Post90Mean), 'Std=' + str(Post90Std),'\n') + +PAStats(TotActComp, 'complication') +PAStats(TotActNoComp, 'no complication') +PAStats(TotActUnk, 'unkown') + + +#%% Plot histogram number of missing values +CountDF = pd.DataFrame(NaNCount) +CountDF['Complication'] = DFCompl['Complications at home during monitoring ? '] +CountDF.columns = ['Count', 'Complication'] + +sns.displot(CountDF, x='Count', bins=[10, 20, 30, 40, 50, 60, 70, 80, 90], hue='Complication') +sns.color_palette ('colorblind') + +#%% + +def RollingAvAct(DF, windowsize): + AvDF = pd.DataFrame() + + for index, row in DF.iterrows(): + AvDF = AvDF.append(row.rolling(windowsize, min_periods=1).mean()) + return(AvDF) + +AvTotActComp =pd.DataFrame(RollingAvAct(TotActComp, 3)) +AvTotActNoComp = pd.DataFrame(RollingAvAct(TotActNoComp, 3)) + + +#%% +def Trendliner(DF, Dates, group): + newPASlopePre = pd.DataFrame(columns=['Slope', 'Int', 'Group']) + newPASlopeLOS = pd.DataFrame(columns=['Slope', 'Int', 'Group']) + newPASlopePost = pd.DataFrame(columns=['Slope', 'Int', 'Group']) + + for index, row in DF.iterrows(): + + counter = index-1 + DisDay = int(AXVArray[counter,0]) + DisDay2 = int(DisDay+15) + DisDay3 = int(DisDay2-1) + + # Calculate trendline pre-op + Xpre = DF.columns[0:15] + Ypre = DF.loc[index,-14:0] + z_pre = np.polyfit(Xpre, Ypre, 1) + p_pre = np.poly1d(z_pre) + newPASlopePre.loc[index,'Slope'] = z_pre[0] + newPASlopePre.loc[index,'Int'] = z_pre[1] + newPASlopePre.loc[index, 'Group'] = group + + # Calculate trendline LOS + Xlos = DF.columns[14:DisDay2] + Ylos = DF.loc[index,0:DisDay] + z_los = np.polyfit(Xlos, Ylos, 1) + p_los = np.poly1d(z_los) + newPASlopeLOS.loc[index,'Slope'] = z_los[0] + newPASlopeLOS.loc[index,'Int'] = z_los[1] + newPASlopeLOS.loc[index, 'Group'] = group + + # Calculate trendline post-op + Xpost = DF.columns[DisDay3:] + Ypost = DF.loc[index,DisDay:] + z_post = np.polyfit(Xpost, Ypost, 1) + p_post = np.poly1d(z_post) + newPASlopePost.loc[index,'Slope'] = z_post[0] + newPASlopePost.loc[index,'Int'] = z_post[1] + newPASlopePost.loc[index, 'Group'] = group + + # Plot figures + plt.figure(figsize=(24,8)) + plt.plot(DF.loc[index]) + plt.plot(Xpost,p_post(Xpost),'r--') + plt.plot(Xpre, p_pre(Xpre), 'b--') + plt.plot(Xlos, p_los(Xlos), 'k--') + plt.vlines(x=0, ymin=0, ymax=1440, linestyle='dashed') + plt.vlines(Dates[counter], ymin= 0, ymax= 1440, colors=colors, linestyle='dotted') + plt.xlim(-14,105) + plt.ylim(0,1440) + plt.ylabel('Minutes of PA') + plt.xlabel('Days') + plt.title('Mov Avg PA levels pt' + str(index) + '_' + group) + + d = {'Pre': newPASlopePre, 'LOS':newPASlopeLOS, 'Post': newPASlopePost} + + return(d) + +TrendDictComp = Trendliner(AvTotActComp, AXVcomb, 'complication') +TrendDictNoComp= Trendliner(AvTotActNoComp, AXVcomb, 'no complication') + + +#%% + +# def SlopeStats(SlopeDict, group): +# MeanSlopePre, MeanIntPre = SlopeDict['Pre'].mean() +# StdSlopePre, StdIntPre = SlopeDict['Pre'].std() +# MeanSlopeLOS, MeanIntLOS = SlopeDict['LOS'].mean() +# StdSlopeLOS, StdIntLOS = SlopeDict['LOS'].std() +# MeanSlopePost, MeanIntPost = SlopeDict['Post'].mean() +# StdSlopePost, StdIntPost = SlopeDict['Post'].std() + +# print('Stats '+ group + ':', '\n') +# print('Mean slope PA Pre-op = '+ str(MeanSlopePre),'Std= ' + str(StdSlopePre)) +# print('Mean slope PA hospitalization = '+ str(MeanSlopeLOS),'Std= ' + str(StdSlopeLOS)) +# print('Mean slope PA Post-op = '+ str(MeanSlopePost),'Std= ' + str(StdSlopePost)) +# print('Mean intersept PA Pre-op = '+ str(MeanIntPre),'Std= ' + str(StdIntPre)) +# print('Mean intercept PA hospitalization = '+ str(MeanIntLOS),'Std= ' + str(StdIntLOS)) +# print('Mean intercept PA Post-op = '+ str(MeanIntPre),'Std= ' + str(StdIntPre), '\n') + +# return(MeanSlopePre, StdSlopePre, MeanSlopeLOS, StdSlopeLOS, MeanSlopePost, StdSlopePost) + +# MeanSlopePreComp, StdSlopePreComp, MeanSlopeLOSComp, StdSlopLOSComp, MeanSlopePostComp, StdSlopeComp, = SlopeStats(TrendDictComp, 'complications') +# MeanSlopePreNoComp, StdSlopePreNoComp, MeanSlopeLOSNoComp, StdSlopLOSNoComp, MeanSlopePostNoComp, StdSlopeNoComp = SlopeStats(TrendDictNoComp, 'no complications') + + +#%% + +# SlopeIntPreComp = pd.DataFrame(TrendDictComp['Pre']) +# SlopeIntPreComp['Period'] = 'Pre' +# SlopeIntPreNoComp= pd.DataFrame(TrendDictNoComp['Pre']) +# SlopeIntPreNoComp['Period'] = 'Pre' +# SlopeIntLOSComp = pd.DataFrame(TrendDictComp['LOS']) +# SlopeIntLOSComp['Period'] = 'LOS' +# SlopeIntLOSNoComp= pd.DataFrame(TrendDictNoComp['LOS']) +# SlopeIntLOSNoComp['Period'] = 'LOS' + +# SlopeIntPostComp = pd.DataFrame(TrendDictComp['Post']) +# SlopeIntPostComp['Period'] = 'Post' +# SlopeIntPostNoComp= pd.DataFrame(TrendDictNoComp['Post']) +# SlopeIntPostNoComp['Period'] = 'Post' + +# Slope = pd.DataFrame() +# Slope = Slope.append([SlopeIntPreComp, SlopeIntPreNoComp, SlopeIntLOSComp, SlopeIntLOSNoComp, SlopeIntPostComp, SlopeIntPostNoComp]) +# Slope['Slope'] = Slope['Slope'].astype('float64') +# Slope['Int'] = Slope['Int'].astype('float64') + +#%% +# plt.figure(figsize=(12,8)) +# sns.set_theme(style="darkgrid") +# sns.violinplot(x=Slope['Period'], y=Slope['Slope'],hue=Slope['Group'], palette="muted", split=True) +# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) + +# plt.figure(figsize=(12,8)) +# sns.set_theme(style="darkgrid") +# sns.violinplot(x=Slope['Period'], y=Slope['Int'],hue=Slope['Group'], palette="muted", split=True) +# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) + +#%% +#fig, axes = plt.subplots(1,2, sharey=True) +#sns.violinplot(data=newPASlopeComp['Intercept'], ax=axes[0], color='b') +#sns.violinplot(data=newPASlopeNoComp['Intercept'], ax=axes[1], color='r') \ No newline at end of file diff --git a/ScatterBoxplotter.py b/ScatterBoxplotter.py new file mode 100644 index 0000000..2d6ac9f --- /dev/null +++ b/ScatterBoxplotter.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri May 14 09:18:32 2021 + +@author: Dijkhofmf +""" + +# Import stuff +import os +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn import preprocessing + +#%% Import data and path + +Path = 'I:\Mike Dijkhof\Connecare MGP\Data\FinalFiles' + +# Set path +os.chdir(Path) + +#%% Create DF +FinalDF = pd.DataFrame(pd.read_csv('FinalDataset.csv')) + +X = pd.DataFrame(FinalDF) + +cols = X.drop('Pt Type', axis=1) + + +ID = X['Study ID'] +y = X['Pt Type'] +y= y.replace('Healthy', 'No-complication') +X = X.drop(['Pt Type', 'Study ID'], axis=1) + +#%% +X1 = pd.DataFrame(preprocessing.scale(X), columns=X.columns) + +X1['Pt Type'] = y +X1.set_index(ID) + +#%% + +X1.columns = ['Age (years)', 'Gender', 'Daily alcohol use', 'Medication', + 'ASA-classification', 'Recurrent disease?', 'Comorb', + 'Independent, with others', 'Smokes cigarettes/sigar', 'BMI', 'GFI', + 'HADS_A', 'HADS Depression', 'ADL', 'iADL', 'TUG', 'Handgrip strength', + 'Avg. Steps/day', 'Avg. MVPA/day', 'Pt Type'] + +plots = X1.columns + +#%% +import matplotlib.pylab as pylab + +params = {'legend.fontsize': 'x-large', + 'axes.labelsize': 'x-large', + 'axes.titlesize':'x-large', + 'xtick.labelsize':'x-large', + 'ytick.labelsize':'x-large'} + +pylab.rcParams.update(params) + +plots = plots[1:] +namecount=0 + +for x in plots: + name = str(plots[namecount]) + plt.figure(dpi=720) + sns.boxplot(x='Pt Type', y=x, data=X1, boxprops=dict(alpha=0.5)) + sns.swarmplot(x='Pt Type', y=x, data=X1) + plt.title('Swarm-boxplot ' + name) + namecount = namecount +1 + + \ No newline at end of file