# -*- coding: utf-8 -*- """ Created on Wed May 19 09:23:49 2021 @author: Dijkhofmf """ # Import stuff import os import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy import stats #from pca import pca from sklearn import preprocessing from sklearn.model_selection import train_test_split, KFold, LeaveOneOut from sklearn.cross_decomposition import PLSRegression from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score from sklearn.metrics import r2_score, accuracy_score, plot_confusion_matrix, confusion_matrix, roc_curve, auc, roc_auc_score, precision_recall_fscore_support, classification_report, mean_squared_error #%% Import data and path Path = 'I:\Mike Dijkhof\Connecare MGP\Data\FinalFiles' # Set path os.chdir(Path) #%% Create DF FinalDF = pd.DataFrame(pd.read_csv('FinalDataset.csv')) X = FinalDF.drop(['Pt Type'], axis=1) X = X.drop(['Study ID'], axis=1) # #%% # plt.figure(dpi=720, figsize=(20,20)) # heatmap = sns.heatmap(X.corr(), vmin=-1, vmax=1, annot=True) # plt.title('Pearson correlation heatmap') # plt.figure(dpi=720, figsize=(20,20)) # heatmap = sns.heatmap(X.corr(method='spearman'), vmin=-1, vmax=1, annot=True) # plt.title('Spearman correlation heatmap') #%% cols = pd.DataFrame(X).columns y = FinalDF['Pt Type'] y = y.replace('Complication', int(1)) y = y.replace('Healthy', int(0)) X = X.dropna() y = y.loc[X.index] y = y.astype(int) #X = X.reset_index() #y = y.reset_index() #y = y.drop('index', axis=1) #X = X.drop('Study ID', axis=1) X = preprocessing.scale(X) #%% #PLS = PLSRegression(n_components=2, scale=True) # # Or reduce the data towards 2 PCs # model = pca(n_components=8) # # Fit transform # results = model.fit_transform(X, y, col_labels=cols) # # Plot explained variance # fig, ax = model.plot() # # Scatter first 2 PCs # fig, ax = model.scatter() # # Make biplot with the number of features # fig, ax = model.biplot(n_feat=8) #%% def base_pls(X_train, y_train, X_test, n_components): # Define PLS model and fit it to the train set PLS = PLSRegression(n_components=n_components) PLS.fit(X_train, y_train) # Prediction on train set y_c = PLS.predict(X_train) # Prediction on test set y_p = PLS.predict(X_test) return(y_c, y_p) #%% cval = LeaveOneOut() n_components = range(1,19) X = pd.DataFrame(X) yu = pd.DataFrame(y) rmse_c = np.zeros(len(n_components)).astype('float32') rmse_cv = np.zeros(len(n_components)).astype('float32') R2_cv = np.zeros(len(n_components)).astype('float32') Q2_cv = np.zeros(len(n_components)).astype('float32') PRESScv = np.zeros(len(n_components)).astype('float32') RSS_cv = np.zeros(len(n_components)).astype('float32') WoldR = np.zeros(len(n_components)).astype('float32') for i in n_components: rmsec = [] rmsecv = [] PRESS = [] RSS = [] R2 = [] Q2 = [] for train, test in cval.split(X): PLS = PLSRegression(n_components=i, scale=False) PLS.fit_transform(X.iloc[train], y.iloc[train]) Y_pred_train = PLS.predict(X.iloc[train]) Y_pred_test = PLS.predict(X.iloc[test]) rmsec.append(np.sqrt(mean_squared_error(y.iloc[train], Y_pred_train[:,0]))) rmsecv.append(np.sqrt(mean_squared_error(y.iloc[test], Y_pred_test[:,0]))) y_value = y.iloc[test] y_value = y_value.iloc[0] RSS.append((Y_pred_test - y_value)**2) PRESS.append(mean_squared_error(y.iloc[test], Y_pred_test[:,0])) R2Y = r2_score(y.iloc[train], Y_pred_train) rmse_c[i-1] = np.mean(np.array(rmsec)) rmse_cv[i-1] = np.mean(np.array(rmsecv)) RSS_cv[i-1] = np.mean(np.array(RSS)) PRESScv[i-1] = np.mean(np.array(PRESS)) R2_cv[i-1] = np.mean(np.array(R2Y)) if i == 1: Q2_cv[0] = 0 else: Q2_cv[i-1] = (1 - (PRESScv[i-1]/RSS_cv[i-2])) if i == 1: WoldR[i-1] = 0 elif i > 1: WoldR[i-1] = (PRESScv[i-1]/PRESScv[i-2]) with plt.style.context(('seaborn')): plt.plot(rmse_c, 'o-r', label = "RMSE Calibration") plt.plot(rmse_cv, 'o-b', label = "RMSE Cross Validation") plt.ylabel('RMSE') plt.xlabel('Number of LV included') plt.legend() plt.show() with plt.style.context(('seaborn')): plt.plot(R2_cv, 'o-r', label = "R2_score") plt.plot(Q2_cv, 'o-b', label = "Q2_score") plt.ylabel('Score') plt.xlabel('Number of LV included') plt.legend() plt.show() with plt.style.context(('seaborn')): plt.plot(WoldR, 'o-r', label = "Wold R") plt.plot(PRESScv, 'o-b', label = "PRESS") plt.ylabel('Score') plt.xlabel('Number of LV included') plt.legend() plt.show() #%% Select n_components based on LOOCV: PLS = PLSRegression(n_components=6, scale=True) PLS.fit_transform(X, y) Y_pred = PLS.predict(X) bin_pred = (PLS.predict(X)[:,0] > 0.5).astype('uint8') plt.figure(dpi=720) plt.title('Confusion Matrix PLS-DA') cf_rf = confusion_matrix(y, bin_pred) sns.heatmap(cf_rf, annot=True, cmap='Blues') plt.xlabel('Predicted outcomes') plt.ylabel('True outcomes') plt.show() print(classification_report(y, bin_pred)) scores_PLS = precision_recall_fscore_support(y, bin_pred) #%% def vip(model): t = model.x_scores_ w = model.x_weights_ q = model.y_loadings_ p, h = w.shape vips = np.zeros((p,)) s = np.diag(t.T @ t @ q.T @ q).reshape(h, -1) total_s = np.sum(s) for i in range(p): weight = np.array([ (w[i,j] / np.linalg.norm(w[:,j]))**2 for j in range(h) ]) vips[i] = np.sqrt(p*(s.T @ weight)/total_s) return vips VIP = vip(PLS) #%% cols = ['Age (years)', 'Gender', 'Daily alcohol use', 'Medication', 'ASA-classification', 'Recurrent disease?', 'Comorb', 'Independent, with others', 'Smokes cigarettes/sigar', 'BMI', 'GFI', 'HADS_A', 'HADS Depression', 'ADL', 'iADL', 'TUG', 'Handgrip strength', 'Avg. Steps/day', 'Avg. MVPA/day'] VIP = pd.DataFrame(VIP, columns=['VIP']) VIP['labels'] = cols VIP = VIP.sort_values('VIP', ascending=False) #%% BCT = VIP['VIP'].describe()[6] + (1.5*( VIP['VIP'].describe()[6]- VIP['VIP'].describe()[4])) #%% VIP = VIP[VIP['VIP']>1] #%% import matplotlib.pylab as pylab params = {'legend.fontsize': 'x-large', 'axes.labelsize': 'x-large', 'axes.titlesize':'x-large', 'xtick.labelsize':'x-large', 'ytick.labelsize':'x-large'} pylab.rcParams.update(params) plt.figure(figsize=(12,7), dpi=720) sns.set_style('whitegrid') plt.bar(x=VIP['labels'], height=VIP['VIP']) plt.title('VIP scores') plt.yticks(fontsize='x-large') plt.xticks(rotation=-90, fontsize='x-large') #%% Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() fpr, tpr, _ = roc_curve(y, bin_pred) roc_auc = auc(fpr, tpr) # Compute micro-average ROC curve and ROC area fpr_m, tpr_m, _ = roc_curve(y.ravel(), bin_pred.ravel()) roc_auc_m = auc(fpr_m, tpr_m) plt.figure(dpi=720) lw = 2 plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC-Curve PLS-DA') plt.legend(loc="lower right") plt.show() plt.figure(dpi=720) lw = 2 plt.plot(fpr_m, tpr_m, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc_m) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.show()