commit bb4d0014b64da93ef0d1538b8a148fbfa1ce6792 Author: Mike Dijkhof Date: Thu Jul 1 15:20:19 2021 +0200 test diff --git a/PLS.py b/PLS.py new file mode 100644 index 0000000..4605db7 --- /dev/null +++ b/PLS.py @@ -0,0 +1,306 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed May 19 09:23:49 2021 + +@author: Dijkhofmf +""" + +# Import stuff + +import os +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from scipy import stats +#from pca import pca +from sklearn import preprocessing +from sklearn.model_selection import train_test_split, KFold, LeaveOneOut +from sklearn.cross_decomposition import PLSRegression +from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score + +from sklearn.metrics import r2_score, accuracy_score, plot_confusion_matrix, confusion_matrix, roc_curve, auc, roc_auc_score, precision_recall_fscore_support, classification_report, mean_squared_error + +#%% Import data and path + +Path = 'I:\Mike Dijkhof\Connecare MGP\Data\FinalFiles' + +# Set path +os.chdir(Path) + +#%% Create DF +FinalDF = pd.DataFrame(pd.read_csv('FinalDataset.csv')) + +X = FinalDF.drop(['Pt Type'], axis=1) +X = X.drop(['Study ID'], axis=1) + +# #%% + +# plt.figure(dpi=720, figsize=(20,20)) +# heatmap = sns.heatmap(X.corr(), vmin=-1, vmax=1, annot=True) +# plt.title('Pearson correlation heatmap') + +# plt.figure(dpi=720, figsize=(20,20)) +# heatmap = sns.heatmap(X.corr(method='spearman'), vmin=-1, vmax=1, annot=True) +# plt.title('Spearman correlation heatmap') + + +#%% +cols = pd.DataFrame(X).columns +y = FinalDF['Pt Type'] + +y = y.replace('Complication', int(1)) +y = y.replace('Healthy', int(0)) + +X = X.dropna() + +y = y.loc[X.index] +y = y.astype(int) + +#X = X.reset_index() +#y = y.reset_index() +#y = y.drop('index', axis=1) +#X = X.drop('Study ID', axis=1) + +X = preprocessing.scale(X) + +#%% + +#PLS = PLSRegression(n_components=2, scale=True) + +# # Or reduce the data towards 2 PCs +# model = pca(n_components=8) +# # Fit transform +# results = model.fit_transform(X, y, col_labels=cols) +# # Plot explained variance +# fig, ax = model.plot() +# # Scatter first 2 PCs +# fig, ax = model.scatter() +# # Make biplot with the number of features +# fig, ax = model.biplot(n_feat=8) + +#%% + +def base_pls(X_train, y_train, X_test, n_components): + + # Define PLS model and fit it to the train set + PLS = PLSRegression(n_components=n_components) + PLS.fit(X_train, y_train) + + # Prediction on train set + y_c = PLS.predict(X_train) + + # Prediction on test set + y_p = PLS.predict(X_test) + + return(y_c, y_p) + +#%% + +cval = LeaveOneOut() +n_components = range(1,19) + +X = pd.DataFrame(X) +yu = pd.DataFrame(y) + +rmse_c = np.zeros(len(n_components)).astype('float32') +rmse_cv = np.zeros(len(n_components)).astype('float32') +R2_cv = np.zeros(len(n_components)).astype('float32') +Q2_cv = np.zeros(len(n_components)).astype('float32') +PRESScv = np.zeros(len(n_components)).astype('float32') +RSS_cv = np.zeros(len(n_components)).astype('float32') +WoldR = np.zeros(len(n_components)).astype('float32') + +for i in n_components: + + rmsec = [] + rmsecv = [] + PRESS = [] + RSS = [] + R2 = [] + Q2 = [] + + for train, test in cval.split(X): + + PLS = PLSRegression(n_components=i, scale=False) + PLS.fit_transform(X.iloc[train], y.iloc[train]) + + Y_pred_train = PLS.predict(X.iloc[train]) + Y_pred_test = PLS.predict(X.iloc[test]) + + rmsec.append(np.sqrt(mean_squared_error(y.iloc[train], Y_pred_train[:,0]))) + rmsecv.append(np.sqrt(mean_squared_error(y.iloc[test], Y_pred_test[:,0]))) + + y_value = y.iloc[test] + y_value = y_value.iloc[0] + + RSS.append((Y_pred_test - y_value)**2) + PRESS.append(mean_squared_error(y.iloc[test], Y_pred_test[:,0])) + R2Y = r2_score(y.iloc[train], Y_pred_train) + + + rmse_c[i-1] = np.mean(np.array(rmsec)) + rmse_cv[i-1] = np.mean(np.array(rmsecv)) + RSS_cv[i-1] = np.mean(np.array(RSS)) + PRESScv[i-1] = np.mean(np.array(PRESS)) + R2_cv[i-1] = np.mean(np.array(R2Y)) + if i == 1: + Q2_cv[0] = 0 + else: + Q2_cv[i-1] = (1 - (PRESScv[i-1]/RSS_cv[i-2])) + + + if i == 1: + WoldR[i-1] = 0 + elif i > 1: + WoldR[i-1] = (PRESScv[i-1]/PRESScv[i-2]) + + + +with plt.style.context(('seaborn')): + plt.plot(rmse_c, 'o-r', label = "RMSE Calibration") + plt.plot(rmse_cv, 'o-b', label = "RMSE Cross Validation") + plt.ylabel('RMSE') + plt.xlabel('Number of LV included') +plt.legend() +plt.show() + +with plt.style.context(('seaborn')): + plt.plot(R2_cv, 'o-r', label = "R2_score") + plt.plot(Q2_cv, 'o-b', label = "Q2_score") + plt.ylabel('Score') + plt.xlabel('Number of LV included') +plt.legend() +plt.show() + +with plt.style.context(('seaborn')): + plt.plot(WoldR, 'o-r', label = "Wold R") + plt.plot(PRESScv, 'o-b', label = "PRESS") + plt.ylabel('Score') + plt.xlabel('Number of LV included') +plt.legend() +plt.show() + +#%% Select n_components based on LOOCV: + +PLS = PLSRegression(n_components=6, scale=True) + + +PLS.fit_transform(X, y) + +Y_pred = PLS.predict(X) +bin_pred = (PLS.predict(X)[:,0] > 0.5).astype('uint8') + + +plt.figure(dpi=720) +plt.title('Confusion Matrix PLS-DA') +cf_rf = confusion_matrix(y, bin_pred) +sns.heatmap(cf_rf, annot=True, cmap='Blues') +plt.xlabel('Predicted outcomes') +plt.ylabel('True outcomes') +plt.show() + +print(classification_report(y, bin_pred)) +scores_PLS = precision_recall_fscore_support(y, bin_pred) + +#%% + +def vip(model): + + t = model.x_scores_ + w = model.x_weights_ + q = model.y_loadings_ + p, h = w.shape + + vips = np.zeros((p,)) + + s = np.diag(t.T @ t @ q.T @ q).reshape(h, -1) + total_s = np.sum(s) + + for i in range(p): + + weight = np.array([ (w[i,j] / np.linalg.norm(w[:,j]))**2 for j in range(h) ]) + vips[i] = np.sqrt(p*(s.T @ weight)/total_s) + + return vips + +VIP = vip(PLS) + +#%% + +cols = ['Age (years)', 'Gender', 'Daily alcohol use', 'Medication', + 'ASA-classification', 'Recurrent disease?', 'Comorb', + 'Independent, with others', 'Smokes cigarettes/sigar', 'BMI', 'GFI', + 'HADS_A', 'HADS Depression', 'ADL', 'iADL', 'TUG', 'Handgrip strength', + 'Avg. Steps/day', 'Avg. MVPA/day'] + +VIP = pd.DataFrame(VIP, columns=['VIP']) +VIP['labels'] = cols +VIP = VIP.sort_values('VIP', ascending=False) + + +#%% + +BCT = VIP['VIP'].describe()[6] + (1.5*( VIP['VIP'].describe()[6]- VIP['VIP'].describe()[4])) + +#%% + +VIP = VIP[VIP['VIP']>1] + +#%% +import matplotlib.pylab as pylab + +params = {'legend.fontsize': 'x-large', + 'axes.labelsize': 'x-large', + 'axes.titlesize':'x-large', + 'xtick.labelsize':'x-large', + 'ytick.labelsize':'x-large'} + +pylab.rcParams.update(params) + + +plt.figure(figsize=(12,7), dpi=720) +sns.set_style('whitegrid') +plt.bar(x=VIP['labels'], height=VIP['VIP']) +plt.title('VIP scores') +plt.yticks(fontsize='x-large') +plt.xticks(rotation=-90, fontsize='x-large') + +#%% Compute ROC curve and ROC area for each class + +fpr = dict() +tpr = dict() +roc_auc = dict() + +fpr, tpr, _ = roc_curve(y, bin_pred) +roc_auc = auc(fpr, tpr) + +# Compute micro-average ROC curve and ROC area +fpr_m, tpr_m, _ = roc_curve(y.ravel(), bin_pred.ravel()) +roc_auc_m = auc(fpr_m, tpr_m) + +plt.figure(dpi=720) +lw = 2 +plt.plot(fpr, tpr, color='darkorange', + lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) +plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') +plt.xlim([0.0, 1.0]) +plt.ylim([0.0, 1.05]) +plt.xlabel('False Positive Rate') +plt.ylabel('True Positive Rate') +plt.title('ROC-Curve PLS-DA') +plt.legend(loc="lower right") +plt.show() + +plt.figure(dpi=720) +lw = 2 +plt.plot(fpr_m, tpr_m, color='darkorange', + lw=lw, label='ROC curve (area = %0.2f)' % roc_auc_m) +plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') +plt.xlim([0.0, 1.0]) +plt.ylim([0.0, 1.05]) +plt.xlabel('False Positive Rate') +plt.ylabel('True Positive Rate') +plt.title('Receiver operating characteristic example') +plt.legend(loc="lower right") +plt.show()