test
This commit is contained in:
commit
bb4d0014b6
306
PLS.py
Normal file
306
PLS.py
Normal file
@ -0,0 +1,306 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Wed May 19 09:23:49 2021
|
||||||
|
|
||||||
|
@author: Dijkhofmf
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Import stuff
|
||||||
|
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import seaborn as sns
|
||||||
|
from scipy import stats
|
||||||
|
#from pca import pca
|
||||||
|
from sklearn import preprocessing
|
||||||
|
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut
|
||||||
|
from sklearn.cross_decomposition import PLSRegression
|
||||||
|
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score
|
||||||
|
|
||||||
|
from sklearn.metrics import r2_score, accuracy_score, plot_confusion_matrix, confusion_matrix, roc_curve, auc, roc_auc_score, precision_recall_fscore_support, classification_report, mean_squared_error
|
||||||
|
|
||||||
|
#%% Import data and path
|
||||||
|
|
||||||
|
Path = 'I:\Mike Dijkhof\Connecare MGP\Data\FinalFiles'
|
||||||
|
|
||||||
|
# Set path
|
||||||
|
os.chdir(Path)
|
||||||
|
|
||||||
|
#%% Create DF
|
||||||
|
FinalDF = pd.DataFrame(pd.read_csv('FinalDataset.csv'))
|
||||||
|
|
||||||
|
X = FinalDF.drop(['Pt Type'], axis=1)
|
||||||
|
X = X.drop(['Study ID'], axis=1)
|
||||||
|
|
||||||
|
# #%%
|
||||||
|
|
||||||
|
# plt.figure(dpi=720, figsize=(20,20))
|
||||||
|
# heatmap = sns.heatmap(X.corr(), vmin=-1, vmax=1, annot=True)
|
||||||
|
# plt.title('Pearson correlation heatmap')
|
||||||
|
|
||||||
|
# plt.figure(dpi=720, figsize=(20,20))
|
||||||
|
# heatmap = sns.heatmap(X.corr(method='spearman'), vmin=-1, vmax=1, annot=True)
|
||||||
|
# plt.title('Spearman correlation heatmap')
|
||||||
|
|
||||||
|
|
||||||
|
#%%
|
||||||
|
cols = pd.DataFrame(X).columns
|
||||||
|
y = FinalDF['Pt Type']
|
||||||
|
|
||||||
|
y = y.replace('Complication', int(1))
|
||||||
|
y = y.replace('Healthy', int(0))
|
||||||
|
|
||||||
|
X = X.dropna()
|
||||||
|
|
||||||
|
y = y.loc[X.index]
|
||||||
|
y = y.astype(int)
|
||||||
|
|
||||||
|
#X = X.reset_index()
|
||||||
|
#y = y.reset_index()
|
||||||
|
#y = y.drop('index', axis=1)
|
||||||
|
#X = X.drop('Study ID', axis=1)
|
||||||
|
|
||||||
|
X = preprocessing.scale(X)
|
||||||
|
|
||||||
|
#%%
|
||||||
|
|
||||||
|
#PLS = PLSRegression(n_components=2, scale=True)
|
||||||
|
|
||||||
|
# # Or reduce the data towards 2 PCs
|
||||||
|
# model = pca(n_components=8)
|
||||||
|
# # Fit transform
|
||||||
|
# results = model.fit_transform(X, y, col_labels=cols)
|
||||||
|
# # Plot explained variance
|
||||||
|
# fig, ax = model.plot()
|
||||||
|
# # Scatter first 2 PCs
|
||||||
|
# fig, ax = model.scatter()
|
||||||
|
# # Make biplot with the number of features
|
||||||
|
# fig, ax = model.biplot(n_feat=8)
|
||||||
|
|
||||||
|
#%%
|
||||||
|
|
||||||
|
def base_pls(X_train, y_train, X_test, n_components):
|
||||||
|
|
||||||
|
# Define PLS model and fit it to the train set
|
||||||
|
PLS = PLSRegression(n_components=n_components)
|
||||||
|
PLS.fit(X_train, y_train)
|
||||||
|
|
||||||
|
# Prediction on train set
|
||||||
|
y_c = PLS.predict(X_train)
|
||||||
|
|
||||||
|
# Prediction on test set
|
||||||
|
y_p = PLS.predict(X_test)
|
||||||
|
|
||||||
|
return(y_c, y_p)
|
||||||
|
|
||||||
|
#%%
|
||||||
|
|
||||||
|
cval = LeaveOneOut()
|
||||||
|
n_components = range(1,19)
|
||||||
|
|
||||||
|
X = pd.DataFrame(X)
|
||||||
|
yu = pd.DataFrame(y)
|
||||||
|
|
||||||
|
rmse_c = np.zeros(len(n_components)).astype('float32')
|
||||||
|
rmse_cv = np.zeros(len(n_components)).astype('float32')
|
||||||
|
R2_cv = np.zeros(len(n_components)).astype('float32')
|
||||||
|
Q2_cv = np.zeros(len(n_components)).astype('float32')
|
||||||
|
PRESScv = np.zeros(len(n_components)).astype('float32')
|
||||||
|
RSS_cv = np.zeros(len(n_components)).astype('float32')
|
||||||
|
WoldR = np.zeros(len(n_components)).astype('float32')
|
||||||
|
|
||||||
|
for i in n_components:
|
||||||
|
|
||||||
|
rmsec = []
|
||||||
|
rmsecv = []
|
||||||
|
PRESS = []
|
||||||
|
RSS = []
|
||||||
|
R2 = []
|
||||||
|
Q2 = []
|
||||||
|
|
||||||
|
for train, test in cval.split(X):
|
||||||
|
|
||||||
|
PLS = PLSRegression(n_components=i, scale=False)
|
||||||
|
PLS.fit_transform(X.iloc[train], y.iloc[train])
|
||||||
|
|
||||||
|
Y_pred_train = PLS.predict(X.iloc[train])
|
||||||
|
Y_pred_test = PLS.predict(X.iloc[test])
|
||||||
|
|
||||||
|
rmsec.append(np.sqrt(mean_squared_error(y.iloc[train], Y_pred_train[:,0])))
|
||||||
|
rmsecv.append(np.sqrt(mean_squared_error(y.iloc[test], Y_pred_test[:,0])))
|
||||||
|
|
||||||
|
y_value = y.iloc[test]
|
||||||
|
y_value = y_value.iloc[0]
|
||||||
|
|
||||||
|
RSS.append((Y_pred_test - y_value)**2)
|
||||||
|
PRESS.append(mean_squared_error(y.iloc[test], Y_pred_test[:,0]))
|
||||||
|
R2Y = r2_score(y.iloc[train], Y_pred_train)
|
||||||
|
|
||||||
|
|
||||||
|
rmse_c[i-1] = np.mean(np.array(rmsec))
|
||||||
|
rmse_cv[i-1] = np.mean(np.array(rmsecv))
|
||||||
|
RSS_cv[i-1] = np.mean(np.array(RSS))
|
||||||
|
PRESScv[i-1] = np.mean(np.array(PRESS))
|
||||||
|
R2_cv[i-1] = np.mean(np.array(R2Y))
|
||||||
|
if i == 1:
|
||||||
|
Q2_cv[0] = 0
|
||||||
|
else:
|
||||||
|
Q2_cv[i-1] = (1 - (PRESScv[i-1]/RSS_cv[i-2]))
|
||||||
|
|
||||||
|
|
||||||
|
if i == 1:
|
||||||
|
WoldR[i-1] = 0
|
||||||
|
elif i > 1:
|
||||||
|
WoldR[i-1] = (PRESScv[i-1]/PRESScv[i-2])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
with plt.style.context(('seaborn')):
|
||||||
|
plt.plot(rmse_c, 'o-r', label = "RMSE Calibration")
|
||||||
|
plt.plot(rmse_cv, 'o-b', label = "RMSE Cross Validation")
|
||||||
|
plt.ylabel('RMSE')
|
||||||
|
plt.xlabel('Number of LV included')
|
||||||
|
plt.legend()
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
with plt.style.context(('seaborn')):
|
||||||
|
plt.plot(R2_cv, 'o-r', label = "R2_score")
|
||||||
|
plt.plot(Q2_cv, 'o-b', label = "Q2_score")
|
||||||
|
plt.ylabel('Score')
|
||||||
|
plt.xlabel('Number of LV included')
|
||||||
|
plt.legend()
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
with plt.style.context(('seaborn')):
|
||||||
|
plt.plot(WoldR, 'o-r', label = "Wold R")
|
||||||
|
plt.plot(PRESScv, 'o-b', label = "PRESS")
|
||||||
|
plt.ylabel('Score')
|
||||||
|
plt.xlabel('Number of LV included')
|
||||||
|
plt.legend()
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
#%% Select n_components based on LOOCV:
|
||||||
|
|
||||||
|
PLS = PLSRegression(n_components=6, scale=True)
|
||||||
|
|
||||||
|
|
||||||
|
PLS.fit_transform(X, y)
|
||||||
|
|
||||||
|
Y_pred = PLS.predict(X)
|
||||||
|
bin_pred = (PLS.predict(X)[:,0] > 0.5).astype('uint8')
|
||||||
|
|
||||||
|
|
||||||
|
plt.figure(dpi=720)
|
||||||
|
plt.title('Confusion Matrix PLS-DA')
|
||||||
|
cf_rf = confusion_matrix(y, bin_pred)
|
||||||
|
sns.heatmap(cf_rf, annot=True, cmap='Blues')
|
||||||
|
plt.xlabel('Predicted outcomes')
|
||||||
|
plt.ylabel('True outcomes')
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
print(classification_report(y, bin_pred))
|
||||||
|
scores_PLS = precision_recall_fscore_support(y, bin_pred)
|
||||||
|
|
||||||
|
#%%
|
||||||
|
|
||||||
|
def vip(model):
|
||||||
|
|
||||||
|
t = model.x_scores_
|
||||||
|
w = model.x_weights_
|
||||||
|
q = model.y_loadings_
|
||||||
|
p, h = w.shape
|
||||||
|
|
||||||
|
vips = np.zeros((p,))
|
||||||
|
|
||||||
|
s = np.diag(t.T @ t @ q.T @ q).reshape(h, -1)
|
||||||
|
total_s = np.sum(s)
|
||||||
|
|
||||||
|
for i in range(p):
|
||||||
|
|
||||||
|
weight = np.array([ (w[i,j] / np.linalg.norm(w[:,j]))**2 for j in range(h) ])
|
||||||
|
vips[i] = np.sqrt(p*(s.T @ weight)/total_s)
|
||||||
|
|
||||||
|
return vips
|
||||||
|
|
||||||
|
VIP = vip(PLS)
|
||||||
|
|
||||||
|
#%%
|
||||||
|
|
||||||
|
cols = ['Age (years)', 'Gender', 'Daily alcohol use', 'Medication',
|
||||||
|
'ASA-classification', 'Recurrent disease?', 'Comorb',
|
||||||
|
'Independent, with others', 'Smokes cigarettes/sigar', 'BMI', 'GFI',
|
||||||
|
'HADS_A', 'HADS Depression', 'ADL', 'iADL', 'TUG', 'Handgrip strength',
|
||||||
|
'Avg. Steps/day', 'Avg. MVPA/day']
|
||||||
|
|
||||||
|
VIP = pd.DataFrame(VIP, columns=['VIP'])
|
||||||
|
VIP['labels'] = cols
|
||||||
|
VIP = VIP.sort_values('VIP', ascending=False)
|
||||||
|
|
||||||
|
|
||||||
|
#%%
|
||||||
|
|
||||||
|
BCT = VIP['VIP'].describe()[6] + (1.5*( VIP['VIP'].describe()[6]- VIP['VIP'].describe()[4]))
|
||||||
|
|
||||||
|
#%%
|
||||||
|
|
||||||
|
VIP = VIP[VIP['VIP']>1]
|
||||||
|
|
||||||
|
#%%
|
||||||
|
import matplotlib.pylab as pylab
|
||||||
|
|
||||||
|
params = {'legend.fontsize': 'x-large',
|
||||||
|
'axes.labelsize': 'x-large',
|
||||||
|
'axes.titlesize':'x-large',
|
||||||
|
'xtick.labelsize':'x-large',
|
||||||
|
'ytick.labelsize':'x-large'}
|
||||||
|
|
||||||
|
pylab.rcParams.update(params)
|
||||||
|
|
||||||
|
|
||||||
|
plt.figure(figsize=(12,7), dpi=720)
|
||||||
|
sns.set_style('whitegrid')
|
||||||
|
plt.bar(x=VIP['labels'], height=VIP['VIP'])
|
||||||
|
plt.title('VIP scores')
|
||||||
|
plt.yticks(fontsize='x-large')
|
||||||
|
plt.xticks(rotation=-90, fontsize='x-large')
|
||||||
|
|
||||||
|
#%% Compute ROC curve and ROC area for each class
|
||||||
|
|
||||||
|
fpr = dict()
|
||||||
|
tpr = dict()
|
||||||
|
roc_auc = dict()
|
||||||
|
|
||||||
|
fpr, tpr, _ = roc_curve(y, bin_pred)
|
||||||
|
roc_auc = auc(fpr, tpr)
|
||||||
|
|
||||||
|
# Compute micro-average ROC curve and ROC area
|
||||||
|
fpr_m, tpr_m, _ = roc_curve(y.ravel(), bin_pred.ravel())
|
||||||
|
roc_auc_m = auc(fpr_m, tpr_m)
|
||||||
|
|
||||||
|
plt.figure(dpi=720)
|
||||||
|
lw = 2
|
||||||
|
plt.plot(fpr, tpr, color='darkorange',
|
||||||
|
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
|
||||||
|
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
|
||||||
|
plt.xlim([0.0, 1.0])
|
||||||
|
plt.ylim([0.0, 1.05])
|
||||||
|
plt.xlabel('False Positive Rate')
|
||||||
|
plt.ylabel('True Positive Rate')
|
||||||
|
plt.title('ROC-Curve PLS-DA')
|
||||||
|
plt.legend(loc="lower right")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
plt.figure(dpi=720)
|
||||||
|
lw = 2
|
||||||
|
plt.plot(fpr_m, tpr_m, color='darkorange',
|
||||||
|
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc_m)
|
||||||
|
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
|
||||||
|
plt.xlim([0.0, 1.0])
|
||||||
|
plt.ylim([0.0, 1.05])
|
||||||
|
plt.xlabel('False Positive Rate')
|
||||||
|
plt.ylabel('True Positive Rate')
|
||||||
|
plt.title('Receiver operating characteristic example')
|
||||||
|
plt.legend(loc="lower right")
|
||||||
|
plt.show()
|
Loading…
x
Reference in New Issue
Block a user