import os import sys import configparser import numpy as np import pandas as pd from matplotlib import pyplot from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score from sklearn import preprocessing from collections import Counter # database import pypyodbc # classifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.metrics import f1_score from sklearn.metrics import confusion_matrix import pickle currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification' sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir)) from dataIO import readFile from dataIO import groupSamplesInCSV import dataManipulation import utility as util configFile = currDir + '\\config.ini' # load init file config = configparser.ConfigParser() config.sections() config.read(configFile) dirFeature = config['sentence_based']['dirFeature'] sentenceNumMax = 10 classifierList = [] LE_X_decode = [] LE_y = preprocessing.LabelEncoder() LE_y.fit(["Groningen_and_Drenthe", "Limburg", "Oost_Overijsel-Gelderland"]) testset_X = [] testset_y = [] testset_userID = [] result_y_test = [] result_y_prediction = [] fout = open("comparison.csv", "w") for sentenceNum in range(1, sentenceNumMax+1): #if sentenceNum != 10: # sentenceNumStr = '0' + str(sentenceNum) #else: # sentenceNumStr = str(sentenceNumStr) sentenceNumStr = format(sentenceNum, '02') fileSentence = dirFeature + '\\\\' + sentenceNumStr + '.csv' ## load combined data fileCSV = fileSentence idxRegion = 1 header, dataGroningen, dataLimburg, dataOverijsel = groupSamplesInCSV(fileCSV, idxRegion) sampleNumMax = np.min((len(dataGroningen), len(dataLimburg), len(dataOverijsel))) ## make balanced dataset dataG, indexG = dataManipulation.extractRandomSample(np.array(dataGroningen), sampleNumMax) dataL, indexL = dataManipulation.extractRandomSample(np.array(dataLimburg), sampleNumMax) dataO, indexO = dataManipulation.extractRandomSample(np.array(dataOverijsel), sampleNumMax) XIndex = np.arange(idxRegion+1, len(header)) yIndex = 1 # region userIDindex = 0 # userID ## cathegorical values into numbers X_ = np.r_[dataG[:, XIndex], dataL[:, XIndex], dataO[:, XIndex]] y_ = np.r_[dataG[:, yIndex], dataL[:, yIndex], dataO[:, yIndex]] userID_ = np.r_[dataG[:, userIDindex], dataL[:, userIDindex], dataO[:, userIDindex]] #X = np.zeros((X_.shape), 'int') for Xindex in XIndex: x = X_[:, Xindex-2] ## levenshtein distance #word_count = Counter(x) #frequent_word = max(word_count) #X[:, Xindex-2] = dataManipulation.calcLevenshteinArray(frequent_word, x) # hot encoding le_x = preprocessing.LabelBinarizer() le_x.fit(np.unique(x)) x_ = le_x.transform(x) LE_X_decode.append(x_.shape[1]) if Xindex == idxRegion+1: X = x_ else: X = np.c_[X, x_] y = LE_y.transform(y_) ## split into train vs test set #[X_train, X_test, y_train, y_test] = train_test_split(X, y, test_size = 0.2, random_state = 0) # each regional data should be splited equally lenG = dataG.shape[0] lenL = dataL.shape[0] lenO = dataO.shape[0] indexG = np.arange(0, lenG) indexL = np.arange(lenG, lenG+lenL) indexO = np.arange(lenG+lenL, lenG+lenL+lenO) [XG_train, XG_test, yG_train, yG_test] = train_test_split(X[indexG, :], y[indexG], test_size = 0.2, random_state = 0) [XL_train, XL_test, yL_train, yL_test] = train_test_split(X[indexL, :], y[indexL], test_size = 0.2, random_state = 0) [XO_train, XO_test, yO_train, yO_test] = train_test_split(X[indexO, :], y[indexO], test_size = 0.2, random_state = 0) X_train = np.r_[XG_train, XL_train, XO_train] X_test = np.r_[XG_test, XL_test, XO_test] y_train = np.r_[yG_train, yL_train, yO_train] y_test = np.r_[yG_test, yL_test, yO_test] ## comparison ## classifiers #names = ["Nearest Neighbors", # "Linear SVM", # "Poly SVM", # "RBF SVM", # "Decision Tree", # "Random Forest 2", # "Random Forest 3", # "Random Forest 4", # "AdaBoost", # #"Naive Bayes", # "Linear Discriminant Analysis", # #"Quadratic Discriminant Analysis" # ] #classifiers = [ # KNeighborsClassifier(3), # SVC(kernel="linear", C=0.025), # SVC(kernel="poly", C=0.025), # SVC(gamma=2, C=1), # DecisionTreeClassifier(max_depth=4), # RandomForestClassifier(max_depth=2, n_estimators=10, max_features=1), # RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1), # RandomForestClassifier(max_depth=4, n_estimators=10, max_features=1), # AdaBoostClassifier(), # #GaussianNB(), # LinearDiscriminantAnalysis(), # #QuadraticDiscriminantAnalysis() # ] #for name, model in zip(names, classifiers): # scores = cross_val_score(model, X, y, cv = 10, scoring = 'f1_micro') # fout = open("comparison.csv", "a") # fout.write("{0},{1},{2}\n".format(sentenceNum, name, scores.mean())) # print('{0}, {1}: {2}'.format(sentenceNum, name, scores.mean())) # quasi-optimal model model = AdaBoostClassifier() # cross validation scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro') ci_mean, ci_low, ci_high = util.mean_confidence_interval(scores, 0.95) modelfit = model.fit(X_train, y_train) # f1 on test data y_prediction = modelfit.predict(X_test) f1score = f1_score(y_test, y_prediction, average='micro') fout.write("{0},{1},{2},{3}\n".format(ci_mean, ci_low, ci_high, f1score)) ## save for the test testset_X.append(X_test) testset_y.append(y_test) testset_userID.append(userID_) result_y_test = result_y_test + list(y_test) result_y_prediction = result_y_prediction + list(y_prediction) fileClassifier = dirFeature + '\\\\' + sentenceNumStr + '.mdl' pickle.dump(modelfit, open(fileClassifier, 'wb')) fout.close() ### confusion matrix result_y_test_label = LE_y.inverse_transform(result_y_test) result_y_prediction_label = LE_y.inverse_transform(result_y_prediction) confusionMatrix = confusion_matrix(result_y_test_label, result_y_prediction_label, labels=[ 'Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']) print(confusionMatrix) ### make userID list #userID = testset_userID[0] #for sentenceNum in range(1, sentenceNumMax): # userid = testset_userID[sentenceNum] # userID = np.r_[userID, userid] #userIDlist = np.unique(userID)