198 lines
6.5 KiB
Python
198 lines
6.5 KiB
Python
import os
|
|
import sys
|
|
import configparser
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from matplotlib import pyplot
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.model_selection import cross_val_score
|
|
from sklearn import preprocessing
|
|
from collections import Counter
|
|
|
|
# database
|
|
import pypyodbc
|
|
|
|
# classifier
|
|
from sklearn.neighbors import KNeighborsClassifier
|
|
from sklearn.svm import SVC
|
|
from sklearn.tree import DecisionTreeClassifier
|
|
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
|
|
from sklearn.naive_bayes import GaussianNB
|
|
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
|
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
|
|
from sklearn.metrics import f1_score
|
|
from sklearn.metrics import confusion_matrix
|
|
import pickle
|
|
|
|
currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
|
|
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
|
|
from dataIO import readFile
|
|
from dataIO import groupSamplesInCSV
|
|
import dataManipulation
|
|
import utility as util
|
|
|
|
|
|
configFile = currDir + '\\config.ini'
|
|
# load init file
|
|
config = configparser.ConfigParser()
|
|
config.sections()
|
|
config.read(configFile)
|
|
dirFeature = config['sentence_based']['dirFeature']
|
|
|
|
sentenceNumMax = 10
|
|
classifierList = []
|
|
LE_X_decode = []
|
|
LE_y = preprocessing.LabelEncoder()
|
|
LE_y.fit(["Groningen_and_Drenthe", "Limburg", "Oost_Overijsel-Gelderland"])
|
|
|
|
testset_X = []
|
|
testset_y = []
|
|
testset_userID = []
|
|
result_y_test = []
|
|
result_y_prediction = []
|
|
fout = open("comparison.csv", "w")
|
|
for sentenceNum in range(1, sentenceNumMax+1):
|
|
#if sentenceNum != 10:
|
|
# sentenceNumStr = '0' + str(sentenceNum)
|
|
#else:
|
|
# sentenceNumStr = str(sentenceNumStr)
|
|
sentenceNumStr = format(sentenceNum, '02')
|
|
fileSentence = dirFeature + '\\\\' + sentenceNumStr + '.csv'
|
|
|
|
|
|
## load combined data
|
|
fileCSV = fileSentence
|
|
idxRegion = 1
|
|
header, dataGroningen, dataLimburg, dataOverijsel = groupSamplesInCSV(fileCSV, idxRegion)
|
|
sampleNumMax = np.min((len(dataGroningen), len(dataLimburg), len(dataOverijsel)))
|
|
|
|
|
|
## make balanced dataset
|
|
dataG, indexG = dataManipulation.extractRandomSample(np.array(dataGroningen), sampleNumMax)
|
|
dataL, indexL = dataManipulation.extractRandomSample(np.array(dataLimburg), sampleNumMax)
|
|
dataO, indexO = dataManipulation.extractRandomSample(np.array(dataOverijsel), sampleNumMax)
|
|
|
|
XIndex = np.arange(idxRegion+1, len(header))
|
|
yIndex = 1 # region
|
|
userIDindex = 0 # userID
|
|
|
|
|
|
## cathegorical values into numbers
|
|
X_ = np.r_[dataG[:, XIndex], dataL[:, XIndex], dataO[:, XIndex]]
|
|
y_ = np.r_[dataG[:, yIndex], dataL[:, yIndex], dataO[:, yIndex]]
|
|
userID_ = np.r_[dataG[:, userIDindex], dataL[:, userIDindex], dataO[:, userIDindex]]
|
|
|
|
#X = np.zeros((X_.shape), 'int')
|
|
for Xindex in XIndex:
|
|
x = X_[:, Xindex-2]
|
|
|
|
## levenshtein distance
|
|
#word_count = Counter(x)
|
|
#frequent_word = max(word_count)
|
|
#X[:, Xindex-2] = dataManipulation.calcLevenshteinArray(frequent_word, x)
|
|
|
|
# hot encoding
|
|
le_x = preprocessing.LabelBinarizer()
|
|
le_x.fit(np.unique(x))
|
|
x_ = le_x.transform(x)
|
|
LE_X_decode.append(x_.shape[1])
|
|
if Xindex == idxRegion+1:
|
|
X = x_
|
|
else:
|
|
X = np.c_[X, x_]
|
|
|
|
y = LE_y.transform(y_)
|
|
|
|
|
|
## split into train vs test set
|
|
#[X_train, X_test, y_train, y_test] = train_test_split(X, y, test_size = 0.2, random_state = 0)
|
|
|
|
# each regional data should be splited equally
|
|
lenG = dataG.shape[0]
|
|
lenL = dataL.shape[0]
|
|
lenO = dataO.shape[0]
|
|
indexG = np.arange(0, lenG)
|
|
indexL = np.arange(lenG, lenG+lenL)
|
|
indexO = np.arange(lenG+lenL, lenG+lenL+lenO)
|
|
[XG_train, XG_test, yG_train, yG_test] = train_test_split(X[indexG, :], y[indexG], test_size = 0.2, random_state = 0)
|
|
[XL_train, XL_test, yL_train, yL_test] = train_test_split(X[indexL, :], y[indexL], test_size = 0.2, random_state = 0)
|
|
[XO_train, XO_test, yO_train, yO_test] = train_test_split(X[indexO, :], y[indexO], test_size = 0.2, random_state = 0)
|
|
X_train = np.r_[XG_train, XL_train, XO_train]
|
|
X_test = np.r_[XG_test, XL_test, XO_test]
|
|
y_train = np.r_[yG_train, yL_train, yO_train]
|
|
y_test = np.r_[yG_test, yL_test, yO_test]
|
|
|
|
|
|
## comparison
|
|
## classifiers
|
|
#names = ["Nearest Neighbors",
|
|
# "Linear SVM",
|
|
# "Poly SVM",
|
|
# "RBF SVM",
|
|
# "Decision Tree",
|
|
# "Random Forest 2",
|
|
# "Random Forest 3",
|
|
# "Random Forest 4",
|
|
# "AdaBoost",
|
|
# #"Naive Bayes",
|
|
# "Linear Discriminant Analysis",
|
|
# #"Quadratic Discriminant Analysis"
|
|
# ]
|
|
#classifiers = [
|
|
# KNeighborsClassifier(3),
|
|
# SVC(kernel="linear", C=0.025),
|
|
# SVC(kernel="poly", C=0.025),
|
|
# SVC(gamma=2, C=1),
|
|
# DecisionTreeClassifier(max_depth=4),
|
|
# RandomForestClassifier(max_depth=2, n_estimators=10, max_features=1),
|
|
# RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1),
|
|
# RandomForestClassifier(max_depth=4, n_estimators=10, max_features=1),
|
|
# AdaBoostClassifier(),
|
|
# #GaussianNB(),
|
|
# LinearDiscriminantAnalysis(),
|
|
# #QuadraticDiscriminantAnalysis()
|
|
# ]
|
|
#for name, model in zip(names, classifiers):
|
|
# scores = cross_val_score(model, X, y, cv = 10, scoring = 'f1_micro')
|
|
# fout = open("comparison.csv", "a")
|
|
# fout.write("{0},{1},{2}\n".format(sentenceNum, name, scores.mean()))
|
|
# print('{0}, {1}: {2}'.format(sentenceNum, name, scores.mean()))
|
|
|
|
# quasi-optimal model
|
|
model = AdaBoostClassifier()
|
|
# cross validation
|
|
scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro')
|
|
ci_mean, ci_low, ci_high = util.mean_confidence_interval(scores, 0.95)
|
|
modelfit = model.fit(X_train, y_train)
|
|
# f1 on test data
|
|
y_prediction = modelfit.predict(X_test)
|
|
f1score = f1_score(y_test, y_prediction, average='micro')
|
|
fout.write("{0},{1},{2},{3}\n".format(ci_mean, ci_low, ci_high, f1score))
|
|
|
|
## save for the test
|
|
testset_X.append(X_test)
|
|
testset_y.append(y_test)
|
|
testset_userID.append(userID_)
|
|
result_y_test = result_y_test + list(y_test)
|
|
result_y_prediction = result_y_prediction + list(y_prediction)
|
|
fileClassifier = dirFeature + '\\\\' + sentenceNumStr + '.mdl'
|
|
pickle.dump(modelfit, open(fileClassifier, 'wb'))
|
|
fout.close()
|
|
|
|
### confusion matrix
|
|
result_y_test_label = LE_y.inverse_transform(result_y_test)
|
|
result_y_prediction_label = LE_y.inverse_transform(result_y_prediction)
|
|
confusionMatrix = confusion_matrix(result_y_test_label, result_y_prediction_label, labels=[
|
|
'Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland'])
|
|
print(confusionMatrix)
|
|
|
|
|
|
### make userID list
|
|
#userID = testset_userID[0]
|
|
#for sentenceNum in range(1, sentenceNumMax):
|
|
# userid = testset_userID[sentenceNum]
|
|
# userID = np.r_[userID, userid]
|
|
#userIDlist = np.unique(userID)
|
|
|