accent_classification/accent_classification/sentence_based.py

198 lines
6.5 KiB
Python

import os
import sys
import configparser
import numpy as np
import pandas as pd
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from collections import Counter
# database
import pypyodbc
# classifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import pickle
currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
from dataIO import readFile
from dataIO import groupSamplesInCSV
import dataManipulation
import utility as util
configFile = currDir + '\\config.ini'
# load init file
config = configparser.ConfigParser()
config.sections()
config.read(configFile)
dirFeature = config['sentence_based']['dirFeature']
sentenceNumMax = 10
classifierList = []
LE_X_decode = []
LE_y = preprocessing.LabelEncoder()
LE_y.fit(["Groningen_and_Drenthe", "Limburg", "Oost_Overijsel-Gelderland"])
testset_X = []
testset_y = []
testset_userID = []
result_y_test = []
result_y_prediction = []
fout = open("comparison.csv", "w")
for sentenceNum in range(1, sentenceNumMax+1):
#if sentenceNum != 10:
# sentenceNumStr = '0' + str(sentenceNum)
#else:
# sentenceNumStr = str(sentenceNumStr)
sentenceNumStr = format(sentenceNum, '02')
fileSentence = dirFeature + '\\\\' + sentenceNumStr + '.csv'
## load combined data
fileCSV = fileSentence
idxRegion = 1
header, dataGroningen, dataLimburg, dataOverijsel = groupSamplesInCSV(fileCSV, idxRegion)
sampleNumMax = np.min((len(dataGroningen), len(dataLimburg), len(dataOverijsel)))
## make balanced dataset
dataG, indexG = dataManipulation.extractRandomSample(np.array(dataGroningen), sampleNumMax)
dataL, indexL = dataManipulation.extractRandomSample(np.array(dataLimburg), sampleNumMax)
dataO, indexO = dataManipulation.extractRandomSample(np.array(dataOverijsel), sampleNumMax)
XIndex = np.arange(idxRegion+1, len(header))
yIndex = 1 # region
userIDindex = 0 # userID
## cathegorical values into numbers
X_ = np.r_[dataG[:, XIndex], dataL[:, XIndex], dataO[:, XIndex]]
y_ = np.r_[dataG[:, yIndex], dataL[:, yIndex], dataO[:, yIndex]]
userID_ = np.r_[dataG[:, userIDindex], dataL[:, userIDindex], dataO[:, userIDindex]]
#X = np.zeros((X_.shape), 'int')
for Xindex in XIndex:
x = X_[:, Xindex-2]
## levenshtein distance
#word_count = Counter(x)
#frequent_word = max(word_count)
#X[:, Xindex-2] = dataManipulation.calcLevenshteinArray(frequent_word, x)
# hot encoding
le_x = preprocessing.LabelBinarizer()
le_x.fit(np.unique(x))
x_ = le_x.transform(x)
LE_X_decode.append(x_.shape[1])
if Xindex == idxRegion+1:
X = x_
else:
X = np.c_[X, x_]
y = LE_y.transform(y_)
## split into train vs test set
#[X_train, X_test, y_train, y_test] = train_test_split(X, y, test_size = 0.2, random_state = 0)
# each regional data should be splited equally
lenG = dataG.shape[0]
lenL = dataL.shape[0]
lenO = dataO.shape[0]
indexG = np.arange(0, lenG)
indexL = np.arange(lenG, lenG+lenL)
indexO = np.arange(lenG+lenL, lenG+lenL+lenO)
[XG_train, XG_test, yG_train, yG_test] = train_test_split(X[indexG, :], y[indexG], test_size = 0.2, random_state = 0)
[XL_train, XL_test, yL_train, yL_test] = train_test_split(X[indexL, :], y[indexL], test_size = 0.2, random_state = 0)
[XO_train, XO_test, yO_train, yO_test] = train_test_split(X[indexO, :], y[indexO], test_size = 0.2, random_state = 0)
X_train = np.r_[XG_train, XL_train, XO_train]
X_test = np.r_[XG_test, XL_test, XO_test]
y_train = np.r_[yG_train, yL_train, yO_train]
y_test = np.r_[yG_test, yL_test, yO_test]
## comparison
## classifiers
#names = ["Nearest Neighbors",
# "Linear SVM",
# "Poly SVM",
# "RBF SVM",
# "Decision Tree",
# "Random Forest 2",
# "Random Forest 3",
# "Random Forest 4",
# "AdaBoost",
# #"Naive Bayes",
# "Linear Discriminant Analysis",
# #"Quadratic Discriminant Analysis"
# ]
#classifiers = [
# KNeighborsClassifier(3),
# SVC(kernel="linear", C=0.025),
# SVC(kernel="poly", C=0.025),
# SVC(gamma=2, C=1),
# DecisionTreeClassifier(max_depth=4),
# RandomForestClassifier(max_depth=2, n_estimators=10, max_features=1),
# RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1),
# RandomForestClassifier(max_depth=4, n_estimators=10, max_features=1),
# AdaBoostClassifier(),
# #GaussianNB(),
# LinearDiscriminantAnalysis(),
# #QuadraticDiscriminantAnalysis()
# ]
#for name, model in zip(names, classifiers):
# scores = cross_val_score(model, X, y, cv = 10, scoring = 'f1_micro')
# fout = open("comparison.csv", "a")
# fout.write("{0},{1},{2}\n".format(sentenceNum, name, scores.mean()))
# print('{0}, {1}: {2}'.format(sentenceNum, name, scores.mean()))
# quasi-optimal model
model = AdaBoostClassifier()
# cross validation
scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro')
ci_mean, ci_low, ci_high = util.mean_confidence_interval(scores, 0.95)
modelfit = model.fit(X_train, y_train)
# f1 on test data
y_prediction = modelfit.predict(X_test)
f1score = f1_score(y_test, y_prediction, average='micro')
fout.write("{0},{1},{2},{3}\n".format(ci_mean, ci_low, ci_high, f1score))
## save for the test
testset_X.append(X_test)
testset_y.append(y_test)
testset_userID.append(userID_)
result_y_test = result_y_test + list(y_test)
result_y_prediction = result_y_prediction + list(y_prediction)
fileClassifier = dirFeature + '\\\\' + sentenceNumStr + '.mdl'
pickle.dump(modelfit, open(fileClassifier, 'wb'))
fout.close()
### confusion matrix
result_y_test_label = LE_y.inverse_transform(result_y_test)
result_y_prediction_label = LE_y.inverse_transform(result_y_prediction)
confusionMatrix = confusion_matrix(result_y_test_label, result_y_prediction_label, labels=[
'Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland'])
print(confusionMatrix)
### make userID list
#userID = testset_userID[0]
#for sentenceNum in range(1, sentenceNumMax):
# userid = testset_userID[sentenceNum]
# userID = np.r_[userID, userid]
#userIDlist = np.unique(userID)