327 lines
9.8 KiB
Python
327 lines
9.8 KiB
Python
import os
|
|
import sys
|
|
import configparser
|
|
|
|
import pypyodbc
|
|
import numpy as np
|
|
from collections import Counter
|
|
import matplotlib.pyplot as plt
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.model_selection import cross_val_score
|
|
from sklearn import preprocessing
|
|
from sklearn.metrics import confusion_matrix
|
|
from sklearn.metrics import accuracy_score
|
|
|
|
currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
|
|
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
|
|
import dataManipulation as mani
|
|
import evaluation as eval
|
|
import speaker_based_functions as sb_func
|
|
|
|
|
|
#####################
|
|
## USER DEFINE ##
|
|
#####################
|
|
sentenceNumMax = 10
|
|
configFile = currDir + '\\config.ini'
|
|
dirOut = currDir + '\\result'
|
|
|
|
# make train/test set: 1, load: 0
|
|
makeTrainTestSet = 0
|
|
# convert 3 regions to 2 regions: 1, load: 0
|
|
conv3to2region = 0
|
|
|
|
# 3 regions: 0
|
|
# saxon vs limburg: 1
|
|
# groningen vs limburg: 2
|
|
experiment_type = 2
|
|
|
|
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
|
|
|
|
# a bit useless error handling.
|
|
#assert (experiment_type in (0, 1, 2)), "experiment type should be 0, 1 or 2."
|
|
if experiment_type == 1:
|
|
regionLabels2 = ['Low_Saxon', 'Limburg']
|
|
regionLabels2 = ['Groningen_and_Drenthe', 'Limburg']
|
|
|
|
|
|
##########################
|
|
## DATA PREPARATION ##
|
|
##########################
|
|
|
|
## load init file
|
|
config = configparser.ConfigParser()
|
|
config.sections()
|
|
config.read(configFile)
|
|
dirFeature = config['sentence_based']['dirFeature']
|
|
fileMDB = config['sentence_based']['fileMDB']
|
|
|
|
|
|
## database connection
|
|
pypyodbc.lowercase = False
|
|
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
|
|
conn = pypyodbc.connect(param)
|
|
cursor = conn.cursor()
|
|
|
|
|
|
## get data from Access database
|
|
# data format
|
|
# 0: filename
|
|
# 1: pid
|
|
# 2: region
|
|
# 3: ID (unique word_id)
|
|
# 4: sentence_id
|
|
# 5: word_id
|
|
# 6: word
|
|
# 7: pronunciation
|
|
SQL_string = """\
|
|
{CALL dataset_with_cities}
|
|
"""
|
|
cursor.execute(SQL_string)
|
|
|
|
rows = cursor.fetchall()
|
|
data = np.array(rows)
|
|
#dataNumMax = data.shape[0]
|
|
#uniqueWordIDmax = max(data[:, 3].astype(int))
|
|
del SQL_string, rows
|
|
|
|
|
|
## make list of LabelBinarizer object per word.
|
|
# for X
|
|
# get pronvarList from Access database
|
|
# pronvarList format
|
|
# 0: ID (unique word_id)
|
|
# 1: word
|
|
# 2: pronvar
|
|
SQL_string = """\
|
|
{CALL pronunciation_variant}
|
|
"""
|
|
cursor.execute(SQL_string)
|
|
rows = cursor.fetchall()
|
|
pronvarList = np.array(rows)
|
|
del SQL_string, rows
|
|
|
|
|
|
LBlist = []
|
|
#uniqueWordIDlist = pronvarList[:, 0].astype(int)
|
|
uniqueWordIDlist = data[:, 3].astype(int)
|
|
uniqueWordIDmax = max(uniqueWordIDlist)
|
|
for uniqueWordID in range(1, uniqueWordIDmax+1):
|
|
pronvar = data[uniqueWordIDlist == uniqueWordID, 7]
|
|
#pronvar = pronvarList[pronvarList[:, 0] == uniqueWordID, 2]
|
|
LB = preprocessing.LabelBinarizer()
|
|
LB.fit(np.unique(pronvar))
|
|
LBlist.append(LB)
|
|
|
|
# for y (=region)
|
|
LE_y = preprocessing.LabelEncoder()
|
|
LE_y.fit(regionLabels)
|
|
LE_y2 = preprocessing.LabelEncoder()
|
|
LE_y2.fit(regionLabels2)
|
|
|
|
LB_y = preprocessing.LabelBinarizer()
|
|
LB_y.fit(regionLabels)
|
|
LB_y2 = preprocessing.LabelBinarizer()
|
|
LB_y2.fit(regionLabels2)
|
|
|
|
del uniqueWordID, uniqueWordIDmax, pronvar, LB
|
|
|
|
|
|
#################
|
|
## ITERATION ##
|
|
#################
|
|
#CM_majority = np.zeros((1, 9)).astype(int)
|
|
#CM_weighted = np.zeros((1, 9)).astype(int)
|
|
#for iter in range(0, 1):
|
|
# print(iter)
|
|
|
|
## make balanced dataset
|
|
pidlist = np.unique(data[:, (1, 2)], axis=0)
|
|
|
|
# count number of samples
|
|
pidlistCounter = Counter(pidlist[:, 1])
|
|
sampleNumMax = min(pidlistCounter.values())
|
|
del pidlistCounter
|
|
|
|
|
|
## make train/eval/test set or load
|
|
if makeTrainTestSet==1:
|
|
pidlist_train = []
|
|
pidlist_eval = []
|
|
pidlist_test = []
|
|
for regionNum in range(0, len(regionLabels)):
|
|
regionName = regionLabels[regionNum]
|
|
|
|
pidlist_per_region_ = pidlist[pidlist[:, 1]==regionLabels[regionNum], :]
|
|
pidlist_per_region, idx = mani.extractRandomSample(
|
|
pidlist_per_region_, sampleNumMax)
|
|
|
|
# split dataset into train, eval and test.
|
|
[pidlist_per_region_train, pidlist_per_region_test] = train_test_split(
|
|
pidlist_per_region, test_size = 0.2, random_state = 0)
|
|
[pidlist_per_region_train, pidlist_per_region_eval] = train_test_split(
|
|
pidlist_per_region_train, test_size = 0.1, random_state = 0)
|
|
|
|
# append numpy arrays
|
|
if regionNum == 0:
|
|
pidlist_train = pidlist_per_region_train
|
|
pidlist_eval = pidlist_per_region_eval
|
|
pidlist_test = pidlist_per_region_test
|
|
else:
|
|
pidlist_train = np.r_[pidlist_train, pidlist_per_region_train]
|
|
pidlist_eval = np.r_[pidlist_eval, pidlist_per_region_eval]
|
|
pidlist_test = np.r_[pidlist_test, pidlist_per_region_test]
|
|
del regionNum, regionName
|
|
del pidlist_per_region_, pidlist_per_region, idx
|
|
del pidlist_per_region_train, pidlist_per_region_eval, pidlist_per_region_test
|
|
np.save(dirOut + "\\pidlist_train.npy", pidlist_train)
|
|
np.save(dirOut + "\\pidlist_eval.npy", pidlist_eval)
|
|
np.save(dirOut + "\\pidlist_test.npy", pidlist_test)
|
|
else:
|
|
pidlist_train = np.load(dirOut + "\\pidlist_train.npy")
|
|
pidlist_eval = np.load(dirOut + "\\pidlist_eval.npy")
|
|
pidlist_test = np.load(dirOut + "\\pidlist_test.npy")
|
|
|
|
|
|
## make dataset for 2 regions or load
|
|
if conv3to2region==1:
|
|
pidlist2_train_ = np.r_[pidlist_train, pidlist_eval]
|
|
|
|
if experiment_type == 1:
|
|
pidlist2_train = sb_func.saxon_vs_limburg(pidlist2_train_)
|
|
pidlist2_test = sb_func.saxon_vs_limburg(pidlist_test)
|
|
np.save(dirOut + "\\pidlist2_saxon_vs_limburg_train", pidlist2_train)
|
|
np.save(dirOut + "\\pidlist2_saxon_vs_limburg_test", pidlist2_test)
|
|
|
|
elif experiment_type == 2:
|
|
pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_)
|
|
pidlist2_test = sb_func.groningen_vs_limburg(pidlist_test)
|
|
np.save(dirOut + "\\pidlist2_groningen_vs_limburg_train", pidlist2_train)
|
|
np.save(dirOut + "\\pidlist2_groningen_vs_limburg_test", pidlist2_test)
|
|
|
|
del pidlist2_train_
|
|
else:
|
|
if experiment_type == 1:
|
|
pidlist2_train = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_train.npy")
|
|
pidlist2_test = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_test.npy")
|
|
|
|
elif experiment_type == 2:
|
|
pidlist2_train = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_train.npy")
|
|
pidlist2_test = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_test.npy")
|
|
|
|
|
|
## train/test data
|
|
if experiment_type == 0:
|
|
# Groningen vs Overijsel vs Limburg
|
|
data_train = sb_func.extractPid(pidlist_train, data)
|
|
data_eval = sb_func.extractPid(pidlist_eval, data)
|
|
data_test = sb_func.extractPid(pidlist_test, data)
|
|
|
|
elif experiment_type == 1 or experiment_type == 2:
|
|
data2 = np.array(data)
|
|
|
|
if experiment_type == 1:
|
|
for row, row2 in zip(data, data2):
|
|
if row[2] == regionLabels[0] or row[2] == regionLabels[2]:
|
|
row2[2] = regionLabels2[0]
|
|
|
|
data2_train = sb_func.extractPid(pidlist2_train, data2)
|
|
data2_test = sb_func.extractPid(pidlist2_test, data2)
|
|
|
|
|
|
#####################################
|
|
## EXPERIMENTS START FROM HERE ##
|
|
#####################################
|
|
|
|
## actual training
|
|
# train vs eval
|
|
#trainData = data_train
|
|
#testData = data_eval
|
|
#testPID = pidlist_eval
|
|
#LB = LB_y
|
|
#LE = LE_y
|
|
#regionLabels = regionLabels3
|
|
|
|
# train+eval vs test
|
|
if experiment_type == 0:
|
|
trainData = np.r_[data_train, data_eval]
|
|
testData = data_test
|
|
testPID = pidlist_test
|
|
LB = LB_y
|
|
LE = LE_y
|
|
elif experiment_type == 1 or experiment_type == 2:
|
|
# 2 region: saxon vs limburg/ groningen vs limburg
|
|
trainData = data2_train
|
|
testData = data2_test
|
|
testPID = pidlist2_test
|
|
LB = LB_y2
|
|
LE = LE_y2
|
|
regionLabels = regionLabels2
|
|
|
|
|
|
# check the number of utterance
|
|
allData = np.r_[trainData, testData]
|
|
filenames = np.c_[allData[:, 0], allData[:, 2]]
|
|
filenames_unique = np.unique(filenames, axis=0)
|
|
Counter(filenames_unique[:, 1])
|
|
|
|
|
|
fileComparison = dirOut + "\\algorithm_comparison.csv"
|
|
filePerformance = dirOut + "\\sentence-level.csv"
|
|
fileConfusionMatrix = dirOut + "\\confusion_matrix.csv"
|
|
|
|
## compare classification algorithms for the sentence-classifiers.
|
|
#sb_func.compare_sentence_level_classifiers(trainData, LBlist, LE, fileComparison)
|
|
|
|
## train sentence-level classifiers.
|
|
modelList, scoreList, confusionMatrixList = sb_func.train_sentence_level_classifiers(
|
|
trainData, LBlist, LE, filePerformance)
|
|
|
|
## prediction over evaluation data per each sentence-level classifier.
|
|
pred_per_sentence = sb_func.prediction_per_sentence(testData, modelList, LBlist, LE)
|
|
|
|
## combine sentence-level classifiers
|
|
pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence)
|
|
|
|
## majority vote (weighted)
|
|
#weight = sb_func.calc_weight(confusionMatrixList)
|
|
#pred_per_pid_weighted = sb_func.prediction_per_pid_weighted(testPID, pred_per_sentence, weight, LB, LE)
|
|
|
|
### confusion matrix
|
|
if experiment_type == 0:
|
|
confusionMatrix_majority = confusion_matrix(
|
|
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg'])
|
|
else:
|
|
confusionMatrix_majority = confusion_matrix(
|
|
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Limburg'])
|
|
|
|
#confusionMatrix_weighted = confusion_matrix(
|
|
# pred_per_pid_weighted[:, 1], pred_per_pid_weighted[:, 2], labels=regionLabels)
|
|
|
|
|
|
## output
|
|
accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None)
|
|
print('accuracy: {}%'.format(accuracy * 100))
|
|
|
|
cm = confusionMatrix_majority
|
|
print(cm)
|
|
|
|
np.save(dirOut + "\\pred_per_pid.npy", pred_per_pid_majority)
|
|
np.save(dirOut + "\\confusion_matrix.npy", cm)
|
|
|
|
#fout = open(fileConfusionMatrix, "w")
|
|
#fout.write('< confusion matrix for majority vote in evaluation set >\n')
|
|
#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_majority', regionLabels)
|
|
#fout.write('< confusion matrix for weighted vote in evaluation set >\n')
|
|
#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_weighted', regionLabels)
|
|
#fout.write('\n')
|
|
#fout.close()
|
|
|
|
|
|
##### iteration finish #####
|
|
conn.close()
|
|
#np.savetxt(dirOut + '\\cm_majority.csv', CM_majority, delimiter=',')
|
|
#np.savetxt(dirOut + '\\cm_weighted.csv', CM_weighted, delimiter=',')
|
|
|