267 lines
7.7 KiB
Python
267 lines
7.7 KiB
Python
import os
|
|
import sys
|
|
import configparser
|
|
|
|
import pypyodbc
|
|
import numpy as np
|
|
from collections import Counter
|
|
import matplotlib.pyplot as plt
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.model_selection import cross_val_score
|
|
from sklearn import preprocessing
|
|
from sklearn.metrics import confusion_matrix
|
|
from sklearn.metrics import accuracy_score
|
|
|
|
repo_dir = 'C:\\Users\\Aki\\source\\repos\\accent_classification'
|
|
curr_dir = repo_dir + '\\accent_classification'
|
|
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir))
|
|
import data_manipulation as mani
|
|
import evaluation as eval
|
|
import speaker_based_functions as sb_func
|
|
|
|
|
|
## ======================= user define =======================
|
|
sentence_num_max = 10
|
|
config_file = curr_dir + '\\config.ini'
|
|
output_dir = repo_dir + '\\output'
|
|
|
|
# make train/test set: 1, load: 0
|
|
make_train_test_set = 0
|
|
|
|
# specify which experiment to be performed.
|
|
# - 3: groninven vs oost_overijssel vs limburg
|
|
# - 2: groningen vs limburg
|
|
experiment_type = 2
|
|
|
|
region_labels3 = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg']
|
|
region_labels2 = ['Groningen_and_Drenthe', 'Limburg']
|
|
|
|
|
|
## ======================= data preparation =======================
|
|
|
|
## load variables from the ini file
|
|
config = configparser.ConfigParser()
|
|
config.sections()
|
|
config.read(config_file)
|
|
MDB_file = config['sentence_based']['fileMDB']
|
|
|
|
|
|
## connect to the database
|
|
pypyodbc.lowercase = False
|
|
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + MDB_file + ";"
|
|
conn = pypyodbc.connect(param)
|
|
cursor = conn.cursor()
|
|
|
|
|
|
## get data from Access database
|
|
# data format
|
|
# 0: filename
|
|
# 1: pid
|
|
# 2: region
|
|
# 3: ID (unique word_id)
|
|
# 4: sentence_id
|
|
# 5: word_id
|
|
# 6: word
|
|
# 7: pronunciation
|
|
SQL_string = """\
|
|
{CALL dataset_with_cities}
|
|
"""
|
|
cursor.execute(SQL_string)
|
|
|
|
rows = cursor.fetchall()
|
|
data = np.array(rows)
|
|
del SQL_string, rows
|
|
|
|
|
|
## get the list of pronunciation variant (pronvarList) from Access database
|
|
# pronvarList format
|
|
# 0: ID (unique word_id)
|
|
# 1: word
|
|
# 2: pronvar
|
|
SQL_string = """\
|
|
{CALL pronunciation_variant}
|
|
"""
|
|
cursor.execute(SQL_string)
|
|
rows = cursor.fetchall()
|
|
pronvarList = np.array(rows)
|
|
del SQL_string, rows
|
|
|
|
conn.close()
|
|
|
|
|
|
## make list of LabelBinarizer object per word for X (=pronunciation variant).
|
|
LB_list = []
|
|
unique_wordID_list = data[:, 3].astype(int)
|
|
unique_wordID_max = max(unique_wordID_list)
|
|
for unique_wordID in range(1, unique_wordID_max+1):
|
|
pronvar = data[unique_wordID_list == unique_wordID, 7]
|
|
LB = preprocessing.LabelBinarizer()
|
|
LB.fit(np.unique(pronvar))
|
|
LB_list.append(LB)
|
|
|
|
|
|
## make LabelEncorder/LabelBinilizer objects for y (=region).
|
|
LE_y3 = preprocessing.LabelEncoder()
|
|
LE_y3.fit(region_labels3)
|
|
LE_y2 = preprocessing.LabelEncoder()
|
|
LE_y2.fit(region_labels2)
|
|
|
|
LB_y3 = preprocessing.LabelBinarizer()
|
|
LB_y3.fit(region_labels3)
|
|
LB_y2 = preprocessing.LabelBinarizer()
|
|
LB_y2.fit(region_labels2)
|
|
|
|
del unique_wordID, unique_wordID_max, pronvar, LB
|
|
|
|
|
|
|
|
## ======================= make train/eval/test set or load =======================
|
|
|
|
## find the smallest group to balance the number of samples per group.
|
|
pidlist3 = np.unique(data[:, (1, 2)], axis=0)
|
|
pidlist3_counter = Counter(pidlist3[:, 1])
|
|
sample_num_max = min(pidlist3_counter.values())
|
|
del pidlist3_counter
|
|
|
|
|
|
## make train/eval/test set or load them.
|
|
|
|
if make_train_test_set==1:
|
|
pidlist3_train = []
|
|
pidlist3_eval = []
|
|
pidlist3_test = []
|
|
for region_num in range(0, len(region_labels3)):
|
|
region_name = region_labels3[region_num]
|
|
|
|
pidlist3_per_region_ = pidlist3[pidlist3[:, 1]==region_labels3[region_num], :]
|
|
pidlist3_per_region, idx = mani.extractRandomSample(
|
|
pidlist3_per_region_, sample_num_max)
|
|
|
|
# split dataset into train, eval and test.
|
|
[pidlist3_per_region_train, pidlist3_per_region_test] = train_test_split(
|
|
pidlist3_per_region, test_size = 0.2, random_state = 0)
|
|
[pidlist3_per_region_train, pidlist3_per_region_eval] = train_test_split(
|
|
pidlist3_per_region_train, test_size = 0.1, random_state = 0)
|
|
|
|
# append numpy arrays.
|
|
if region_num == 0:
|
|
pidlist3_train = pidlist3_per_region_train
|
|
pidlist3_eval = pidlist3_per_region_eval
|
|
pidlist3_test = pidlist3_per_region_test
|
|
else:
|
|
pidlist3_train = np.r_[pidlist3_train, pidlist3_per_region_train]
|
|
pidlist3_eval = np.r_[pidlist3_eval, pidlist3_per_region_eval]
|
|
pidlist3_test = np.r_[pidlist3_test, pidlist3_per_region_test]
|
|
del region_num, region_name
|
|
del pidlist3_per_region_, pidlist3_per_region, idx
|
|
del pidlist3_per_region_train, pidlist3_per_region_eval, pidlist3_per_region_test
|
|
np.save(output_dir + "\\pidlist3_train.npy", pidlist3_train)
|
|
np.save(output_dir + "\\pidlist3_eval.npy", pidlist3_eval)
|
|
np.save(output_dir + "\\pidlist3_test.npy", pidlist3_test)
|
|
|
|
|
|
if experiment_type == 2:
|
|
pidlist2_train_ = np.r_[pidlist3_train, pidlist3_eval]
|
|
|
|
pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_)
|
|
pidlist2_test = sb_func.groningen_vs_limburg(pidlist3_test)
|
|
np.save(output_dir + "\\pidlist2_train", pidlist2_train)
|
|
np.save(output_dir + "\\pidlist2_test", pidlist2_test)
|
|
|
|
del pidlist2_train_
|
|
else:
|
|
pidlist3_train = np.load(output_dir + "\\pidlist3_train.npy")
|
|
pidlist3_eval = np.load(output_dir + "\\pidlist3_eval.npy")
|
|
pidlist3_test = np.load(output_dir + "\\pidlist3_test.npy")
|
|
|
|
if experiment_type == 2:
|
|
pidlist2_train = np.load(output_dir + "\\pidlist2_train.npy")
|
|
pidlist2_test = np.load(output_dir + "\\pidlist2_test.npy")
|
|
|
|
|
|
## extract corresponding data using pid
|
|
|
|
data3_train = sb_func.extractPid(pidlist3_train, data)
|
|
data3_eval = sb_func.extractPid(pidlist3_eval, data)
|
|
data3_test = sb_func.extractPid(pidlist3_test, data)
|
|
|
|
if experiment_type == 2:
|
|
data2 = np.array(data)
|
|
data2_train = sb_func.extractPid(pidlist2_train, data2)
|
|
data2_test = sb_func.extractPid(pidlist2_test, data2)
|
|
|
|
|
|
## ======================= experiments =======================
|
|
|
|
## specify the dataset
|
|
|
|
# train vs eval
|
|
#trainData = data3_train
|
|
#testData = data3_eval
|
|
#testPID = pidlist3_eval
|
|
#LB = LB_y3
|
|
#LE = LE_y3
|
|
#region_labels = region_labels3
|
|
|
|
# train+eval vs test
|
|
if experiment_type == 3:
|
|
trainData = np.r_[data3_train, data3_eval]
|
|
testData = data3_test
|
|
testPID = pidlist3_test
|
|
LB = LB_y3
|
|
LE = LE_y3
|
|
region_labels = region_labels3
|
|
|
|
elif experiment_type == 2:
|
|
trainData = data2_train
|
|
testData = data2_test
|
|
testPID = pidlist2_test
|
|
LB = LB_y2
|
|
LE = LE_y2
|
|
region_labels = region_labels2
|
|
|
|
## check the number of utterance
|
|
#data_all = np.r_[trainData, testData]
|
|
#filenames = np.c_[data_all[:, 0], data_all[:, 2]]
|
|
#filenames_unique = np.unique(filenames, axis=0)
|
|
#Counter(filenames_unique[:, 1])
|
|
|
|
|
|
## output filenames
|
|
fileComparison = output_dir + "\\algorithm_comparison.csv"
|
|
filePerformance = output_dir + "\\sentence-level.csv"
|
|
fileConfusionMatrix = output_dir + "\\confusion_matrix.csv"
|
|
|
|
|
|
## compare classification algorithms for the sentence-classifiers.
|
|
#sb_func.compare_sentence_level_classifiers(trainData, LB_list, LE, fileComparison)
|
|
|
|
|
|
## train sentence-level classifiers.
|
|
model_list, score_list, confusion_matrix_list = sb_func.train_sentence_level_classifiers(
|
|
trainData, LB_list, LE, filePerformance)
|
|
|
|
|
|
## prediction over evaluation data per each sentence-level classifier.
|
|
pred_per_sentence = sb_func.prediction_per_sentence(testData, model_list, LB_list, LE)
|
|
|
|
|
|
## combine sentence-level classifiers
|
|
pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence)
|
|
|
|
|
|
## confusion matrix
|
|
confusionMatrix_majority = confusion_matrix(
|
|
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=region_labels)
|
|
|
|
|
|
## output
|
|
accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None)
|
|
print('accuracy: {}%'.format(accuracy * 100))
|
|
|
|
cm = confusionMatrix_majority
|
|
print(cm)
|
|
|
|
np.save(output_dir + "\\pred_per_pid2.npy", pred_per_pid_majority)
|
|
np.save(output_dir + "\\confusion_matrix2.npy", cm) |