accent_classification/accent_classification/speaker_based.py

267 lines
7.7 KiB
Python
Raw Normal View History

import os
import sys
import configparser
import pypyodbc
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
repo_dir = 'C:\\Users\\Aki\\source\\repos\\accent_classification'
curr_dir = repo_dir + '\\accent_classification'
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir))
import data_manipulation as mani
import evaluation as eval
import speaker_based_functions as sb_func
## ======================= user define =======================
sentence_num_max = 10
config_file = curr_dir + '\\config.ini'
output_dir = repo_dir + '\\output'
# make train/test set: 1, load: 0
make_train_test_set = 0
# specify which experiment to be performed.
# - 3: groninven vs oost_overijssel vs limburg
# - 2: groningen vs limburg
experiment_type = 2
region_labels3 = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg']
region_labels2 = ['Groningen_and_Drenthe', 'Limburg']
## ======================= data preparation =======================
## load variables from the ini file
config = configparser.ConfigParser()
config.sections()
config.read(config_file)
MDB_file = config['sentence_based']['fileMDB']
## connect to the database
pypyodbc.lowercase = False
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + MDB_file + ";"
conn = pypyodbc.connect(param)
cursor = conn.cursor()
## get data from Access database
# data format
# 0: filename
# 1: pid
# 2: region
# 3: ID (unique word_id)
# 4: sentence_id
# 5: word_id
# 6: word
# 7: pronunciation
SQL_string = """\
{CALL dataset_with_cities}
"""
cursor.execute(SQL_string)
rows = cursor.fetchall()
data = np.array(rows)
del SQL_string, rows
## get the list of pronunciation variant (pronvarList) from Access database
# pronvarList format
# 0: ID (unique word_id)
# 1: word
# 2: pronvar
SQL_string = """\
{CALL pronunciation_variant}
"""
cursor.execute(SQL_string)
rows = cursor.fetchall()
pronvarList = np.array(rows)
del SQL_string, rows
conn.close()
## make list of LabelBinarizer object per word for X (=pronunciation variant).
LB_list = []
unique_wordID_list = data[:, 3].astype(int)
unique_wordID_max = max(unique_wordID_list)
for unique_wordID in range(1, unique_wordID_max+1):
pronvar = data[unique_wordID_list == unique_wordID, 7]
LB = preprocessing.LabelBinarizer()
LB.fit(np.unique(pronvar))
LB_list.append(LB)
## make LabelEncorder/LabelBinilizer objects for y (=region).
LE_y3 = preprocessing.LabelEncoder()
LE_y3.fit(region_labels3)
LE_y2 = preprocessing.LabelEncoder()
LE_y2.fit(region_labels2)
LB_y3 = preprocessing.LabelBinarizer()
LB_y3.fit(region_labels3)
LB_y2 = preprocessing.LabelBinarizer()
LB_y2.fit(region_labels2)
del unique_wordID, unique_wordID_max, pronvar, LB
## ======================= make train/eval/test set or load =======================
## find the smallest group to balance the number of samples per group.
pidlist3 = np.unique(data[:, (1, 2)], axis=0)
pidlist3_counter = Counter(pidlist3[:, 1])
sample_num_max = min(pidlist3_counter.values())
del pidlist3_counter
## make train/eval/test set or load them.
if make_train_test_set==1:
pidlist3_train = []
pidlist3_eval = []
pidlist3_test = []
for region_num in range(0, len(region_labels3)):
region_name = region_labels3[region_num]
pidlist3_per_region_ = pidlist3[pidlist3[:, 1]==region_labels3[region_num], :]
pidlist3_per_region, idx = mani.extractRandomSample(
pidlist3_per_region_, sample_num_max)
# split dataset into train, eval and test.
[pidlist3_per_region_train, pidlist3_per_region_test] = train_test_split(
pidlist3_per_region, test_size = 0.2, random_state = 0)
[pidlist3_per_region_train, pidlist3_per_region_eval] = train_test_split(
pidlist3_per_region_train, test_size = 0.1, random_state = 0)
# append numpy arrays.
if region_num == 0:
pidlist3_train = pidlist3_per_region_train
pidlist3_eval = pidlist3_per_region_eval
pidlist3_test = pidlist3_per_region_test
else:
pidlist3_train = np.r_[pidlist3_train, pidlist3_per_region_train]
pidlist3_eval = np.r_[pidlist3_eval, pidlist3_per_region_eval]
pidlist3_test = np.r_[pidlist3_test, pidlist3_per_region_test]
del region_num, region_name
del pidlist3_per_region_, pidlist3_per_region, idx
del pidlist3_per_region_train, pidlist3_per_region_eval, pidlist3_per_region_test
np.save(output_dir + "\\pidlist3_train.npy", pidlist3_train)
np.save(output_dir + "\\pidlist3_eval.npy", pidlist3_eval)
np.save(output_dir + "\\pidlist3_test.npy", pidlist3_test)
if experiment_type == 2:
pidlist2_train_ = np.r_[pidlist3_train, pidlist3_eval]
pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_)
pidlist2_test = sb_func.groningen_vs_limburg(pidlist3_test)
np.save(output_dir + "\\pidlist2_train", pidlist2_train)
np.save(output_dir + "\\pidlist2_test", pidlist2_test)
del pidlist2_train_
else:
pidlist3_train = np.load(output_dir + "\\pidlist3_train.npy")
pidlist3_eval = np.load(output_dir + "\\pidlist3_eval.npy")
pidlist3_test = np.load(output_dir + "\\pidlist3_test.npy")
if experiment_type == 2:
pidlist2_train = np.load(output_dir + "\\pidlist2_train.npy")
pidlist2_test = np.load(output_dir + "\\pidlist2_test.npy")
## extract corresponding data using pid
data3_train = sb_func.extractPid(pidlist3_train, data)
data3_eval = sb_func.extractPid(pidlist3_eval, data)
data3_test = sb_func.extractPid(pidlist3_test, data)
if experiment_type == 2:
data2 = np.array(data)
data2_train = sb_func.extractPid(pidlist2_train, data2)
data2_test = sb_func.extractPid(pidlist2_test, data2)
## ======================= experiments =======================
## specify the dataset
# train vs eval
#trainData = data3_train
#testData = data3_eval
#testPID = pidlist3_eval
#LB = LB_y3
#LE = LE_y3
#region_labels = region_labels3
# train+eval vs test
if experiment_type == 3:
trainData = np.r_[data3_train, data3_eval]
testData = data3_test
testPID = pidlist3_test
LB = LB_y3
LE = LE_y3
region_labels = region_labels3
elif experiment_type == 2:
trainData = data2_train
testData = data2_test
testPID = pidlist2_test
LB = LB_y2
LE = LE_y2
region_labels = region_labels2
## check the number of utterance
#data_all = np.r_[trainData, testData]
#filenames = np.c_[data_all[:, 0], data_all[:, 2]]
#filenames_unique = np.unique(filenames, axis=0)
#Counter(filenames_unique[:, 1])
## output filenames
fileComparison = output_dir + "\\algorithm_comparison.csv"
filePerformance = output_dir + "\\sentence-level.csv"
fileConfusionMatrix = output_dir + "\\confusion_matrix.csv"
## compare classification algorithms for the sentence-classifiers.
#sb_func.compare_sentence_level_classifiers(trainData, LB_list, LE, fileComparison)
## train sentence-level classifiers.
model_list, score_list, confusion_matrix_list = sb_func.train_sentence_level_classifiers(
trainData, LB_list, LE, filePerformance)
## prediction over evaluation data per each sentence-level classifier.
pred_per_sentence = sb_func.prediction_per_sentence(testData, model_list, LB_list, LE)
## combine sentence-level classifiers
pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence)
## confusion matrix
confusionMatrix_majority = confusion_matrix(
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=region_labels)
## output
accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None)
print('accuracy: {}%'.format(accuracy * 100))
cm = confusionMatrix_majority
print(cm)
np.save(output_dir + "\\pred_per_pid2.npy", pred_per_pid_majority)
np.save(output_dir + "\\confusion_matrix2.npy", cm)