import os import sys import configparser import pypyodbc import numpy as np from collections import Counter import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score from sklearn import preprocessing from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score repo_dir = 'C:\\Users\\Aki\\source\\repos\\accent_classification' curr_dir = repo_dir + '\\accent_classification' sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir)) import data_manipulation as mani import evaluation as eval import speaker_based_functions as sb_func ## ======================= user define ======================= sentence_num_max = 10 config_file = curr_dir + '\\config.ini' output_dir = repo_dir + '\\output' # make train/test set: 1, load: 0 make_train_test_set = 0 # specify which experiment to be performed. # - 3: groninven vs oost_overijssel vs limburg # - 2: groningen vs limburg experiment_type = 2 region_labels3 = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg'] region_labels2 = ['Groningen_and_Drenthe', 'Limburg'] ## ======================= data preparation ======================= ## load variables from the ini file config = configparser.ConfigParser() config.sections() config.read(config_file) MDB_file = config['sentence_based']['fileMDB'] ## connect to the database pypyodbc.lowercase = False param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + MDB_file + ";" conn = pypyodbc.connect(param) cursor = conn.cursor() ## get data from Access database # data format # 0: filename # 1: pid # 2: region # 3: ID (unique word_id) # 4: sentence_id # 5: word_id # 6: word # 7: pronunciation SQL_string = """\ {CALL dataset_with_cities} """ cursor.execute(SQL_string) rows = cursor.fetchall() data = np.array(rows) del SQL_string, rows ## get the list of pronunciation variant (pronvarList) from Access database # pronvarList format # 0: ID (unique word_id) # 1: word # 2: pronvar SQL_string = """\ {CALL pronunciation_variant} """ cursor.execute(SQL_string) rows = cursor.fetchall() pronvarList = np.array(rows) del SQL_string, rows conn.close() ## make list of LabelBinarizer object per word for X (=pronunciation variant). LB_list = [] unique_wordID_list = data[:, 3].astype(int) unique_wordID_max = max(unique_wordID_list) for unique_wordID in range(1, unique_wordID_max+1): pronvar = data[unique_wordID_list == unique_wordID, 7] LB = preprocessing.LabelBinarizer() LB.fit(np.unique(pronvar)) LB_list.append(LB) ## make LabelEncorder/LabelBinilizer objects for y (=region). LE_y3 = preprocessing.LabelEncoder() LE_y3.fit(region_labels3) LE_y2 = preprocessing.LabelEncoder() LE_y2.fit(region_labels2) LB_y3 = preprocessing.LabelBinarizer() LB_y3.fit(region_labels3) LB_y2 = preprocessing.LabelBinarizer() LB_y2.fit(region_labels2) del unique_wordID, unique_wordID_max, pronvar, LB ## ======================= make train/eval/test set or load ======================= ## find the smallest group to balance the number of samples per group. pidlist3 = np.unique(data[:, (1, 2)], axis=0) pidlist3_counter = Counter(pidlist3[:, 1]) sample_num_max = min(pidlist3_counter.values()) del pidlist3_counter ## make train/eval/test set or load them. if make_train_test_set==1: pidlist3_train = [] pidlist3_eval = [] pidlist3_test = [] for region_num in range(0, len(region_labels3)): region_name = region_labels3[region_num] pidlist3_per_region_ = pidlist3[pidlist3[:, 1]==region_labels3[region_num], :] pidlist3_per_region, idx = mani.extractRandomSample( pidlist3_per_region_, sample_num_max) # split dataset into train, eval and test. [pidlist3_per_region_train, pidlist3_per_region_test] = train_test_split( pidlist3_per_region, test_size = 0.2, random_state = 0) [pidlist3_per_region_train, pidlist3_per_region_eval] = train_test_split( pidlist3_per_region_train, test_size = 0.1, random_state = 0) # append numpy arrays. if region_num == 0: pidlist3_train = pidlist3_per_region_train pidlist3_eval = pidlist3_per_region_eval pidlist3_test = pidlist3_per_region_test else: pidlist3_train = np.r_[pidlist3_train, pidlist3_per_region_train] pidlist3_eval = np.r_[pidlist3_eval, pidlist3_per_region_eval] pidlist3_test = np.r_[pidlist3_test, pidlist3_per_region_test] del region_num, region_name del pidlist3_per_region_, pidlist3_per_region, idx del pidlist3_per_region_train, pidlist3_per_region_eval, pidlist3_per_region_test np.save(output_dir + "\\pidlist3_train.npy", pidlist3_train) np.save(output_dir + "\\pidlist3_eval.npy", pidlist3_eval) np.save(output_dir + "\\pidlist3_test.npy", pidlist3_test) if experiment_type == 2: pidlist2_train_ = np.r_[pidlist3_train, pidlist3_eval] pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_) pidlist2_test = sb_func.groningen_vs_limburg(pidlist3_test) np.save(output_dir + "\\pidlist2_train", pidlist2_train) np.save(output_dir + "\\pidlist2_test", pidlist2_test) del pidlist2_train_ else: pidlist3_train = np.load(output_dir + "\\pidlist3_train.npy") pidlist3_eval = np.load(output_dir + "\\pidlist3_eval.npy") pidlist3_test = np.load(output_dir + "\\pidlist3_test.npy") if experiment_type == 2: pidlist2_train = np.load(output_dir + "\\pidlist2_train.npy") pidlist2_test = np.load(output_dir + "\\pidlist2_test.npy") ## extract corresponding data using pid data3_train = sb_func.extractPid(pidlist3_train, data) data3_eval = sb_func.extractPid(pidlist3_eval, data) data3_test = sb_func.extractPid(pidlist3_test, data) if experiment_type == 2: data2 = np.array(data) data2_train = sb_func.extractPid(pidlist2_train, data2) data2_test = sb_func.extractPid(pidlist2_test, data2) ## ======================= experiments ======================= ## specify the dataset # train vs eval #trainData = data3_train #testData = data3_eval #testPID = pidlist3_eval #LB = LB_y3 #LE = LE_y3 #region_labels = region_labels3 # train+eval vs test if experiment_type == 3: trainData = np.r_[data3_train, data3_eval] testData = data3_test testPID = pidlist3_test LB = LB_y3 LE = LE_y3 region_labels = region_labels3 elif experiment_type == 2: trainData = data2_train testData = data2_test testPID = pidlist2_test LB = LB_y2 LE = LE_y2 region_labels = region_labels2 ## check the number of utterance #data_all = np.r_[trainData, testData] #filenames = np.c_[data_all[:, 0], data_all[:, 2]] #filenames_unique = np.unique(filenames, axis=0) #Counter(filenames_unique[:, 1]) ## output filenames fileComparison = output_dir + "\\algorithm_comparison.csv" filePerformance = output_dir + "\\sentence-level.csv" fileConfusionMatrix = output_dir + "\\confusion_matrix.csv" ## compare classification algorithms for the sentence-classifiers. #sb_func.compare_sentence_level_classifiers(trainData, LB_list, LE, fileComparison) ## train sentence-level classifiers. model_list, score_list, confusion_matrix_list = sb_func.train_sentence_level_classifiers( trainData, LB_list, LE, filePerformance) ## prediction over evaluation data per each sentence-level classifier. pred_per_sentence = sb_func.prediction_per_sentence(testData, model_list, LB_list, LE) ## combine sentence-level classifiers pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence) ## confusion matrix confusionMatrix_majority = confusion_matrix( pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=region_labels) ## output accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None) print('accuracy: {}%'.format(accuracy * 100)) cm = confusionMatrix_majority print(cm) np.save(output_dir + "\\pred_per_pid2.npy", pred_per_pid_majority) np.save(output_dir + "\\confusion_matrix2.npy", cm)