diff --git a/.vs/accent_classification/v15/.suo b/.vs/accent_classification/v15/.suo new file mode 100644 index 0000000..ffd4fc5 Binary files /dev/null and b/.vs/accent_classification/v15/.suo differ diff --git a/dialect_identification.sln b/accent_classification.sln similarity index 92% rename from dialect_identification.sln rename to accent_classification.sln index 3d3874c..0ba6b7a 100644 --- a/dialect_identification.sln +++ b/accent_classification.sln @@ -3,8 +3,6 @@ Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio 15 VisualStudioVersion = 15.0.26730.12 MinimumVisualStudioVersion = 10.0.40219.1 -Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "dialect_identification", "dialect_identification\dialect_identification.pyproj", "{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}" -EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{5A4286D1-F037-43D4-90F8-05C5CCC0CA30}" ProjectSection(SolutionItems) = preProject ..\..\forced-alignment\forced_alignment\convert_phone_set.py = ..\..\forced-alignment\forced_alignment\convert_phone_set.py @@ -20,6 +18,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution ..\..\forced-alignment\forced_alignment\test_environment.py = ..\..\forced-alignment\forced_alignment\test_environment.py EndProjectSection EndProject +Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "accent_classification", "accent_classification\accent_classification.pyproj", "{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU diff --git a/accent_classification/__pycache__/data_manipulation.cpython-36.pyc b/accent_classification/__pycache__/data_manipulation.cpython-36.pyc new file mode 100644 index 0000000..ccf73c9 Binary files /dev/null and b/accent_classification/__pycache__/data_manipulation.cpython-36.pyc differ diff --git a/accent_classification/__pycache__/evaluation.cpython-36.pyc b/accent_classification/__pycache__/evaluation.cpython-36.pyc new file mode 100644 index 0000000..00f80fb Binary files /dev/null and b/accent_classification/__pycache__/evaluation.cpython-36.pyc differ diff --git a/accent_classification/__pycache__/speaker_based_functions.cpython-36.pyc b/accent_classification/__pycache__/speaker_based_functions.cpython-36.pyc new file mode 100644 index 0000000..b438b55 Binary files /dev/null and b/accent_classification/__pycache__/speaker_based_functions.cpython-36.pyc differ diff --git a/dialect_identification/dialect_identification.pyproj b/accent_classification/accent_classification.pyproj similarity index 92% rename from dialect_identification/dialect_identification.pyproj rename to accent_classification/accent_classification.pyproj index ae47cc7..9905a37 100644 --- a/dialect_identification/dialect_identification.pyproj +++ b/accent_classification/accent_classification.pyproj @@ -5,7 +5,7 @@ fe1b1358-adbe-4446-affd-a0802d13d15b {a41c8ea1-112a-4a2d-9f91-29557995525f};{888888a0-9f3d-457c-b088-3a5042f75d52} . - output_confusion_matrix.py + speaker_based.py . @@ -22,6 +22,8 @@ false + + Code @@ -29,9 +31,6 @@ Code - - Code - Code @@ -53,7 +52,6 @@ Code - diff --git a/dialect_identification/audio2db.py b/accent_classification/audio2db.py similarity index 89% rename from dialect_identification/audio2db.py rename to accent_classification/audio2db.py index 6b73f90..3375fb8 100644 --- a/dialect_identification/audio2db.py +++ b/accent_classification/audio2db.py @@ -1,6 +1,5 @@ import os import sys -import configparser import numpy as np import pypyodbc @@ -20,16 +19,10 @@ sys.path.append(forced_alignment_module) from forced_alignment import forced_alignment -## check if forced-alignment work in each sentence +## delete all automatically generated pronunciations #from forced_alignment import pronunciations #pronunciations.delete_all_g2p_entries() -#wav_file = wav_dir + '\\10\\' + regionLabels[0] + '\\9935-1464218044-1951631.wav' -#script_file = script_dir + '\\script10.txt' -#with open(script_file, 'r') as fin: -# script = fin.readline() -#fa = forced_alignment(wav_file, script) - ## make database connection param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";" diff --git a/dialect_identification/classifier.py b/accent_classification/classifier.py similarity index 100% rename from dialect_identification/classifier.py rename to accent_classification/classifier.py diff --git a/dialect_identification/config.ini b/accent_classification/config.ini similarity index 100% rename from dialect_identification/config.ini rename to accent_classification/config.ini diff --git a/dialect_identification/data_io.py b/accent_classification/data_io.py similarity index 100% rename from dialect_identification/data_io.py rename to accent_classification/data_io.py diff --git a/dialect_identification/data_manipulation.py b/accent_classification/data_manipulation.py similarity index 100% rename from dialect_identification/data_manipulation.py rename to accent_classification/data_manipulation.py diff --git a/dialect_identification/evaluation.py b/accent_classification/evaluation.py similarity index 100% rename from dialect_identification/evaluation.py rename to accent_classification/evaluation.py diff --git a/dialect_identification/manipulate_db.py b/accent_classification/manipulate_db.py similarity index 100% rename from dialect_identification/manipulate_db.py rename to accent_classification/manipulate_db.py diff --git a/dialect_identification/output_confusion_matrix.py b/accent_classification/output_confusion_matrix.py similarity index 100% rename from dialect_identification/output_confusion_matrix.py rename to accent_classification/output_confusion_matrix.py diff --git a/dialect_identification/sentence_based.py b/accent_classification/sentence_based.py similarity index 100% rename from dialect_identification/sentence_based.py rename to accent_classification/sentence_based.py diff --git a/accent_classification/speaker_based.py b/accent_classification/speaker_based.py new file mode 100644 index 0000000..3679eb5 --- /dev/null +++ b/accent_classification/speaker_based.py @@ -0,0 +1,267 @@ +import os +import sys +import configparser + +import pypyodbc +import numpy as np +from collections import Counter +import matplotlib.pyplot as plt + +from sklearn.model_selection import train_test_split +from sklearn.model_selection import cross_val_score +from sklearn import preprocessing +from sklearn.metrics import confusion_matrix +from sklearn.metrics import accuracy_score + +repo_dir = 'C:\\Users\\Aki\\source\\repos\\accent_classification' +curr_dir = repo_dir + '\\accent_classification' +sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir)) +import data_manipulation as mani +import evaluation as eval +import speaker_based_functions as sb_func + + +## ======================= user define ======================= +sentence_num_max = 10 +config_file = curr_dir + '\\config.ini' +output_dir = repo_dir + '\\output' + +# make train/test set: 1, load: 0 +make_train_test_set = 0 + +# specify which experiment to be performed. +# - 3: groninven vs oost_overijssel vs limburg +# - 2: groningen vs limburg +experiment_type = 2 + +region_labels3 = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg'] +region_labels2 = ['Groningen_and_Drenthe', 'Limburg'] + + +## ======================= data preparation ======================= + +## load variables from the ini file +config = configparser.ConfigParser() +config.sections() +config.read(config_file) +MDB_file = config['sentence_based']['fileMDB'] + + +## connect to the database +pypyodbc.lowercase = False +param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + MDB_file + ";" +conn = pypyodbc.connect(param) +cursor = conn.cursor() + + +## get data from Access database +# data format +# 0: filename +# 1: pid +# 2: region +# 3: ID (unique word_id) +# 4: sentence_id +# 5: word_id +# 6: word +# 7: pronunciation +SQL_string = """\ +{CALL dataset_with_cities} +""" +cursor.execute(SQL_string) + +rows = cursor.fetchall() +data = np.array(rows) +del SQL_string, rows + + +## get the list of pronunciation variant (pronvarList) from Access database +# pronvarList format +# 0: ID (unique word_id) +# 1: word +# 2: pronvar +SQL_string = """\ +{CALL pronunciation_variant} +""" +cursor.execute(SQL_string) +rows = cursor.fetchall() +pronvarList = np.array(rows) +del SQL_string, rows + +conn.close() + + +## make list of LabelBinarizer object per word for X (=pronunciation variant). +LB_list = [] +unique_wordID_list = data[:, 3].astype(int) +unique_wordID_max = max(unique_wordID_list) +for unique_wordID in range(1, unique_wordID_max+1): + pronvar = data[unique_wordID_list == unique_wordID, 7] + LB = preprocessing.LabelBinarizer() + LB.fit(np.unique(pronvar)) + LB_list.append(LB) + + +## make LabelEncorder/LabelBinilizer objects for y (=region). +LE_y3 = preprocessing.LabelEncoder() +LE_y3.fit(region_labels3) +LE_y2 = preprocessing.LabelEncoder() +LE_y2.fit(region_labels2) + +LB_y3 = preprocessing.LabelBinarizer() +LB_y3.fit(region_labels3) +LB_y2 = preprocessing.LabelBinarizer() +LB_y2.fit(region_labels2) + +del unique_wordID, unique_wordID_max, pronvar, LB + + + +## ======================= make train/eval/test set or load ======================= + +## find the smallest group to balance the number of samples per group. +pidlist3 = np.unique(data[:, (1, 2)], axis=0) +pidlist3_counter = Counter(pidlist3[:, 1]) +sample_num_max = min(pidlist3_counter.values()) +del pidlist3_counter + + +## make train/eval/test set or load them. + +if make_train_test_set==1: + pidlist3_train = [] + pidlist3_eval = [] + pidlist3_test = [] + for region_num in range(0, len(region_labels3)): + region_name = region_labels3[region_num] + + pidlist3_per_region_ = pidlist3[pidlist3[:, 1]==region_labels3[region_num], :] + pidlist3_per_region, idx = mani.extractRandomSample( + pidlist3_per_region_, sample_num_max) + + # split dataset into train, eval and test. + [pidlist3_per_region_train, pidlist3_per_region_test] = train_test_split( + pidlist3_per_region, test_size = 0.2, random_state = 0) + [pidlist3_per_region_train, pidlist3_per_region_eval] = train_test_split( + pidlist3_per_region_train, test_size = 0.1, random_state = 0) + + # append numpy arrays. + if region_num == 0: + pidlist3_train = pidlist3_per_region_train + pidlist3_eval = pidlist3_per_region_eval + pidlist3_test = pidlist3_per_region_test + else: + pidlist3_train = np.r_[pidlist3_train, pidlist3_per_region_train] + pidlist3_eval = np.r_[pidlist3_eval, pidlist3_per_region_eval] + pidlist3_test = np.r_[pidlist3_test, pidlist3_per_region_test] + del region_num, region_name + del pidlist3_per_region_, pidlist3_per_region, idx + del pidlist3_per_region_train, pidlist3_per_region_eval, pidlist3_per_region_test + np.save(output_dir + "\\pidlist3_train.npy", pidlist3_train) + np.save(output_dir + "\\pidlist3_eval.npy", pidlist3_eval) + np.save(output_dir + "\\pidlist3_test.npy", pidlist3_test) + + + if experiment_type == 2: + pidlist2_train_ = np.r_[pidlist3_train, pidlist3_eval] + + pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_) + pidlist2_test = sb_func.groningen_vs_limburg(pidlist3_test) + np.save(output_dir + "\\pidlist2_train", pidlist2_train) + np.save(output_dir + "\\pidlist2_test", pidlist2_test) + + del pidlist2_train_ +else: + pidlist3_train = np.load(output_dir + "\\pidlist3_train.npy") + pidlist3_eval = np.load(output_dir + "\\pidlist3_eval.npy") + pidlist3_test = np.load(output_dir + "\\pidlist3_test.npy") + + if experiment_type == 2: + pidlist2_train = np.load(output_dir + "\\pidlist2_train.npy") + pidlist2_test = np.load(output_dir + "\\pidlist2_test.npy") + + +## extract corresponding data using pid + +data3_train = sb_func.extractPid(pidlist3_train, data) +data3_eval = sb_func.extractPid(pidlist3_eval, data) +data3_test = sb_func.extractPid(pidlist3_test, data) + +if experiment_type == 2: + data2 = np.array(data) + data2_train = sb_func.extractPid(pidlist2_train, data2) + data2_test = sb_func.extractPid(pidlist2_test, data2) + + +## ======================= experiments ======================= + +## specify the dataset + +# train vs eval +#trainData = data3_train +#testData = data3_eval +#testPID = pidlist3_eval +#LB = LB_y3 +#LE = LE_y3 +#region_labels = region_labels3 + +# train+eval vs test +if experiment_type == 3: + trainData = np.r_[data3_train, data3_eval] + testData = data3_test + testPID = pidlist3_test + LB = LB_y3 + LE = LE_y3 + region_labels = region_labels3 + +elif experiment_type == 2: + trainData = data2_train + testData = data2_test + testPID = pidlist2_test + LB = LB_y2 + LE = LE_y2 + region_labels = region_labels2 + +## check the number of utterance +#data_all = np.r_[trainData, testData] +#filenames = np.c_[data_all[:, 0], data_all[:, 2]] +#filenames_unique = np.unique(filenames, axis=0) +#Counter(filenames_unique[:, 1]) + + +## output filenames +fileComparison = output_dir + "\\algorithm_comparison.csv" +filePerformance = output_dir + "\\sentence-level.csv" +fileConfusionMatrix = output_dir + "\\confusion_matrix.csv" + + +## compare classification algorithms for the sentence-classifiers. +#sb_func.compare_sentence_level_classifiers(trainData, LB_list, LE, fileComparison) + + +## train sentence-level classifiers. +model_list, score_list, confusion_matrix_list = sb_func.train_sentence_level_classifiers( + trainData, LB_list, LE, filePerformance) + + +## prediction over evaluation data per each sentence-level classifier. +pred_per_sentence = sb_func.prediction_per_sentence(testData, model_list, LB_list, LE) + + +## combine sentence-level classifiers +pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence) + + +## confusion matrix +confusionMatrix_majority = confusion_matrix( + pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=region_labels) + + +## output +accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None) +print('accuracy: {}%'.format(accuracy * 100)) + +cm = confusionMatrix_majority +print(cm) + +np.save(output_dir + "\\pred_per_pid2.npy", pred_per_pid_majority) +np.save(output_dir + "\\confusion_matrix2.npy", cm) \ No newline at end of file diff --git a/dialect_identification/speaker_based_functions.py b/accent_classification/speaker_based_functions.py similarity index 90% rename from dialect_identification/speaker_based_functions.py rename to accent_classification/speaker_based_functions.py index 1421376..69293e7 100644 --- a/dialect_identification/speaker_based_functions.py +++ b/accent_classification/speaker_based_functions.py @@ -14,7 +14,7 @@ from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.model_selection import cross_val_score from sklearn.metrics import confusion_matrix -import dataManipulation as mani +import data_manipulation as mani import evaluation as eval @@ -338,34 +338,6 @@ def prediction_per_pid_weighted(pidlist_eval, prediction, weight, LB_y, LE_y): return np.array(prediction_per_pid) -def saxon_vs_limburg(pidlist3): - """convert a pidlist for 3 regions into that for 2 regions. - - Notes: - 3 regions include ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland'] - 2 regions include ['Limburg', 'Low_Saxon'] - where Low_Saxon = 'Groningen_and_Drenthe' + 'Oost_Overijsel-Gelderland' - samples are randomly chosen so that each class has the same amount of data. - - """ - - regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland'] - regionLabels2 = ['Low_Saxon', 'Limburg'] - - index_saxon = np.any([pidlist3[:, 1] == regionLabels[0], pidlist3[:, 1] == regionLabels[2]], axis=0) - pidlist_saxon_ = pidlist3[index_saxon, :] - pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :] - - # extract the same amout of samples as Limburg. - pidlistCounter3 = Counter(pidlist3[:, 1]) - pidlist_saxon, idx = mani.extractRandomSample(pidlist_saxon_, pidlistCounter3['Limburg']) - pidlist_saxon[:, 1] = regionLabels2[0] - - pidlist2 = np.r_[pidlist_limburg, pidlist_saxon] - #pidlistCounter2 = Counter(pidlist2[:, 1]) - return pidlist2 - - def groningen_vs_limburg(pidlist3): """convert a pidlist for 3 regions into that for 2 regions. @@ -374,7 +346,7 @@ def groningen_vs_limburg(pidlist3): 2 regions include ['Groningen_and_Drenthe', 'Limburg'] """ - regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland'] + regionLabels = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg'] pidlist_groningen = pidlist3[pidlist3[:, 1] == regionLabels[0], :] pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :] diff --git a/dialect_identification/test_code.py b/accent_classification/test_code.py similarity index 100% rename from dialect_identification/test_code.py rename to accent_classification/test_code.py diff --git a/dialect_identification/word_based.py b/accent_classification/word_based.py similarity index 100% rename from dialect_identification/word_based.py rename to accent_classification/word_based.py diff --git a/dialect_identification/speaker_based.py b/dialect_identification/speaker_based.py deleted file mode 100644 index c7d1536..0000000 --- a/dialect_identification/speaker_based.py +++ /dev/null @@ -1,326 +0,0 @@ -import os -import sys -import configparser - -import pypyodbc -import numpy as np -from collections import Counter -import matplotlib.pyplot as plt - -from sklearn.model_selection import train_test_split -from sklearn.model_selection import cross_val_score -from sklearn import preprocessing -from sklearn.metrics import confusion_matrix -from sklearn.metrics import accuracy_score - -currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification' -sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir)) -import dataManipulation as mani -import evaluation as eval -import speaker_based_functions as sb_func - - -##################### -## USER DEFINE ## -##################### -sentenceNumMax = 10 -configFile = currDir + '\\config.ini' -dirOut = currDir + '\\result' - -# make train/test set: 1, load: 0 -makeTrainTestSet = 0 -# convert 3 regions to 2 regions: 1, load: 0 -conv3to2region = 0 - -# 3 regions: 0 -# saxon vs limburg: 1 -# groningen vs limburg: 2 -experiment_type = 2 - -regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland'] - -# a bit useless error handling. -#assert (experiment_type in (0, 1, 2)), "experiment type should be 0, 1 or 2." -if experiment_type == 1: - regionLabels2 = ['Low_Saxon', 'Limburg'] -regionLabels2 = ['Groningen_and_Drenthe', 'Limburg'] - - -########################## -## DATA PREPARATION ## -########################## - -## load init file -config = configparser.ConfigParser() -config.sections() -config.read(configFile) -dirFeature = config['sentence_based']['dirFeature'] -fileMDB = config['sentence_based']['fileMDB'] - - -## database connection -pypyodbc.lowercase = False -param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";" -conn = pypyodbc.connect(param) -cursor = conn.cursor() - - -## get data from Access database -# data format -# 0: filename -# 1: pid -# 2: region -# 3: ID (unique word_id) -# 4: sentence_id -# 5: word_id -# 6: word -# 7: pronunciation -SQL_string = """\ -{CALL dataset_with_cities} -""" -cursor.execute(SQL_string) - -rows = cursor.fetchall() -data = np.array(rows) -#dataNumMax = data.shape[0] -#uniqueWordIDmax = max(data[:, 3].astype(int)) -del SQL_string, rows - - -## make list of LabelBinarizer object per word. -# for X -# get pronvarList from Access database -# pronvarList format -# 0: ID (unique word_id) -# 1: word -# 2: pronvar -SQL_string = """\ -{CALL pronunciation_variant} -""" -cursor.execute(SQL_string) -rows = cursor.fetchall() -pronvarList = np.array(rows) -del SQL_string, rows - - -LBlist = [] -#uniqueWordIDlist = pronvarList[:, 0].astype(int) -uniqueWordIDlist = data[:, 3].astype(int) -uniqueWordIDmax = max(uniqueWordIDlist) -for uniqueWordID in range(1, uniqueWordIDmax+1): - pronvar = data[uniqueWordIDlist == uniqueWordID, 7] - #pronvar = pronvarList[pronvarList[:, 0] == uniqueWordID, 2] - LB = preprocessing.LabelBinarizer() - LB.fit(np.unique(pronvar)) - LBlist.append(LB) - -# for y (=region) -LE_y = preprocessing.LabelEncoder() -LE_y.fit(regionLabels) -LE_y2 = preprocessing.LabelEncoder() -LE_y2.fit(regionLabels2) - -LB_y = preprocessing.LabelBinarizer() -LB_y.fit(regionLabels) -LB_y2 = preprocessing.LabelBinarizer() -LB_y2.fit(regionLabels2) - -del uniqueWordID, uniqueWordIDmax, pronvar, LB - - -################# -## ITERATION ## -################# -#CM_majority = np.zeros((1, 9)).astype(int) -#CM_weighted = np.zeros((1, 9)).astype(int) -#for iter in range(0, 1): -# print(iter) - -## make balanced dataset -pidlist = np.unique(data[:, (1, 2)], axis=0) - -# count number of samples -pidlistCounter = Counter(pidlist[:, 1]) -sampleNumMax = min(pidlistCounter.values()) -del pidlistCounter - - -## make train/eval/test set or load -if makeTrainTestSet==1: - pidlist_train = [] - pidlist_eval = [] - pidlist_test = [] - for regionNum in range(0, len(regionLabels)): - regionName = regionLabels[regionNum] - - pidlist_per_region_ = pidlist[pidlist[:, 1]==regionLabels[regionNum], :] - pidlist_per_region, idx = mani.extractRandomSample( - pidlist_per_region_, sampleNumMax) - - # split dataset into train, eval and test. - [pidlist_per_region_train, pidlist_per_region_test] = train_test_split( - pidlist_per_region, test_size = 0.2, random_state = 0) - [pidlist_per_region_train, pidlist_per_region_eval] = train_test_split( - pidlist_per_region_train, test_size = 0.1, random_state = 0) - - # append numpy arrays - if regionNum == 0: - pidlist_train = pidlist_per_region_train - pidlist_eval = pidlist_per_region_eval - pidlist_test = pidlist_per_region_test - else: - pidlist_train = np.r_[pidlist_train, pidlist_per_region_train] - pidlist_eval = np.r_[pidlist_eval, pidlist_per_region_eval] - pidlist_test = np.r_[pidlist_test, pidlist_per_region_test] - del regionNum, regionName - del pidlist_per_region_, pidlist_per_region, idx - del pidlist_per_region_train, pidlist_per_region_eval, pidlist_per_region_test - np.save(dirOut + "\\pidlist_train.npy", pidlist_train) - np.save(dirOut + "\\pidlist_eval.npy", pidlist_eval) - np.save(dirOut + "\\pidlist_test.npy", pidlist_test) -else: - pidlist_train = np.load(dirOut + "\\pidlist_train.npy") - pidlist_eval = np.load(dirOut + "\\pidlist_eval.npy") - pidlist_test = np.load(dirOut + "\\pidlist_test.npy") - - -## make dataset for 2 regions or load -if conv3to2region==1: - pidlist2_train_ = np.r_[pidlist_train, pidlist_eval] - - if experiment_type == 1: - pidlist2_train = sb_func.saxon_vs_limburg(pidlist2_train_) - pidlist2_test = sb_func.saxon_vs_limburg(pidlist_test) - np.save(dirOut + "\\pidlist2_saxon_vs_limburg_train", pidlist2_train) - np.save(dirOut + "\\pidlist2_saxon_vs_limburg_test", pidlist2_test) - - elif experiment_type == 2: - pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_) - pidlist2_test = sb_func.groningen_vs_limburg(pidlist_test) - np.save(dirOut + "\\pidlist2_groningen_vs_limburg_train", pidlist2_train) - np.save(dirOut + "\\pidlist2_groningen_vs_limburg_test", pidlist2_test) - - del pidlist2_train_ -else: - if experiment_type == 1: - pidlist2_train = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_train.npy") - pidlist2_test = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_test.npy") - - elif experiment_type == 2: - pidlist2_train = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_train.npy") - pidlist2_test = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_test.npy") - - -## train/test data -if experiment_type == 0: - # Groningen vs Overijsel vs Limburg - data_train = sb_func.extractPid(pidlist_train, data) - data_eval = sb_func.extractPid(pidlist_eval, data) - data_test = sb_func.extractPid(pidlist_test, data) - -elif experiment_type == 1 or experiment_type == 2: - data2 = np.array(data) - - if experiment_type == 1: - for row, row2 in zip(data, data2): - if row[2] == regionLabels[0] or row[2] == regionLabels[2]: - row2[2] = regionLabels2[0] - - data2_train = sb_func.extractPid(pidlist2_train, data2) - data2_test = sb_func.extractPid(pidlist2_test, data2) - - -##################################### -## EXPERIMENTS START FROM HERE ## -##################################### - -## actual training -# train vs eval -#trainData = data_train -#testData = data_eval -#testPID = pidlist_eval -#LB = LB_y -#LE = LE_y -#regionLabels = regionLabels3 - -# train+eval vs test -if experiment_type == 0: - trainData = np.r_[data_train, data_eval] - testData = data_test - testPID = pidlist_test - LB = LB_y - LE = LE_y -elif experiment_type == 1 or experiment_type == 2: -# 2 region: saxon vs limburg/ groningen vs limburg - trainData = data2_train - testData = data2_test - testPID = pidlist2_test - LB = LB_y2 - LE = LE_y2 - regionLabels = regionLabels2 - - -# check the number of utterance -allData = np.r_[trainData, testData] -filenames = np.c_[allData[:, 0], allData[:, 2]] -filenames_unique = np.unique(filenames, axis=0) -Counter(filenames_unique[:, 1]) - - -fileComparison = dirOut + "\\algorithm_comparison.csv" -filePerformance = dirOut + "\\sentence-level.csv" -fileConfusionMatrix = dirOut + "\\confusion_matrix.csv" - -## compare classification algorithms for the sentence-classifiers. -#sb_func.compare_sentence_level_classifiers(trainData, LBlist, LE, fileComparison) - -## train sentence-level classifiers. -modelList, scoreList, confusionMatrixList = sb_func.train_sentence_level_classifiers( - trainData, LBlist, LE, filePerformance) - -## prediction over evaluation data per each sentence-level classifier. -pred_per_sentence = sb_func.prediction_per_sentence(testData, modelList, LBlist, LE) - -## combine sentence-level classifiers -pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence) - -## majority vote (weighted) -#weight = sb_func.calc_weight(confusionMatrixList) -#pred_per_pid_weighted = sb_func.prediction_per_pid_weighted(testPID, pred_per_sentence, weight, LB, LE) - -### confusion matrix -if experiment_type == 0: - confusionMatrix_majority = confusion_matrix( - pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg']) -else: - confusionMatrix_majority = confusion_matrix( - pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Limburg']) - - #confusionMatrix_weighted = confusion_matrix( -# pred_per_pid_weighted[:, 1], pred_per_pid_weighted[:, 2], labels=regionLabels) - - -## output -accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None) -print('accuracy: {}%'.format(accuracy * 100)) - -cm = confusionMatrix_majority -print(cm) - -np.save(dirOut + "\\pred_per_pid.npy", pred_per_pid_majority) -np.save(dirOut + "\\confusion_matrix.npy", cm) - -#fout = open(fileConfusionMatrix, "w") -#fout.write('< confusion matrix for majority vote in evaluation set >\n') -#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_majority', regionLabels) -#fout.write('< confusion matrix for weighted vote in evaluation set >\n') -#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_weighted', regionLabels) -#fout.write('\n') -#fout.close() - - -##### iteration finish ##### -conn.close() -#np.savetxt(dirOut + '\\cm_majority.csv', CM_majority, delimiter=',') -#np.savetxt(dirOut + '\\cm_weighted.csv', CM_weighted, delimiter=',') - diff --git a/output/confusion_matrix_2regions.npy b/output/confusion_matrix_2regions.npy new file mode 100644 index 0000000..e766cb8 Binary files /dev/null and b/output/confusion_matrix_2regions.npy differ diff --git a/output/confusion_matrix_2regions.png b/output/confusion_matrix_2regions.png new file mode 100644 index 0000000..8a67f8d Binary files /dev/null and b/output/confusion_matrix_2regions.png differ diff --git a/output/confusion_matrix_2regions_normalized.png b/output/confusion_matrix_2regions_normalized.png new file mode 100644 index 0000000..02b7621 Binary files /dev/null and b/output/confusion_matrix_2regions_normalized.png differ diff --git a/output/confusion_matrix_3regions.npy b/output/confusion_matrix_3regions.npy new file mode 100644 index 0000000..09e5359 Binary files /dev/null and b/output/confusion_matrix_3regions.npy differ diff --git a/output/confusion_matrix_3regions.png b/output/confusion_matrix_3regions.png new file mode 100644 index 0000000..8b2c7f0 Binary files /dev/null and b/output/confusion_matrix_3regions.png differ diff --git a/output/confusion_matrix_3regions_normalized.png b/output/confusion_matrix_3regions_normalized.png new file mode 100644 index 0000000..c187d32 Binary files /dev/null and b/output/confusion_matrix_3regions_normalized.png differ diff --git a/output/pidlist_2regions_test.npy b/output/pidlist_2regions_test.npy new file mode 100644 index 0000000..2a9701b Binary files /dev/null and b/output/pidlist_2regions_test.npy differ diff --git a/output/pidlist_2regions_train.npy b/output/pidlist_2regions_train.npy new file mode 100644 index 0000000..85652b5 Binary files /dev/null and b/output/pidlist_2regions_train.npy differ diff --git a/output/pidlist_3regions_eval.npy b/output/pidlist_3regions_eval.npy new file mode 100644 index 0000000..258b029 Binary files /dev/null and b/output/pidlist_3regions_eval.npy differ diff --git a/output/pidlist_3regions_test.npy b/output/pidlist_3regions_test.npy new file mode 100644 index 0000000..d7f1a78 Binary files /dev/null and b/output/pidlist_3regions_test.npy differ diff --git a/output/pidlist_3regions_train.npy b/output/pidlist_3regions_train.npy new file mode 100644 index 0000000..0649478 Binary files /dev/null and b/output/pidlist_3regions_train.npy differ diff --git a/output/pred_per_pid_2regions.npy b/output/pred_per_pid_2regions.npy new file mode 100644 index 0000000..a6256fa Binary files /dev/null and b/output/pred_per_pid_2regions.npy differ diff --git a/output/pred_per_pid_3regions.npy b/output/pred_per_pid_3regions.npy new file mode 100644 index 0000000..4cef437 Binary files /dev/null and b/output/pred_per_pid_3regions.npy differ