diff --git a/.vs/accent_classification/v15/.suo b/.vs/accent_classification/v15/.suo
new file mode 100644
index 0000000..ffd4fc5
Binary files /dev/null and b/.vs/accent_classification/v15/.suo differ
diff --git a/dialect_identification.sln b/accent_classification.sln
similarity index 92%
rename from dialect_identification.sln
rename to accent_classification.sln
index 3d3874c..0ba6b7a 100644
--- a/dialect_identification.sln
+++ b/accent_classification.sln
@@ -3,8 +3,6 @@ Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.26730.12
MinimumVisualStudioVersion = 10.0.40219.1
-Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "dialect_identification", "dialect_identification\dialect_identification.pyproj", "{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}"
-EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{5A4286D1-F037-43D4-90F8-05C5CCC0CA30}"
ProjectSection(SolutionItems) = preProject
..\..\forced-alignment\forced_alignment\convert_phone_set.py = ..\..\forced-alignment\forced_alignment\convert_phone_set.py
@@ -20,6 +18,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
..\..\forced-alignment\forced_alignment\test_environment.py = ..\..\forced-alignment\forced_alignment\test_environment.py
EndProjectSection
EndProject
+Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "accent_classification", "accent_classification\accent_classification.pyproj", "{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}"
+EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
diff --git a/accent_classification/__pycache__/data_manipulation.cpython-36.pyc b/accent_classification/__pycache__/data_manipulation.cpython-36.pyc
new file mode 100644
index 0000000..ccf73c9
Binary files /dev/null and b/accent_classification/__pycache__/data_manipulation.cpython-36.pyc differ
diff --git a/accent_classification/__pycache__/evaluation.cpython-36.pyc b/accent_classification/__pycache__/evaluation.cpython-36.pyc
new file mode 100644
index 0000000..00f80fb
Binary files /dev/null and b/accent_classification/__pycache__/evaluation.cpython-36.pyc differ
diff --git a/accent_classification/__pycache__/speaker_based_functions.cpython-36.pyc b/accent_classification/__pycache__/speaker_based_functions.cpython-36.pyc
new file mode 100644
index 0000000..b438b55
Binary files /dev/null and b/accent_classification/__pycache__/speaker_based_functions.cpython-36.pyc differ
diff --git a/dialect_identification/dialect_identification.pyproj b/accent_classification/accent_classification.pyproj
similarity index 92%
rename from dialect_identification/dialect_identification.pyproj
rename to accent_classification/accent_classification.pyproj
index ae47cc7..9905a37 100644
--- a/dialect_identification/dialect_identification.pyproj
+++ b/accent_classification/accent_classification.pyproj
@@ -5,7 +5,7 @@
fe1b1358-adbe-4446-affd-a0802d13d15b
{a41c8ea1-112a-4a2d-9f91-29557995525f};{888888a0-9f3d-457c-b088-3a5042f75d52}
.
- output_confusion_matrix.py
+ speaker_based.py
.
@@ -22,6 +22,8 @@
false
+
+
Code
@@ -29,9 +31,6 @@
Code
-
- Code
-
Code
@@ -53,7 +52,6 @@
Code
-
diff --git a/dialect_identification/audio2db.py b/accent_classification/audio2db.py
similarity index 89%
rename from dialect_identification/audio2db.py
rename to accent_classification/audio2db.py
index 6b73f90..3375fb8 100644
--- a/dialect_identification/audio2db.py
+++ b/accent_classification/audio2db.py
@@ -1,6 +1,5 @@
import os
import sys
-import configparser
import numpy as np
import pypyodbc
@@ -20,16 +19,10 @@ sys.path.append(forced_alignment_module)
from forced_alignment import forced_alignment
-## check if forced-alignment work in each sentence
+## delete all automatically generated pronunciations
#from forced_alignment import pronunciations
#pronunciations.delete_all_g2p_entries()
-#wav_file = wav_dir + '\\10\\' + regionLabels[0] + '\\9935-1464218044-1951631.wav'
-#script_file = script_dir + '\\script10.txt'
-#with open(script_file, 'r') as fin:
-# script = fin.readline()
-#fa = forced_alignment(wav_file, script)
-
## make database connection
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
diff --git a/dialect_identification/classifier.py b/accent_classification/classifier.py
similarity index 100%
rename from dialect_identification/classifier.py
rename to accent_classification/classifier.py
diff --git a/dialect_identification/config.ini b/accent_classification/config.ini
similarity index 100%
rename from dialect_identification/config.ini
rename to accent_classification/config.ini
diff --git a/dialect_identification/data_io.py b/accent_classification/data_io.py
similarity index 100%
rename from dialect_identification/data_io.py
rename to accent_classification/data_io.py
diff --git a/dialect_identification/data_manipulation.py b/accent_classification/data_manipulation.py
similarity index 100%
rename from dialect_identification/data_manipulation.py
rename to accent_classification/data_manipulation.py
diff --git a/dialect_identification/evaluation.py b/accent_classification/evaluation.py
similarity index 100%
rename from dialect_identification/evaluation.py
rename to accent_classification/evaluation.py
diff --git a/dialect_identification/manipulate_db.py b/accent_classification/manipulate_db.py
similarity index 100%
rename from dialect_identification/manipulate_db.py
rename to accent_classification/manipulate_db.py
diff --git a/dialect_identification/output_confusion_matrix.py b/accent_classification/output_confusion_matrix.py
similarity index 100%
rename from dialect_identification/output_confusion_matrix.py
rename to accent_classification/output_confusion_matrix.py
diff --git a/dialect_identification/sentence_based.py b/accent_classification/sentence_based.py
similarity index 100%
rename from dialect_identification/sentence_based.py
rename to accent_classification/sentence_based.py
diff --git a/accent_classification/speaker_based.py b/accent_classification/speaker_based.py
new file mode 100644
index 0000000..3679eb5
--- /dev/null
+++ b/accent_classification/speaker_based.py
@@ -0,0 +1,267 @@
+import os
+import sys
+import configparser
+
+import pypyodbc
+import numpy as np
+from collections import Counter
+import matplotlib.pyplot as plt
+
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import cross_val_score
+from sklearn import preprocessing
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import accuracy_score
+
+repo_dir = 'C:\\Users\\Aki\\source\\repos\\accent_classification'
+curr_dir = repo_dir + '\\accent_classification'
+sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir))
+import data_manipulation as mani
+import evaluation as eval
+import speaker_based_functions as sb_func
+
+
+## ======================= user define =======================
+sentence_num_max = 10
+config_file = curr_dir + '\\config.ini'
+output_dir = repo_dir + '\\output'
+
+# make train/test set: 1, load: 0
+make_train_test_set = 0
+
+# specify which experiment to be performed.
+# - 3: groninven vs oost_overijssel vs limburg
+# - 2: groningen vs limburg
+experiment_type = 2
+
+region_labels3 = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg']
+region_labels2 = ['Groningen_and_Drenthe', 'Limburg']
+
+
+## ======================= data preparation =======================
+
+## load variables from the ini file
+config = configparser.ConfigParser()
+config.sections()
+config.read(config_file)
+MDB_file = config['sentence_based']['fileMDB']
+
+
+## connect to the database
+pypyodbc.lowercase = False
+param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + MDB_file + ";"
+conn = pypyodbc.connect(param)
+cursor = conn.cursor()
+
+
+## get data from Access database
+# data format
+# 0: filename
+# 1: pid
+# 2: region
+# 3: ID (unique word_id)
+# 4: sentence_id
+# 5: word_id
+# 6: word
+# 7: pronunciation
+SQL_string = """\
+{CALL dataset_with_cities}
+"""
+cursor.execute(SQL_string)
+
+rows = cursor.fetchall()
+data = np.array(rows)
+del SQL_string, rows
+
+
+## get the list of pronunciation variant (pronvarList) from Access database
+# pronvarList format
+# 0: ID (unique word_id)
+# 1: word
+# 2: pronvar
+SQL_string = """\
+{CALL pronunciation_variant}
+"""
+cursor.execute(SQL_string)
+rows = cursor.fetchall()
+pronvarList = np.array(rows)
+del SQL_string, rows
+
+conn.close()
+
+
+## make list of LabelBinarizer object per word for X (=pronunciation variant).
+LB_list = []
+unique_wordID_list = data[:, 3].astype(int)
+unique_wordID_max = max(unique_wordID_list)
+for unique_wordID in range(1, unique_wordID_max+1):
+ pronvar = data[unique_wordID_list == unique_wordID, 7]
+ LB = preprocessing.LabelBinarizer()
+ LB.fit(np.unique(pronvar))
+ LB_list.append(LB)
+
+
+## make LabelEncorder/LabelBinilizer objects for y (=region).
+LE_y3 = preprocessing.LabelEncoder()
+LE_y3.fit(region_labels3)
+LE_y2 = preprocessing.LabelEncoder()
+LE_y2.fit(region_labels2)
+
+LB_y3 = preprocessing.LabelBinarizer()
+LB_y3.fit(region_labels3)
+LB_y2 = preprocessing.LabelBinarizer()
+LB_y2.fit(region_labels2)
+
+del unique_wordID, unique_wordID_max, pronvar, LB
+
+
+
+## ======================= make train/eval/test set or load =======================
+
+## find the smallest group to balance the number of samples per group.
+pidlist3 = np.unique(data[:, (1, 2)], axis=0)
+pidlist3_counter = Counter(pidlist3[:, 1])
+sample_num_max = min(pidlist3_counter.values())
+del pidlist3_counter
+
+
+## make train/eval/test set or load them.
+
+if make_train_test_set==1:
+ pidlist3_train = []
+ pidlist3_eval = []
+ pidlist3_test = []
+ for region_num in range(0, len(region_labels3)):
+ region_name = region_labels3[region_num]
+
+ pidlist3_per_region_ = pidlist3[pidlist3[:, 1]==region_labels3[region_num], :]
+ pidlist3_per_region, idx = mani.extractRandomSample(
+ pidlist3_per_region_, sample_num_max)
+
+ # split dataset into train, eval and test.
+ [pidlist3_per_region_train, pidlist3_per_region_test] = train_test_split(
+ pidlist3_per_region, test_size = 0.2, random_state = 0)
+ [pidlist3_per_region_train, pidlist3_per_region_eval] = train_test_split(
+ pidlist3_per_region_train, test_size = 0.1, random_state = 0)
+
+ # append numpy arrays.
+ if region_num == 0:
+ pidlist3_train = pidlist3_per_region_train
+ pidlist3_eval = pidlist3_per_region_eval
+ pidlist3_test = pidlist3_per_region_test
+ else:
+ pidlist3_train = np.r_[pidlist3_train, pidlist3_per_region_train]
+ pidlist3_eval = np.r_[pidlist3_eval, pidlist3_per_region_eval]
+ pidlist3_test = np.r_[pidlist3_test, pidlist3_per_region_test]
+ del region_num, region_name
+ del pidlist3_per_region_, pidlist3_per_region, idx
+ del pidlist3_per_region_train, pidlist3_per_region_eval, pidlist3_per_region_test
+ np.save(output_dir + "\\pidlist3_train.npy", pidlist3_train)
+ np.save(output_dir + "\\pidlist3_eval.npy", pidlist3_eval)
+ np.save(output_dir + "\\pidlist3_test.npy", pidlist3_test)
+
+
+ if experiment_type == 2:
+ pidlist2_train_ = np.r_[pidlist3_train, pidlist3_eval]
+
+ pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_)
+ pidlist2_test = sb_func.groningen_vs_limburg(pidlist3_test)
+ np.save(output_dir + "\\pidlist2_train", pidlist2_train)
+ np.save(output_dir + "\\pidlist2_test", pidlist2_test)
+
+ del pidlist2_train_
+else:
+ pidlist3_train = np.load(output_dir + "\\pidlist3_train.npy")
+ pidlist3_eval = np.load(output_dir + "\\pidlist3_eval.npy")
+ pidlist3_test = np.load(output_dir + "\\pidlist3_test.npy")
+
+ if experiment_type == 2:
+ pidlist2_train = np.load(output_dir + "\\pidlist2_train.npy")
+ pidlist2_test = np.load(output_dir + "\\pidlist2_test.npy")
+
+
+## extract corresponding data using pid
+
+data3_train = sb_func.extractPid(pidlist3_train, data)
+data3_eval = sb_func.extractPid(pidlist3_eval, data)
+data3_test = sb_func.extractPid(pidlist3_test, data)
+
+if experiment_type == 2:
+ data2 = np.array(data)
+ data2_train = sb_func.extractPid(pidlist2_train, data2)
+ data2_test = sb_func.extractPid(pidlist2_test, data2)
+
+
+## ======================= experiments =======================
+
+## specify the dataset
+
+# train vs eval
+#trainData = data3_train
+#testData = data3_eval
+#testPID = pidlist3_eval
+#LB = LB_y3
+#LE = LE_y3
+#region_labels = region_labels3
+
+# train+eval vs test
+if experiment_type == 3:
+ trainData = np.r_[data3_train, data3_eval]
+ testData = data3_test
+ testPID = pidlist3_test
+ LB = LB_y3
+ LE = LE_y3
+ region_labels = region_labels3
+
+elif experiment_type == 2:
+ trainData = data2_train
+ testData = data2_test
+ testPID = pidlist2_test
+ LB = LB_y2
+ LE = LE_y2
+ region_labels = region_labels2
+
+## check the number of utterance
+#data_all = np.r_[trainData, testData]
+#filenames = np.c_[data_all[:, 0], data_all[:, 2]]
+#filenames_unique = np.unique(filenames, axis=0)
+#Counter(filenames_unique[:, 1])
+
+
+## output filenames
+fileComparison = output_dir + "\\algorithm_comparison.csv"
+filePerformance = output_dir + "\\sentence-level.csv"
+fileConfusionMatrix = output_dir + "\\confusion_matrix.csv"
+
+
+## compare classification algorithms for the sentence-classifiers.
+#sb_func.compare_sentence_level_classifiers(trainData, LB_list, LE, fileComparison)
+
+
+## train sentence-level classifiers.
+model_list, score_list, confusion_matrix_list = sb_func.train_sentence_level_classifiers(
+ trainData, LB_list, LE, filePerformance)
+
+
+## prediction over evaluation data per each sentence-level classifier.
+pred_per_sentence = sb_func.prediction_per_sentence(testData, model_list, LB_list, LE)
+
+
+## combine sentence-level classifiers
+pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence)
+
+
+## confusion matrix
+confusionMatrix_majority = confusion_matrix(
+ pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=region_labels)
+
+
+## output
+accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None)
+print('accuracy: {}%'.format(accuracy * 100))
+
+cm = confusionMatrix_majority
+print(cm)
+
+np.save(output_dir + "\\pred_per_pid2.npy", pred_per_pid_majority)
+np.save(output_dir + "\\confusion_matrix2.npy", cm)
\ No newline at end of file
diff --git a/dialect_identification/speaker_based_functions.py b/accent_classification/speaker_based_functions.py
similarity index 90%
rename from dialect_identification/speaker_based_functions.py
rename to accent_classification/speaker_based_functions.py
index 1421376..69293e7 100644
--- a/dialect_identification/speaker_based_functions.py
+++ b/accent_classification/speaker_based_functions.py
@@ -14,7 +14,7 @@ from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
-import dataManipulation as mani
+import data_manipulation as mani
import evaluation as eval
@@ -338,34 +338,6 @@ def prediction_per_pid_weighted(pidlist_eval, prediction, weight, LB_y, LE_y):
return np.array(prediction_per_pid)
-def saxon_vs_limburg(pidlist3):
- """convert a pidlist for 3 regions into that for 2 regions.
-
- Notes:
- 3 regions include ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
- 2 regions include ['Limburg', 'Low_Saxon']
- where Low_Saxon = 'Groningen_and_Drenthe' + 'Oost_Overijsel-Gelderland'
- samples are randomly chosen so that each class has the same amount of data.
-
- """
-
- regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
- regionLabels2 = ['Low_Saxon', 'Limburg']
-
- index_saxon = np.any([pidlist3[:, 1] == regionLabels[0], pidlist3[:, 1] == regionLabels[2]], axis=0)
- pidlist_saxon_ = pidlist3[index_saxon, :]
- pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :]
-
- # extract the same amout of samples as Limburg.
- pidlistCounter3 = Counter(pidlist3[:, 1])
- pidlist_saxon, idx = mani.extractRandomSample(pidlist_saxon_, pidlistCounter3['Limburg'])
- pidlist_saxon[:, 1] = regionLabels2[0]
-
- pidlist2 = np.r_[pidlist_limburg, pidlist_saxon]
- #pidlistCounter2 = Counter(pidlist2[:, 1])
- return pidlist2
-
-
def groningen_vs_limburg(pidlist3):
"""convert a pidlist for 3 regions into that for 2 regions.
@@ -374,7 +346,7 @@ def groningen_vs_limburg(pidlist3):
2 regions include ['Groningen_and_Drenthe', 'Limburg']
"""
- regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
+ regionLabels = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg']
pidlist_groningen = pidlist3[pidlist3[:, 1] == regionLabels[0], :]
pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :]
diff --git a/dialect_identification/test_code.py b/accent_classification/test_code.py
similarity index 100%
rename from dialect_identification/test_code.py
rename to accent_classification/test_code.py
diff --git a/dialect_identification/word_based.py b/accent_classification/word_based.py
similarity index 100%
rename from dialect_identification/word_based.py
rename to accent_classification/word_based.py
diff --git a/dialect_identification/speaker_based.py b/dialect_identification/speaker_based.py
deleted file mode 100644
index c7d1536..0000000
--- a/dialect_identification/speaker_based.py
+++ /dev/null
@@ -1,326 +0,0 @@
-import os
-import sys
-import configparser
-
-import pypyodbc
-import numpy as np
-from collections import Counter
-import matplotlib.pyplot as plt
-
-from sklearn.model_selection import train_test_split
-from sklearn.model_selection import cross_val_score
-from sklearn import preprocessing
-from sklearn.metrics import confusion_matrix
-from sklearn.metrics import accuracy_score
-
-currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
-sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
-import dataManipulation as mani
-import evaluation as eval
-import speaker_based_functions as sb_func
-
-
-#####################
-## USER DEFINE ##
-#####################
-sentenceNumMax = 10
-configFile = currDir + '\\config.ini'
-dirOut = currDir + '\\result'
-
-# make train/test set: 1, load: 0
-makeTrainTestSet = 0
-# convert 3 regions to 2 regions: 1, load: 0
-conv3to2region = 0
-
-# 3 regions: 0
-# saxon vs limburg: 1
-# groningen vs limburg: 2
-experiment_type = 2
-
-regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
-
-# a bit useless error handling.
-#assert (experiment_type in (0, 1, 2)), "experiment type should be 0, 1 or 2."
-if experiment_type == 1:
- regionLabels2 = ['Low_Saxon', 'Limburg']
-regionLabels2 = ['Groningen_and_Drenthe', 'Limburg']
-
-
-##########################
-## DATA PREPARATION ##
-##########################
-
-## load init file
-config = configparser.ConfigParser()
-config.sections()
-config.read(configFile)
-dirFeature = config['sentence_based']['dirFeature']
-fileMDB = config['sentence_based']['fileMDB']
-
-
-## database connection
-pypyodbc.lowercase = False
-param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
-conn = pypyodbc.connect(param)
-cursor = conn.cursor()
-
-
-## get data from Access database
-# data format
-# 0: filename
-# 1: pid
-# 2: region
-# 3: ID (unique word_id)
-# 4: sentence_id
-# 5: word_id
-# 6: word
-# 7: pronunciation
-SQL_string = """\
-{CALL dataset_with_cities}
-"""
-cursor.execute(SQL_string)
-
-rows = cursor.fetchall()
-data = np.array(rows)
-#dataNumMax = data.shape[0]
-#uniqueWordIDmax = max(data[:, 3].astype(int))
-del SQL_string, rows
-
-
-## make list of LabelBinarizer object per word.
-# for X
-# get pronvarList from Access database
-# pronvarList format
-# 0: ID (unique word_id)
-# 1: word
-# 2: pronvar
-SQL_string = """\
-{CALL pronunciation_variant}
-"""
-cursor.execute(SQL_string)
-rows = cursor.fetchall()
-pronvarList = np.array(rows)
-del SQL_string, rows
-
-
-LBlist = []
-#uniqueWordIDlist = pronvarList[:, 0].astype(int)
-uniqueWordIDlist = data[:, 3].astype(int)
-uniqueWordIDmax = max(uniqueWordIDlist)
-for uniqueWordID in range(1, uniqueWordIDmax+1):
- pronvar = data[uniqueWordIDlist == uniqueWordID, 7]
- #pronvar = pronvarList[pronvarList[:, 0] == uniqueWordID, 2]
- LB = preprocessing.LabelBinarizer()
- LB.fit(np.unique(pronvar))
- LBlist.append(LB)
-
-# for y (=region)
-LE_y = preprocessing.LabelEncoder()
-LE_y.fit(regionLabels)
-LE_y2 = preprocessing.LabelEncoder()
-LE_y2.fit(regionLabels2)
-
-LB_y = preprocessing.LabelBinarizer()
-LB_y.fit(regionLabels)
-LB_y2 = preprocessing.LabelBinarizer()
-LB_y2.fit(regionLabels2)
-
-del uniqueWordID, uniqueWordIDmax, pronvar, LB
-
-
-#################
-## ITERATION ##
-#################
-#CM_majority = np.zeros((1, 9)).astype(int)
-#CM_weighted = np.zeros((1, 9)).astype(int)
-#for iter in range(0, 1):
-# print(iter)
-
-## make balanced dataset
-pidlist = np.unique(data[:, (1, 2)], axis=0)
-
-# count number of samples
-pidlistCounter = Counter(pidlist[:, 1])
-sampleNumMax = min(pidlistCounter.values())
-del pidlistCounter
-
-
-## make train/eval/test set or load
-if makeTrainTestSet==1:
- pidlist_train = []
- pidlist_eval = []
- pidlist_test = []
- for regionNum in range(0, len(regionLabels)):
- regionName = regionLabels[regionNum]
-
- pidlist_per_region_ = pidlist[pidlist[:, 1]==regionLabels[regionNum], :]
- pidlist_per_region, idx = mani.extractRandomSample(
- pidlist_per_region_, sampleNumMax)
-
- # split dataset into train, eval and test.
- [pidlist_per_region_train, pidlist_per_region_test] = train_test_split(
- pidlist_per_region, test_size = 0.2, random_state = 0)
- [pidlist_per_region_train, pidlist_per_region_eval] = train_test_split(
- pidlist_per_region_train, test_size = 0.1, random_state = 0)
-
- # append numpy arrays
- if regionNum == 0:
- pidlist_train = pidlist_per_region_train
- pidlist_eval = pidlist_per_region_eval
- pidlist_test = pidlist_per_region_test
- else:
- pidlist_train = np.r_[pidlist_train, pidlist_per_region_train]
- pidlist_eval = np.r_[pidlist_eval, pidlist_per_region_eval]
- pidlist_test = np.r_[pidlist_test, pidlist_per_region_test]
- del regionNum, regionName
- del pidlist_per_region_, pidlist_per_region, idx
- del pidlist_per_region_train, pidlist_per_region_eval, pidlist_per_region_test
- np.save(dirOut + "\\pidlist_train.npy", pidlist_train)
- np.save(dirOut + "\\pidlist_eval.npy", pidlist_eval)
- np.save(dirOut + "\\pidlist_test.npy", pidlist_test)
-else:
- pidlist_train = np.load(dirOut + "\\pidlist_train.npy")
- pidlist_eval = np.load(dirOut + "\\pidlist_eval.npy")
- pidlist_test = np.load(dirOut + "\\pidlist_test.npy")
-
-
-## make dataset for 2 regions or load
-if conv3to2region==1:
- pidlist2_train_ = np.r_[pidlist_train, pidlist_eval]
-
- if experiment_type == 1:
- pidlist2_train = sb_func.saxon_vs_limburg(pidlist2_train_)
- pidlist2_test = sb_func.saxon_vs_limburg(pidlist_test)
- np.save(dirOut + "\\pidlist2_saxon_vs_limburg_train", pidlist2_train)
- np.save(dirOut + "\\pidlist2_saxon_vs_limburg_test", pidlist2_test)
-
- elif experiment_type == 2:
- pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_)
- pidlist2_test = sb_func.groningen_vs_limburg(pidlist_test)
- np.save(dirOut + "\\pidlist2_groningen_vs_limburg_train", pidlist2_train)
- np.save(dirOut + "\\pidlist2_groningen_vs_limburg_test", pidlist2_test)
-
- del pidlist2_train_
-else:
- if experiment_type == 1:
- pidlist2_train = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_train.npy")
- pidlist2_test = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_test.npy")
-
- elif experiment_type == 2:
- pidlist2_train = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_train.npy")
- pidlist2_test = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_test.npy")
-
-
-## train/test data
-if experiment_type == 0:
- # Groningen vs Overijsel vs Limburg
- data_train = sb_func.extractPid(pidlist_train, data)
- data_eval = sb_func.extractPid(pidlist_eval, data)
- data_test = sb_func.extractPid(pidlist_test, data)
-
-elif experiment_type == 1 or experiment_type == 2:
- data2 = np.array(data)
-
- if experiment_type == 1:
- for row, row2 in zip(data, data2):
- if row[2] == regionLabels[0] or row[2] == regionLabels[2]:
- row2[2] = regionLabels2[0]
-
- data2_train = sb_func.extractPid(pidlist2_train, data2)
- data2_test = sb_func.extractPid(pidlist2_test, data2)
-
-
-#####################################
-## EXPERIMENTS START FROM HERE ##
-#####################################
-
-## actual training
-# train vs eval
-#trainData = data_train
-#testData = data_eval
-#testPID = pidlist_eval
-#LB = LB_y
-#LE = LE_y
-#regionLabels = regionLabels3
-
-# train+eval vs test
-if experiment_type == 0:
- trainData = np.r_[data_train, data_eval]
- testData = data_test
- testPID = pidlist_test
- LB = LB_y
- LE = LE_y
-elif experiment_type == 1 or experiment_type == 2:
-# 2 region: saxon vs limburg/ groningen vs limburg
- trainData = data2_train
- testData = data2_test
- testPID = pidlist2_test
- LB = LB_y2
- LE = LE_y2
- regionLabels = regionLabels2
-
-
-# check the number of utterance
-allData = np.r_[trainData, testData]
-filenames = np.c_[allData[:, 0], allData[:, 2]]
-filenames_unique = np.unique(filenames, axis=0)
-Counter(filenames_unique[:, 1])
-
-
-fileComparison = dirOut + "\\algorithm_comparison.csv"
-filePerformance = dirOut + "\\sentence-level.csv"
-fileConfusionMatrix = dirOut + "\\confusion_matrix.csv"
-
-## compare classification algorithms for the sentence-classifiers.
-#sb_func.compare_sentence_level_classifiers(trainData, LBlist, LE, fileComparison)
-
-## train sentence-level classifiers.
-modelList, scoreList, confusionMatrixList = sb_func.train_sentence_level_classifiers(
- trainData, LBlist, LE, filePerformance)
-
-## prediction over evaluation data per each sentence-level classifier.
-pred_per_sentence = sb_func.prediction_per_sentence(testData, modelList, LBlist, LE)
-
-## combine sentence-level classifiers
-pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence)
-
-## majority vote (weighted)
-#weight = sb_func.calc_weight(confusionMatrixList)
-#pred_per_pid_weighted = sb_func.prediction_per_pid_weighted(testPID, pred_per_sentence, weight, LB, LE)
-
-### confusion matrix
-if experiment_type == 0:
- confusionMatrix_majority = confusion_matrix(
- pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg'])
-else:
- confusionMatrix_majority = confusion_matrix(
- pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Limburg'])
-
- #confusionMatrix_weighted = confusion_matrix(
-# pred_per_pid_weighted[:, 1], pred_per_pid_weighted[:, 2], labels=regionLabels)
-
-
-## output
-accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None)
-print('accuracy: {}%'.format(accuracy * 100))
-
-cm = confusionMatrix_majority
-print(cm)
-
-np.save(dirOut + "\\pred_per_pid.npy", pred_per_pid_majority)
-np.save(dirOut + "\\confusion_matrix.npy", cm)
-
-#fout = open(fileConfusionMatrix, "w")
-#fout.write('< confusion matrix for majority vote in evaluation set >\n')
-#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_majority', regionLabels)
-#fout.write('< confusion matrix for weighted vote in evaluation set >\n')
-#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_weighted', regionLabels)
-#fout.write('\n')
-#fout.close()
-
-
-##### iteration finish #####
-conn.close()
-#np.savetxt(dirOut + '\\cm_majority.csv', CM_majority, delimiter=',')
-#np.savetxt(dirOut + '\\cm_weighted.csv', CM_weighted, delimiter=',')
-
diff --git a/output/confusion_matrix_2regions.npy b/output/confusion_matrix_2regions.npy
new file mode 100644
index 0000000..e766cb8
Binary files /dev/null and b/output/confusion_matrix_2regions.npy differ
diff --git a/output/confusion_matrix_2regions.png b/output/confusion_matrix_2regions.png
new file mode 100644
index 0000000..8a67f8d
Binary files /dev/null and b/output/confusion_matrix_2regions.png differ
diff --git a/output/confusion_matrix_2regions_normalized.png b/output/confusion_matrix_2regions_normalized.png
new file mode 100644
index 0000000..02b7621
Binary files /dev/null and b/output/confusion_matrix_2regions_normalized.png differ
diff --git a/output/confusion_matrix_3regions.npy b/output/confusion_matrix_3regions.npy
new file mode 100644
index 0000000..09e5359
Binary files /dev/null and b/output/confusion_matrix_3regions.npy differ
diff --git a/output/confusion_matrix_3regions.png b/output/confusion_matrix_3regions.png
new file mode 100644
index 0000000..8b2c7f0
Binary files /dev/null and b/output/confusion_matrix_3regions.png differ
diff --git a/output/confusion_matrix_3regions_normalized.png b/output/confusion_matrix_3regions_normalized.png
new file mode 100644
index 0000000..c187d32
Binary files /dev/null and b/output/confusion_matrix_3regions_normalized.png differ
diff --git a/output/pidlist_2regions_test.npy b/output/pidlist_2regions_test.npy
new file mode 100644
index 0000000..2a9701b
Binary files /dev/null and b/output/pidlist_2regions_test.npy differ
diff --git a/output/pidlist_2regions_train.npy b/output/pidlist_2regions_train.npy
new file mode 100644
index 0000000..85652b5
Binary files /dev/null and b/output/pidlist_2regions_train.npy differ
diff --git a/output/pidlist_3regions_eval.npy b/output/pidlist_3regions_eval.npy
new file mode 100644
index 0000000..258b029
Binary files /dev/null and b/output/pidlist_3regions_eval.npy differ
diff --git a/output/pidlist_3regions_test.npy b/output/pidlist_3regions_test.npy
new file mode 100644
index 0000000..d7f1a78
Binary files /dev/null and b/output/pidlist_3regions_test.npy differ
diff --git a/output/pidlist_3regions_train.npy b/output/pidlist_3regions_train.npy
new file mode 100644
index 0000000..0649478
Binary files /dev/null and b/output/pidlist_3regions_train.npy differ
diff --git a/output/pred_per_pid_2regions.npy b/output/pred_per_pid_2regions.npy
new file mode 100644
index 0000000..a6256fa
Binary files /dev/null and b/output/pred_per_pid_2regions.npy differ
diff --git a/output/pred_per_pid_3regions.npy b/output/pred_per_pid_3regions.npy
new file mode 100644
index 0000000..4cef437
Binary files /dev/null and b/output/pred_per_pid_3regions.npy differ