cleaned up the INTERSPEECH related codes.
This commit is contained in:
parent
a1379caced
commit
eb65543781
BIN
.vs/accent_classification/v15/.suo
Normal file
BIN
.vs/accent_classification/v15/.suo
Normal file
Binary file not shown.
@ -3,8 +3,6 @@ Microsoft Visual Studio Solution File, Format Version 12.00
|
|||||||
# Visual Studio 15
|
# Visual Studio 15
|
||||||
VisualStudioVersion = 15.0.26730.12
|
VisualStudioVersion = 15.0.26730.12
|
||||||
MinimumVisualStudioVersion = 10.0.40219.1
|
MinimumVisualStudioVersion = 10.0.40219.1
|
||||||
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "dialect_identification", "dialect_identification\dialect_identification.pyproj", "{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}"
|
|
||||||
EndProject
|
|
||||||
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{5A4286D1-F037-43D4-90F8-05C5CCC0CA30}"
|
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{5A4286D1-F037-43D4-90F8-05C5CCC0CA30}"
|
||||||
ProjectSection(SolutionItems) = preProject
|
ProjectSection(SolutionItems) = preProject
|
||||||
..\..\forced-alignment\forced_alignment\convert_phone_set.py = ..\..\forced-alignment\forced_alignment\convert_phone_set.py
|
..\..\forced-alignment\forced_alignment\convert_phone_set.py = ..\..\forced-alignment\forced_alignment\convert_phone_set.py
|
||||||
@ -20,6 +18,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
|
|||||||
..\..\forced-alignment\forced_alignment\test_environment.py = ..\..\forced-alignment\forced_alignment\test_environment.py
|
..\..\forced-alignment\forced_alignment\test_environment.py = ..\..\forced-alignment\forced_alignment\test_environment.py
|
||||||
EndProjectSection
|
EndProjectSection
|
||||||
EndProject
|
EndProject
|
||||||
|
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "accent_classification", "accent_classification\accent_classification.pyproj", "{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}"
|
||||||
|
EndProject
|
||||||
Global
|
Global
|
||||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
Debug|Any CPU = Debug|Any CPU
|
Debug|Any CPU = Debug|Any CPU
|
Binary file not shown.
BIN
accent_classification/__pycache__/evaluation.cpython-36.pyc
Normal file
BIN
accent_classification/__pycache__/evaluation.cpython-36.pyc
Normal file
Binary file not shown.
Binary file not shown.
@ -5,7 +5,7 @@
|
|||||||
<ProjectGuid>fe1b1358-adbe-4446-affd-a0802d13d15b</ProjectGuid>
|
<ProjectGuid>fe1b1358-adbe-4446-affd-a0802d13d15b</ProjectGuid>
|
||||||
<ProjectTypeGuids>{a41c8ea1-112a-4a2d-9f91-29557995525f};{888888a0-9f3d-457c-b088-3a5042f75d52}</ProjectTypeGuids>
|
<ProjectTypeGuids>{a41c8ea1-112a-4a2d-9f91-29557995525f};{888888a0-9f3d-457c-b088-3a5042f75d52}</ProjectTypeGuids>
|
||||||
<ProjectHome>.</ProjectHome>
|
<ProjectHome>.</ProjectHome>
|
||||||
<StartupFile>output_confusion_matrix.py</StartupFile>
|
<StartupFile>speaker_based.py</StartupFile>
|
||||||
<SearchPath>
|
<SearchPath>
|
||||||
</SearchPath>
|
</SearchPath>
|
||||||
<WorkingDirectory>.</WorkingDirectory>
|
<WorkingDirectory>.</WorkingDirectory>
|
||||||
@ -22,6 +22,8 @@
|
|||||||
<EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
|
<EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
<Compile Include="data_io.py" />
|
||||||
|
<Compile Include="data_manipulation.py" />
|
||||||
<Compile Include="manipulate_db.py">
|
<Compile Include="manipulate_db.py">
|
||||||
<SubType>Code</SubType>
|
<SubType>Code</SubType>
|
||||||
</Compile>
|
</Compile>
|
||||||
@ -29,9 +31,6 @@
|
|||||||
<SubType>Code</SubType>
|
<SubType>Code</SubType>
|
||||||
</Compile>
|
</Compile>
|
||||||
<Compile Include="classifier.py" />
|
<Compile Include="classifier.py" />
|
||||||
<Compile Include="dataManipulation.py">
|
|
||||||
<SubType>Code</SubType>
|
|
||||||
</Compile>
|
|
||||||
<Compile Include="output_confusion_matrix.py">
|
<Compile Include="output_confusion_matrix.py">
|
||||||
<SubType>Code</SubType>
|
<SubType>Code</SubType>
|
||||||
</Compile>
|
</Compile>
|
||||||
@ -53,7 +52,6 @@
|
|||||||
<Compile Include="word_based.py">
|
<Compile Include="word_based.py">
|
||||||
<SubType>Code</SubType>
|
<SubType>Code</SubType>
|
||||||
</Compile>
|
</Compile>
|
||||||
<Compile Include="dataIO.py" />
|
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<Content Include="config.ini" />
|
<Content Include="config.ini" />
|
@ -1,6 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import configparser
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pypyodbc
|
import pypyodbc
|
||||||
@ -20,16 +19,10 @@ sys.path.append(forced_alignment_module)
|
|||||||
from forced_alignment import forced_alignment
|
from forced_alignment import forced_alignment
|
||||||
|
|
||||||
|
|
||||||
## check if forced-alignment work in each sentence
|
## delete all automatically generated pronunciations
|
||||||
#from forced_alignment import pronunciations
|
#from forced_alignment import pronunciations
|
||||||
#pronunciations.delete_all_g2p_entries()
|
#pronunciations.delete_all_g2p_entries()
|
||||||
|
|
||||||
#wav_file = wav_dir + '\\10\\' + regionLabels[0] + '\\9935-1464218044-1951631.wav'
|
|
||||||
#script_file = script_dir + '\\script10.txt'
|
|
||||||
#with open(script_file, 'r') as fin:
|
|
||||||
# script = fin.readline()
|
|
||||||
#fa = forced_alignment(wav_file, script)
|
|
||||||
|
|
||||||
|
|
||||||
## make database connection
|
## make database connection
|
||||||
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
|
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
|
267
accent_classification/speaker_based.py
Normal file
267
accent_classification/speaker_based.py
Normal file
@ -0,0 +1,267 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import configparser
|
||||||
|
|
||||||
|
import pypyodbc
|
||||||
|
import numpy as np
|
||||||
|
from collections import Counter
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.model_selection import cross_val_score
|
||||||
|
from sklearn import preprocessing
|
||||||
|
from sklearn.metrics import confusion_matrix
|
||||||
|
from sklearn.metrics import accuracy_score
|
||||||
|
|
||||||
|
repo_dir = 'C:\\Users\\Aki\\source\\repos\\accent_classification'
|
||||||
|
curr_dir = repo_dir + '\\accent_classification'
|
||||||
|
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir))
|
||||||
|
import data_manipulation as mani
|
||||||
|
import evaluation as eval
|
||||||
|
import speaker_based_functions as sb_func
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= user define =======================
|
||||||
|
sentence_num_max = 10
|
||||||
|
config_file = curr_dir + '\\config.ini'
|
||||||
|
output_dir = repo_dir + '\\output'
|
||||||
|
|
||||||
|
# make train/test set: 1, load: 0
|
||||||
|
make_train_test_set = 0
|
||||||
|
|
||||||
|
# specify which experiment to be performed.
|
||||||
|
# - 3: groninven vs oost_overijssel vs limburg
|
||||||
|
# - 2: groningen vs limburg
|
||||||
|
experiment_type = 2
|
||||||
|
|
||||||
|
region_labels3 = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg']
|
||||||
|
region_labels2 = ['Groningen_and_Drenthe', 'Limburg']
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= data preparation =======================
|
||||||
|
|
||||||
|
## load variables from the ini file
|
||||||
|
config = configparser.ConfigParser()
|
||||||
|
config.sections()
|
||||||
|
config.read(config_file)
|
||||||
|
MDB_file = config['sentence_based']['fileMDB']
|
||||||
|
|
||||||
|
|
||||||
|
## connect to the database
|
||||||
|
pypyodbc.lowercase = False
|
||||||
|
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + MDB_file + ";"
|
||||||
|
conn = pypyodbc.connect(param)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
|
||||||
|
## get data from Access database
|
||||||
|
# data format
|
||||||
|
# 0: filename
|
||||||
|
# 1: pid
|
||||||
|
# 2: region
|
||||||
|
# 3: ID (unique word_id)
|
||||||
|
# 4: sentence_id
|
||||||
|
# 5: word_id
|
||||||
|
# 6: word
|
||||||
|
# 7: pronunciation
|
||||||
|
SQL_string = """\
|
||||||
|
{CALL dataset_with_cities}
|
||||||
|
"""
|
||||||
|
cursor.execute(SQL_string)
|
||||||
|
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
data = np.array(rows)
|
||||||
|
del SQL_string, rows
|
||||||
|
|
||||||
|
|
||||||
|
## get the list of pronunciation variant (pronvarList) from Access database
|
||||||
|
# pronvarList format
|
||||||
|
# 0: ID (unique word_id)
|
||||||
|
# 1: word
|
||||||
|
# 2: pronvar
|
||||||
|
SQL_string = """\
|
||||||
|
{CALL pronunciation_variant}
|
||||||
|
"""
|
||||||
|
cursor.execute(SQL_string)
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
pronvarList = np.array(rows)
|
||||||
|
del SQL_string, rows
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
## make list of LabelBinarizer object per word for X (=pronunciation variant).
|
||||||
|
LB_list = []
|
||||||
|
unique_wordID_list = data[:, 3].astype(int)
|
||||||
|
unique_wordID_max = max(unique_wordID_list)
|
||||||
|
for unique_wordID in range(1, unique_wordID_max+1):
|
||||||
|
pronvar = data[unique_wordID_list == unique_wordID, 7]
|
||||||
|
LB = preprocessing.LabelBinarizer()
|
||||||
|
LB.fit(np.unique(pronvar))
|
||||||
|
LB_list.append(LB)
|
||||||
|
|
||||||
|
|
||||||
|
## make LabelEncorder/LabelBinilizer objects for y (=region).
|
||||||
|
LE_y3 = preprocessing.LabelEncoder()
|
||||||
|
LE_y3.fit(region_labels3)
|
||||||
|
LE_y2 = preprocessing.LabelEncoder()
|
||||||
|
LE_y2.fit(region_labels2)
|
||||||
|
|
||||||
|
LB_y3 = preprocessing.LabelBinarizer()
|
||||||
|
LB_y3.fit(region_labels3)
|
||||||
|
LB_y2 = preprocessing.LabelBinarizer()
|
||||||
|
LB_y2.fit(region_labels2)
|
||||||
|
|
||||||
|
del unique_wordID, unique_wordID_max, pronvar, LB
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= make train/eval/test set or load =======================
|
||||||
|
|
||||||
|
## find the smallest group to balance the number of samples per group.
|
||||||
|
pidlist3 = np.unique(data[:, (1, 2)], axis=0)
|
||||||
|
pidlist3_counter = Counter(pidlist3[:, 1])
|
||||||
|
sample_num_max = min(pidlist3_counter.values())
|
||||||
|
del pidlist3_counter
|
||||||
|
|
||||||
|
|
||||||
|
## make train/eval/test set or load them.
|
||||||
|
|
||||||
|
if make_train_test_set==1:
|
||||||
|
pidlist3_train = []
|
||||||
|
pidlist3_eval = []
|
||||||
|
pidlist3_test = []
|
||||||
|
for region_num in range(0, len(region_labels3)):
|
||||||
|
region_name = region_labels3[region_num]
|
||||||
|
|
||||||
|
pidlist3_per_region_ = pidlist3[pidlist3[:, 1]==region_labels3[region_num], :]
|
||||||
|
pidlist3_per_region, idx = mani.extractRandomSample(
|
||||||
|
pidlist3_per_region_, sample_num_max)
|
||||||
|
|
||||||
|
# split dataset into train, eval and test.
|
||||||
|
[pidlist3_per_region_train, pidlist3_per_region_test] = train_test_split(
|
||||||
|
pidlist3_per_region, test_size = 0.2, random_state = 0)
|
||||||
|
[pidlist3_per_region_train, pidlist3_per_region_eval] = train_test_split(
|
||||||
|
pidlist3_per_region_train, test_size = 0.1, random_state = 0)
|
||||||
|
|
||||||
|
# append numpy arrays.
|
||||||
|
if region_num == 0:
|
||||||
|
pidlist3_train = pidlist3_per_region_train
|
||||||
|
pidlist3_eval = pidlist3_per_region_eval
|
||||||
|
pidlist3_test = pidlist3_per_region_test
|
||||||
|
else:
|
||||||
|
pidlist3_train = np.r_[pidlist3_train, pidlist3_per_region_train]
|
||||||
|
pidlist3_eval = np.r_[pidlist3_eval, pidlist3_per_region_eval]
|
||||||
|
pidlist3_test = np.r_[pidlist3_test, pidlist3_per_region_test]
|
||||||
|
del region_num, region_name
|
||||||
|
del pidlist3_per_region_, pidlist3_per_region, idx
|
||||||
|
del pidlist3_per_region_train, pidlist3_per_region_eval, pidlist3_per_region_test
|
||||||
|
np.save(output_dir + "\\pidlist3_train.npy", pidlist3_train)
|
||||||
|
np.save(output_dir + "\\pidlist3_eval.npy", pidlist3_eval)
|
||||||
|
np.save(output_dir + "\\pidlist3_test.npy", pidlist3_test)
|
||||||
|
|
||||||
|
|
||||||
|
if experiment_type == 2:
|
||||||
|
pidlist2_train_ = np.r_[pidlist3_train, pidlist3_eval]
|
||||||
|
|
||||||
|
pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_)
|
||||||
|
pidlist2_test = sb_func.groningen_vs_limburg(pidlist3_test)
|
||||||
|
np.save(output_dir + "\\pidlist2_train", pidlist2_train)
|
||||||
|
np.save(output_dir + "\\pidlist2_test", pidlist2_test)
|
||||||
|
|
||||||
|
del pidlist2_train_
|
||||||
|
else:
|
||||||
|
pidlist3_train = np.load(output_dir + "\\pidlist3_train.npy")
|
||||||
|
pidlist3_eval = np.load(output_dir + "\\pidlist3_eval.npy")
|
||||||
|
pidlist3_test = np.load(output_dir + "\\pidlist3_test.npy")
|
||||||
|
|
||||||
|
if experiment_type == 2:
|
||||||
|
pidlist2_train = np.load(output_dir + "\\pidlist2_train.npy")
|
||||||
|
pidlist2_test = np.load(output_dir + "\\pidlist2_test.npy")
|
||||||
|
|
||||||
|
|
||||||
|
## extract corresponding data using pid
|
||||||
|
|
||||||
|
data3_train = sb_func.extractPid(pidlist3_train, data)
|
||||||
|
data3_eval = sb_func.extractPid(pidlist3_eval, data)
|
||||||
|
data3_test = sb_func.extractPid(pidlist3_test, data)
|
||||||
|
|
||||||
|
if experiment_type == 2:
|
||||||
|
data2 = np.array(data)
|
||||||
|
data2_train = sb_func.extractPid(pidlist2_train, data2)
|
||||||
|
data2_test = sb_func.extractPid(pidlist2_test, data2)
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= experiments =======================
|
||||||
|
|
||||||
|
## specify the dataset
|
||||||
|
|
||||||
|
# train vs eval
|
||||||
|
#trainData = data3_train
|
||||||
|
#testData = data3_eval
|
||||||
|
#testPID = pidlist3_eval
|
||||||
|
#LB = LB_y3
|
||||||
|
#LE = LE_y3
|
||||||
|
#region_labels = region_labels3
|
||||||
|
|
||||||
|
# train+eval vs test
|
||||||
|
if experiment_type == 3:
|
||||||
|
trainData = np.r_[data3_train, data3_eval]
|
||||||
|
testData = data3_test
|
||||||
|
testPID = pidlist3_test
|
||||||
|
LB = LB_y3
|
||||||
|
LE = LE_y3
|
||||||
|
region_labels = region_labels3
|
||||||
|
|
||||||
|
elif experiment_type == 2:
|
||||||
|
trainData = data2_train
|
||||||
|
testData = data2_test
|
||||||
|
testPID = pidlist2_test
|
||||||
|
LB = LB_y2
|
||||||
|
LE = LE_y2
|
||||||
|
region_labels = region_labels2
|
||||||
|
|
||||||
|
## check the number of utterance
|
||||||
|
#data_all = np.r_[trainData, testData]
|
||||||
|
#filenames = np.c_[data_all[:, 0], data_all[:, 2]]
|
||||||
|
#filenames_unique = np.unique(filenames, axis=0)
|
||||||
|
#Counter(filenames_unique[:, 1])
|
||||||
|
|
||||||
|
|
||||||
|
## output filenames
|
||||||
|
fileComparison = output_dir + "\\algorithm_comparison.csv"
|
||||||
|
filePerformance = output_dir + "\\sentence-level.csv"
|
||||||
|
fileConfusionMatrix = output_dir + "\\confusion_matrix.csv"
|
||||||
|
|
||||||
|
|
||||||
|
## compare classification algorithms for the sentence-classifiers.
|
||||||
|
#sb_func.compare_sentence_level_classifiers(trainData, LB_list, LE, fileComparison)
|
||||||
|
|
||||||
|
|
||||||
|
## train sentence-level classifiers.
|
||||||
|
model_list, score_list, confusion_matrix_list = sb_func.train_sentence_level_classifiers(
|
||||||
|
trainData, LB_list, LE, filePerformance)
|
||||||
|
|
||||||
|
|
||||||
|
## prediction over evaluation data per each sentence-level classifier.
|
||||||
|
pred_per_sentence = sb_func.prediction_per_sentence(testData, model_list, LB_list, LE)
|
||||||
|
|
||||||
|
|
||||||
|
## combine sentence-level classifiers
|
||||||
|
pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence)
|
||||||
|
|
||||||
|
|
||||||
|
## confusion matrix
|
||||||
|
confusionMatrix_majority = confusion_matrix(
|
||||||
|
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=region_labels)
|
||||||
|
|
||||||
|
|
||||||
|
## output
|
||||||
|
accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None)
|
||||||
|
print('accuracy: {}%'.format(accuracy * 100))
|
||||||
|
|
||||||
|
cm = confusionMatrix_majority
|
||||||
|
print(cm)
|
||||||
|
|
||||||
|
np.save(output_dir + "\\pred_per_pid2.npy", pred_per_pid_majority)
|
||||||
|
np.save(output_dir + "\\confusion_matrix2.npy", cm)
|
@ -14,7 +14,7 @@ from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
|
|||||||
from sklearn.model_selection import cross_val_score
|
from sklearn.model_selection import cross_val_score
|
||||||
from sklearn.metrics import confusion_matrix
|
from sklearn.metrics import confusion_matrix
|
||||||
|
|
||||||
import dataManipulation as mani
|
import data_manipulation as mani
|
||||||
import evaluation as eval
|
import evaluation as eval
|
||||||
|
|
||||||
|
|
||||||
@ -338,34 +338,6 @@ def prediction_per_pid_weighted(pidlist_eval, prediction, weight, LB_y, LE_y):
|
|||||||
return np.array(prediction_per_pid)
|
return np.array(prediction_per_pid)
|
||||||
|
|
||||||
|
|
||||||
def saxon_vs_limburg(pidlist3):
|
|
||||||
"""convert a pidlist for 3 regions into that for 2 regions.
|
|
||||||
|
|
||||||
Notes:
|
|
||||||
3 regions include ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
|
|
||||||
2 regions include ['Limburg', 'Low_Saxon']
|
|
||||||
where Low_Saxon = 'Groningen_and_Drenthe' + 'Oost_Overijsel-Gelderland'
|
|
||||||
samples are randomly chosen so that each class has the same amount of data.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
|
|
||||||
regionLabels2 = ['Low_Saxon', 'Limburg']
|
|
||||||
|
|
||||||
index_saxon = np.any([pidlist3[:, 1] == regionLabels[0], pidlist3[:, 1] == regionLabels[2]], axis=0)
|
|
||||||
pidlist_saxon_ = pidlist3[index_saxon, :]
|
|
||||||
pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :]
|
|
||||||
|
|
||||||
# extract the same amout of samples as Limburg.
|
|
||||||
pidlistCounter3 = Counter(pidlist3[:, 1])
|
|
||||||
pidlist_saxon, idx = mani.extractRandomSample(pidlist_saxon_, pidlistCounter3['Limburg'])
|
|
||||||
pidlist_saxon[:, 1] = regionLabels2[0]
|
|
||||||
|
|
||||||
pidlist2 = np.r_[pidlist_limburg, pidlist_saxon]
|
|
||||||
#pidlistCounter2 = Counter(pidlist2[:, 1])
|
|
||||||
return pidlist2
|
|
||||||
|
|
||||||
|
|
||||||
def groningen_vs_limburg(pidlist3):
|
def groningen_vs_limburg(pidlist3):
|
||||||
"""convert a pidlist for 3 regions into that for 2 regions.
|
"""convert a pidlist for 3 regions into that for 2 regions.
|
||||||
|
|
||||||
@ -374,7 +346,7 @@ def groningen_vs_limburg(pidlist3):
|
|||||||
2 regions include ['Groningen_and_Drenthe', 'Limburg']
|
2 regions include ['Groningen_and_Drenthe', 'Limburg']
|
||||||
|
|
||||||
"""
|
"""
|
||||||
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
|
regionLabels = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg']
|
||||||
|
|
||||||
pidlist_groningen = pidlist3[pidlist3[:, 1] == regionLabels[0], :]
|
pidlist_groningen = pidlist3[pidlist3[:, 1] == regionLabels[0], :]
|
||||||
pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :]
|
pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :]
|
@ -1,326 +0,0 @@
|
|||||||
import os
|
|
||||||
import sys
|
|
||||||
import configparser
|
|
||||||
|
|
||||||
import pypyodbc
|
|
||||||
import numpy as np
|
|
||||||
from collections import Counter
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
|
|
||||||
from sklearn.model_selection import train_test_split
|
|
||||||
from sklearn.model_selection import cross_val_score
|
|
||||||
from sklearn import preprocessing
|
|
||||||
from sklearn.metrics import confusion_matrix
|
|
||||||
from sklearn.metrics import accuracy_score
|
|
||||||
|
|
||||||
currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
|
|
||||||
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
|
|
||||||
import dataManipulation as mani
|
|
||||||
import evaluation as eval
|
|
||||||
import speaker_based_functions as sb_func
|
|
||||||
|
|
||||||
|
|
||||||
#####################
|
|
||||||
## USER DEFINE ##
|
|
||||||
#####################
|
|
||||||
sentenceNumMax = 10
|
|
||||||
configFile = currDir + '\\config.ini'
|
|
||||||
dirOut = currDir + '\\result'
|
|
||||||
|
|
||||||
# make train/test set: 1, load: 0
|
|
||||||
makeTrainTestSet = 0
|
|
||||||
# convert 3 regions to 2 regions: 1, load: 0
|
|
||||||
conv3to2region = 0
|
|
||||||
|
|
||||||
# 3 regions: 0
|
|
||||||
# saxon vs limburg: 1
|
|
||||||
# groningen vs limburg: 2
|
|
||||||
experiment_type = 2
|
|
||||||
|
|
||||||
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
|
|
||||||
|
|
||||||
# a bit useless error handling.
|
|
||||||
#assert (experiment_type in (0, 1, 2)), "experiment type should be 0, 1 or 2."
|
|
||||||
if experiment_type == 1:
|
|
||||||
regionLabels2 = ['Low_Saxon', 'Limburg']
|
|
||||||
regionLabels2 = ['Groningen_and_Drenthe', 'Limburg']
|
|
||||||
|
|
||||||
|
|
||||||
##########################
|
|
||||||
## DATA PREPARATION ##
|
|
||||||
##########################
|
|
||||||
|
|
||||||
## load init file
|
|
||||||
config = configparser.ConfigParser()
|
|
||||||
config.sections()
|
|
||||||
config.read(configFile)
|
|
||||||
dirFeature = config['sentence_based']['dirFeature']
|
|
||||||
fileMDB = config['sentence_based']['fileMDB']
|
|
||||||
|
|
||||||
|
|
||||||
## database connection
|
|
||||||
pypyodbc.lowercase = False
|
|
||||||
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
|
|
||||||
conn = pypyodbc.connect(param)
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
|
|
||||||
## get data from Access database
|
|
||||||
# data format
|
|
||||||
# 0: filename
|
|
||||||
# 1: pid
|
|
||||||
# 2: region
|
|
||||||
# 3: ID (unique word_id)
|
|
||||||
# 4: sentence_id
|
|
||||||
# 5: word_id
|
|
||||||
# 6: word
|
|
||||||
# 7: pronunciation
|
|
||||||
SQL_string = """\
|
|
||||||
{CALL dataset_with_cities}
|
|
||||||
"""
|
|
||||||
cursor.execute(SQL_string)
|
|
||||||
|
|
||||||
rows = cursor.fetchall()
|
|
||||||
data = np.array(rows)
|
|
||||||
#dataNumMax = data.shape[0]
|
|
||||||
#uniqueWordIDmax = max(data[:, 3].astype(int))
|
|
||||||
del SQL_string, rows
|
|
||||||
|
|
||||||
|
|
||||||
## make list of LabelBinarizer object per word.
|
|
||||||
# for X
|
|
||||||
# get pronvarList from Access database
|
|
||||||
# pronvarList format
|
|
||||||
# 0: ID (unique word_id)
|
|
||||||
# 1: word
|
|
||||||
# 2: pronvar
|
|
||||||
SQL_string = """\
|
|
||||||
{CALL pronunciation_variant}
|
|
||||||
"""
|
|
||||||
cursor.execute(SQL_string)
|
|
||||||
rows = cursor.fetchall()
|
|
||||||
pronvarList = np.array(rows)
|
|
||||||
del SQL_string, rows
|
|
||||||
|
|
||||||
|
|
||||||
LBlist = []
|
|
||||||
#uniqueWordIDlist = pronvarList[:, 0].astype(int)
|
|
||||||
uniqueWordIDlist = data[:, 3].astype(int)
|
|
||||||
uniqueWordIDmax = max(uniqueWordIDlist)
|
|
||||||
for uniqueWordID in range(1, uniqueWordIDmax+1):
|
|
||||||
pronvar = data[uniqueWordIDlist == uniqueWordID, 7]
|
|
||||||
#pronvar = pronvarList[pronvarList[:, 0] == uniqueWordID, 2]
|
|
||||||
LB = preprocessing.LabelBinarizer()
|
|
||||||
LB.fit(np.unique(pronvar))
|
|
||||||
LBlist.append(LB)
|
|
||||||
|
|
||||||
# for y (=region)
|
|
||||||
LE_y = preprocessing.LabelEncoder()
|
|
||||||
LE_y.fit(regionLabels)
|
|
||||||
LE_y2 = preprocessing.LabelEncoder()
|
|
||||||
LE_y2.fit(regionLabels2)
|
|
||||||
|
|
||||||
LB_y = preprocessing.LabelBinarizer()
|
|
||||||
LB_y.fit(regionLabels)
|
|
||||||
LB_y2 = preprocessing.LabelBinarizer()
|
|
||||||
LB_y2.fit(regionLabels2)
|
|
||||||
|
|
||||||
del uniqueWordID, uniqueWordIDmax, pronvar, LB
|
|
||||||
|
|
||||||
|
|
||||||
#################
|
|
||||||
## ITERATION ##
|
|
||||||
#################
|
|
||||||
#CM_majority = np.zeros((1, 9)).astype(int)
|
|
||||||
#CM_weighted = np.zeros((1, 9)).astype(int)
|
|
||||||
#for iter in range(0, 1):
|
|
||||||
# print(iter)
|
|
||||||
|
|
||||||
## make balanced dataset
|
|
||||||
pidlist = np.unique(data[:, (1, 2)], axis=0)
|
|
||||||
|
|
||||||
# count number of samples
|
|
||||||
pidlistCounter = Counter(pidlist[:, 1])
|
|
||||||
sampleNumMax = min(pidlistCounter.values())
|
|
||||||
del pidlistCounter
|
|
||||||
|
|
||||||
|
|
||||||
## make train/eval/test set or load
|
|
||||||
if makeTrainTestSet==1:
|
|
||||||
pidlist_train = []
|
|
||||||
pidlist_eval = []
|
|
||||||
pidlist_test = []
|
|
||||||
for regionNum in range(0, len(regionLabels)):
|
|
||||||
regionName = regionLabels[regionNum]
|
|
||||||
|
|
||||||
pidlist_per_region_ = pidlist[pidlist[:, 1]==regionLabels[regionNum], :]
|
|
||||||
pidlist_per_region, idx = mani.extractRandomSample(
|
|
||||||
pidlist_per_region_, sampleNumMax)
|
|
||||||
|
|
||||||
# split dataset into train, eval and test.
|
|
||||||
[pidlist_per_region_train, pidlist_per_region_test] = train_test_split(
|
|
||||||
pidlist_per_region, test_size = 0.2, random_state = 0)
|
|
||||||
[pidlist_per_region_train, pidlist_per_region_eval] = train_test_split(
|
|
||||||
pidlist_per_region_train, test_size = 0.1, random_state = 0)
|
|
||||||
|
|
||||||
# append numpy arrays
|
|
||||||
if regionNum == 0:
|
|
||||||
pidlist_train = pidlist_per_region_train
|
|
||||||
pidlist_eval = pidlist_per_region_eval
|
|
||||||
pidlist_test = pidlist_per_region_test
|
|
||||||
else:
|
|
||||||
pidlist_train = np.r_[pidlist_train, pidlist_per_region_train]
|
|
||||||
pidlist_eval = np.r_[pidlist_eval, pidlist_per_region_eval]
|
|
||||||
pidlist_test = np.r_[pidlist_test, pidlist_per_region_test]
|
|
||||||
del regionNum, regionName
|
|
||||||
del pidlist_per_region_, pidlist_per_region, idx
|
|
||||||
del pidlist_per_region_train, pidlist_per_region_eval, pidlist_per_region_test
|
|
||||||
np.save(dirOut + "\\pidlist_train.npy", pidlist_train)
|
|
||||||
np.save(dirOut + "\\pidlist_eval.npy", pidlist_eval)
|
|
||||||
np.save(dirOut + "\\pidlist_test.npy", pidlist_test)
|
|
||||||
else:
|
|
||||||
pidlist_train = np.load(dirOut + "\\pidlist_train.npy")
|
|
||||||
pidlist_eval = np.load(dirOut + "\\pidlist_eval.npy")
|
|
||||||
pidlist_test = np.load(dirOut + "\\pidlist_test.npy")
|
|
||||||
|
|
||||||
|
|
||||||
## make dataset for 2 regions or load
|
|
||||||
if conv3to2region==1:
|
|
||||||
pidlist2_train_ = np.r_[pidlist_train, pidlist_eval]
|
|
||||||
|
|
||||||
if experiment_type == 1:
|
|
||||||
pidlist2_train = sb_func.saxon_vs_limburg(pidlist2_train_)
|
|
||||||
pidlist2_test = sb_func.saxon_vs_limburg(pidlist_test)
|
|
||||||
np.save(dirOut + "\\pidlist2_saxon_vs_limburg_train", pidlist2_train)
|
|
||||||
np.save(dirOut + "\\pidlist2_saxon_vs_limburg_test", pidlist2_test)
|
|
||||||
|
|
||||||
elif experiment_type == 2:
|
|
||||||
pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_)
|
|
||||||
pidlist2_test = sb_func.groningen_vs_limburg(pidlist_test)
|
|
||||||
np.save(dirOut + "\\pidlist2_groningen_vs_limburg_train", pidlist2_train)
|
|
||||||
np.save(dirOut + "\\pidlist2_groningen_vs_limburg_test", pidlist2_test)
|
|
||||||
|
|
||||||
del pidlist2_train_
|
|
||||||
else:
|
|
||||||
if experiment_type == 1:
|
|
||||||
pidlist2_train = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_train.npy")
|
|
||||||
pidlist2_test = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_test.npy")
|
|
||||||
|
|
||||||
elif experiment_type == 2:
|
|
||||||
pidlist2_train = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_train.npy")
|
|
||||||
pidlist2_test = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_test.npy")
|
|
||||||
|
|
||||||
|
|
||||||
## train/test data
|
|
||||||
if experiment_type == 0:
|
|
||||||
# Groningen vs Overijsel vs Limburg
|
|
||||||
data_train = sb_func.extractPid(pidlist_train, data)
|
|
||||||
data_eval = sb_func.extractPid(pidlist_eval, data)
|
|
||||||
data_test = sb_func.extractPid(pidlist_test, data)
|
|
||||||
|
|
||||||
elif experiment_type == 1 or experiment_type == 2:
|
|
||||||
data2 = np.array(data)
|
|
||||||
|
|
||||||
if experiment_type == 1:
|
|
||||||
for row, row2 in zip(data, data2):
|
|
||||||
if row[2] == regionLabels[0] or row[2] == regionLabels[2]:
|
|
||||||
row2[2] = regionLabels2[0]
|
|
||||||
|
|
||||||
data2_train = sb_func.extractPid(pidlist2_train, data2)
|
|
||||||
data2_test = sb_func.extractPid(pidlist2_test, data2)
|
|
||||||
|
|
||||||
|
|
||||||
#####################################
|
|
||||||
## EXPERIMENTS START FROM HERE ##
|
|
||||||
#####################################
|
|
||||||
|
|
||||||
## actual training
|
|
||||||
# train vs eval
|
|
||||||
#trainData = data_train
|
|
||||||
#testData = data_eval
|
|
||||||
#testPID = pidlist_eval
|
|
||||||
#LB = LB_y
|
|
||||||
#LE = LE_y
|
|
||||||
#regionLabels = regionLabels3
|
|
||||||
|
|
||||||
# train+eval vs test
|
|
||||||
if experiment_type == 0:
|
|
||||||
trainData = np.r_[data_train, data_eval]
|
|
||||||
testData = data_test
|
|
||||||
testPID = pidlist_test
|
|
||||||
LB = LB_y
|
|
||||||
LE = LE_y
|
|
||||||
elif experiment_type == 1 or experiment_type == 2:
|
|
||||||
# 2 region: saxon vs limburg/ groningen vs limburg
|
|
||||||
trainData = data2_train
|
|
||||||
testData = data2_test
|
|
||||||
testPID = pidlist2_test
|
|
||||||
LB = LB_y2
|
|
||||||
LE = LE_y2
|
|
||||||
regionLabels = regionLabels2
|
|
||||||
|
|
||||||
|
|
||||||
# check the number of utterance
|
|
||||||
allData = np.r_[trainData, testData]
|
|
||||||
filenames = np.c_[allData[:, 0], allData[:, 2]]
|
|
||||||
filenames_unique = np.unique(filenames, axis=0)
|
|
||||||
Counter(filenames_unique[:, 1])
|
|
||||||
|
|
||||||
|
|
||||||
fileComparison = dirOut + "\\algorithm_comparison.csv"
|
|
||||||
filePerformance = dirOut + "\\sentence-level.csv"
|
|
||||||
fileConfusionMatrix = dirOut + "\\confusion_matrix.csv"
|
|
||||||
|
|
||||||
## compare classification algorithms for the sentence-classifiers.
|
|
||||||
#sb_func.compare_sentence_level_classifiers(trainData, LBlist, LE, fileComparison)
|
|
||||||
|
|
||||||
## train sentence-level classifiers.
|
|
||||||
modelList, scoreList, confusionMatrixList = sb_func.train_sentence_level_classifiers(
|
|
||||||
trainData, LBlist, LE, filePerformance)
|
|
||||||
|
|
||||||
## prediction over evaluation data per each sentence-level classifier.
|
|
||||||
pred_per_sentence = sb_func.prediction_per_sentence(testData, modelList, LBlist, LE)
|
|
||||||
|
|
||||||
## combine sentence-level classifiers
|
|
||||||
pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence)
|
|
||||||
|
|
||||||
## majority vote (weighted)
|
|
||||||
#weight = sb_func.calc_weight(confusionMatrixList)
|
|
||||||
#pred_per_pid_weighted = sb_func.prediction_per_pid_weighted(testPID, pred_per_sentence, weight, LB, LE)
|
|
||||||
|
|
||||||
### confusion matrix
|
|
||||||
if experiment_type == 0:
|
|
||||||
confusionMatrix_majority = confusion_matrix(
|
|
||||||
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg'])
|
|
||||||
else:
|
|
||||||
confusionMatrix_majority = confusion_matrix(
|
|
||||||
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Limburg'])
|
|
||||||
|
|
||||||
#confusionMatrix_weighted = confusion_matrix(
|
|
||||||
# pred_per_pid_weighted[:, 1], pred_per_pid_weighted[:, 2], labels=regionLabels)
|
|
||||||
|
|
||||||
|
|
||||||
## output
|
|
||||||
accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None)
|
|
||||||
print('accuracy: {}%'.format(accuracy * 100))
|
|
||||||
|
|
||||||
cm = confusionMatrix_majority
|
|
||||||
print(cm)
|
|
||||||
|
|
||||||
np.save(dirOut + "\\pred_per_pid.npy", pred_per_pid_majority)
|
|
||||||
np.save(dirOut + "\\confusion_matrix.npy", cm)
|
|
||||||
|
|
||||||
#fout = open(fileConfusionMatrix, "w")
|
|
||||||
#fout.write('< confusion matrix for majority vote in evaluation set >\n')
|
|
||||||
#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_majority', regionLabels)
|
|
||||||
#fout.write('< confusion matrix for weighted vote in evaluation set >\n')
|
|
||||||
#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_weighted', regionLabels)
|
|
||||||
#fout.write('\n')
|
|
||||||
#fout.close()
|
|
||||||
|
|
||||||
|
|
||||||
##### iteration finish #####
|
|
||||||
conn.close()
|
|
||||||
#np.savetxt(dirOut + '\\cm_majority.csv', CM_majority, delimiter=',')
|
|
||||||
#np.savetxt(dirOut + '\\cm_weighted.csv', CM_weighted, delimiter=',')
|
|
||||||
|
|
BIN
output/confusion_matrix_2regions.npy
Normal file
BIN
output/confusion_matrix_2regions.npy
Normal file
Binary file not shown.
BIN
output/confusion_matrix_2regions.png
Normal file
BIN
output/confusion_matrix_2regions.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 18 KiB |
BIN
output/confusion_matrix_2regions_normalized.png
Normal file
BIN
output/confusion_matrix_2regions_normalized.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 21 KiB |
BIN
output/confusion_matrix_3regions.npy
Normal file
BIN
output/confusion_matrix_3regions.npy
Normal file
Binary file not shown.
BIN
output/confusion_matrix_3regions.png
Normal file
BIN
output/confusion_matrix_3regions.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 22 KiB |
BIN
output/confusion_matrix_3regions_normalized.png
Normal file
BIN
output/confusion_matrix_3regions_normalized.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 33 KiB |
BIN
output/pidlist_2regions_test.npy
Normal file
BIN
output/pidlist_2regions_test.npy
Normal file
Binary file not shown.
BIN
output/pidlist_2regions_train.npy
Normal file
BIN
output/pidlist_2regions_train.npy
Normal file
Binary file not shown.
BIN
output/pidlist_3regions_eval.npy
Normal file
BIN
output/pidlist_3regions_eval.npy
Normal file
Binary file not shown.
BIN
output/pidlist_3regions_test.npy
Normal file
BIN
output/pidlist_3regions_test.npy
Normal file
Binary file not shown.
BIN
output/pidlist_3regions_train.npy
Normal file
BIN
output/pidlist_3regions_train.npy
Normal file
Binary file not shown.
BIN
output/pred_per_pid_2regions.npy
Normal file
BIN
output/pred_per_pid_2regions.npy
Normal file
Binary file not shown.
BIN
output/pred_per_pid_3regions.npy
Normal file
BIN
output/pred_per_pid_3regions.npy
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user