cleaned up the INTERSPEECH related codes.
This commit is contained in:
parent
a1379caced
commit
eb65543781
BIN
.vs/accent_classification/v15/.suo
Normal file
BIN
.vs/accent_classification/v15/.suo
Normal file
Binary file not shown.
@ -3,8 +3,6 @@ Microsoft Visual Studio Solution File, Format Version 12.00
|
||||
# Visual Studio 15
|
||||
VisualStudioVersion = 15.0.26730.12
|
||||
MinimumVisualStudioVersion = 10.0.40219.1
|
||||
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "dialect_identification", "dialect_identification\dialect_identification.pyproj", "{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}"
|
||||
EndProject
|
||||
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{5A4286D1-F037-43D4-90F8-05C5CCC0CA30}"
|
||||
ProjectSection(SolutionItems) = preProject
|
||||
..\..\forced-alignment\forced_alignment\convert_phone_set.py = ..\..\forced-alignment\forced_alignment\convert_phone_set.py
|
||||
@ -20,6 +18,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
|
||||
..\..\forced-alignment\forced_alignment\test_environment.py = ..\..\forced-alignment\forced_alignment\test_environment.py
|
||||
EndProjectSection
|
||||
EndProject
|
||||
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "accent_classification", "accent_classification\accent_classification.pyproj", "{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Any CPU = Debug|Any CPU
|
Binary file not shown.
BIN
accent_classification/__pycache__/evaluation.cpython-36.pyc
Normal file
BIN
accent_classification/__pycache__/evaluation.cpython-36.pyc
Normal file
Binary file not shown.
Binary file not shown.
@ -5,7 +5,7 @@
|
||||
<ProjectGuid>fe1b1358-adbe-4446-affd-a0802d13d15b</ProjectGuid>
|
||||
<ProjectTypeGuids>{a41c8ea1-112a-4a2d-9f91-29557995525f};{888888a0-9f3d-457c-b088-3a5042f75d52}</ProjectTypeGuids>
|
||||
<ProjectHome>.</ProjectHome>
|
||||
<StartupFile>output_confusion_matrix.py</StartupFile>
|
||||
<StartupFile>speaker_based.py</StartupFile>
|
||||
<SearchPath>
|
||||
</SearchPath>
|
||||
<WorkingDirectory>.</WorkingDirectory>
|
||||
@ -22,6 +22,8 @@
|
||||
<EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
|
||||
</PropertyGroup>
|
||||
<ItemGroup>
|
||||
<Compile Include="data_io.py" />
|
||||
<Compile Include="data_manipulation.py" />
|
||||
<Compile Include="manipulate_db.py">
|
||||
<SubType>Code</SubType>
|
||||
</Compile>
|
||||
@ -29,9 +31,6 @@
|
||||
<SubType>Code</SubType>
|
||||
</Compile>
|
||||
<Compile Include="classifier.py" />
|
||||
<Compile Include="dataManipulation.py">
|
||||
<SubType>Code</SubType>
|
||||
</Compile>
|
||||
<Compile Include="output_confusion_matrix.py">
|
||||
<SubType>Code</SubType>
|
||||
</Compile>
|
||||
@ -53,7 +52,6 @@
|
||||
<Compile Include="word_based.py">
|
||||
<SubType>Code</SubType>
|
||||
</Compile>
|
||||
<Compile Include="dataIO.py" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Content Include="config.ini" />
|
@ -1,6 +1,5 @@
|
||||
import os
|
||||
import sys
|
||||
import configparser
|
||||
|
||||
import numpy as np
|
||||
import pypyodbc
|
||||
@ -20,16 +19,10 @@ sys.path.append(forced_alignment_module)
|
||||
from forced_alignment import forced_alignment
|
||||
|
||||
|
||||
## check if forced-alignment work in each sentence
|
||||
## delete all automatically generated pronunciations
|
||||
#from forced_alignment import pronunciations
|
||||
#pronunciations.delete_all_g2p_entries()
|
||||
|
||||
#wav_file = wav_dir + '\\10\\' + regionLabels[0] + '\\9935-1464218044-1951631.wav'
|
||||
#script_file = script_dir + '\\script10.txt'
|
||||
#with open(script_file, 'r') as fin:
|
||||
# script = fin.readline()
|
||||
#fa = forced_alignment(wav_file, script)
|
||||
|
||||
|
||||
## make database connection
|
||||
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
|
267
accent_classification/speaker_based.py
Normal file
267
accent_classification/speaker_based.py
Normal file
@ -0,0 +1,267 @@
|
||||
import os
|
||||
import sys
|
||||
import configparser
|
||||
|
||||
import pypyodbc
|
||||
import numpy as np
|
||||
from collections import Counter
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from sklearn import preprocessing
|
||||
from sklearn.metrics import confusion_matrix
|
||||
from sklearn.metrics import accuracy_score
|
||||
|
||||
repo_dir = 'C:\\Users\\Aki\\source\\repos\\accent_classification'
|
||||
curr_dir = repo_dir + '\\accent_classification'
|
||||
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir))
|
||||
import data_manipulation as mani
|
||||
import evaluation as eval
|
||||
import speaker_based_functions as sb_func
|
||||
|
||||
|
||||
## ======================= user define =======================
|
||||
sentence_num_max = 10
|
||||
config_file = curr_dir + '\\config.ini'
|
||||
output_dir = repo_dir + '\\output'
|
||||
|
||||
# make train/test set: 1, load: 0
|
||||
make_train_test_set = 0
|
||||
|
||||
# specify which experiment to be performed.
|
||||
# - 3: groninven vs oost_overijssel vs limburg
|
||||
# - 2: groningen vs limburg
|
||||
experiment_type = 2
|
||||
|
||||
region_labels3 = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg']
|
||||
region_labels2 = ['Groningen_and_Drenthe', 'Limburg']
|
||||
|
||||
|
||||
## ======================= data preparation =======================
|
||||
|
||||
## load variables from the ini file
|
||||
config = configparser.ConfigParser()
|
||||
config.sections()
|
||||
config.read(config_file)
|
||||
MDB_file = config['sentence_based']['fileMDB']
|
||||
|
||||
|
||||
## connect to the database
|
||||
pypyodbc.lowercase = False
|
||||
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + MDB_file + ";"
|
||||
conn = pypyodbc.connect(param)
|
||||
cursor = conn.cursor()
|
||||
|
||||
|
||||
## get data from Access database
|
||||
# data format
|
||||
# 0: filename
|
||||
# 1: pid
|
||||
# 2: region
|
||||
# 3: ID (unique word_id)
|
||||
# 4: sentence_id
|
||||
# 5: word_id
|
||||
# 6: word
|
||||
# 7: pronunciation
|
||||
SQL_string = """\
|
||||
{CALL dataset_with_cities}
|
||||
"""
|
||||
cursor.execute(SQL_string)
|
||||
|
||||
rows = cursor.fetchall()
|
||||
data = np.array(rows)
|
||||
del SQL_string, rows
|
||||
|
||||
|
||||
## get the list of pronunciation variant (pronvarList) from Access database
|
||||
# pronvarList format
|
||||
# 0: ID (unique word_id)
|
||||
# 1: word
|
||||
# 2: pronvar
|
||||
SQL_string = """\
|
||||
{CALL pronunciation_variant}
|
||||
"""
|
||||
cursor.execute(SQL_string)
|
||||
rows = cursor.fetchall()
|
||||
pronvarList = np.array(rows)
|
||||
del SQL_string, rows
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
## make list of LabelBinarizer object per word for X (=pronunciation variant).
|
||||
LB_list = []
|
||||
unique_wordID_list = data[:, 3].astype(int)
|
||||
unique_wordID_max = max(unique_wordID_list)
|
||||
for unique_wordID in range(1, unique_wordID_max+1):
|
||||
pronvar = data[unique_wordID_list == unique_wordID, 7]
|
||||
LB = preprocessing.LabelBinarizer()
|
||||
LB.fit(np.unique(pronvar))
|
||||
LB_list.append(LB)
|
||||
|
||||
|
||||
## make LabelEncorder/LabelBinilizer objects for y (=region).
|
||||
LE_y3 = preprocessing.LabelEncoder()
|
||||
LE_y3.fit(region_labels3)
|
||||
LE_y2 = preprocessing.LabelEncoder()
|
||||
LE_y2.fit(region_labels2)
|
||||
|
||||
LB_y3 = preprocessing.LabelBinarizer()
|
||||
LB_y3.fit(region_labels3)
|
||||
LB_y2 = preprocessing.LabelBinarizer()
|
||||
LB_y2.fit(region_labels2)
|
||||
|
||||
del unique_wordID, unique_wordID_max, pronvar, LB
|
||||
|
||||
|
||||
|
||||
## ======================= make train/eval/test set or load =======================
|
||||
|
||||
## find the smallest group to balance the number of samples per group.
|
||||
pidlist3 = np.unique(data[:, (1, 2)], axis=0)
|
||||
pidlist3_counter = Counter(pidlist3[:, 1])
|
||||
sample_num_max = min(pidlist3_counter.values())
|
||||
del pidlist3_counter
|
||||
|
||||
|
||||
## make train/eval/test set or load them.
|
||||
|
||||
if make_train_test_set==1:
|
||||
pidlist3_train = []
|
||||
pidlist3_eval = []
|
||||
pidlist3_test = []
|
||||
for region_num in range(0, len(region_labels3)):
|
||||
region_name = region_labels3[region_num]
|
||||
|
||||
pidlist3_per_region_ = pidlist3[pidlist3[:, 1]==region_labels3[region_num], :]
|
||||
pidlist3_per_region, idx = mani.extractRandomSample(
|
||||
pidlist3_per_region_, sample_num_max)
|
||||
|
||||
# split dataset into train, eval and test.
|
||||
[pidlist3_per_region_train, pidlist3_per_region_test] = train_test_split(
|
||||
pidlist3_per_region, test_size = 0.2, random_state = 0)
|
||||
[pidlist3_per_region_train, pidlist3_per_region_eval] = train_test_split(
|
||||
pidlist3_per_region_train, test_size = 0.1, random_state = 0)
|
||||
|
||||
# append numpy arrays.
|
||||
if region_num == 0:
|
||||
pidlist3_train = pidlist3_per_region_train
|
||||
pidlist3_eval = pidlist3_per_region_eval
|
||||
pidlist3_test = pidlist3_per_region_test
|
||||
else:
|
||||
pidlist3_train = np.r_[pidlist3_train, pidlist3_per_region_train]
|
||||
pidlist3_eval = np.r_[pidlist3_eval, pidlist3_per_region_eval]
|
||||
pidlist3_test = np.r_[pidlist3_test, pidlist3_per_region_test]
|
||||
del region_num, region_name
|
||||
del pidlist3_per_region_, pidlist3_per_region, idx
|
||||
del pidlist3_per_region_train, pidlist3_per_region_eval, pidlist3_per_region_test
|
||||
np.save(output_dir + "\\pidlist3_train.npy", pidlist3_train)
|
||||
np.save(output_dir + "\\pidlist3_eval.npy", pidlist3_eval)
|
||||
np.save(output_dir + "\\pidlist3_test.npy", pidlist3_test)
|
||||
|
||||
|
||||
if experiment_type == 2:
|
||||
pidlist2_train_ = np.r_[pidlist3_train, pidlist3_eval]
|
||||
|
||||
pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_)
|
||||
pidlist2_test = sb_func.groningen_vs_limburg(pidlist3_test)
|
||||
np.save(output_dir + "\\pidlist2_train", pidlist2_train)
|
||||
np.save(output_dir + "\\pidlist2_test", pidlist2_test)
|
||||
|
||||
del pidlist2_train_
|
||||
else:
|
||||
pidlist3_train = np.load(output_dir + "\\pidlist3_train.npy")
|
||||
pidlist3_eval = np.load(output_dir + "\\pidlist3_eval.npy")
|
||||
pidlist3_test = np.load(output_dir + "\\pidlist3_test.npy")
|
||||
|
||||
if experiment_type == 2:
|
||||
pidlist2_train = np.load(output_dir + "\\pidlist2_train.npy")
|
||||
pidlist2_test = np.load(output_dir + "\\pidlist2_test.npy")
|
||||
|
||||
|
||||
## extract corresponding data using pid
|
||||
|
||||
data3_train = sb_func.extractPid(pidlist3_train, data)
|
||||
data3_eval = sb_func.extractPid(pidlist3_eval, data)
|
||||
data3_test = sb_func.extractPid(pidlist3_test, data)
|
||||
|
||||
if experiment_type == 2:
|
||||
data2 = np.array(data)
|
||||
data2_train = sb_func.extractPid(pidlist2_train, data2)
|
||||
data2_test = sb_func.extractPid(pidlist2_test, data2)
|
||||
|
||||
|
||||
## ======================= experiments =======================
|
||||
|
||||
## specify the dataset
|
||||
|
||||
# train vs eval
|
||||
#trainData = data3_train
|
||||
#testData = data3_eval
|
||||
#testPID = pidlist3_eval
|
||||
#LB = LB_y3
|
||||
#LE = LE_y3
|
||||
#region_labels = region_labels3
|
||||
|
||||
# train+eval vs test
|
||||
if experiment_type == 3:
|
||||
trainData = np.r_[data3_train, data3_eval]
|
||||
testData = data3_test
|
||||
testPID = pidlist3_test
|
||||
LB = LB_y3
|
||||
LE = LE_y3
|
||||
region_labels = region_labels3
|
||||
|
||||
elif experiment_type == 2:
|
||||
trainData = data2_train
|
||||
testData = data2_test
|
||||
testPID = pidlist2_test
|
||||
LB = LB_y2
|
||||
LE = LE_y2
|
||||
region_labels = region_labels2
|
||||
|
||||
## check the number of utterance
|
||||
#data_all = np.r_[trainData, testData]
|
||||
#filenames = np.c_[data_all[:, 0], data_all[:, 2]]
|
||||
#filenames_unique = np.unique(filenames, axis=0)
|
||||
#Counter(filenames_unique[:, 1])
|
||||
|
||||
|
||||
## output filenames
|
||||
fileComparison = output_dir + "\\algorithm_comparison.csv"
|
||||
filePerformance = output_dir + "\\sentence-level.csv"
|
||||
fileConfusionMatrix = output_dir + "\\confusion_matrix.csv"
|
||||
|
||||
|
||||
## compare classification algorithms for the sentence-classifiers.
|
||||
#sb_func.compare_sentence_level_classifiers(trainData, LB_list, LE, fileComparison)
|
||||
|
||||
|
||||
## train sentence-level classifiers.
|
||||
model_list, score_list, confusion_matrix_list = sb_func.train_sentence_level_classifiers(
|
||||
trainData, LB_list, LE, filePerformance)
|
||||
|
||||
|
||||
## prediction over evaluation data per each sentence-level classifier.
|
||||
pred_per_sentence = sb_func.prediction_per_sentence(testData, model_list, LB_list, LE)
|
||||
|
||||
|
||||
## combine sentence-level classifiers
|
||||
pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence)
|
||||
|
||||
|
||||
## confusion matrix
|
||||
confusionMatrix_majority = confusion_matrix(
|
||||
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=region_labels)
|
||||
|
||||
|
||||
## output
|
||||
accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None)
|
||||
print('accuracy: {}%'.format(accuracy * 100))
|
||||
|
||||
cm = confusionMatrix_majority
|
||||
print(cm)
|
||||
|
||||
np.save(output_dir + "\\pred_per_pid2.npy", pred_per_pid_majority)
|
||||
np.save(output_dir + "\\confusion_matrix2.npy", cm)
|
@ -14,7 +14,7 @@ from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from sklearn.metrics import confusion_matrix
|
||||
|
||||
import dataManipulation as mani
|
||||
import data_manipulation as mani
|
||||
import evaluation as eval
|
||||
|
||||
|
||||
@ -338,34 +338,6 @@ def prediction_per_pid_weighted(pidlist_eval, prediction, weight, LB_y, LE_y):
|
||||
return np.array(prediction_per_pid)
|
||||
|
||||
|
||||
def saxon_vs_limburg(pidlist3):
|
||||
"""convert a pidlist for 3 regions into that for 2 regions.
|
||||
|
||||
Notes:
|
||||
3 regions include ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
|
||||
2 regions include ['Limburg', 'Low_Saxon']
|
||||
where Low_Saxon = 'Groningen_and_Drenthe' + 'Oost_Overijsel-Gelderland'
|
||||
samples are randomly chosen so that each class has the same amount of data.
|
||||
|
||||
"""
|
||||
|
||||
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
|
||||
regionLabels2 = ['Low_Saxon', 'Limburg']
|
||||
|
||||
index_saxon = np.any([pidlist3[:, 1] == regionLabels[0], pidlist3[:, 1] == regionLabels[2]], axis=0)
|
||||
pidlist_saxon_ = pidlist3[index_saxon, :]
|
||||
pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :]
|
||||
|
||||
# extract the same amout of samples as Limburg.
|
||||
pidlistCounter3 = Counter(pidlist3[:, 1])
|
||||
pidlist_saxon, idx = mani.extractRandomSample(pidlist_saxon_, pidlistCounter3['Limburg'])
|
||||
pidlist_saxon[:, 1] = regionLabels2[0]
|
||||
|
||||
pidlist2 = np.r_[pidlist_limburg, pidlist_saxon]
|
||||
#pidlistCounter2 = Counter(pidlist2[:, 1])
|
||||
return pidlist2
|
||||
|
||||
|
||||
def groningen_vs_limburg(pidlist3):
|
||||
"""convert a pidlist for 3 regions into that for 2 regions.
|
||||
|
||||
@ -374,7 +346,7 @@ def groningen_vs_limburg(pidlist3):
|
||||
2 regions include ['Groningen_and_Drenthe', 'Limburg']
|
||||
|
||||
"""
|
||||
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
|
||||
regionLabels = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg']
|
||||
|
||||
pidlist_groningen = pidlist3[pidlist3[:, 1] == regionLabels[0], :]
|
||||
pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :]
|
@ -1,326 +0,0 @@
|
||||
import os
|
||||
import sys
|
||||
import configparser
|
||||
|
||||
import pypyodbc
|
||||
import numpy as np
|
||||
from collections import Counter
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from sklearn import preprocessing
|
||||
from sklearn.metrics import confusion_matrix
|
||||
from sklearn.metrics import accuracy_score
|
||||
|
||||
currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
|
||||
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
|
||||
import dataManipulation as mani
|
||||
import evaluation as eval
|
||||
import speaker_based_functions as sb_func
|
||||
|
||||
|
||||
#####################
|
||||
## USER DEFINE ##
|
||||
#####################
|
||||
sentenceNumMax = 10
|
||||
configFile = currDir + '\\config.ini'
|
||||
dirOut = currDir + '\\result'
|
||||
|
||||
# make train/test set: 1, load: 0
|
||||
makeTrainTestSet = 0
|
||||
# convert 3 regions to 2 regions: 1, load: 0
|
||||
conv3to2region = 0
|
||||
|
||||
# 3 regions: 0
|
||||
# saxon vs limburg: 1
|
||||
# groningen vs limburg: 2
|
||||
experiment_type = 2
|
||||
|
||||
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
|
||||
|
||||
# a bit useless error handling.
|
||||
#assert (experiment_type in (0, 1, 2)), "experiment type should be 0, 1 or 2."
|
||||
if experiment_type == 1:
|
||||
regionLabels2 = ['Low_Saxon', 'Limburg']
|
||||
regionLabels2 = ['Groningen_and_Drenthe', 'Limburg']
|
||||
|
||||
|
||||
##########################
|
||||
## DATA PREPARATION ##
|
||||
##########################
|
||||
|
||||
## load init file
|
||||
config = configparser.ConfigParser()
|
||||
config.sections()
|
||||
config.read(configFile)
|
||||
dirFeature = config['sentence_based']['dirFeature']
|
||||
fileMDB = config['sentence_based']['fileMDB']
|
||||
|
||||
|
||||
## database connection
|
||||
pypyodbc.lowercase = False
|
||||
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
|
||||
conn = pypyodbc.connect(param)
|
||||
cursor = conn.cursor()
|
||||
|
||||
|
||||
## get data from Access database
|
||||
# data format
|
||||
# 0: filename
|
||||
# 1: pid
|
||||
# 2: region
|
||||
# 3: ID (unique word_id)
|
||||
# 4: sentence_id
|
||||
# 5: word_id
|
||||
# 6: word
|
||||
# 7: pronunciation
|
||||
SQL_string = """\
|
||||
{CALL dataset_with_cities}
|
||||
"""
|
||||
cursor.execute(SQL_string)
|
||||
|
||||
rows = cursor.fetchall()
|
||||
data = np.array(rows)
|
||||
#dataNumMax = data.shape[0]
|
||||
#uniqueWordIDmax = max(data[:, 3].astype(int))
|
||||
del SQL_string, rows
|
||||
|
||||
|
||||
## make list of LabelBinarizer object per word.
|
||||
# for X
|
||||
# get pronvarList from Access database
|
||||
# pronvarList format
|
||||
# 0: ID (unique word_id)
|
||||
# 1: word
|
||||
# 2: pronvar
|
||||
SQL_string = """\
|
||||
{CALL pronunciation_variant}
|
||||
"""
|
||||
cursor.execute(SQL_string)
|
||||
rows = cursor.fetchall()
|
||||
pronvarList = np.array(rows)
|
||||
del SQL_string, rows
|
||||
|
||||
|
||||
LBlist = []
|
||||
#uniqueWordIDlist = pronvarList[:, 0].astype(int)
|
||||
uniqueWordIDlist = data[:, 3].astype(int)
|
||||
uniqueWordIDmax = max(uniqueWordIDlist)
|
||||
for uniqueWordID in range(1, uniqueWordIDmax+1):
|
||||
pronvar = data[uniqueWordIDlist == uniqueWordID, 7]
|
||||
#pronvar = pronvarList[pronvarList[:, 0] == uniqueWordID, 2]
|
||||
LB = preprocessing.LabelBinarizer()
|
||||
LB.fit(np.unique(pronvar))
|
||||
LBlist.append(LB)
|
||||
|
||||
# for y (=region)
|
||||
LE_y = preprocessing.LabelEncoder()
|
||||
LE_y.fit(regionLabels)
|
||||
LE_y2 = preprocessing.LabelEncoder()
|
||||
LE_y2.fit(regionLabels2)
|
||||
|
||||
LB_y = preprocessing.LabelBinarizer()
|
||||
LB_y.fit(regionLabels)
|
||||
LB_y2 = preprocessing.LabelBinarizer()
|
||||
LB_y2.fit(regionLabels2)
|
||||
|
||||
del uniqueWordID, uniqueWordIDmax, pronvar, LB
|
||||
|
||||
|
||||
#################
|
||||
## ITERATION ##
|
||||
#################
|
||||
#CM_majority = np.zeros((1, 9)).astype(int)
|
||||
#CM_weighted = np.zeros((1, 9)).astype(int)
|
||||
#for iter in range(0, 1):
|
||||
# print(iter)
|
||||
|
||||
## make balanced dataset
|
||||
pidlist = np.unique(data[:, (1, 2)], axis=0)
|
||||
|
||||
# count number of samples
|
||||
pidlistCounter = Counter(pidlist[:, 1])
|
||||
sampleNumMax = min(pidlistCounter.values())
|
||||
del pidlistCounter
|
||||
|
||||
|
||||
## make train/eval/test set or load
|
||||
if makeTrainTestSet==1:
|
||||
pidlist_train = []
|
||||
pidlist_eval = []
|
||||
pidlist_test = []
|
||||
for regionNum in range(0, len(regionLabels)):
|
||||
regionName = regionLabels[regionNum]
|
||||
|
||||
pidlist_per_region_ = pidlist[pidlist[:, 1]==regionLabels[regionNum], :]
|
||||
pidlist_per_region, idx = mani.extractRandomSample(
|
||||
pidlist_per_region_, sampleNumMax)
|
||||
|
||||
# split dataset into train, eval and test.
|
||||
[pidlist_per_region_train, pidlist_per_region_test] = train_test_split(
|
||||
pidlist_per_region, test_size = 0.2, random_state = 0)
|
||||
[pidlist_per_region_train, pidlist_per_region_eval] = train_test_split(
|
||||
pidlist_per_region_train, test_size = 0.1, random_state = 0)
|
||||
|
||||
# append numpy arrays
|
||||
if regionNum == 0:
|
||||
pidlist_train = pidlist_per_region_train
|
||||
pidlist_eval = pidlist_per_region_eval
|
||||
pidlist_test = pidlist_per_region_test
|
||||
else:
|
||||
pidlist_train = np.r_[pidlist_train, pidlist_per_region_train]
|
||||
pidlist_eval = np.r_[pidlist_eval, pidlist_per_region_eval]
|
||||
pidlist_test = np.r_[pidlist_test, pidlist_per_region_test]
|
||||
del regionNum, regionName
|
||||
del pidlist_per_region_, pidlist_per_region, idx
|
||||
del pidlist_per_region_train, pidlist_per_region_eval, pidlist_per_region_test
|
||||
np.save(dirOut + "\\pidlist_train.npy", pidlist_train)
|
||||
np.save(dirOut + "\\pidlist_eval.npy", pidlist_eval)
|
||||
np.save(dirOut + "\\pidlist_test.npy", pidlist_test)
|
||||
else:
|
||||
pidlist_train = np.load(dirOut + "\\pidlist_train.npy")
|
||||
pidlist_eval = np.load(dirOut + "\\pidlist_eval.npy")
|
||||
pidlist_test = np.load(dirOut + "\\pidlist_test.npy")
|
||||
|
||||
|
||||
## make dataset for 2 regions or load
|
||||
if conv3to2region==1:
|
||||
pidlist2_train_ = np.r_[pidlist_train, pidlist_eval]
|
||||
|
||||
if experiment_type == 1:
|
||||
pidlist2_train = sb_func.saxon_vs_limburg(pidlist2_train_)
|
||||
pidlist2_test = sb_func.saxon_vs_limburg(pidlist_test)
|
||||
np.save(dirOut + "\\pidlist2_saxon_vs_limburg_train", pidlist2_train)
|
||||
np.save(dirOut + "\\pidlist2_saxon_vs_limburg_test", pidlist2_test)
|
||||
|
||||
elif experiment_type == 2:
|
||||
pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_)
|
||||
pidlist2_test = sb_func.groningen_vs_limburg(pidlist_test)
|
||||
np.save(dirOut + "\\pidlist2_groningen_vs_limburg_train", pidlist2_train)
|
||||
np.save(dirOut + "\\pidlist2_groningen_vs_limburg_test", pidlist2_test)
|
||||
|
||||
del pidlist2_train_
|
||||
else:
|
||||
if experiment_type == 1:
|
||||
pidlist2_train = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_train.npy")
|
||||
pidlist2_test = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_test.npy")
|
||||
|
||||
elif experiment_type == 2:
|
||||
pidlist2_train = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_train.npy")
|
||||
pidlist2_test = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_test.npy")
|
||||
|
||||
|
||||
## train/test data
|
||||
if experiment_type == 0:
|
||||
# Groningen vs Overijsel vs Limburg
|
||||
data_train = sb_func.extractPid(pidlist_train, data)
|
||||
data_eval = sb_func.extractPid(pidlist_eval, data)
|
||||
data_test = sb_func.extractPid(pidlist_test, data)
|
||||
|
||||
elif experiment_type == 1 or experiment_type == 2:
|
||||
data2 = np.array(data)
|
||||
|
||||
if experiment_type == 1:
|
||||
for row, row2 in zip(data, data2):
|
||||
if row[2] == regionLabels[0] or row[2] == regionLabels[2]:
|
||||
row2[2] = regionLabels2[0]
|
||||
|
||||
data2_train = sb_func.extractPid(pidlist2_train, data2)
|
||||
data2_test = sb_func.extractPid(pidlist2_test, data2)
|
||||
|
||||
|
||||
#####################################
|
||||
## EXPERIMENTS START FROM HERE ##
|
||||
#####################################
|
||||
|
||||
## actual training
|
||||
# train vs eval
|
||||
#trainData = data_train
|
||||
#testData = data_eval
|
||||
#testPID = pidlist_eval
|
||||
#LB = LB_y
|
||||
#LE = LE_y
|
||||
#regionLabels = regionLabels3
|
||||
|
||||
# train+eval vs test
|
||||
if experiment_type == 0:
|
||||
trainData = np.r_[data_train, data_eval]
|
||||
testData = data_test
|
||||
testPID = pidlist_test
|
||||
LB = LB_y
|
||||
LE = LE_y
|
||||
elif experiment_type == 1 or experiment_type == 2:
|
||||
# 2 region: saxon vs limburg/ groningen vs limburg
|
||||
trainData = data2_train
|
||||
testData = data2_test
|
||||
testPID = pidlist2_test
|
||||
LB = LB_y2
|
||||
LE = LE_y2
|
||||
regionLabels = regionLabels2
|
||||
|
||||
|
||||
# check the number of utterance
|
||||
allData = np.r_[trainData, testData]
|
||||
filenames = np.c_[allData[:, 0], allData[:, 2]]
|
||||
filenames_unique = np.unique(filenames, axis=0)
|
||||
Counter(filenames_unique[:, 1])
|
||||
|
||||
|
||||
fileComparison = dirOut + "\\algorithm_comparison.csv"
|
||||
filePerformance = dirOut + "\\sentence-level.csv"
|
||||
fileConfusionMatrix = dirOut + "\\confusion_matrix.csv"
|
||||
|
||||
## compare classification algorithms for the sentence-classifiers.
|
||||
#sb_func.compare_sentence_level_classifiers(trainData, LBlist, LE, fileComparison)
|
||||
|
||||
## train sentence-level classifiers.
|
||||
modelList, scoreList, confusionMatrixList = sb_func.train_sentence_level_classifiers(
|
||||
trainData, LBlist, LE, filePerformance)
|
||||
|
||||
## prediction over evaluation data per each sentence-level classifier.
|
||||
pred_per_sentence = sb_func.prediction_per_sentence(testData, modelList, LBlist, LE)
|
||||
|
||||
## combine sentence-level classifiers
|
||||
pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence)
|
||||
|
||||
## majority vote (weighted)
|
||||
#weight = sb_func.calc_weight(confusionMatrixList)
|
||||
#pred_per_pid_weighted = sb_func.prediction_per_pid_weighted(testPID, pred_per_sentence, weight, LB, LE)
|
||||
|
||||
### confusion matrix
|
||||
if experiment_type == 0:
|
||||
confusionMatrix_majority = confusion_matrix(
|
||||
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg'])
|
||||
else:
|
||||
confusionMatrix_majority = confusion_matrix(
|
||||
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Limburg'])
|
||||
|
||||
#confusionMatrix_weighted = confusion_matrix(
|
||||
# pred_per_pid_weighted[:, 1], pred_per_pid_weighted[:, 2], labels=regionLabels)
|
||||
|
||||
|
||||
## output
|
||||
accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None)
|
||||
print('accuracy: {}%'.format(accuracy * 100))
|
||||
|
||||
cm = confusionMatrix_majority
|
||||
print(cm)
|
||||
|
||||
np.save(dirOut + "\\pred_per_pid.npy", pred_per_pid_majority)
|
||||
np.save(dirOut + "\\confusion_matrix.npy", cm)
|
||||
|
||||
#fout = open(fileConfusionMatrix, "w")
|
||||
#fout.write('< confusion matrix for majority vote in evaluation set >\n')
|
||||
#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_majority', regionLabels)
|
||||
#fout.write('< confusion matrix for weighted vote in evaluation set >\n')
|
||||
#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_weighted', regionLabels)
|
||||
#fout.write('\n')
|
||||
#fout.close()
|
||||
|
||||
|
||||
##### iteration finish #####
|
||||
conn.close()
|
||||
#np.savetxt(dirOut + '\\cm_majority.csv', CM_majority, delimiter=',')
|
||||
#np.savetxt(dirOut + '\\cm_weighted.csv', CM_weighted, delimiter=',')
|
||||
|
BIN
output/confusion_matrix_2regions.npy
Normal file
BIN
output/confusion_matrix_2regions.npy
Normal file
Binary file not shown.
BIN
output/confusion_matrix_2regions.png
Normal file
BIN
output/confusion_matrix_2regions.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 18 KiB |
BIN
output/confusion_matrix_2regions_normalized.png
Normal file
BIN
output/confusion_matrix_2regions_normalized.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 21 KiB |
BIN
output/confusion_matrix_3regions.npy
Normal file
BIN
output/confusion_matrix_3regions.npy
Normal file
Binary file not shown.
BIN
output/confusion_matrix_3regions.png
Normal file
BIN
output/confusion_matrix_3regions.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 22 KiB |
BIN
output/confusion_matrix_3regions_normalized.png
Normal file
BIN
output/confusion_matrix_3regions_normalized.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 33 KiB |
BIN
output/pidlist_2regions_test.npy
Normal file
BIN
output/pidlist_2regions_test.npy
Normal file
Binary file not shown.
BIN
output/pidlist_2regions_train.npy
Normal file
BIN
output/pidlist_2regions_train.npy
Normal file
Binary file not shown.
BIN
output/pidlist_3regions_eval.npy
Normal file
BIN
output/pidlist_3regions_eval.npy
Normal file
Binary file not shown.
BIN
output/pidlist_3regions_test.npy
Normal file
BIN
output/pidlist_3regions_test.npy
Normal file
Binary file not shown.
BIN
output/pidlist_3regions_train.npy
Normal file
BIN
output/pidlist_3regions_train.npy
Normal file
Binary file not shown.
BIN
output/pred_per_pid_2regions.npy
Normal file
BIN
output/pred_per_pid_2regions.npy
Normal file
Binary file not shown.
BIN
output/pred_per_pid_3regions.npy
Normal file
BIN
output/pred_per_pid_3regions.npy
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user