commit to be sure.
This commit is contained in:
commit
a1379caced
38
dialect_identification.sln
Normal file
38
dialect_identification.sln
Normal file
@ -0,0 +1,38 @@
|
||||
|
||||
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||
# Visual Studio 15
|
||||
VisualStudioVersion = 15.0.26730.12
|
||||
MinimumVisualStudioVersion = 10.0.40219.1
|
||||
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "dialect_identification", "dialect_identification\dialect_identification.pyproj", "{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}"
|
||||
EndProject
|
||||
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{5A4286D1-F037-43D4-90F8-05C5CCC0CA30}"
|
||||
ProjectSection(SolutionItems) = preProject
|
||||
..\..\forced-alignment\forced_alignment\convert_phone_set.py = ..\..\forced-alignment\forced_alignment\convert_phone_set.py
|
||||
..\..\forced-alignment\forced_alignment\defaultfiles.py = ..\..\forced-alignment\forced_alignment\defaultfiles.py
|
||||
..\..\forced-alignment\forced_alignment\forced_alignment.pyproj = ..\..\forced-alignment\forced_alignment\forced_alignment.pyproj
|
||||
..\..\forced-alignment\forced_alignment\htk_dict.py = ..\..\forced-alignment\forced_alignment\htk_dict.py
|
||||
..\..\forced-alignment\forced_alignment\lexicon.py = ..\..\forced-alignment\forced_alignment\lexicon.py
|
||||
..\..\forced-alignment\forced_alignment\mlf.py = ..\..\forced-alignment\forced_alignment\mlf.py
|
||||
..\..\forced-alignment\forced_alignment\pronunciations.py = ..\..\forced-alignment\forced_alignment\pronunciations.py
|
||||
..\..\forced-alignment\forced_alignment\pyhtk.py = ..\..\forced-alignment\forced_alignment\pyhtk.py
|
||||
..\..\forced-alignment\forced_alignment\scripts.py = ..\..\forced-alignment\forced_alignment\scripts.py
|
||||
..\..\forced-alignment\forced_alignment\tempfilename.py = ..\..\forced-alignment\forced_alignment\tempfilename.py
|
||||
..\..\forced-alignment\forced_alignment\test_environment.py = ..\..\forced-alignment\forced_alignment\test_environment.py
|
||||
EndProjectSection
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Any CPU = Debug|Any CPU
|
||||
Release|Any CPU = Release|Any CPU
|
||||
EndGlobalSection
|
||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||
{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
EndGlobalSection
|
||||
GlobalSection(ExtensibilityGlobals) = postSolution
|
||||
SolutionGuid = {FA4F83BB-D460-40C1-B10E-98E4877CA29B}
|
||||
EndGlobalSection
|
||||
EndGlobal
|
90
dialect_identification/audio2db.py
Normal file
90
dialect_identification/audio2db.py
Normal file
@ -0,0 +1,90 @@
|
||||
import os
|
||||
import sys
|
||||
import configparser
|
||||
|
||||
import numpy as np
|
||||
import pypyodbc
|
||||
|
||||
|
||||
## user define
|
||||
forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced-alignment'
|
||||
dir_same_utterance = 'd:\\OneDrive\\Research\\rug\\experiments\\same_utterance'
|
||||
wav_dir = dir_same_utterance + '\\wav_with_cities'
|
||||
script_dir = dir_same_utterance + '\\script'
|
||||
fileMDB = dir_same_utterance + '\\feature\\DialectClassification.accdb'
|
||||
table = 'ForcedAlignmentResult'
|
||||
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
|
||||
|
||||
# these lines are not necessary once forced-alignment is intalled as a package.
|
||||
sys.path.append(forced_alignment_module)
|
||||
from forced_alignment import forced_alignment
|
||||
|
||||
|
||||
## check if forced-alignment work in each sentence
|
||||
#from forced_alignment import pronunciations
|
||||
#pronunciations.delete_all_g2p_entries()
|
||||
|
||||
#wav_file = wav_dir + '\\10\\' + regionLabels[0] + '\\9935-1464218044-1951631.wav'
|
||||
#script_file = script_dir + '\\script10.txt'
|
||||
#with open(script_file, 'r') as fin:
|
||||
# script = fin.readline()
|
||||
#fa = forced_alignment(wav_file, script)
|
||||
|
||||
|
||||
## make database connection
|
||||
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
|
||||
conn = pypyodbc.connect(param)
|
||||
cursor = conn.cursor()
|
||||
|
||||
SQLstring1 = 'INSERT INTO ' + table + ' (filename, region, word_id, pronunciation) '
|
||||
|
||||
|
||||
## forced-alignment to all the wav files in dir_same_utterance
|
||||
word_id_start = 1
|
||||
for sentenceID in range(1, 11):
|
||||
sentenceIDstr = format(sentenceID, '02')
|
||||
|
||||
# get script
|
||||
script_file = script_dir + '\\script' + sentenceIDstr + '.txt'
|
||||
with open(script_file, 'r') as fin:
|
||||
script = fin.readline()
|
||||
|
||||
# loop over three regions
|
||||
for region in regionLabels:
|
||||
|
||||
# loop over the wav_subdir
|
||||
wav_subdir = wav_dir + '\\' + sentenceIDstr + '\\' + region
|
||||
wav_files = os.listdir(wav_subdir)
|
||||
file_nr = 0
|
||||
for wav_file in wav_files:
|
||||
file_nr += 1
|
||||
filename = wav_file.replace('.wav', '')
|
||||
wav_file_fullpath = wav_subdir + '\\' + wav_file
|
||||
|
||||
# forced-alignment
|
||||
print('{0} {1}: {2} ({3}/{4})'.format(sentenceIDstr, region, wav_file, file_nr, len(wav_files)))
|
||||
fa = forced_alignment(wav_file_fullpath, script)
|
||||
|
||||
# send pronunciation variant to database
|
||||
word_id = word_id_start
|
||||
for row in fa:
|
||||
word = row[0]
|
||||
phonemes = np.array(row[1])
|
||||
|
||||
## get pronunciation variant
|
||||
pronvar_ = phonemes[:, 2]
|
||||
pronvar_[np.where(pronvar_=='ssil')]='' # remove 'ssil'
|
||||
pronvar = ''.join(pronvar_)
|
||||
|
||||
## insert the result into the database.
|
||||
SQLstring2 = 'VALUES (\'' + filename + '\',\'' + region + '\',\'' + str(word_id) + '\',\'' + pronvar + '\')'
|
||||
SQLstring = SQLstring1 + SQLstring2
|
||||
cursor.execute(SQLstring)
|
||||
conn.commit()
|
||||
|
||||
word_id = word_id + 1
|
||||
|
||||
word_id_start += script.count(' ')+1
|
||||
|
||||
conn.close()
|
||||
|
290
dialect_identification/classifier.py
Normal file
290
dialect_identification/classifier.py
Normal file
@ -0,0 +1,290 @@
|
||||
'''
|
||||
This script perfoms the basic process for applying a machine learning
|
||||
algorithm to a dataset using Python libraries.
|
||||
|
||||
The four steps are:
|
||||
1. Download a dataset (using pandas)
|
||||
2. Process the numeric data (using numpy)
|
||||
3. Train and evaluate learners (using scikit-learn)
|
||||
4. Plot and compare results (using matplotlib)
|
||||
|
||||
|
||||
The data is downloaded from URL, which is defined below. As is normal
|
||||
for machine learning problems, the nature of the source data affects
|
||||
the entire solution. When you change URL to refer to your own data, you
|
||||
will need to review the data processing steps to ensure they remain
|
||||
correct.
|
||||
|
||||
============
|
||||
Example Data
|
||||
============
|
||||
The example is from http://mlr.cs.umass.edu/ml/datasets/Spambase
|
||||
It contains pre-processed metrics, such as the frequency of certain
|
||||
words and letters, from a collection of emails. A classification for
|
||||
each one indicating 'spam' or 'not spam' is in the final column.
|
||||
See the linked page for full details of the data set.
|
||||
|
||||
This script uses three classifiers to predict the class of an email
|
||||
based on the metrics. These are not representative of modern spam
|
||||
detection systems.
|
||||
'''
|
||||
|
||||
# Remember to update the script for the new data when you change this URL
|
||||
URL = "http://mlr.cs.umass.edu/ml/machine-learning-databases/spambase/spambase.data"
|
||||
|
||||
# Uncomment this call when using matplotlib to generate images
|
||||
# rather than displaying interactive UI.
|
||||
#import matplotlib
|
||||
#matplotlib.use('Agg')
|
||||
|
||||
from pandas import read_table
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
try:
|
||||
# [OPTIONAL] Seaborn makes plots nicer
|
||||
import seaborn
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# =====================================================================
|
||||
|
||||
def download_data():
|
||||
'''
|
||||
Downloads the data for this script into a pandas DataFrame.
|
||||
'''
|
||||
|
||||
# If your data is in an Excel file, install 'xlrd' and use
|
||||
# pandas.read_excel instead of read_table
|
||||
#from pandas import read_excel
|
||||
#frame = read_excel(URL)
|
||||
|
||||
# If your data is in a private Azure blob, install 'azure-storage' and use
|
||||
# BlockBlobService.get_blob_to_path() with read_table() or read_excel()
|
||||
#from azure.storage.blob import BlockBlobService
|
||||
#service = BlockBlobService(ACCOUNT_NAME, ACCOUNT_KEY)
|
||||
#service.get_blob_to_path(container_name, blob_name, 'my_data.csv')
|
||||
#frame = read_table('my_data.csv', ...
|
||||
|
||||
frame = read_table(
|
||||
URL,
|
||||
|
||||
# Uncomment if the file needs to be decompressed
|
||||
#compression='gzip',
|
||||
#compression='bz2',
|
||||
|
||||
# Specify the file encoding
|
||||
# Latin-1 is common for data from US sources
|
||||
encoding='latin-1',
|
||||
#encoding='utf-8', # UTF-8 is also common
|
||||
|
||||
# Specify the separator in the data
|
||||
sep=',', # comma separated values
|
||||
#sep='\t', # tab separated values
|
||||
#sep=' ', # space separated values
|
||||
|
||||
# Ignore spaces after the separator
|
||||
skipinitialspace=True,
|
||||
|
||||
# Generate row labels from each row number
|
||||
index_col=None,
|
||||
#index_col=0, # use the first column as row labels
|
||||
#index_col=-1, # use the last column as row labels
|
||||
|
||||
# Generate column headers row from each column number
|
||||
header=None,
|
||||
#header=0, # use the first line as headers
|
||||
|
||||
# Use manual headers and skip the first row in the file
|
||||
#header=0,
|
||||
#names=['col1', 'col2', ...],
|
||||
)
|
||||
|
||||
# Return a subset of the columns
|
||||
#return frame[['col1', 'col4', ...]]
|
||||
|
||||
# Return the entire frame
|
||||
return frame
|
||||
|
||||
|
||||
# =====================================================================
|
||||
|
||||
|
||||
def get_features_and_labels(frame):
|
||||
'''
|
||||
Transforms and scales the input data and returns numpy arrays for
|
||||
training and testing inputs and targets.
|
||||
'''
|
||||
|
||||
# Replace missing values with 0.0, or we can use
|
||||
# scikit-learn to calculate missing values (below)
|
||||
#frame[frame.isnull()] = 0.0
|
||||
|
||||
# Convert values to floats
|
||||
arr = np.array(frame, dtype=np.float)
|
||||
|
||||
# Use the last column as the target value
|
||||
X, y = arr[:, :-1], arr[:, -1]
|
||||
# To use the first column instead, change the index value
|
||||
#X, y = arr[:, 1:], arr[:, 0]
|
||||
|
||||
# Use 80% of the data for training; test against the rest
|
||||
from sklearn.model_selection import train_test_split
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
|
||||
|
||||
# sklearn.pipeline.make_pipeline could also be used to chain
|
||||
# processing and classification into a black box, but here we do
|
||||
# them separately.
|
||||
|
||||
# If values are missing we could impute them from the training data
|
||||
#from sklearn.preprocessing import Imputer
|
||||
#imputer = Imputer(strategy='mean')
|
||||
#imputer.fit(X_train)
|
||||
#X_train = imputer.transform(X_train)
|
||||
#X_test = imputer.transform(X_test)
|
||||
|
||||
# Normalize the attribute values to mean=0 and variance=1
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
scaler = StandardScaler()
|
||||
# To scale to a specified range, use MinMaxScaler
|
||||
#from sklearn.preprocessing import MinMaxScaler
|
||||
#scaler = MinMaxScaler(feature_range=(0, 1))
|
||||
|
||||
# Fit the scaler based on the training data, then apply the same
|
||||
# scaling to both training and test sets.
|
||||
scaler.fit(X_train)
|
||||
X_train = scaler.transform(X_train)
|
||||
X_test = scaler.transform(X_test)
|
||||
|
||||
# Return the training and test sets
|
||||
return X_train, X_test, y_train, y_test
|
||||
|
||||
|
||||
# =====================================================================
|
||||
|
||||
|
||||
def evaluate_classifier(X_train, X_test, y_train, y_test):
|
||||
'''
|
||||
Run multiple times with different classifiers to get an idea of the
|
||||
relative performance of each configuration.
|
||||
|
||||
Returns a sequence of tuples containing:
|
||||
(title, precision, recall)
|
||||
for each learner.
|
||||
'''
|
||||
|
||||
# Import some classifiers to test
|
||||
from sklearn.svm import LinearSVC, NuSVC
|
||||
from sklearn.ensemble import AdaBoostClassifier
|
||||
|
||||
# We will calculate the P-R curve for each classifier
|
||||
from sklearn.metrics import precision_recall_curve, f1_score
|
||||
|
||||
# Here we create classifiers with default parameters. These need
|
||||
# to be adjusted to obtain optimal performance on your data set.
|
||||
|
||||
# Test the linear support vector classifier
|
||||
classifier = LinearSVC(C=1)
|
||||
# Fit the classifier
|
||||
classifier.fit(X_train, y_train)
|
||||
score = f1_score(y_test, classifier.predict(X_test))
|
||||
# Generate the P-R curve
|
||||
y_prob = classifier.decision_function(X_test)
|
||||
precision, recall, _ = precision_recall_curve(y_test, y_prob)
|
||||
# Include the score in the title
|
||||
yield 'Linear SVC (F1 score={:.3f})'.format(score), precision, recall
|
||||
|
||||
# Test the Nu support vector classifier
|
||||
classifier = NuSVC(kernel='rbf', nu=0.5, gamma=1e-3)
|
||||
# Fit the classifier
|
||||
classifier.fit(X_train, y_train)
|
||||
score = f1_score(y_test, classifier.predict(X_test))
|
||||
# Generate the P-R curve
|
||||
y_prob = classifier.decision_function(X_test)
|
||||
precision, recall, _ = precision_recall_curve(y_test, y_prob)
|
||||
# Include the score in the title
|
||||
yield 'NuSVC (F1 score={:.3f})'.format(score), precision, recall
|
||||
|
||||
# Test the Ada boost classifier
|
||||
classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.0, algorithm='SAMME.R')
|
||||
# Fit the classifier
|
||||
classifier.fit(X_train, y_train)
|
||||
score = f1_score(y_test, classifier.predict(X_test))
|
||||
# Generate the P-R curve
|
||||
y_prob = classifier.decision_function(X_test)
|
||||
precision, recall, _ = precision_recall_curve(y_test, y_prob)
|
||||
# Include the score in the title
|
||||
yield 'Ada Boost (F1 score={:.3f})'.format(score), precision, recall
|
||||
|
||||
# =====================================================================
|
||||
|
||||
|
||||
def plot(results):
|
||||
'''
|
||||
Create a plot comparing multiple learners.
|
||||
|
||||
`results` is a list of tuples containing:
|
||||
(title, precision, recall)
|
||||
|
||||
All the elements in results will be plotted.
|
||||
'''
|
||||
|
||||
# Plot the precision-recall curves
|
||||
|
||||
fig = plt.figure(figsize=(6, 6))
|
||||
fig.canvas.set_window_title('Classifying data from ' + URL)
|
||||
|
||||
for label, precision, recall in results:
|
||||
plt.plot(recall, precision, label=label)
|
||||
|
||||
plt.title('Precision-Recall Curves')
|
||||
plt.xlabel('Precision')
|
||||
plt.ylabel('Recall')
|
||||
plt.legend(loc='lower left')
|
||||
|
||||
# Let matplotlib improve the layout
|
||||
plt.tight_layout()
|
||||
|
||||
# ==================================
|
||||
# Display the plot in interactive UI
|
||||
plt.show()
|
||||
|
||||
# To save the plot to an image file, use savefig()
|
||||
#plt.savefig('plot.png')
|
||||
|
||||
# Open the image file with the default image viewer
|
||||
#import subprocess
|
||||
#subprocess.Popen('plot.png', shell=True)
|
||||
|
||||
# To save the plot to an image in memory, use BytesIO and savefig()
|
||||
# This can then be written to any stream-like object, such as a
|
||||
# file or HTTP response.
|
||||
#from io import BytesIO
|
||||
#img_stream = BytesIO()
|
||||
#plt.savefig(img_stream, fmt='png')
|
||||
#img_bytes = img_stream.getvalue()
|
||||
#print('Image is {} bytes - {!r}'.format(len(img_bytes), img_bytes[:8] + b'...'))
|
||||
|
||||
# Closing the figure allows matplotlib to release the memory used.
|
||||
plt.close()
|
||||
|
||||
|
||||
# =====================================================================
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Download the data set from URL
|
||||
print("Downloading data from {}".format(URL))
|
||||
frame = download_data()
|
||||
|
||||
# Process data into feature and label arrays
|
||||
print("Processing {} samples with {} attributes".format(len(frame.index), len(frame.columns)))
|
||||
X_train, X_test, y_train, y_test = get_features_and_labels(frame)
|
||||
|
||||
# Evaluate multiple classifiers on the data
|
||||
print("Evaluating classifiers")
|
||||
results = list(evaluate_classifier(X_train, X_test, y_train, y_test))
|
||||
|
||||
# Display the results
|
||||
print("Plotting the results")
|
||||
plot(results)
|
8
dialect_identification/config.ini
Normal file
8
dialect_identification/config.ini
Normal file
@ -0,0 +1,8 @@
|
||||
[word_based]
|
||||
fileWordList = D:\\OneDrive\\Research\\rug\\same_utterance\\feature\\wordList.csv
|
||||
fileCombined = D:\\OneDrive\\Research\\rug\\same_utterance\\feature\\combined.csv
|
||||
|
||||
[sentence_based]
|
||||
dirFeature = D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\feature
|
||||
fileMDB = D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\feature\\DialectClassification.accdb
|
||||
dirData = D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\wav
|
74
dialect_identification/data_io.py
Normal file
74
dialect_identification/data_io.py
Normal file
@ -0,0 +1,74 @@
|
||||
#
|
||||
# 2017/09/25
|
||||
# select samples from the combined.csv for the further analysis
|
||||
#
|
||||
# HISTORY
|
||||
# 2017/10/02 modularized.
|
||||
#
|
||||
# Aki Kunikoshi
|
||||
# 428968@gmail.com
|
||||
#
|
||||
import numpy as np
|
||||
|
||||
def readFile(filename):
|
||||
with open(filename, 'r') as fin:
|
||||
lines = fin.read()
|
||||
linesEach = lines.split('\n')
|
||||
return linesEach
|
||||
|
||||
|
||||
def selectSamplesFromCombinedData(word, fileCombined):
|
||||
# load combined data
|
||||
fin = open(fileCombined, 'r')
|
||||
line = fin.readline()
|
||||
|
||||
# load data per region
|
||||
dataGroningen = []
|
||||
dataLimburg = []
|
||||
dataOverijsel = []
|
||||
while line:
|
||||
line = fin.readline()
|
||||
line = line.rstrip()
|
||||
lineList = line.split(',')
|
||||
if len(lineList) == 6 and lineList[5] == word:
|
||||
region = lineList[2]
|
||||
if region == 'Groningen_and_Drenthe':
|
||||
dataGroningen.append(lineList)
|
||||
elif region == 'Limburg':
|
||||
dataLimburg.append(lineList)
|
||||
elif region == 'Oost_Overijsel-Gelderland':
|
||||
dataOverijsel.append(lineList)
|
||||
fin.close()
|
||||
return (dataGroningen, dataLimburg, dataOverijsel)
|
||||
#print("{0}: {1} {2} {3}".format(word,len(listGroningen),len(listLimburg),len(listOverijsel))
|
||||
|
||||
|
||||
def groupSamplesInCSV(fileCSV, idxRegion):
|
||||
fin = open(fileCSV, 'r')
|
||||
|
||||
# first line is the header
|
||||
line = fin.readline()
|
||||
line = line.rstrip()
|
||||
header = line.split(',')
|
||||
|
||||
# load data per region
|
||||
dataGroningen = []
|
||||
dataLimburg = []
|
||||
dataOverijsel = []
|
||||
while line:
|
||||
line = fin.readline()
|
||||
line = line.rstrip()
|
||||
lineList = line.split(',')
|
||||
if len(lineList) == len(header):
|
||||
region = lineList[idxRegion]
|
||||
if region == 'Groningen_and_Drenthe':
|
||||
dataGroningen.append(lineList)
|
||||
elif region == 'Limburg':
|
||||
dataLimburg.append(lineList)
|
||||
elif region == 'Oost_Overijsel-Gelderland':
|
||||
dataOverijsel.append(lineList)
|
||||
fin.close()
|
||||
return (header, dataGroningen, dataLimburg, dataOverijsel)
|
||||
|
||||
def addUserID(featureFile, recordingsCSV):
|
||||
dirFeature = config['sentence_based']['dirFeature']
|
41
dialect_identification/data_manipulation.py
Normal file
41
dialect_identification/data_manipulation.py
Normal file
@ -0,0 +1,41 @@
|
||||
import numpy as np
|
||||
from sklearn import manifold
|
||||
import Levenshtein
|
||||
|
||||
# x: ndarray (dnum x dim)
|
||||
# n: number of samples to extract
|
||||
# OUTPUT
|
||||
# index: index of the chosen samples
|
||||
#
|
||||
def extractRandomSample(x, n):
|
||||
xRowMax = x.shape[0]
|
||||
indexOriginal = np.arange(xRowMax)
|
||||
indexChosen = np.random.choice(indexOriginal, n, False)
|
||||
xChosen = x[indexChosen, :]
|
||||
return (xChosen, indexChosen)
|
||||
|
||||
# x: 1d string ndarray
|
||||
def makeLevenshteinMatrix(x):
|
||||
xRowMax = x.shape[0]
|
||||
xLevenshtein = np.ones((xRowMax, xRowMax), dtype='int')
|
||||
|
||||
for xRow in range(0, xRowMax):
|
||||
for xCol in range(0, xRowMax):
|
||||
dist = Levenshtein.distance(x[xRow], x[xCol]);
|
||||
xLevenshtein[xRow, xCol] = dist
|
||||
return xLevenshtein
|
||||
|
||||
# x: 1d string ndarray
|
||||
def calcLevenshteinArray(word, x):
|
||||
xRowMax = x.shape[0]
|
||||
xLevenshtein = np.zeros(x.shape, dtype='int')
|
||||
|
||||
for xRow in range(0, xRowMax):
|
||||
dist = Levenshtein.distance(word, x[xRow]);
|
||||
xLevenshtein[xRow] = dist
|
||||
return xLevenshtein
|
||||
|
||||
def MDS(x):
|
||||
mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=6)
|
||||
xmds = mds.fit_transform(x)
|
||||
return xmds
|
70
dialect_identification/dialect_identification.pyproj
Normal file
70
dialect_identification/dialect_identification.pyproj
Normal file
@ -0,0 +1,70 @@
|
||||
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" ToolsVersion="4.0">
|
||||
<PropertyGroup>
|
||||
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
|
||||
<SchemaVersion>2.0</SchemaVersion>
|
||||
<ProjectGuid>fe1b1358-adbe-4446-affd-a0802d13d15b</ProjectGuid>
|
||||
<ProjectTypeGuids>{a41c8ea1-112a-4a2d-9f91-29557995525f};{888888a0-9f3d-457c-b088-3a5042f75d52}</ProjectTypeGuids>
|
||||
<ProjectHome>.</ProjectHome>
|
||||
<StartupFile>output_confusion_matrix.py</StartupFile>
|
||||
<SearchPath>
|
||||
</SearchPath>
|
||||
<WorkingDirectory>.</WorkingDirectory>
|
||||
<OutputPath>.</OutputPath>
|
||||
<Name>dialect_identification</Name>
|
||||
<RootNamespace>dialect_identification</RootNamespace>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition=" '$(Configuration)' == 'Debug' ">
|
||||
<DebugSymbols>true</DebugSymbols>
|
||||
<EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition=" '$(Configuration)' == 'Release' ">
|
||||
<DebugSymbols>true</DebugSymbols>
|
||||
<EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
|
||||
</PropertyGroup>
|
||||
<ItemGroup>
|
||||
<Compile Include="manipulate_db.py">
|
||||
<SubType>Code</SubType>
|
||||
</Compile>
|
||||
<Compile Include="audio2db.py">
|
||||
<SubType>Code</SubType>
|
||||
</Compile>
|
||||
<Compile Include="classifier.py" />
|
||||
<Compile Include="dataManipulation.py">
|
||||
<SubType>Code</SubType>
|
||||
</Compile>
|
||||
<Compile Include="output_confusion_matrix.py">
|
||||
<SubType>Code</SubType>
|
||||
</Compile>
|
||||
<Compile Include="sentence_based.py">
|
||||
<SubType>Code</SubType>
|
||||
</Compile>
|
||||
<Compile Include="speaker_based.py">
|
||||
<SubType>Code</SubType>
|
||||
</Compile>
|
||||
<Compile Include="speaker_based_functions.py">
|
||||
<SubType>Code</SubType>
|
||||
</Compile>
|
||||
<Compile Include="test_code.py">
|
||||
<SubType>Code</SubType>
|
||||
</Compile>
|
||||
<Compile Include="evaluation.py">
|
||||
<SubType>Code</SubType>
|
||||
</Compile>
|
||||
<Compile Include="word_based.py">
|
||||
<SubType>Code</SubType>
|
||||
</Compile>
|
||||
<Compile Include="dataIO.py" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Content Include="config.ini" />
|
||||
</ItemGroup>
|
||||
<Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
|
||||
<!-- Uncomment the CoreCompile target to enable the Build command in
|
||||
Visual Studio and specify your pre- and post-build commands in
|
||||
the BeforeBuild and AfterBuild targets below. -->
|
||||
<!--<Target Name="CoreCompile" />-->
|
||||
<Target Name="BeforeBuild">
|
||||
</Target>
|
||||
<Target Name="AfterBuild">
|
||||
</Target>
|
||||
</Project>
|
40
dialect_identification/evaluation.py
Normal file
40
dialect_identification/evaluation.py
Normal file
@ -0,0 +1,40 @@
|
||||
import numpy as np
|
||||
import scipy as sp
|
||||
import scipy.stats
|
||||
from sklearn.model_selection import KFold
|
||||
from sklearn.metrics import f1_score
|
||||
from sklearn.metrics import confusion_matrix
|
||||
|
||||
|
||||
# from https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data
|
||||
def mean_confidence_interval(data, confidence):
|
||||
a = 1.0*np.array(data)
|
||||
n = len(a)
|
||||
m, se = np.mean(a), scipy.stats.sem(a)
|
||||
h = se * sp.stats.t._ppf((1+confidence)/2., n-1)
|
||||
return m, m-h, m+h
|
||||
|
||||
# accumulated confusion matrix is added to cross_val_score
|
||||
def cross_val_confusion_matrix(model, X, y, cv):
|
||||
kf = KFold(n_splits=cv)
|
||||
classLabels = np.unique(y)
|
||||
classNumMax = classLabels.shape[0]
|
||||
confusionMatrixAccumulated = np.zeros((classNumMax, classNumMax))
|
||||
scores = []
|
||||
for idx_train, idx_test in kf.split(X):
|
||||
# split into train/test
|
||||
x_train = X[idx_train, :]
|
||||
x_test = X[idx_test, :]
|
||||
y_train = y[idx_train]
|
||||
y_test = y[idx_test]
|
||||
modelfit = model.fit(x_train, y_train)
|
||||
|
||||
# evaluation
|
||||
y_pred = modelfit.predict(x_test)
|
||||
|
||||
score = f1_score(y_test, y_pred, average='micro')
|
||||
scores.append(score)
|
||||
confusionMatrixAccumulated = confusionMatrixAccumulated + confusion_matrix(y_test, y_pred,
|
||||
labels=classLabels)
|
||||
scores = np.array(scores)
|
||||
return scores, confusionMatrixAccumulated
|
48
dialect_identification/manipulate_db.py
Normal file
48
dialect_identification/manipulate_db.py
Normal file
@ -0,0 +1,48 @@
|
||||
import sys
|
||||
import os
|
||||
import pandas
|
||||
import datetime
|
||||
sys.path.append('..')
|
||||
|
||||
# these lines are not necessary once forced-alignment is intalled as a package.
|
||||
forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced-alignment'
|
||||
sys.path.append(forced_alignment_module)
|
||||
from forced_alignment import pronunciations
|
||||
from forced_alignment.htk_dict import variances_table
|
||||
|
||||
|
||||
#pronunciations.delete_word('kunikoshi')
|
||||
#pronunciations.delete_all_g2p_entries()
|
||||
|
||||
|
||||
#existing_pronunciations = set(pronunciations.get_all())
|
||||
## only focus on word
|
||||
|
||||
|
||||
## missing pronunciations
|
||||
## (1) pronunciation is written in IPA.
|
||||
## (2) pronunciation variants are made based on (1).
|
||||
## (3) they are converted into HTK format.
|
||||
#missing_pronunciations_file = 'D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\missing_words_in_barbara_dic\\missing_words_pronvarsHTK.txt'
|
||||
|
||||
#with open(missing_pronunciations_file) as fin:
|
||||
# lines = fin.read()
|
||||
# lines = lines.split('\n')
|
||||
|
||||
#source = 'generated using ipa transcription by Marita Everhardt.'
|
||||
#inserts = []
|
||||
#for line in lines:
|
||||
# line = line.split('\t')
|
||||
# word = line[0].strip().lower()
|
||||
# pronounciation = line[1].strip().split()
|
||||
|
||||
# # surely not in the table
|
||||
# #if (word, pronounciation) not in existing_pronunciations:
|
||||
# inserts.append("('{}', '{}', '{}', '{}', 0)".format(
|
||||
# word,
|
||||
# ' '.join(pronounciation),
|
||||
# source,
|
||||
# datetime.datetime.now(), ))
|
||||
|
||||
#sql = """INSERT INTO pronunciations (word, pronunciation, collection, added, automatic) VALUES\n {};""".format(
|
||||
# ',\n '.join(inserts)
|
79
dialect_identification/output_confusion_matrix.py
Normal file
79
dialect_identification/output_confusion_matrix.py
Normal file
@ -0,0 +1,79 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
import itertools
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from sklearn.metrics import accuracy_score
|
||||
from sklearn.metrics import confusion_matrix
|
||||
|
||||
|
||||
currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
|
||||
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
|
||||
|
||||
regionLabels = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg']
|
||||
regionLabels2 = ['Groningen_and_Drenthe', 'Limburg']
|
||||
dirOut = currDir + '\\result\\same-utterance_with_cities'
|
||||
|
||||
|
||||
def plot_confusion_matrix(cm, classes,
|
||||
normalize=False,
|
||||
title='Confusion matrix',
|
||||
cmap=plt.cm.Blues):
|
||||
"""
|
||||
This function prints and plots the confusion matrix.
|
||||
Normalization can be applied by setting `normalize=True`.
|
||||
Note:
|
||||
this code is downloaded from: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
|
||||
"""
|
||||
if normalize:
|
||||
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
|
||||
print("Normalized confusion matrix")
|
||||
else:
|
||||
print('Confusion matrix, without normalization')
|
||||
|
||||
_fontsize = 24
|
||||
plt.imshow(cm, interpolation='nearest', cmap=cmap)
|
||||
#plt.title(title, fontsize=_fontsize+2)
|
||||
#plt.colorbar()
|
||||
tick_marks = np.arange(len(classes))
|
||||
#plt.xticks(tick_marks, classes, rotation=45, fontsize=_fontsize-2)
|
||||
plt.xticks(tick_marks, classes, fontsize=_fontsize-4)
|
||||
plt.yticks(tick_marks, classes, fontsize=_fontsize-4)
|
||||
|
||||
fmt = '.2f' if normalize else 'd'
|
||||
thresh = cm.max() / 2.
|
||||
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
|
||||
plt.text(j, i, format(cm[i, j], fmt),
|
||||
horizontalalignment="center",
|
||||
color="white" if cm[i, j] > thresh else "black",
|
||||
fontsize=_fontsize)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.subplots_adjust(bottom=0.2)
|
||||
plt.ylabel('True label', fontsize=_fontsize-4)
|
||||
plt.xlabel('Predicted label', fontsize=_fontsize-4)
|
||||
|
||||
|
||||
pred = np.load(dirOut + '\\pred_per_pid_3regions.npy')
|
||||
|
||||
#accuracy = accuracy_score(pred[:, 1], pred[:, 2], normalize=True, sample_weight=None)
|
||||
#print('accuracy: {}%'.format(accuracy * 100))
|
||||
|
||||
# confusion matrix
|
||||
cm = confusion_matrix(pred[:, 1], pred[:, 2], labels=regionLabels)
|
||||
# human perception (2 regions)
|
||||
#cm = np.array([[39, 57], [6, 104]])
|
||||
# human perception (3 regions)
|
||||
#cm = np.array([[22, 14, 52], [23, 21, 52], [5, 5, 100]])
|
||||
print(cm)
|
||||
|
||||
np.set_printoptions(precision=2)
|
||||
|
||||
plt.figure()
|
||||
plot_confusion_matrix(cm, classes=['GD', 'OG', 'LB'], normalize=True)
|
||||
#plot_confusion_matrix(cm, classes=['GD', 'LB'], normalize=True)
|
||||
|
||||
#plt.show()
|
||||
plt.savefig(dirOut + '\\cm_machine_3regions_normalized.png')
|
197
dialect_identification/sentence_based.py
Normal file
197
dialect_identification/sentence_based.py
Normal file
@ -0,0 +1,197 @@
|
||||
import os
|
||||
import sys
|
||||
import configparser
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from matplotlib import pyplot
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from sklearn import preprocessing
|
||||
from collections import Counter
|
||||
|
||||
# database
|
||||
import pypyodbc
|
||||
|
||||
# classifier
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
|
||||
from sklearn.metrics import f1_score
|
||||
from sklearn.metrics import confusion_matrix
|
||||
import pickle
|
||||
|
||||
currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
|
||||
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
|
||||
from dataIO import readFile
|
||||
from dataIO import groupSamplesInCSV
|
||||
import dataManipulation
|
||||
import utility as util
|
||||
|
||||
|
||||
configFile = currDir + '\\config.ini'
|
||||
# load init file
|
||||
config = configparser.ConfigParser()
|
||||
config.sections()
|
||||
config.read(configFile)
|
||||
dirFeature = config['sentence_based']['dirFeature']
|
||||
|
||||
sentenceNumMax = 10
|
||||
classifierList = []
|
||||
LE_X_decode = []
|
||||
LE_y = preprocessing.LabelEncoder()
|
||||
LE_y.fit(["Groningen_and_Drenthe", "Limburg", "Oost_Overijsel-Gelderland"])
|
||||
|
||||
testset_X = []
|
||||
testset_y = []
|
||||
testset_userID = []
|
||||
result_y_test = []
|
||||
result_y_prediction = []
|
||||
fout = open("comparison.csv", "w")
|
||||
for sentenceNum in range(1, sentenceNumMax+1):
|
||||
#if sentenceNum != 10:
|
||||
# sentenceNumStr = '0' + str(sentenceNum)
|
||||
#else:
|
||||
# sentenceNumStr = str(sentenceNumStr)
|
||||
sentenceNumStr = format(sentenceNum, '02')
|
||||
fileSentence = dirFeature + '\\\\' + sentenceNumStr + '.csv'
|
||||
|
||||
|
||||
## load combined data
|
||||
fileCSV = fileSentence
|
||||
idxRegion = 1
|
||||
header, dataGroningen, dataLimburg, dataOverijsel = groupSamplesInCSV(fileCSV, idxRegion)
|
||||
sampleNumMax = np.min((len(dataGroningen), len(dataLimburg), len(dataOverijsel)))
|
||||
|
||||
|
||||
## make balanced dataset
|
||||
dataG, indexG = dataManipulation.extractRandomSample(np.array(dataGroningen), sampleNumMax)
|
||||
dataL, indexL = dataManipulation.extractRandomSample(np.array(dataLimburg), sampleNumMax)
|
||||
dataO, indexO = dataManipulation.extractRandomSample(np.array(dataOverijsel), sampleNumMax)
|
||||
|
||||
XIndex = np.arange(idxRegion+1, len(header))
|
||||
yIndex = 1 # region
|
||||
userIDindex = 0 # userID
|
||||
|
||||
|
||||
## cathegorical values into numbers
|
||||
X_ = np.r_[dataG[:, XIndex], dataL[:, XIndex], dataO[:, XIndex]]
|
||||
y_ = np.r_[dataG[:, yIndex], dataL[:, yIndex], dataO[:, yIndex]]
|
||||
userID_ = np.r_[dataG[:, userIDindex], dataL[:, userIDindex], dataO[:, userIDindex]]
|
||||
|
||||
#X = np.zeros((X_.shape), 'int')
|
||||
for Xindex in XIndex:
|
||||
x = X_[:, Xindex-2]
|
||||
|
||||
## levenshtein distance
|
||||
#word_count = Counter(x)
|
||||
#frequent_word = max(word_count)
|
||||
#X[:, Xindex-2] = dataManipulation.calcLevenshteinArray(frequent_word, x)
|
||||
|
||||
# hot encoding
|
||||
le_x = preprocessing.LabelBinarizer()
|
||||
le_x.fit(np.unique(x))
|
||||
x_ = le_x.transform(x)
|
||||
LE_X_decode.append(x_.shape[1])
|
||||
if Xindex == idxRegion+1:
|
||||
X = x_
|
||||
else:
|
||||
X = np.c_[X, x_]
|
||||
|
||||
y = LE_y.transform(y_)
|
||||
|
||||
|
||||
## split into train vs test set
|
||||
#[X_train, X_test, y_train, y_test] = train_test_split(X, y, test_size = 0.2, random_state = 0)
|
||||
|
||||
# each regional data should be splited equally
|
||||
lenG = dataG.shape[0]
|
||||
lenL = dataL.shape[0]
|
||||
lenO = dataO.shape[0]
|
||||
indexG = np.arange(0, lenG)
|
||||
indexL = np.arange(lenG, lenG+lenL)
|
||||
indexO = np.arange(lenG+lenL, lenG+lenL+lenO)
|
||||
[XG_train, XG_test, yG_train, yG_test] = train_test_split(X[indexG, :], y[indexG], test_size = 0.2, random_state = 0)
|
||||
[XL_train, XL_test, yL_train, yL_test] = train_test_split(X[indexL, :], y[indexL], test_size = 0.2, random_state = 0)
|
||||
[XO_train, XO_test, yO_train, yO_test] = train_test_split(X[indexO, :], y[indexO], test_size = 0.2, random_state = 0)
|
||||
X_train = np.r_[XG_train, XL_train, XO_train]
|
||||
X_test = np.r_[XG_test, XL_test, XO_test]
|
||||
y_train = np.r_[yG_train, yL_train, yO_train]
|
||||
y_test = np.r_[yG_test, yL_test, yO_test]
|
||||
|
||||
|
||||
## comparison
|
||||
## classifiers
|
||||
#names = ["Nearest Neighbors",
|
||||
# "Linear SVM",
|
||||
# "Poly SVM",
|
||||
# "RBF SVM",
|
||||
# "Decision Tree",
|
||||
# "Random Forest 2",
|
||||
# "Random Forest 3",
|
||||
# "Random Forest 4",
|
||||
# "AdaBoost",
|
||||
# #"Naive Bayes",
|
||||
# "Linear Discriminant Analysis",
|
||||
# #"Quadratic Discriminant Analysis"
|
||||
# ]
|
||||
#classifiers = [
|
||||
# KNeighborsClassifier(3),
|
||||
# SVC(kernel="linear", C=0.025),
|
||||
# SVC(kernel="poly", C=0.025),
|
||||
# SVC(gamma=2, C=1),
|
||||
# DecisionTreeClassifier(max_depth=4),
|
||||
# RandomForestClassifier(max_depth=2, n_estimators=10, max_features=1),
|
||||
# RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1),
|
||||
# RandomForestClassifier(max_depth=4, n_estimators=10, max_features=1),
|
||||
# AdaBoostClassifier(),
|
||||
# #GaussianNB(),
|
||||
# LinearDiscriminantAnalysis(),
|
||||
# #QuadraticDiscriminantAnalysis()
|
||||
# ]
|
||||
#for name, model in zip(names, classifiers):
|
||||
# scores = cross_val_score(model, X, y, cv = 10, scoring = 'f1_micro')
|
||||
# fout = open("comparison.csv", "a")
|
||||
# fout.write("{0},{1},{2}\n".format(sentenceNum, name, scores.mean()))
|
||||
# print('{0}, {1}: {2}'.format(sentenceNum, name, scores.mean()))
|
||||
|
||||
# quasi-optimal model
|
||||
model = AdaBoostClassifier()
|
||||
# cross validation
|
||||
scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro')
|
||||
ci_mean, ci_low, ci_high = util.mean_confidence_interval(scores, 0.95)
|
||||
modelfit = model.fit(X_train, y_train)
|
||||
# f1 on test data
|
||||
y_prediction = modelfit.predict(X_test)
|
||||
f1score = f1_score(y_test, y_prediction, average='micro')
|
||||
fout.write("{0},{1},{2},{3}\n".format(ci_mean, ci_low, ci_high, f1score))
|
||||
|
||||
## save for the test
|
||||
testset_X.append(X_test)
|
||||
testset_y.append(y_test)
|
||||
testset_userID.append(userID_)
|
||||
result_y_test = result_y_test + list(y_test)
|
||||
result_y_prediction = result_y_prediction + list(y_prediction)
|
||||
fileClassifier = dirFeature + '\\\\' + sentenceNumStr + '.mdl'
|
||||
pickle.dump(modelfit, open(fileClassifier, 'wb'))
|
||||
fout.close()
|
||||
|
||||
### confusion matrix
|
||||
result_y_test_label = LE_y.inverse_transform(result_y_test)
|
||||
result_y_prediction_label = LE_y.inverse_transform(result_y_prediction)
|
||||
confusionMatrix = confusion_matrix(result_y_test_label, result_y_prediction_label, labels=[
|
||||
'Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland'])
|
||||
print(confusionMatrix)
|
||||
|
||||
|
||||
### make userID list
|
||||
#userID = testset_userID[0]
|
||||
#for sentenceNum in range(1, sentenceNumMax):
|
||||
# userid = testset_userID[sentenceNum]
|
||||
# userID = np.r_[userID, userid]
|
||||
#userIDlist = np.unique(userID)
|
||||
|
326
dialect_identification/speaker_based.py
Normal file
326
dialect_identification/speaker_based.py
Normal file
@ -0,0 +1,326 @@
|
||||
import os
|
||||
import sys
|
||||
import configparser
|
||||
|
||||
import pypyodbc
|
||||
import numpy as np
|
||||
from collections import Counter
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from sklearn import preprocessing
|
||||
from sklearn.metrics import confusion_matrix
|
||||
from sklearn.metrics import accuracy_score
|
||||
|
||||
currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
|
||||
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
|
||||
import dataManipulation as mani
|
||||
import evaluation as eval
|
||||
import speaker_based_functions as sb_func
|
||||
|
||||
|
||||
#####################
|
||||
## USER DEFINE ##
|
||||
#####################
|
||||
sentenceNumMax = 10
|
||||
configFile = currDir + '\\config.ini'
|
||||
dirOut = currDir + '\\result'
|
||||
|
||||
# make train/test set: 1, load: 0
|
||||
makeTrainTestSet = 0
|
||||
# convert 3 regions to 2 regions: 1, load: 0
|
||||
conv3to2region = 0
|
||||
|
||||
# 3 regions: 0
|
||||
# saxon vs limburg: 1
|
||||
# groningen vs limburg: 2
|
||||
experiment_type = 2
|
||||
|
||||
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
|
||||
|
||||
# a bit useless error handling.
|
||||
#assert (experiment_type in (0, 1, 2)), "experiment type should be 0, 1 or 2."
|
||||
if experiment_type == 1:
|
||||
regionLabels2 = ['Low_Saxon', 'Limburg']
|
||||
regionLabels2 = ['Groningen_and_Drenthe', 'Limburg']
|
||||
|
||||
|
||||
##########################
|
||||
## DATA PREPARATION ##
|
||||
##########################
|
||||
|
||||
## load init file
|
||||
config = configparser.ConfigParser()
|
||||
config.sections()
|
||||
config.read(configFile)
|
||||
dirFeature = config['sentence_based']['dirFeature']
|
||||
fileMDB = config['sentence_based']['fileMDB']
|
||||
|
||||
|
||||
## database connection
|
||||
pypyodbc.lowercase = False
|
||||
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
|
||||
conn = pypyodbc.connect(param)
|
||||
cursor = conn.cursor()
|
||||
|
||||
|
||||
## get data from Access database
|
||||
# data format
|
||||
# 0: filename
|
||||
# 1: pid
|
||||
# 2: region
|
||||
# 3: ID (unique word_id)
|
||||
# 4: sentence_id
|
||||
# 5: word_id
|
||||
# 6: word
|
||||
# 7: pronunciation
|
||||
SQL_string = """\
|
||||
{CALL dataset_with_cities}
|
||||
"""
|
||||
cursor.execute(SQL_string)
|
||||
|
||||
rows = cursor.fetchall()
|
||||
data = np.array(rows)
|
||||
#dataNumMax = data.shape[0]
|
||||
#uniqueWordIDmax = max(data[:, 3].astype(int))
|
||||
del SQL_string, rows
|
||||
|
||||
|
||||
## make list of LabelBinarizer object per word.
|
||||
# for X
|
||||
# get pronvarList from Access database
|
||||
# pronvarList format
|
||||
# 0: ID (unique word_id)
|
||||
# 1: word
|
||||
# 2: pronvar
|
||||
SQL_string = """\
|
||||
{CALL pronunciation_variant}
|
||||
"""
|
||||
cursor.execute(SQL_string)
|
||||
rows = cursor.fetchall()
|
||||
pronvarList = np.array(rows)
|
||||
del SQL_string, rows
|
||||
|
||||
|
||||
LBlist = []
|
||||
#uniqueWordIDlist = pronvarList[:, 0].astype(int)
|
||||
uniqueWordIDlist = data[:, 3].astype(int)
|
||||
uniqueWordIDmax = max(uniqueWordIDlist)
|
||||
for uniqueWordID in range(1, uniqueWordIDmax+1):
|
||||
pronvar = data[uniqueWordIDlist == uniqueWordID, 7]
|
||||
#pronvar = pronvarList[pronvarList[:, 0] == uniqueWordID, 2]
|
||||
LB = preprocessing.LabelBinarizer()
|
||||
LB.fit(np.unique(pronvar))
|
||||
LBlist.append(LB)
|
||||
|
||||
# for y (=region)
|
||||
LE_y = preprocessing.LabelEncoder()
|
||||
LE_y.fit(regionLabels)
|
||||
LE_y2 = preprocessing.LabelEncoder()
|
||||
LE_y2.fit(regionLabels2)
|
||||
|
||||
LB_y = preprocessing.LabelBinarizer()
|
||||
LB_y.fit(regionLabels)
|
||||
LB_y2 = preprocessing.LabelBinarizer()
|
||||
LB_y2.fit(regionLabels2)
|
||||
|
||||
del uniqueWordID, uniqueWordIDmax, pronvar, LB
|
||||
|
||||
|
||||
#################
|
||||
## ITERATION ##
|
||||
#################
|
||||
#CM_majority = np.zeros((1, 9)).astype(int)
|
||||
#CM_weighted = np.zeros((1, 9)).astype(int)
|
||||
#for iter in range(0, 1):
|
||||
# print(iter)
|
||||
|
||||
## make balanced dataset
|
||||
pidlist = np.unique(data[:, (1, 2)], axis=0)
|
||||
|
||||
# count number of samples
|
||||
pidlistCounter = Counter(pidlist[:, 1])
|
||||
sampleNumMax = min(pidlistCounter.values())
|
||||
del pidlistCounter
|
||||
|
||||
|
||||
## make train/eval/test set or load
|
||||
if makeTrainTestSet==1:
|
||||
pidlist_train = []
|
||||
pidlist_eval = []
|
||||
pidlist_test = []
|
||||
for regionNum in range(0, len(regionLabels)):
|
||||
regionName = regionLabels[regionNum]
|
||||
|
||||
pidlist_per_region_ = pidlist[pidlist[:, 1]==regionLabels[regionNum], :]
|
||||
pidlist_per_region, idx = mani.extractRandomSample(
|
||||
pidlist_per_region_, sampleNumMax)
|
||||
|
||||
# split dataset into train, eval and test.
|
||||
[pidlist_per_region_train, pidlist_per_region_test] = train_test_split(
|
||||
pidlist_per_region, test_size = 0.2, random_state = 0)
|
||||
[pidlist_per_region_train, pidlist_per_region_eval] = train_test_split(
|
||||
pidlist_per_region_train, test_size = 0.1, random_state = 0)
|
||||
|
||||
# append numpy arrays
|
||||
if regionNum == 0:
|
||||
pidlist_train = pidlist_per_region_train
|
||||
pidlist_eval = pidlist_per_region_eval
|
||||
pidlist_test = pidlist_per_region_test
|
||||
else:
|
||||
pidlist_train = np.r_[pidlist_train, pidlist_per_region_train]
|
||||
pidlist_eval = np.r_[pidlist_eval, pidlist_per_region_eval]
|
||||
pidlist_test = np.r_[pidlist_test, pidlist_per_region_test]
|
||||
del regionNum, regionName
|
||||
del pidlist_per_region_, pidlist_per_region, idx
|
||||
del pidlist_per_region_train, pidlist_per_region_eval, pidlist_per_region_test
|
||||
np.save(dirOut + "\\pidlist_train.npy", pidlist_train)
|
||||
np.save(dirOut + "\\pidlist_eval.npy", pidlist_eval)
|
||||
np.save(dirOut + "\\pidlist_test.npy", pidlist_test)
|
||||
else:
|
||||
pidlist_train = np.load(dirOut + "\\pidlist_train.npy")
|
||||
pidlist_eval = np.load(dirOut + "\\pidlist_eval.npy")
|
||||
pidlist_test = np.load(dirOut + "\\pidlist_test.npy")
|
||||
|
||||
|
||||
## make dataset for 2 regions or load
|
||||
if conv3to2region==1:
|
||||
pidlist2_train_ = np.r_[pidlist_train, pidlist_eval]
|
||||
|
||||
if experiment_type == 1:
|
||||
pidlist2_train = sb_func.saxon_vs_limburg(pidlist2_train_)
|
||||
pidlist2_test = sb_func.saxon_vs_limburg(pidlist_test)
|
||||
np.save(dirOut + "\\pidlist2_saxon_vs_limburg_train", pidlist2_train)
|
||||
np.save(dirOut + "\\pidlist2_saxon_vs_limburg_test", pidlist2_test)
|
||||
|
||||
elif experiment_type == 2:
|
||||
pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_)
|
||||
pidlist2_test = sb_func.groningen_vs_limburg(pidlist_test)
|
||||
np.save(dirOut + "\\pidlist2_groningen_vs_limburg_train", pidlist2_train)
|
||||
np.save(dirOut + "\\pidlist2_groningen_vs_limburg_test", pidlist2_test)
|
||||
|
||||
del pidlist2_train_
|
||||
else:
|
||||
if experiment_type == 1:
|
||||
pidlist2_train = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_train.npy")
|
||||
pidlist2_test = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_test.npy")
|
||||
|
||||
elif experiment_type == 2:
|
||||
pidlist2_train = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_train.npy")
|
||||
pidlist2_test = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_test.npy")
|
||||
|
||||
|
||||
## train/test data
|
||||
if experiment_type == 0:
|
||||
# Groningen vs Overijsel vs Limburg
|
||||
data_train = sb_func.extractPid(pidlist_train, data)
|
||||
data_eval = sb_func.extractPid(pidlist_eval, data)
|
||||
data_test = sb_func.extractPid(pidlist_test, data)
|
||||
|
||||
elif experiment_type == 1 or experiment_type == 2:
|
||||
data2 = np.array(data)
|
||||
|
||||
if experiment_type == 1:
|
||||
for row, row2 in zip(data, data2):
|
||||
if row[2] == regionLabels[0] or row[2] == regionLabels[2]:
|
||||
row2[2] = regionLabels2[0]
|
||||
|
||||
data2_train = sb_func.extractPid(pidlist2_train, data2)
|
||||
data2_test = sb_func.extractPid(pidlist2_test, data2)
|
||||
|
||||
|
||||
#####################################
|
||||
## EXPERIMENTS START FROM HERE ##
|
||||
#####################################
|
||||
|
||||
## actual training
|
||||
# train vs eval
|
||||
#trainData = data_train
|
||||
#testData = data_eval
|
||||
#testPID = pidlist_eval
|
||||
#LB = LB_y
|
||||
#LE = LE_y
|
||||
#regionLabels = regionLabels3
|
||||
|
||||
# train+eval vs test
|
||||
if experiment_type == 0:
|
||||
trainData = np.r_[data_train, data_eval]
|
||||
testData = data_test
|
||||
testPID = pidlist_test
|
||||
LB = LB_y
|
||||
LE = LE_y
|
||||
elif experiment_type == 1 or experiment_type == 2:
|
||||
# 2 region: saxon vs limburg/ groningen vs limburg
|
||||
trainData = data2_train
|
||||
testData = data2_test
|
||||
testPID = pidlist2_test
|
||||
LB = LB_y2
|
||||
LE = LE_y2
|
||||
regionLabels = regionLabels2
|
||||
|
||||
|
||||
# check the number of utterance
|
||||
allData = np.r_[trainData, testData]
|
||||
filenames = np.c_[allData[:, 0], allData[:, 2]]
|
||||
filenames_unique = np.unique(filenames, axis=0)
|
||||
Counter(filenames_unique[:, 1])
|
||||
|
||||
|
||||
fileComparison = dirOut + "\\algorithm_comparison.csv"
|
||||
filePerformance = dirOut + "\\sentence-level.csv"
|
||||
fileConfusionMatrix = dirOut + "\\confusion_matrix.csv"
|
||||
|
||||
## compare classification algorithms for the sentence-classifiers.
|
||||
#sb_func.compare_sentence_level_classifiers(trainData, LBlist, LE, fileComparison)
|
||||
|
||||
## train sentence-level classifiers.
|
||||
modelList, scoreList, confusionMatrixList = sb_func.train_sentence_level_classifiers(
|
||||
trainData, LBlist, LE, filePerformance)
|
||||
|
||||
## prediction over evaluation data per each sentence-level classifier.
|
||||
pred_per_sentence = sb_func.prediction_per_sentence(testData, modelList, LBlist, LE)
|
||||
|
||||
## combine sentence-level classifiers
|
||||
pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence)
|
||||
|
||||
## majority vote (weighted)
|
||||
#weight = sb_func.calc_weight(confusionMatrixList)
|
||||
#pred_per_pid_weighted = sb_func.prediction_per_pid_weighted(testPID, pred_per_sentence, weight, LB, LE)
|
||||
|
||||
### confusion matrix
|
||||
if experiment_type == 0:
|
||||
confusionMatrix_majority = confusion_matrix(
|
||||
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg'])
|
||||
else:
|
||||
confusionMatrix_majority = confusion_matrix(
|
||||
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Limburg'])
|
||||
|
||||
#confusionMatrix_weighted = confusion_matrix(
|
||||
# pred_per_pid_weighted[:, 1], pred_per_pid_weighted[:, 2], labels=regionLabels)
|
||||
|
||||
|
||||
## output
|
||||
accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None)
|
||||
print('accuracy: {}%'.format(accuracy * 100))
|
||||
|
||||
cm = confusionMatrix_majority
|
||||
print(cm)
|
||||
|
||||
np.save(dirOut + "\\pred_per_pid.npy", pred_per_pid_majority)
|
||||
np.save(dirOut + "\\confusion_matrix.npy", cm)
|
||||
|
||||
#fout = open(fileConfusionMatrix, "w")
|
||||
#fout.write('< confusion matrix for majority vote in evaluation set >\n')
|
||||
#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_majority', regionLabels)
|
||||
#fout.write('< confusion matrix for weighted vote in evaluation set >\n')
|
||||
#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_weighted', regionLabels)
|
||||
#fout.write('\n')
|
||||
#fout.close()
|
||||
|
||||
|
||||
##### iteration finish #####
|
||||
conn.close()
|
||||
#np.savetxt(dirOut + '\\cm_majority.csv', CM_majority, delimiter=',')
|
||||
#np.savetxt(dirOut + '\\cm_weighted.csv', CM_weighted, delimiter=',')
|
||||
|
383
dialect_identification/speaker_based_functions.py
Normal file
383
dialect_identification/speaker_based_functions.py
Normal file
@ -0,0 +1,383 @@
|
||||
import numpy as np
|
||||
from collections import Counter
|
||||
import matplotlib.pyplot as plt
|
||||
import itertools
|
||||
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
|
||||
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from sklearn.metrics import confusion_matrix
|
||||
|
||||
import dataManipulation as mani
|
||||
import evaluation as eval
|
||||
|
||||
|
||||
# extract data that corresponds to pid in the pidlist
|
||||
def extractPid(pidlist, data):
|
||||
for pidnum in range(0, len(pidlist)):
|
||||
pid = pidlist[pidnum, 0]
|
||||
x = data[data[:, 1] == pid, :]
|
||||
if pidnum == 0:
|
||||
data_ = x
|
||||
else:
|
||||
data_ = np.r_[data_, x]
|
||||
return data_
|
||||
|
||||
|
||||
def OneHotEncoding(data, LB_X, LE_y):
|
||||
# one hot encoding of data using LabelBinalizer per word (LB_X) and for region (LB_y)
|
||||
# INPUT
|
||||
# data
|
||||
# 0: filename
|
||||
# 1: pid
|
||||
# 2: region
|
||||
# 3: ID (unique word_id)
|
||||
# 4: sentence_id
|
||||
# 5: word_id
|
||||
# 6: word
|
||||
# 7: pronunciation
|
||||
# LB_x: LabelBinalizer objects
|
||||
# LE_y: LabelEncoder object
|
||||
# OUTPUT
|
||||
# X: encoded variable data
|
||||
# y: encoded target data
|
||||
pidlist = data[:, 1]
|
||||
regionlist = data[:, 2]
|
||||
uniqueWordIDlist = data[:, 3].astype(int)
|
||||
pronvarlist = data[:, 7]
|
||||
|
||||
uniqueWordIDlist_unique = np.unique(uniqueWordIDlist)
|
||||
uniqueWordIDlist_unique.sort()
|
||||
for uniqueWordIDnum in uniqueWordIDlist_unique:
|
||||
x_ = pronvarlist[uniqueWordIDlist == uniqueWordIDnum]
|
||||
lb = LB_X[uniqueWordIDnum-1]
|
||||
x = lb.transform(x_)
|
||||
if uniqueWordIDnum == uniqueWordIDlist_unique[0]:
|
||||
X = x
|
||||
else:
|
||||
X = np.c_[X, x]
|
||||
|
||||
# pid and region of the speakers
|
||||
y_ = regionlist[uniqueWordIDlist == uniqueWordIDlist_unique[0]]
|
||||
y = LE_y.transform(y_)
|
||||
|
||||
pid = pidlist[uniqueWordIDlist == uniqueWordIDlist_unique[0]]
|
||||
return X, y, pid
|
||||
|
||||
|
||||
def outputConfusionMatrix33(foutName, matrixName, regionLabels):
|
||||
for r in range(0, len(regionLabels)):
|
||||
execString1 = foutName + '.write("{0},{1},{2},{3}\\n".format('
|
||||
execString2 = 'regionLabels[' + str(r) + ']'
|
||||
execString3 = ''
|
||||
for c in range(0, len(regionLabels)):
|
||||
execString3 = execString3 + ',' + matrixName + '[' + str(r) + '][' + str(c) + ']'
|
||||
execString4 = '))'
|
||||
execString = execString1 + execString2 + execString3 + execString4
|
||||
exec(execString)
|
||||
|
||||
|
||||
def compare_sentence_level_classifiers(data_train, LBlist, LE_y, fileCSV):
|
||||
""" compare the classification algorithms on sentence-level classifiers.
|
||||
|
||||
Args:
|
||||
data_train: training data.
|
||||
LBlist: list of label binarizer, which is used to encode pronunciation variants.
|
||||
LE_y: label encorder, which is used to encode rigion names.
|
||||
fileCSV: output csv file path.
|
||||
|
||||
"""
|
||||
fout = open(fileCSV, "w")
|
||||
|
||||
sentenceIDlist_train = data_train[:, 4].astype(int)
|
||||
sentenceIDmax_train = max(sentenceIDlist_train)
|
||||
|
||||
for sentenceID in range(1, sentenceIDmax_train+1):
|
||||
sentenceIDstr = format(sentenceID, '02')
|
||||
|
||||
## categorical values into binary values.
|
||||
data_sentence = data_train[sentenceIDlist_train == sentenceID, :]
|
||||
X_train, y_train, pid_train = OneHotEncoding(data_sentence, LBlist, LE_y)
|
||||
regionCounter = Counter(LE_y.inverse_transform(y_train))
|
||||
|
||||
## classifier comparison
|
||||
names = [
|
||||
"Nearest Neighbors",
|
||||
"Linear SVM",
|
||||
"Poly SVM",
|
||||
"RBF SVM",
|
||||
"Decision Tree",
|
||||
"Random Forest 2",
|
||||
"Random Forest 3",
|
||||
"Random Forest 4",
|
||||
"AdaBoost",
|
||||
"AdaBoost(SVM)",
|
||||
"AdaBoost(Random Forest 3)",
|
||||
"Naive Bayes",
|
||||
"Linear Discriminant Analysis",
|
||||
"Quadratic Discriminant Analysis"
|
||||
]
|
||||
classifiers = [
|
||||
KNeighborsClassifier(3),
|
||||
SVC(kernel="linear", C=0.025),
|
||||
SVC(kernel="poly", C=0.025),
|
||||
SVC(gamma=2, C=1),
|
||||
DecisionTreeClassifier(max_depth=4),
|
||||
RandomForestClassifier(max_depth=2, n_estimators=10, max_features=1),
|
||||
RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1),
|
||||
RandomForestClassifier(max_depth=4, n_estimators=10, max_features=1),
|
||||
AdaBoostClassifier(),
|
||||
AdaBoostClassifier(SVC(probability=True, kernel='linear')),
|
||||
AdaBoostClassifier(RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1)),
|
||||
GaussianNB(),
|
||||
LinearDiscriminantAnalysis(),
|
||||
QuadraticDiscriminantAnalysis()
|
||||
]
|
||||
for name, model in zip(names, classifiers):
|
||||
scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro')
|
||||
fout.write("{0},{1},{2},{3}\n".format(sentenceID, name, scores.mean(), scores.var()))
|
||||
print('{0}, {1}: {2}'.format(sentenceID, name, scores.mean()))
|
||||
|
||||
fout.close()
|
||||
|
||||
|
||||
def train_sentence_level_classifiers(data_train, LBlist, LE_y, fileCSV):
|
||||
""" train sentence-level classifiers.
|
||||
|
||||
Args:
|
||||
data_train: training data.
|
||||
LBlist: list of label binarizer, which is used to encode pronunciation variants.
|
||||
LE_y: label encorder, which is used to encode rigion names.
|
||||
fileCSV: output csv file path.
|
||||
|
||||
Returns:
|
||||
modelList (list): list of models (length: sentenceNumMax)
|
||||
scoreList (list): list of scores (length: sentenceNumMax)
|
||||
|
||||
"""
|
||||
fout = open(fileCSV, "w")
|
||||
|
||||
fout.write('< cross-validation in training set >\n')
|
||||
|
||||
sentenceIDlist_train = data_train[:, 4].astype(int)
|
||||
sentenceIDmax_train = max(sentenceIDlist_train)
|
||||
modelList = []
|
||||
scoreList = []
|
||||
confusionMatrixList = []
|
||||
|
||||
for sentenceID in range(1, sentenceIDmax_train+1):
|
||||
sentenceIDstr = format(sentenceID, '02')
|
||||
|
||||
## categorical values into binary values.
|
||||
data_sentence = data_train[sentenceIDlist_train == sentenceID, :]
|
||||
X_train, y_train, pid_train = OneHotEncoding(data_sentence, LBlist, LE_y)
|
||||
regionCounter = Counter(LE_y.inverse_transform(y_train))
|
||||
|
||||
## cross-validation with the best classifier
|
||||
model = AdaBoostClassifier()
|
||||
#model = SVC(kernel="linear", C=0.025)
|
||||
#model = LinearDiscriminantAnalysis()
|
||||
|
||||
# #scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro')
|
||||
scores, confusionMatrix = eval.cross_val_confusion_matrix(model, X_train, y_train, 10)
|
||||
ci_mean, ci_low, ci_high = eval.mean_confidence_interval(scores, 0.95)
|
||||
scoreList.append(scores)
|
||||
confusionMatrixList.append(confusionMatrix)
|
||||
|
||||
## model fitting
|
||||
modelfit = model.fit(X_train, y_train)
|
||||
modelList.append(modelfit)
|
||||
|
||||
## output
|
||||
fout.write("{},".format(sentenceID))
|
||||
#fout.write("{0},{1},{2},".format(
|
||||
# regionCounter['Groningen_and_Drenthe'], regionCounter['Limburg'], regionCounter['Oost_Overijsel-Gelderland']))
|
||||
#fout.write("{0},{1},".format(
|
||||
# regionCounter['Low_Saxon'], regionCounter['Limburg']))
|
||||
fout.write("{0},{1},".format(
|
||||
regionCounter['Groningen_and_Drenthe'], regionCounter['Limburg']))
|
||||
|
||||
fout.write("{0},{1},{2}\n".format(ci_mean, ci_low, ci_high))
|
||||
fout.write('\n')
|
||||
fout.close()
|
||||
|
||||
return modelList, scoreList, confusionMatrixList
|
||||
|
||||
|
||||
def prediction_per_sentence(data_eval, modelList, LBlist, LE_y):
|
||||
""" prediction using sentence-level classifiers.
|
||||
|
||||
Args:
|
||||
data_eval: evaluation data.
|
||||
modelList: list of the models.
|
||||
LBlist: list of label binarizer, which is used to encode pronunciation variants.
|
||||
LE_y: label encorder, which is used to encode rigion names.
|
||||
|
||||
Returns:
|
||||
prediction (list): [sentenceID, pid, answer, prediction]
|
||||
|
||||
"""
|
||||
sentenceIDlist_eval = data_eval[:, 4].astype(int)
|
||||
sentenceIDmax_eval = max(sentenceIDlist_eval)
|
||||
for sentenceID in range(1, sentenceIDmax_eval+1):
|
||||
sentenceIDstr = format(sentenceID, '02')
|
||||
|
||||
## categorical values into binary values.
|
||||
data_sentence = data_eval[sentenceIDlist_eval == sentenceID, :]
|
||||
X_eval, y_eval, pid_eval = OneHotEncoding(data_sentence, LBlist, LE_y)
|
||||
regionCounter = Counter(LE_y.inverse_transform(y_eval))
|
||||
|
||||
## evaluate model
|
||||
modelfit = modelList[sentenceID-1]
|
||||
y_pred = modelfit.predict(X_eval)
|
||||
y_pred_label = LE_y.inverse_transform(y_pred)
|
||||
y_eval_label = LE_y.inverse_transform(y_eval)
|
||||
|
||||
# pid, y, y_pred
|
||||
sentenceIDvec = np.ones((y_eval_label.shape[0], 1)).astype(int) * sentenceID
|
||||
prediction_ = np.c_[sentenceIDvec, pid_eval, y_eval_label, y_pred_label]
|
||||
if sentenceID == 1:
|
||||
prediction = prediction_
|
||||
else:
|
||||
prediction = np.r_[prediction, prediction_]
|
||||
|
||||
return prediction
|
||||
|
||||
|
||||
def prediction_per_pid_majority(pidlist_eval, prediction):
|
||||
""" make a prediction per pid using majority vote
|
||||
|
||||
Returns:
|
||||
prediction_per_pid (ndarray): [pid, ans, prediction]
|
||||
|
||||
"""
|
||||
prediction_per_pid = []
|
||||
for pid_ in range(0, len(pidlist_eval[:, 0])):
|
||||
pid = pidlist_eval[pid_, 0]
|
||||
ans = pidlist_eval[pid_, 1]
|
||||
prediction_ = prediction[prediction[:, 1] == pid, :]
|
||||
|
||||
# majority vote
|
||||
predCounter = Counter(prediction_[:, -1])
|
||||
predMostCommon = predCounter.most_common(1)
|
||||
predLabel = predMostCommon[0][0]
|
||||
predRatio = predMostCommon[0][1] / prediction_.shape[0] * 100
|
||||
|
||||
prediction_per_pid.append([pid, ans, predLabel])
|
||||
|
||||
return np.array(prediction_per_pid)
|
||||
|
||||
|
||||
def calc_weight(confusionMatrixList):
|
||||
""" calculate weight (how trustworthy the prediction is) for majority vote.
|
||||
|
||||
Note:
|
||||
Of all subjects we predicted are GO/OG/LB, what fraction of them actually are (precision) is used as weight.
|
||||
|
||||
Args:
|
||||
confusionMarixList: list of confusion matrix of sentence-level classifiers.
|
||||
|
||||
"""
|
||||
sentenceID_max = len(confusionMatrixList)
|
||||
weight = np.zeros((sentenceID_max, confusionMatrixList[0].shape[0]))
|
||||
for sentenceID in range(1, sentenceID_max+1):
|
||||
cm = confusionMatrixList[sentenceID-1]
|
||||
|
||||
# normalized confusion matrix
|
||||
#rTotal = np.sum(cm, axis=1)
|
||||
#cm_normalized = cm / rTotal
|
||||
#weight[sentenceID-1, :] = np.diag(cm_normalized)
|
||||
|
||||
true_positives = np.diag(cm)
|
||||
predicted = np.sum(cm, axis=0)
|
||||
weight[sentenceID-1, :] = true_positives / predicted
|
||||
|
||||
return weight
|
||||
|
||||
|
||||
def prediction_per_pid_weighted(pidlist_eval, prediction, weight, LB_y, LE_y):
|
||||
""" make a prediction per pid using weighted (majority) vote.
|
||||
|
||||
Args:
|
||||
weight (ndarray): how trustworthy the prediction of each sentence-based classifier is.
|
||||
LB_y: label binalizer, which is used to encode region names.
|
||||
LE_y: label encorder, which is used to encode region names.
|
||||
Returns:
|
||||
prediction_per_pid (ndarray): [pid, ans, prediction]
|
||||
|
||||
"""
|
||||
|
||||
prediction_per_pid = []
|
||||
for pid_ in range(0, len(pidlist_eval[:, 0])):
|
||||
pid = pidlist_eval[pid_, 0]
|
||||
ans = pidlist_eval[pid_, 1]
|
||||
prediction_ = prediction[prediction[:, 1] == pid, :]
|
||||
|
||||
# calculate weighted (majority) vote
|
||||
vote_weighted = np.zeros((1, 3))
|
||||
for sentenceID_ in range(0, prediction_.shape[0]):
|
||||
sentenceID = prediction_[sentenceID_, 0].astype(int)
|
||||
w = weight[sentenceID-1, :]
|
||||
pred = prediction_[sentenceID_, 3]
|
||||
pred_int = LB_y.transform([pred])
|
||||
vote_weighted = vote_weighted + w * pred_int
|
||||
|
||||
# choose the most vote
|
||||
vote_weighted = vote_weighted[0]
|
||||
maxindex = list(vote_weighted).index(max(vote_weighted))
|
||||
#predLabel = regionLabels[maxindex]
|
||||
predLabel = LE_y.inverse_transform(maxindex)
|
||||
prediction_per_pid.append([pid, ans, predLabel])
|
||||
|
||||
return np.array(prediction_per_pid)
|
||||
|
||||
|
||||
def saxon_vs_limburg(pidlist3):
|
||||
"""convert a pidlist for 3 regions into that for 2 regions.
|
||||
|
||||
Notes:
|
||||
3 regions include ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
|
||||
2 regions include ['Limburg', 'Low_Saxon']
|
||||
where Low_Saxon = 'Groningen_and_Drenthe' + 'Oost_Overijsel-Gelderland'
|
||||
samples are randomly chosen so that each class has the same amount of data.
|
||||
|
||||
"""
|
||||
|
||||
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
|
||||
regionLabels2 = ['Low_Saxon', 'Limburg']
|
||||
|
||||
index_saxon = np.any([pidlist3[:, 1] == regionLabels[0], pidlist3[:, 1] == regionLabels[2]], axis=0)
|
||||
pidlist_saxon_ = pidlist3[index_saxon, :]
|
||||
pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :]
|
||||
|
||||
# extract the same amout of samples as Limburg.
|
||||
pidlistCounter3 = Counter(pidlist3[:, 1])
|
||||
pidlist_saxon, idx = mani.extractRandomSample(pidlist_saxon_, pidlistCounter3['Limburg'])
|
||||
pidlist_saxon[:, 1] = regionLabels2[0]
|
||||
|
||||
pidlist2 = np.r_[pidlist_limburg, pidlist_saxon]
|
||||
#pidlistCounter2 = Counter(pidlist2[:, 1])
|
||||
return pidlist2
|
||||
|
||||
|
||||
def groningen_vs_limburg(pidlist3):
|
||||
"""convert a pidlist for 3 regions into that for 2 regions.
|
||||
|
||||
Notes:
|
||||
3 regions include ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
|
||||
2 regions include ['Groningen_and_Drenthe', 'Limburg']
|
||||
|
||||
"""
|
||||
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
|
||||
|
||||
pidlist_groningen = pidlist3[pidlist3[:, 1] == regionLabels[0], :]
|
||||
pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :]
|
||||
|
||||
pidlist2 = np.r_[pidlist_groningen, pidlist_limburg]
|
||||
return pidlist2
|
44
dialect_identification/test_code.py
Normal file
44
dialect_identification/test_code.py
Normal file
@ -0,0 +1,44 @@
|
||||
|
||||
import Levenshtein
|
||||
import numpy as np
|
||||
|
||||
a = 'hello'
|
||||
b = 'haall'
|
||||
|
||||
# approximate
|
||||
infinite = 100
|
||||
|
||||
# make distance matrix D
|
||||
len_a = len(a)
|
||||
len_b = len(b)
|
||||
D_ = np.zeros((len_a, len_b)).astype(int)
|
||||
for ia in range(0, len_a):
|
||||
a_ = a[ia]
|
||||
for ib in range(0, len_b):
|
||||
b_ = b[ib]
|
||||
if a_ == b_:
|
||||
D_[ia, ib] = 1
|
||||
|
||||
D = np.zeros((len_a+1, len_b+1)).astype(int)
|
||||
D[1:len_a+1, 1:len_b+1] = D_
|
||||
D[0, :] = infinite
|
||||
D[:, 0] = infinite
|
||||
D[0, 0] = 0
|
||||
|
||||
# calculate accumulated distance
|
||||
indexPath = []
|
||||
for ia in range(0, len_a):
|
||||
for ib in range(0, len_b):
|
||||
a_ = a[ia]
|
||||
b_ = b[ib]
|
||||
option = (D[ia, ib]+D[ia+1, ib+1], D[ia, ib+1], D[ia+1, ib])
|
||||
Dmin = np.min(option)
|
||||
D[ia+1, ib+1] = D[ia+1, ib+1]+Dmin
|
||||
index = list(option).index(Dmin)
|
||||
indexPath[ia, ib] = index
|
||||
|
||||
# back trace
|
||||
ia = len_a
|
||||
ib = len_b
|
||||
#while (ia > 0 or ib > 0):
|
||||
# tb
|
56
dialect_identification/word_based.py
Normal file
56
dialect_identification/word_based.py
Normal file
@ -0,0 +1,56 @@
|
||||
import os
|
||||
import sys
|
||||
import configparser
|
||||
|
||||
import numpy as np
|
||||
from matplotlib import pyplot
|
||||
|
||||
currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
|
||||
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
|
||||
from dataIO import readFile
|
||||
from dataIO import selectSamplesFromCombinedData
|
||||
import dataManipulation
|
||||
|
||||
|
||||
configFile = currDir + '\\config.ini'
|
||||
|
||||
config = configparser.ConfigParser()
|
||||
config.sections()
|
||||
config.read(configFile)
|
||||
fileWordList = config['word_based']['fileWordList']
|
||||
fileCombined = config['word_based']['fileCombined']
|
||||
|
||||
wordList = readFile(fileWordList)
|
||||
|
||||
for wordNum in range(1, len(wordList)):
|
||||
word = wordList[wordNum-1] # target word
|
||||
#print("=== {} ===".format(word))
|
||||
|
||||
dataGroningen, dataLimburg, dataOverijsel = selectSamplesFromCombinedData(word, fileCombined)
|
||||
|
||||
sampleNumMax = 50
|
||||
dataG, indexG = dataManipulation.extractRandomSample(np.array(dataGroningen), sampleNumMax)
|
||||
dataL, indexL = dataManipulation.extractRandomSample(np.array(dataLimburg), sampleNumMax)
|
||||
dataO, indexO = dataManipulation.extractRandomSample(np.array(dataOverijsel), sampleNumMax)
|
||||
|
||||
# combine pronunciation from three regions
|
||||
# data: (sampleNumMax x 3) x 1
|
||||
cPronunciation = 4
|
||||
data = np.hstack([dataG[:, cPronunciation], dataL[:, cPronunciation], dataO[:, cPronunciation]])
|
||||
|
||||
# MDS
|
||||
dataLevenshtein = dataManipulation.makeLevenshteinMatrix(data)
|
||||
dataMDS = dataManipulation.MDS(dataLevenshtein)
|
||||
|
||||
# plot
|
||||
pyplot.scatter(dataMDS[0:sampleNumMax-1, 0], dataMDS[0:sampleNumMax-1, 1], s=80, c='red', marker="o", facecolors='none', label="Groningen and Drenthe")
|
||||
pyplot.scatter(dataMDS[sampleNumMax:sampleNumMax*2-1, 0], dataMDS[sampleNumMax:sampleNumMax*2-1, 1], c='green', marker="^", facecolors='none', label="Limburg")
|
||||
pyplot.scatter(dataMDS[sampleNumMax*2:sampleNumMax*3-1, 0], dataMDS[sampleNumMax*2:sampleNumMax*3-1, 1], c='blue', marker="+", facecolors='none', label="Oost Overijsel-Gelderland")
|
||||
|
||||
pyplot.title(word)
|
||||
#ax.set_xlabel('x')
|
||||
#ax.set_ylabel('y')
|
||||
pyplot.legend(loc='upper right')
|
||||
#pyplot.show()
|
||||
pyplot.savefig('c:\\cygwin64\\home\\Aki\\rug_cygwin\\_same-utterance\\fig\\' + word + '.png')
|
||||
pyplot.gcf().clear()
|
Loading…
Reference in New Issue
Block a user