Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.26730.12
MinimumVisualStudioVersion = 10.0.40219.1
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "dialect_identification", "dialect_identification\dialect_identification.pyproj", "{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}"
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{5A4286D1-F037-43D4-90F8-05C5CCC0CA30}"
ProjectSection(SolutionItems) = preProject
..\..\forced-alignment\forced_alignment\ = ..\..\forced-alignment\forced_alignment\
..\..\forced-alignment\forced_alignment\ = ..\..\forced-alignment\forced_alignment\
..\..\forced-alignment\forced_alignment\forced_alignment.pyproj = ..\..\forced-alignment\forced_alignment\forced_alignment.pyproj
..\..\forced-alignment\forced_alignment\ = ..\..\forced-alignment\forced_alignment\
..\..\forced-alignment\forced_alignment\ = ..\..\forced-alignment\forced_alignment\
..\..\forced-alignment\forced_alignment\ = ..\..\forced-alignment\forced_alignment\
..\..\forced-alignment\forced_alignment\ = ..\..\forced-alignment\forced_alignment\
..\..\forced-alignment\forced_alignment\ = ..\..\forced-alignment\forced_alignment\
..\..\forced-alignment\forced_alignment\ = ..\..\forced-alignment\forced_alignment\
..\..\forced-alignment\forced_alignment\ = ..\..\forced-alignment\forced_alignment\
..\..\forced-alignment\forced_alignment\ = ..\..\forced-alignment\forced_alignment\
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}.Release|Any CPU.ActiveCfg = Release|Any CPU
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {FA4F83BB-D460-40C1-B10E-98E4877CA29B}
@ -0,0 +1,90 @@
import os
import sys
import configparser
import numpy as np
import pypyodbc
## user define
forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced-alignment'
dir_same_utterance = 'd:\\OneDrive\\Research\\rug\\experiments\\same_utterance'
wav_dir = dir_same_utterance + '\\wav_with_cities'
script_dir = dir_same_utterance + '\\script'
fileMDB = dir_same_utterance + '\\feature\\DialectClassification.accdb'
table = 'ForcedAlignmentResult'
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
# these lines are not necessary once forced-alignment is intalled as a package.
from forced_alignment import forced_alignment
## check if forced-alignment work in each sentence
#from forced_alignment import pronunciations
#wav_file = wav_dir + '\\10\\' + regionLabels[0] + '\\9935-1464218044-1951631.wav'
#script_file = script_dir + '\\script10.txt'
#with open(script_file, 'r') as fin:
# script = fin.readline()
#fa = forced_alignment(wav_file, script)
## make database connection
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
conn = pypyodbc.connect(param)
cursor = conn.cursor()
SQLstring1 = 'INSERT INTO ' + table + ' (filename, region, word_id, pronunciation) '
## forced-alignment to all the wav files in dir_same_utterance
word_id_start = 1
for sentenceID in range(1, 11):
sentenceIDstr = format(sentenceID, '02')
# get script
script_file = script_dir + '\\script' + sentenceIDstr + '.txt'
with open(script_file, 'r') as fin:
script = fin.readline()
# loop over three regions
for region in regionLabels:
# loop over the wav_subdir
wav_subdir = wav_dir + '\\' + sentenceIDstr + '\\' + region
wav_files = os.listdir(wav_subdir)
file_nr = 0
for wav_file in wav_files:
file_nr += 1
filename = wav_file.replace('.wav', '')
wav_file_fullpath = wav_subdir + '\\' + wav_file
# forced-alignment
print('{0} {1}: {2} ({3}/{4})'.format(sentenceIDstr, region, wav_file, file_nr, len(wav_files)))
fa = forced_alignment(wav_file_fullpath, script)
# send pronunciation variant to database
word_id = word_id_start
for row in fa:
word = row[0]
phonemes = np.array(row[1])
## get pronunciation variant
pronvar_ = phonemes[:, 2]
pronvar_[np.where(pronvar_=='ssil')]='' # remove 'ssil'
pronvar = ''.join(pronvar_)
## insert the result into the database.
SQLstring2 = 'VALUES (\'' + filename + '\',\'' + region + '\',\'' + str(word_id) + '\',\'' + pronvar + '\')'
SQLstring = SQLstring1 + SQLstring2
word_id = word_id + 1
word_id_start += script.count(' ')+1
@ -0,0 +1,290 @@
This script perfoms the basic process for applying a machine learning
algorithm to a dataset using Python libraries.
The four steps are:
1. Download a dataset (using pandas)
2. Process the numeric data (using numpy)
3. Train and evaluate learners (using scikit-learn)
4. Plot and compare results (using matplotlib)
The data is downloaded from URL, which is defined below. As is normal
for machine learning problems, the nature of the source data affects
the entire solution. When you change URL to refer to your own data, you
will need to review the data processing steps to ensure they remain
Example Data
The example is from
It contains pre-processed metrics, such as the frequency of certain
words and letters, from a collection of emails. A classification for
each one indicating 'spam' or 'not spam' is in the final column.
See the linked page for full details of the data set.
This script uses three classifiers to predict the class of an email
based on the metrics. These are not representative of modern spam
detection systems.
# Remember to update the script for the new data when you change this URL
URL = ""
# Uncomment this call when using matplotlib to generate images
# rather than displaying interactive UI.
#import matplotlib
from pandas import read_table
import numpy as np
import matplotlib.pyplot as plt
# [OPTIONAL] Seaborn makes plots nicer
import seaborn
except ImportError:
# =====================================================================
def download_data():
Downloads the data for this script into a pandas DataFrame.
# If your data is in an Excel file, install 'xlrd' and use
# pandas.read_excel instead of read_table
#from pandas import read_excel
#frame = read_excel(URL)
# If your data is in a private Azure blob, install 'azure-storage' and use
# BlockBlobService.get_blob_to_path() with read_table() or read_excel()
#from import BlockBlobService
#service = BlockBlobService(ACCOUNT_NAME, ACCOUNT_KEY)
#service.get_blob_to_path(container_name, blob_name, 'my_data.csv')
#frame = read_table('my_data.csv', ...
frame = read_table(
# Uncomment if the file needs to be decompressed
# Specify the file encoding
# Latin-1 is common for data from US sources
#encoding='utf-8', # UTF-8 is also common
# Specify the separator in the data
sep=',', # comma separated values
#sep='\t', # tab separated values
#sep=' ', # space separated values
# Ignore spaces after the separator
# Generate row labels from each row number
#index_col=0, # use the first column as row labels
#index_col=-1, # use the last column as row labels
# Generate column headers row from each column number
#header=0, # use the first line as headers
# Use manual headers and skip the first row in the file
#names=['col1', 'col2', ...],
# Return a subset of the columns
#return frame[['col1', 'col4', ...]]
# Return the entire frame
return frame
# =====================================================================
def get_features_and_labels(frame):
Transforms and scales the input data and returns numpy arrays for
training and testing inputs and targets.
# Replace missing values with 0.0, or we can use
# scikit-learn to calculate missing values (below)
#frame[frame.isnull()] = 0.0
# Convert values to floats
arr = np.array(frame, dtype=np.float)
# Use the last column as the target value
X, y = arr[:, :-1], arr[:, -1]
# To use the first column instead, change the index value
#X, y = arr[:, 1:], arr[:, 0]
# Use 80% of the data for training; test against the rest
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# sklearn.pipeline.make_pipeline could also be used to chain
# processing and classification into a black box, but here we do
# them separately.
# If values are missing we could impute them from the training data
#from sklearn.preprocessing import Imputer
#imputer = Imputer(strategy='mean')
#X_train = imputer.transform(X_train)
#X_test = imputer.transform(X_test)
# Normalize the attribute values to mean=0 and variance=1
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# To scale to a specified range, use MinMaxScaler
#from sklearn.preprocessing import MinMaxScaler
#scaler = MinMaxScaler(feature_range=(0, 1))
# Fit the scaler based on the training data, then apply the same
# scaling to both training and test sets.
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
# Return the training and test sets
return X_train, X_test, y_train, y_test
# =====================================================================
def evaluate_classifier(X_train, X_test, y_train, y_test):
Run multiple times with different classifiers to get an idea of the
relative performance of each configuration.
Returns a sequence of tuples containing:
(title, precision, recall)
for each learner.
# Import some classifiers to test
from sklearn.svm import LinearSVC, NuSVC
from sklearn.ensemble import AdaBoostClassifier
# We will calculate the P-R curve for each classifier
from sklearn.metrics import precision_recall_curve, f1_score
# Here we create classifiers with default parameters. These need
# to be adjusted to obtain optimal performance on your data set.
# Test the linear support vector classifier
classifier = LinearSVC(C=1)
# Fit the classifier
||||, y_train)
score = f1_score(y_test, classifier.predict(X_test))
# Generate the P-R curve
y_prob = classifier.decision_function(X_test)
precision, recall, _ = precision_recall_curve(y_test, y_prob)
# Include the score in the title
yield 'Linear SVC (F1 score={:.3f})'.format(score), precision, recall
# Test the Nu support vector classifier
classifier = NuSVC(kernel='rbf', nu=0.5, gamma=1e-3)
# Fit the classifier
||||, y_train)
score = f1_score(y_test, classifier.predict(X_test))
# Generate the P-R curve
y_prob = classifier.decision_function(X_test)
precision, recall, _ = precision_recall_curve(y_test, y_prob)
# Include the score in the title
yield 'NuSVC (F1 score={:.3f})'.format(score), precision, recall
# Test the Ada boost classifier
classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.0, algorithm='SAMME.R')
# Fit the classifier
||||, y_train)
score = f1_score(y_test, classifier.predict(X_test))
# Generate the P-R curve
y_prob = classifier.decision_function(X_test)
precision, recall, _ = precision_recall_curve(y_test, y_prob)
# Include the score in the title
yield 'Ada Boost (F1 score={:.3f})'.format(score), precision, recall
# =====================================================================
def plot(results):
Create a plot comparing multiple learners.
`results` is a list of tuples containing:
(title, precision, recall)
All the elements in results will be plotted.
# Plot the precision-recall curves
fig = plt.figure(figsize=(6, 6))
fig.canvas.set_window_title('Classifying data from ' + URL)
for label, precision, recall in results:
plt.plot(recall, precision, label=label)
plt.title('Precision-Recall Curves')
plt.legend(loc='lower left')
# Let matplotlib improve the layout
# ==================================
# Display the plot in interactive UI
# To save the plot to an image file, use savefig()
# Open the image file with the default image viewer
#import subprocess
#subprocess.Popen('plot.png', shell=True)
# To save the plot to an image in memory, use BytesIO and savefig()
# This can then be written to any stream-like object, such as a
# file or HTTP response.
#from io import BytesIO
#img_stream = BytesIO()
#plt.savefig(img_stream, fmt='png')
#img_bytes = img_stream.getvalue()
#print('Image is {} bytes - {!r}'.format(len(img_bytes), img_bytes[:8] + b'...'))
# Closing the figure allows matplotlib to release the memory used.
# =====================================================================
if __name__ == '__main__':
# Download the data set from URL
print("Downloading data from {}".format(URL))
frame = download_data()
# Process data into feature and label arrays
print("Processing {} samples with {} attributes".format(len(frame.index), len(frame.columns)))
X_train, X_test, y_train, y_test = get_features_and_labels(frame)
# Evaluate multiple classifiers on the data
print("Evaluating classifiers")
results = list(evaluate_classifier(X_train, X_test, y_train, y_test))
# Display the results
print("Plotting the results")
@ -0,0 +1,8 @@
fileWordList = D:\\OneDrive\\Research\\rug\\same_utterance\\feature\\wordList.csv
fileCombined = D:\\OneDrive\\Research\\rug\\same_utterance\\feature\\combined.csv
dirFeature = D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\feature
fileMDB = D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\feature\\DialectClassification.accdb
dirData = D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\wav
@ -0,0 +1,74 @@
# 2017/09/25
# select samples from the combined.csv for the further analysis
# 2017/10/02 modularized.
# Aki Kunikoshi
import numpy as np
def readFile(filename):
with open(filename, 'r') as fin:
lines =
linesEach = lines.split('\n')
return linesEach
def selectSamplesFromCombinedData(word, fileCombined):
# load combined data
fin = open(fileCombined, 'r')
line = fin.readline()
# load data per region
dataGroningen = []
dataLimburg = []
dataOverijsel = []
while line:
line = fin.readline()
line = line.rstrip()
lineList = line.split(',')
if len(lineList) == 6 and lineList[5] == word:
region = lineList[2]
if region == 'Groningen_and_Drenthe':
elif region == 'Limburg':
elif region == 'Oost_Overijsel-Gelderland':
return (dataGroningen, dataLimburg, dataOverijsel)
#print("{0}: {1} {2} {3}".format(word,len(listGroningen),len(listLimburg),len(listOverijsel))
def groupSamplesInCSV(fileCSV, idxRegion):
fin = open(fileCSV, 'r')
# first line is the header
line = fin.readline()
line = line.rstrip()
header = line.split(',')
# load data per region
dataGroningen = []
dataLimburg = []
dataOverijsel = []
while line:
line = fin.readline()
line = line.rstrip()
lineList = line.split(',')
if len(lineList) == len(header):
region = lineList[idxRegion]
if region == 'Groningen_and_Drenthe':
elif region == 'Limburg':
elif region == 'Oost_Overijsel-Gelderland':
return (header, dataGroningen, dataLimburg, dataOverijsel)
def addUserID(featureFile, recordingsCSV):
dirFeature = config['sentence_based']['dirFeature']
@ -0,0 +1,41 @@
import numpy as np
from sklearn import manifold
import Levenshtein
# x: ndarray (dnum x dim)
# n: number of samples to extract
# index: index of the chosen samples
def extractRandomSample(x, n):
xRowMax = x.shape[0]
indexOriginal = np.arange(xRowMax)
indexChosen = np.random.choice(indexOriginal, n, False)
xChosen = x[indexChosen, :]
return (xChosen, indexChosen)
# x: 1d string ndarray
def makeLevenshteinMatrix(x):
xRowMax = x.shape[0]
xLevenshtein = np.ones((xRowMax, xRowMax), dtype='int')
for xRow in range(0, xRowMax):
for xCol in range(0, xRowMax):
dist = Levenshtein.distance(x[xRow], x[xCol]);
xLevenshtein[xRow, xCol] = dist
return xLevenshtein
# x: 1d string ndarray
def calcLevenshteinArray(word, x):
xRowMax = x.shape[0]
xLevenshtein = np.zeros(x.shape, dtype='int')
for xRow in range(0, xRowMax):
dist = Levenshtein.distance(word, x[xRow]);
xLevenshtein[xRow] = dist
return xLevenshtein
def MDS(x):
mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=6)
xmds = mds.fit_transform(x)
return xmds
@ -0,0 +1,70 @@
<Project DefaultTargets="Build" xmlns="" ToolsVersion="4.0">
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<PropertyGroup Condition=" '$(Configuration)' == 'Debug' ">
<PropertyGroup Condition=" '$(Configuration)' == 'Release' ">
<Compile Include="">
<Compile Include="">
<Compile Include="" />
<Compile Include="">
<Compile Include="">
<Compile Include="">
<Compile Include="">
<Compile Include="">
<Compile Include="">
<Compile Include="">
<Compile Include="">
<Compile Include="" />
<Content Include="config.ini" />
<Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
<!-- Uncomment the CoreCompile target to enable the Build command in
Visual Studio and specify your pre- and post-build commands in
the BeforeBuild and AfterBuild targets below. -->
<!--<Target Name="CoreCompile" />-->
<Target Name="BeforeBuild">
<Target Name="AfterBuild">
@ -0,0 +1,40 @@
import numpy as np
import scipy as sp
import scipy.stats
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
# from
def mean_confidence_interval(data, confidence):
a = 1.0*np.array(data)
n = len(a)
m, se = np.mean(a), scipy.stats.sem(a)
h = se * sp.stats.t._ppf((1+confidence)/2., n-1)
return m, m-h, m+h
# accumulated confusion matrix is added to cross_val_score
def cross_val_confusion_matrix(model, X, y, cv):
kf = KFold(n_splits=cv)
classLabels = np.unique(y)
classNumMax = classLabels.shape[0]
confusionMatrixAccumulated = np.zeros((classNumMax, classNumMax))
scores = []
for idx_train, idx_test in kf.split(X):
# split into train/test
x_train = X[idx_train, :]
x_test = X[idx_test, :]
y_train = y[idx_train]
y_test = y[idx_test]
modelfit =, y_train)
# evaluation
y_pred = modelfit.predict(x_test)
score = f1_score(y_test, y_pred, average='micro')
confusionMatrixAccumulated = confusionMatrixAccumulated + confusion_matrix(y_test, y_pred,
scores = np.array(scores)
return scores, confusionMatrixAccumulated
@ -0,0 +1,48 @@
import sys
import os
import pandas
import datetime
# these lines are not necessary once forced-alignment is intalled as a package.
forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced-alignment'
from forced_alignment import pronunciations
from forced_alignment.htk_dict import variances_table
#existing_pronunciations = set(pronunciations.get_all())
## only focus on word
## missing pronunciations
## (1) pronunciation is written in IPA.
## (2) pronunciation variants are made based on (1).
## (3) they are converted into HTK format.
#missing_pronunciations_file = 'D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\missing_words_in_barbara_dic\\missing_words_pronvarsHTK.txt'
#with open(missing_pronunciations_file) as fin:
# lines =
# lines = lines.split('\n')
#source = 'generated using ipa transcription by Marita Everhardt.'
#inserts = []
#for line in lines:
# line = line.split('\t')
# word = line[0].strip().lower()
# pronounciation = line[1].strip().split()
# # surely not in the table
# #if (word, pronounciation) not in existing_pronunciations:
# inserts.append("('{}', '{}', '{}', '{}', 0)".format(
# word,
# ' '.join(pronounciation),
# source,
#, ))
#sql = """INSERT INTO pronunciations (word, pronunciation, collection, added, automatic) VALUES\n {};""".format(
# ',\n '.join(inserts)
@ -0,0 +1,79 @@
import os
import sys
import itertools
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
regionLabels = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg']
regionLabels2 = ['Groningen_and_Drenthe', 'Limburg']
dirOut = currDir + '\\result\\same-utterance_with_cities'
def plot_confusion_matrix(cm, classes,
title='Confusion matrix',
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
this code is downloaded from:
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
print('Confusion matrix, without normalization')
_fontsize = 24
plt.imshow(cm, interpolation='nearest', cmap=cmap)
#plt.title(title, fontsize=_fontsize+2)
tick_marks = np.arange(len(classes))
#plt.xticks(tick_marks, classes, rotation=45, fontsize=_fontsize-2)
plt.xticks(tick_marks, classes, fontsize=_fontsize-4)
plt.yticks(tick_marks, classes, fontsize=_fontsize-4)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
color="white" if cm[i, j] > thresh else "black",
plt.ylabel('True label', fontsize=_fontsize-4)
plt.xlabel('Predicted label', fontsize=_fontsize-4)
pred = np.load(dirOut + '\\pred_per_pid_3regions.npy')
#accuracy = accuracy_score(pred[:, 1], pred[:, 2], normalize=True, sample_weight=None)
#print('accuracy: {}%'.format(accuracy * 100))
# confusion matrix
cm = confusion_matrix(pred[:, 1], pred[:, 2], labels=regionLabels)
# human perception (2 regions)
#cm = np.array([[39, 57], [6, 104]])
# human perception (3 regions)
#cm = np.array([[22, 14, 52], [23, 21, 52], [5, 5, 100]])
plot_confusion_matrix(cm, classes=['GD', 'OG', 'LB'], normalize=True)
#plot_confusion_matrix(cm, classes=['GD', 'LB'], normalize=True)
plt.savefig(dirOut + '\\cm_machine_3regions_normalized.png')
@ -0,0 +1,197 @@
import os
import sys
import configparser
import numpy as np
import pandas as pd
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from collections import Counter
# database
import pypyodbc
# classifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import pickle
currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
from dataIO import readFile
from dataIO import groupSamplesInCSV
import dataManipulation
import utility as util
configFile = currDir + '\\config.ini'
# load init file
config = configparser.ConfigParser()
dirFeature = config['sentence_based']['dirFeature']
sentenceNumMax = 10
classifierList = []
LE_X_decode = []
LE_y = preprocessing.LabelEncoder()
||||["Groningen_and_Drenthe", "Limburg", "Oost_Overijsel-Gelderland"])
testset_X = []
testset_y = []
testset_userID = []
result_y_test = []
result_y_prediction = []
fout = open("comparison.csv", "w")
for sentenceNum in range(1, sentenceNumMax+1):
#if sentenceNum != 10:
# sentenceNumStr = '0' + str(sentenceNum)
# sentenceNumStr = str(sentenceNumStr)
sentenceNumStr = format(sentenceNum, '02')
fileSentence = dirFeature + '\\\\' + sentenceNumStr + '.csv'
## load combined data
fileCSV = fileSentence
idxRegion = 1
header, dataGroningen, dataLimburg, dataOverijsel = groupSamplesInCSV(fileCSV, idxRegion)
sampleNumMax = np.min((len(dataGroningen), len(dataLimburg), len(dataOverijsel)))
## make balanced dataset
dataG, indexG = dataManipulation.extractRandomSample(np.array(dataGroningen), sampleNumMax)
dataL, indexL = dataManipulation.extractRandomSample(np.array(dataLimburg), sampleNumMax)
dataO, indexO = dataManipulation.extractRandomSample(np.array(dataOverijsel), sampleNumMax)
XIndex = np.arange(idxRegion+1, len(header))
yIndex = 1 # region
userIDindex = 0 # userID
## cathegorical values into numbers
X_ = np.r_[dataG[:, XIndex], dataL[:, XIndex], dataO[:, XIndex]]
y_ = np.r_[dataG[:, yIndex], dataL[:, yIndex], dataO[:, yIndex]]
userID_ = np.r_[dataG[:, userIDindex], dataL[:, userIDindex], dataO[:, userIDindex]]
#X = np.zeros((X_.shape), 'int')
for Xindex in XIndex:
x = X_[:, Xindex-2]
## levenshtein distance
#word_count = Counter(x)
#frequent_word = max(word_count)
#X[:, Xindex-2] = dataManipulation.calcLevenshteinArray(frequent_word, x)
# hot encoding
le_x = preprocessing.LabelBinarizer()
x_ = le_x.transform(x)
if Xindex == idxRegion+1:
X = x_
X = np.c_[X, x_]
y = LE_y.transform(y_)
## split into train vs test set
#[X_train, X_test, y_train, y_test] = train_test_split(X, y, test_size = 0.2, random_state = 0)
# each regional data should be splited equally
lenG = dataG.shape[0]
lenL = dataL.shape[0]
lenO = dataO.shape[0]
indexG = np.arange(0, lenG)
indexL = np.arange(lenG, lenG+lenL)
indexO = np.arange(lenG+lenL, lenG+lenL+lenO)
[XG_train, XG_test, yG_train, yG_test] = train_test_split(X[indexG, :], y[indexG], test_size = 0.2, random_state = 0)
[XL_train, XL_test, yL_train, yL_test] = train_test_split(X[indexL, :], y[indexL], test_size = 0.2, random_state = 0)
[XO_train, XO_test, yO_train, yO_test] = train_test_split(X[indexO, :], y[indexO], test_size = 0.2, random_state = 0)
X_train = np.r_[XG_train, XL_train, XO_train]
X_test = np.r_[XG_test, XL_test, XO_test]
y_train = np.r_[yG_train, yL_train, yO_train]
y_test = np.r_[yG_test, yL_test, yO_test]
## comparison
## classifiers
#names = ["Nearest Neighbors",
# "Linear SVM",
# "Poly SVM",
# "RBF SVM",
# "Decision Tree",
# "Random Forest 2",
# "Random Forest 3",
# "Random Forest 4",
# "AdaBoost",
# #"Naive Bayes",
# "Linear Discriminant Analysis",
# #"Quadratic Discriminant Analysis"
# ]
#classifiers = [
# KNeighborsClassifier(3),
# SVC(kernel="linear", C=0.025),
# SVC(kernel="poly", C=0.025),
# SVC(gamma=2, C=1),
# DecisionTreeClassifier(max_depth=4),
# RandomForestClassifier(max_depth=2, n_estimators=10, max_features=1),
# RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1),
# RandomForestClassifier(max_depth=4, n_estimators=10, max_features=1),
# AdaBoostClassifier(),
# #GaussianNB(),
# LinearDiscriminantAnalysis(),
# #QuadraticDiscriminantAnalysis()
# ]
#for name, model in zip(names, classifiers):
# scores = cross_val_score(model, X, y, cv = 10, scoring = 'f1_micro')
# fout = open("comparison.csv", "a")
# fout.write("{0},{1},{2}\n".format(sentenceNum, name, scores.mean()))
# print('{0}, {1}: {2}'.format(sentenceNum, name, scores.mean()))
# quasi-optimal model
model = AdaBoostClassifier()
# cross validation
scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro')
ci_mean, ci_low, ci_high = util.mean_confidence_interval(scores, 0.95)
modelfit =, y_train)
# f1 on test data
y_prediction = modelfit.predict(X_test)
f1score = f1_score(y_test, y_prediction, average='micro')
fout.write("{0},{1},{2},{3}\n".format(ci_mean, ci_low, ci_high, f1score))
## save for the test
result_y_test = result_y_test + list(y_test)
result_y_prediction = result_y_prediction + list(y_prediction)
fileClassifier = dirFeature + '\\\\' + sentenceNumStr + '.mdl'
pickle.dump(modelfit, open(fileClassifier, 'wb'))
### confusion matrix
result_y_test_label = LE_y.inverse_transform(result_y_test)
result_y_prediction_label = LE_y.inverse_transform(result_y_prediction)
confusionMatrix = confusion_matrix(result_y_test_label, result_y_prediction_label, labels=[
'Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland'])
### make userID list
#userID = testset_userID[0]
#for sentenceNum in range(1, sentenceNumMax):
# userid = testset_userID[sentenceNum]
# userID = np.r_[userID, userid]
#userIDlist = np.unique(userID)
@ -0,0 +1,326 @@
import os
import sys
import configparser
import pypyodbc
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
import dataManipulation as mani
import evaluation as eval
import speaker_based_functions as sb_func
sentenceNumMax = 10
configFile = currDir + '\\config.ini'
dirOut = currDir + '\\result'
# make train/test set: 1, load: 0
makeTrainTestSet = 0
# convert 3 regions to 2 regions: 1, load: 0
conv3to2region = 0
# 3 regions: 0
# saxon vs limburg: 1
# groningen vs limburg: 2
experiment_type = 2
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
# a bit useless error handling.
#assert (experiment_type in (0, 1, 2)), "experiment type should be 0, 1 or 2."
if experiment_type == 1:
regionLabels2 = ['Low_Saxon', 'Limburg']
regionLabels2 = ['Groningen_and_Drenthe', 'Limburg']
## load init file
config = configparser.ConfigParser()
dirFeature = config['sentence_based']['dirFeature']
fileMDB = config['sentence_based']['fileMDB']
## database connection
pypyodbc.lowercase = False
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
conn = pypyodbc.connect(param)
cursor = conn.cursor()
## get data from Access database
# data format
# 0: filename
# 1: pid
# 2: region
# 3: ID (unique word_id)
# 4: sentence_id
# 5: word_id
# 6: word
# 7: pronunciation
SQL_string = """\
{CALL dataset_with_cities}
rows = cursor.fetchall()
data = np.array(rows)
#dataNumMax = data.shape[0]
#uniqueWordIDmax = max(data[:, 3].astype(int))
del SQL_string, rows
## make list of LabelBinarizer object per word.
# for X
# get pronvarList from Access database
# pronvarList format
# 0: ID (unique word_id)
# 1: word
# 2: pronvar
SQL_string = """\
{CALL pronunciation_variant}
rows = cursor.fetchall()
pronvarList = np.array(rows)
del SQL_string, rows
LBlist = []
#uniqueWordIDlist = pronvarList[:, 0].astype(int)
uniqueWordIDlist = data[:, 3].astype(int)
uniqueWordIDmax = max(uniqueWordIDlist)
for uniqueWordID in range(1, uniqueWordIDmax+1):
pronvar = data[uniqueWordIDlist == uniqueWordID, 7]
#pronvar = pronvarList[pronvarList[:, 0] == uniqueWordID, 2]
LB = preprocessing.LabelBinarizer()
# for y (=region)
LE_y = preprocessing.LabelEncoder()
LE_y2 = preprocessing.LabelEncoder()
LB_y = preprocessing.LabelBinarizer()
LB_y2 = preprocessing.LabelBinarizer()
del uniqueWordID, uniqueWordIDmax, pronvar, LB
#CM_majority = np.zeros((1, 9)).astype(int)
#CM_weighted = np.zeros((1, 9)).astype(int)
#for iter in range(0, 1):
# print(iter)
## make balanced dataset
pidlist = np.unique(data[:, (1, 2)], axis=0)
# count number of samples
pidlistCounter = Counter(pidlist[:, 1])
sampleNumMax = min(pidlistCounter.values())
del pidlistCounter
## make train/eval/test set or load
if makeTrainTestSet==1:
pidlist_train = []
pidlist_eval = []
pidlist_test = []
for regionNum in range(0, len(regionLabels)):
regionName = regionLabels[regionNum]
pidlist_per_region_ = pidlist[pidlist[:, 1]==regionLabels[regionNum], :]
pidlist_per_region, idx = mani.extractRandomSample(
pidlist_per_region_, sampleNumMax)
# split dataset into train, eval and test.
[pidlist_per_region_train, pidlist_per_region_test] = train_test_split(
pidlist_per_region, test_size = 0.2, random_state = 0)
[pidlist_per_region_train, pidlist_per_region_eval] = train_test_split(
pidlist_per_region_train, test_size = 0.1, random_state = 0)
# append numpy arrays
if regionNum == 0:
pidlist_train = pidlist_per_region_train
pidlist_eval = pidlist_per_region_eval
pidlist_test = pidlist_per_region_test
pidlist_train = np.r_[pidlist_train, pidlist_per_region_train]
pidlist_eval = np.r_[pidlist_eval, pidlist_per_region_eval]
pidlist_test = np.r_[pidlist_test, pidlist_per_region_test]
del regionNum, regionName
del pidlist_per_region_, pidlist_per_region, idx
del pidlist_per_region_train, pidlist_per_region_eval, pidlist_per_region_test
|||| + "\\pidlist_train.npy", pidlist_train)
|||| + "\\pidlist_eval.npy", pidlist_eval)
|||| + "\\pidlist_test.npy", pidlist_test)
pidlist_train = np.load(dirOut + "\\pidlist_train.npy")
pidlist_eval = np.load(dirOut + "\\pidlist_eval.npy")
pidlist_test = np.load(dirOut + "\\pidlist_test.npy")
## make dataset for 2 regions or load
if conv3to2region==1:
pidlist2_train_ = np.r_[pidlist_train, pidlist_eval]
if experiment_type == 1:
pidlist2_train = sb_func.saxon_vs_limburg(pidlist2_train_)
pidlist2_test = sb_func.saxon_vs_limburg(pidlist_test)
|||| + "\\pidlist2_saxon_vs_limburg_train", pidlist2_train)
|||| + "\\pidlist2_saxon_vs_limburg_test", pidlist2_test)
elif experiment_type == 2:
pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_)
pidlist2_test = sb_func.groningen_vs_limburg(pidlist_test)
|||| + "\\pidlist2_groningen_vs_limburg_train", pidlist2_train)
|||| + "\\pidlist2_groningen_vs_limburg_test", pidlist2_test)
del pidlist2_train_
if experiment_type == 1:
pidlist2_train = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_train.npy")
pidlist2_test = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_test.npy")
elif experiment_type == 2:
pidlist2_train = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_train.npy")
pidlist2_test = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_test.npy")
## train/test data
if experiment_type == 0:
# Groningen vs Overijsel vs Limburg
data_train = sb_func.extractPid(pidlist_train, data)
data_eval = sb_func.extractPid(pidlist_eval, data)
data_test = sb_func.extractPid(pidlist_test, data)
elif experiment_type == 1 or experiment_type == 2:
data2 = np.array(data)
if experiment_type == 1:
for row, row2 in zip(data, data2):
if row[2] == regionLabels[0] or row[2] == regionLabels[2]:
row2[2] = regionLabels2[0]
data2_train = sb_func.extractPid(pidlist2_train, data2)
data2_test = sb_func.extractPid(pidlist2_test, data2)
## actual training
# train vs eval
#trainData = data_train
#testData = data_eval
#testPID = pidlist_eval
#LB = LB_y
#LE = LE_y
#regionLabels = regionLabels3
# train+eval vs test
if experiment_type == 0:
trainData = np.r_[data_train, data_eval]
testData = data_test
testPID = pidlist_test
LB = LB_y
LE = LE_y
elif experiment_type == 1 or experiment_type == 2:
# 2 region: saxon vs limburg/ groningen vs limburg
trainData = data2_train
testData = data2_test
testPID = pidlist2_test
LB = LB_y2
LE = LE_y2
regionLabels = regionLabels2
# check the number of utterance
allData = np.r_[trainData, testData]
filenames = np.c_[allData[:, 0], allData[:, 2]]
filenames_unique = np.unique(filenames, axis=0)
Counter(filenames_unique[:, 1])
fileComparison = dirOut + "\\algorithm_comparison.csv"
filePerformance = dirOut + "\\sentence-level.csv"
fileConfusionMatrix = dirOut + "\\confusion_matrix.csv"
## compare classification algorithms for the sentence-classifiers.
#sb_func.compare_sentence_level_classifiers(trainData, LBlist, LE, fileComparison)
## train sentence-level classifiers.
modelList, scoreList, confusionMatrixList = sb_func.train_sentence_level_classifiers(
trainData, LBlist, LE, filePerformance)
## prediction over evaluation data per each sentence-level classifier.
pred_per_sentence = sb_func.prediction_per_sentence(testData, modelList, LBlist, LE)
## combine sentence-level classifiers
pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence)
## majority vote (weighted)
#weight = sb_func.calc_weight(confusionMatrixList)
#pred_per_pid_weighted = sb_func.prediction_per_pid_weighted(testPID, pred_per_sentence, weight, LB, LE)
### confusion matrix
if experiment_type == 0:
confusionMatrix_majority = confusion_matrix(
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg'])
confusionMatrix_majority = confusion_matrix(
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Limburg'])
#confusionMatrix_weighted = confusion_matrix(
# pred_per_pid_weighted[:, 1], pred_per_pid_weighted[:, 2], labels=regionLabels)
## output
accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None)
print('accuracy: {}%'.format(accuracy * 100))
cm = confusionMatrix_majority
|||| + "\\pred_per_pid.npy", pred_per_pid_majority)
|||| + "\\confusion_matrix.npy", cm)
#fout = open(fileConfusionMatrix, "w")
#fout.write('< confusion matrix for majority vote in evaluation set >\n')
#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_majority', regionLabels)
#fout.write('< confusion matrix for weighted vote in evaluation set >\n')
#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_weighted', regionLabels)
##### iteration finish #####
#np.savetxt(dirOut + '\\cm_majority.csv', CM_majority, delimiter=',')
#np.savetxt(dirOut + '\\cm_weighted.csv', CM_weighted, delimiter=',')
@ -0,0 +1,383 @@
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import itertools
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
import dataManipulation as mani
import evaluation as eval
# extract data that corresponds to pid in the pidlist
def extractPid(pidlist, data):
for pidnum in range(0, len(pidlist)):
pid = pidlist[pidnum, 0]
x = data[data[:, 1] == pid, :]
if pidnum == 0:
data_ = x
data_ = np.r_[data_, x]
return data_
def OneHotEncoding(data, LB_X, LE_y):
# one hot encoding of data using LabelBinalizer per word (LB_X) and for region (LB_y)
# data
# 0: filename
# 1: pid
# 2: region
# 3: ID (unique word_id)
# 4: sentence_id
# 5: word_id
# 6: word
# 7: pronunciation
# LB_x: LabelBinalizer objects
# LE_y: LabelEncoder object
# X: encoded variable data
# y: encoded target data
pidlist = data[:, 1]
regionlist = data[:, 2]
uniqueWordIDlist = data[:, 3].astype(int)
pronvarlist = data[:, 7]
uniqueWordIDlist_unique = np.unique(uniqueWordIDlist)
for uniqueWordIDnum in uniqueWordIDlist_unique:
x_ = pronvarlist[uniqueWordIDlist == uniqueWordIDnum]
lb = LB_X[uniqueWordIDnum-1]
x = lb.transform(x_)
if uniqueWordIDnum == uniqueWordIDlist_unique[0]:
X = x
X = np.c_[X, x]
# pid and region of the speakers
y_ = regionlist[uniqueWordIDlist == uniqueWordIDlist_unique[0]]
y = LE_y.transform(y_)
pid = pidlist[uniqueWordIDlist == uniqueWordIDlist_unique[0]]
return X, y, pid
def outputConfusionMatrix33(foutName, matrixName, regionLabels):
for r in range(0, len(regionLabels)):
execString1 = foutName + '.write("{0},{1},{2},{3}\\n".format('
execString2 = 'regionLabels[' + str(r) + ']'
execString3 = ''
for c in range(0, len(regionLabels)):
execString3 = execString3 + ',' + matrixName + '[' + str(r) + '][' + str(c) + ']'
execString4 = '))'
execString = execString1 + execString2 + execString3 + execString4
def compare_sentence_level_classifiers(data_train, LBlist, LE_y, fileCSV):
""" compare the classification algorithms on sentence-level classifiers.
data_train: training data.
LBlist: list of label binarizer, which is used to encode pronunciation variants.
LE_y: label encorder, which is used to encode rigion names.
fileCSV: output csv file path.
fout = open(fileCSV, "w")
sentenceIDlist_train = data_train[:, 4].astype(int)
sentenceIDmax_train = max(sentenceIDlist_train)
for sentenceID in range(1, sentenceIDmax_train+1):
sentenceIDstr = format(sentenceID, '02')
## categorical values into binary values.
data_sentence = data_train[sentenceIDlist_train == sentenceID, :]
X_train, y_train, pid_train = OneHotEncoding(data_sentence, LBlist, LE_y)
regionCounter = Counter(LE_y.inverse_transform(y_train))
## classifier comparison
names = [
"Nearest Neighbors",
"Linear SVM",
"Poly SVM",
"Decision Tree",
"Random Forest 2",
"Random Forest 3",
"Random Forest 4",
"AdaBoost(Random Forest 3)",
"Naive Bayes",
"Linear Discriminant Analysis",
"Quadratic Discriminant Analysis"
classifiers = [
SVC(kernel="linear", C=0.025),
SVC(kernel="poly", C=0.025),
SVC(gamma=2, C=1),
RandomForestClassifier(max_depth=2, n_estimators=10, max_features=1),
RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1),
RandomForestClassifier(max_depth=4, n_estimators=10, max_features=1),
AdaBoostClassifier(SVC(probability=True, kernel='linear')),
AdaBoostClassifier(RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1)),
for name, model in zip(names, classifiers):
scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro')
fout.write("{0},{1},{2},{3}\n".format(sentenceID, name, scores.mean(), scores.var()))
print('{0}, {1}: {2}'.format(sentenceID, name, scores.mean()))
def train_sentence_level_classifiers(data_train, LBlist, LE_y, fileCSV):
""" train sentence-level classifiers.
data_train: training data.
LBlist: list of label binarizer, which is used to encode pronunciation variants.
LE_y: label encorder, which is used to encode rigion names.
fileCSV: output csv file path.
modelList (list): list of models (length: sentenceNumMax)
scoreList (list): list of scores (length: sentenceNumMax)
fout = open(fileCSV, "w")
fout.write('< cross-validation in training set >\n')
sentenceIDlist_train = data_train[:, 4].astype(int)
sentenceIDmax_train = max(sentenceIDlist_train)
modelList = []
scoreList = []
confusionMatrixList = []
for sentenceID in range(1, sentenceIDmax_train+1):
sentenceIDstr = format(sentenceID, '02')
## categorical values into binary values.
data_sentence = data_train[sentenceIDlist_train == sentenceID, :]
X_train, y_train, pid_train = OneHotEncoding(data_sentence, LBlist, LE_y)
regionCounter = Counter(LE_y.inverse_transform(y_train))
## cross-validation with the best classifier
model = AdaBoostClassifier()
#model = SVC(kernel="linear", C=0.025)
#model = LinearDiscriminantAnalysis()
# #scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro')
scores, confusionMatrix = eval.cross_val_confusion_matrix(model, X_train, y_train, 10)
ci_mean, ci_low, ci_high = eval.mean_confidence_interval(scores, 0.95)
## model fitting
modelfit =, y_train)
## output
# regionCounter['Groningen_and_Drenthe'], regionCounter['Limburg'], regionCounter['Oost_Overijsel-Gelderland']))
# regionCounter['Low_Saxon'], regionCounter['Limburg']))
regionCounter['Groningen_and_Drenthe'], regionCounter['Limburg']))
fout.write("{0},{1},{2}\n".format(ci_mean, ci_low, ci_high))
return modelList, scoreList, confusionMatrixList
def prediction_per_sentence(data_eval, modelList, LBlist, LE_y):
""" prediction using sentence-level classifiers.
data_eval: evaluation data.
modelList: list of the models.
LBlist: list of label binarizer, which is used to encode pronunciation variants.
LE_y: label encorder, which is used to encode rigion names.
prediction (list): [sentenceID, pid, answer, prediction]
sentenceIDlist_eval = data_eval[:, 4].astype(int)
sentenceIDmax_eval = max(sentenceIDlist_eval)
for sentenceID in range(1, sentenceIDmax_eval+1):
sentenceIDstr = format(sentenceID, '02')
## categorical values into binary values.
data_sentence = data_eval[sentenceIDlist_eval == sentenceID, :]
X_eval, y_eval, pid_eval = OneHotEncoding(data_sentence, LBlist, LE_y)
regionCounter = Counter(LE_y.inverse_transform(y_eval))
## evaluate model
modelfit = modelList[sentenceID-1]
y_pred = modelfit.predict(X_eval)
y_pred_label = LE_y.inverse_transform(y_pred)
y_eval_label = LE_y.inverse_transform(y_eval)
# pid, y, y_pred
sentenceIDvec = np.ones((y_eval_label.shape[0], 1)).astype(int) * sentenceID
prediction_ = np.c_[sentenceIDvec, pid_eval, y_eval_label, y_pred_label]
if sentenceID == 1:
prediction = prediction_
prediction = np.r_[prediction, prediction_]
return prediction
def prediction_per_pid_majority(pidlist_eval, prediction):
""" make a prediction per pid using majority vote
prediction_per_pid (ndarray): [pid, ans, prediction]
prediction_per_pid = []
for pid_ in range(0, len(pidlist_eval[:, 0])):
pid = pidlist_eval[pid_, 0]
ans = pidlist_eval[pid_, 1]
prediction_ = prediction[prediction[:, 1] == pid, :]
# majority vote
predCounter = Counter(prediction_[:, -1])
predMostCommon = predCounter.most_common(1)
predLabel = predMostCommon[0][0]
predRatio = predMostCommon[0][1] / prediction_.shape[0] * 100
prediction_per_pid.append([pid, ans, predLabel])
return np.array(prediction_per_pid)
def calc_weight(confusionMatrixList):
""" calculate weight (how trustworthy the prediction is) for majority vote.
Of all subjects we predicted are GO/OG/LB, what fraction of them actually are (precision) is used as weight.
confusionMarixList: list of confusion matrix of sentence-level classifiers.
sentenceID_max = len(confusionMatrixList)
weight = np.zeros((sentenceID_max, confusionMatrixList[0].shape[0]))
for sentenceID in range(1, sentenceID_max+1):
cm = confusionMatrixList[sentenceID-1]
# normalized confusion matrix
#rTotal = np.sum(cm, axis=1)
#cm_normalized = cm / rTotal
#weight[sentenceID-1, :] = np.diag(cm_normalized)
true_positives = np.diag(cm)
predicted = np.sum(cm, axis=0)
weight[sentenceID-1, :] = true_positives / predicted
return weight
def prediction_per_pid_weighted(pidlist_eval, prediction, weight, LB_y, LE_y):
""" make a prediction per pid using weighted (majority) vote.
weight (ndarray): how trustworthy the prediction of each sentence-based classifier is.
LB_y: label binalizer, which is used to encode region names.
LE_y: label encorder, which is used to encode region names.
prediction_per_pid (ndarray): [pid, ans, prediction]
prediction_per_pid = []
for pid_ in range(0, len(pidlist_eval[:, 0])):
pid = pidlist_eval[pid_, 0]
ans = pidlist_eval[pid_, 1]
prediction_ = prediction[prediction[:, 1] == pid, :]
# calculate weighted (majority) vote
vote_weighted = np.zeros((1, 3))
for sentenceID_ in range(0, prediction_.shape[0]):
sentenceID = prediction_[sentenceID_, 0].astype(int)
w = weight[sentenceID-1, :]
pred = prediction_[sentenceID_, 3]
pred_int = LB_y.transform([pred])
vote_weighted = vote_weighted + w * pred_int
# choose the most vote
vote_weighted = vote_weighted[0]
maxindex = list(vote_weighted).index(max(vote_weighted))
#predLabel = regionLabels[maxindex]
predLabel = LE_y.inverse_transform(maxindex)
prediction_per_pid.append([pid, ans, predLabel])
return np.array(prediction_per_pid)
def saxon_vs_limburg(pidlist3):
"""convert a pidlist for 3 regions into that for 2 regions.
3 regions include ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
2 regions include ['Limburg', 'Low_Saxon']
where Low_Saxon = 'Groningen_and_Drenthe' + 'Oost_Overijsel-Gelderland'
samples are randomly chosen so that each class has the same amount of data.
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
regionLabels2 = ['Low_Saxon', 'Limburg']
index_saxon = np.any([pidlist3[:, 1] == regionLabels[0], pidlist3[:, 1] == regionLabels[2]], axis=0)
pidlist_saxon_ = pidlist3[index_saxon, :]
pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :]
# extract the same amout of samples as Limburg.
pidlistCounter3 = Counter(pidlist3[:, 1])
pidlist_saxon, idx = mani.extractRandomSample(pidlist_saxon_, pidlistCounter3['Limburg'])
pidlist_saxon[:, 1] = regionLabels2[0]
pidlist2 = np.r_[pidlist_limburg, pidlist_saxon]
#pidlistCounter2 = Counter(pidlist2[:, 1])
return pidlist2
def groningen_vs_limburg(pidlist3):
"""convert a pidlist for 3 regions into that for 2 regions.
3 regions include ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
2 regions include ['Groningen_and_Drenthe', 'Limburg']
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
pidlist_groningen = pidlist3[pidlist3[:, 1] == regionLabels[0], :]
pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :]
pidlist2 = np.r_[pidlist_groningen, pidlist_limburg]
return pidlist2
@ -0,0 +1,44 @@
import Levenshtein
import numpy as np
a = 'hello'
b = 'haall'
# approximate
infinite = 100
# make distance matrix D
len_a = len(a)
len_b = len(b)
D_ = np.zeros((len_a, len_b)).astype(int)
for ia in range(0, len_a):
a_ = a[ia]
for ib in range(0, len_b):
b_ = b[ib]
if a_ == b_:
D_[ia, ib] = 1
D = np.zeros((len_a+1, len_b+1)).astype(int)
D[1:len_a+1, 1:len_b+1] = D_
D[0, :] = infinite
D[:, 0] = infinite
D[0, 0] = 0
# calculate accumulated distance
indexPath = []
for ia in range(0, len_a):
for ib in range(0, len_b):
a_ = a[ia]
b_ = b[ib]
option = (D[ia, ib]+D[ia+1, ib+1], D[ia, ib+1], D[ia+1, ib])
Dmin = np.min(option)
D[ia+1, ib+1] = D[ia+1, ib+1]+Dmin
index = list(option).index(Dmin)
indexPath[ia, ib] = index
# back trace
ia = len_a
ib = len_b
#while (ia > 0 or ib > 0):
# tb
Normal file
@ -0,0 +1,56 @@
import os
import sys
import configparser
import numpy as np
from matplotlib import pyplot
currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
from dataIO import readFile
from dataIO import selectSamplesFromCombinedData
import dataManipulation
configFile = currDir + '\\config.ini'
config = configparser.ConfigParser()
fileWordList = config['word_based']['fileWordList']
fileCombined = config['word_based']['fileCombined']
wordList = readFile(fileWordList)
for wordNum in range(1, len(wordList)):
word = wordList[wordNum-1] # target word
#print("=== {} ===".format(word))
dataGroningen, dataLimburg, dataOverijsel = selectSamplesFromCombinedData(word, fileCombined)
sampleNumMax = 50
dataG, indexG = dataManipulation.extractRandomSample(np.array(dataGroningen), sampleNumMax)
dataL, indexL = dataManipulation.extractRandomSample(np.array(dataLimburg), sampleNumMax)
dataO, indexO = dataManipulation.extractRandomSample(np.array(dataOverijsel), sampleNumMax)
# combine pronunciation from three regions
# data: (sampleNumMax x 3) x 1
cPronunciation = 4
data = np.hstack([dataG[:, cPronunciation], dataL[:, cPronunciation], dataO[:, cPronunciation]])
dataLevenshtein = dataManipulation.makeLevenshteinMatrix(data)
dataMDS = dataManipulation.MDS(dataLevenshtein)
# plot
pyplot.scatter(dataMDS[0:sampleNumMax-1, 0], dataMDS[0:sampleNumMax-1, 1], s=80, c='red', marker="o", facecolors='none', label="Groningen and Drenthe")
pyplot.scatter(dataMDS[sampleNumMax:sampleNumMax*2-1, 0], dataMDS[sampleNumMax:sampleNumMax*2-1, 1], c='green', marker="^", facecolors='none', label="Limburg")
pyplot.scatter(dataMDS[sampleNumMax*2:sampleNumMax*3-1, 0], dataMDS[sampleNumMax*2:sampleNumMax*3-1, 1], c='blue', marker="+", facecolors='none', label="Oost Overijsel-Gelderland")
pyplot.legend(loc='upper right')
pyplot.savefig('c:\\cygwin64\\home\\Aki\\rug_cygwin\\_same-utterance\\fig\\' + word + '.png')
