commit to be sure.
This commit is contained in:
commit
a1379caced
38
dialect_identification.sln
Normal file
38
dialect_identification.sln
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
|
||||||
|
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||||
|
# Visual Studio 15
|
||||||
|
VisualStudioVersion = 15.0.26730.12
|
||||||
|
MinimumVisualStudioVersion = 10.0.40219.1
|
||||||
|
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "dialect_identification", "dialect_identification\dialect_identification.pyproj", "{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}"
|
||||||
|
EndProject
|
||||||
|
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{5A4286D1-F037-43D4-90F8-05C5CCC0CA30}"
|
||||||
|
ProjectSection(SolutionItems) = preProject
|
||||||
|
..\..\forced-alignment\forced_alignment\convert_phone_set.py = ..\..\forced-alignment\forced_alignment\convert_phone_set.py
|
||||||
|
..\..\forced-alignment\forced_alignment\defaultfiles.py = ..\..\forced-alignment\forced_alignment\defaultfiles.py
|
||||||
|
..\..\forced-alignment\forced_alignment\forced_alignment.pyproj = ..\..\forced-alignment\forced_alignment\forced_alignment.pyproj
|
||||||
|
..\..\forced-alignment\forced_alignment\htk_dict.py = ..\..\forced-alignment\forced_alignment\htk_dict.py
|
||||||
|
..\..\forced-alignment\forced_alignment\lexicon.py = ..\..\forced-alignment\forced_alignment\lexicon.py
|
||||||
|
..\..\forced-alignment\forced_alignment\mlf.py = ..\..\forced-alignment\forced_alignment\mlf.py
|
||||||
|
..\..\forced-alignment\forced_alignment\pronunciations.py = ..\..\forced-alignment\forced_alignment\pronunciations.py
|
||||||
|
..\..\forced-alignment\forced_alignment\pyhtk.py = ..\..\forced-alignment\forced_alignment\pyhtk.py
|
||||||
|
..\..\forced-alignment\forced_alignment\scripts.py = ..\..\forced-alignment\forced_alignment\scripts.py
|
||||||
|
..\..\forced-alignment\forced_alignment\tempfilename.py = ..\..\forced-alignment\forced_alignment\tempfilename.py
|
||||||
|
..\..\forced-alignment\forced_alignment\test_environment.py = ..\..\forced-alignment\forced_alignment\test_environment.py
|
||||||
|
EndProjectSection
|
||||||
|
EndProject
|
||||||
|
Global
|
||||||
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
|
Debug|Any CPU = Debug|Any CPU
|
||||||
|
Release|Any CPU = Release|Any CPU
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||||
|
{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||||
|
{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(SolutionProperties) = preSolution
|
||||||
|
HideSolutionNode = FALSE
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(ExtensibilityGlobals) = postSolution
|
||||||
|
SolutionGuid = {FA4F83BB-D460-40C1-B10E-98E4877CA29B}
|
||||||
|
EndGlobalSection
|
||||||
|
EndGlobal
|
90
dialect_identification/audio2db.py
Normal file
90
dialect_identification/audio2db.py
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import configparser
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pypyodbc
|
||||||
|
|
||||||
|
|
||||||
|
## user define
|
||||||
|
forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced-alignment'
|
||||||
|
dir_same_utterance = 'd:\\OneDrive\\Research\\rug\\experiments\\same_utterance'
|
||||||
|
wav_dir = dir_same_utterance + '\\wav_with_cities'
|
||||||
|
script_dir = dir_same_utterance + '\\script'
|
||||||
|
fileMDB = dir_same_utterance + '\\feature\\DialectClassification.accdb'
|
||||||
|
table = 'ForcedAlignmentResult'
|
||||||
|
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
|
||||||
|
|
||||||
|
# these lines are not necessary once forced-alignment is intalled as a package.
|
||||||
|
sys.path.append(forced_alignment_module)
|
||||||
|
from forced_alignment import forced_alignment
|
||||||
|
|
||||||
|
|
||||||
|
## check if forced-alignment work in each sentence
|
||||||
|
#from forced_alignment import pronunciations
|
||||||
|
#pronunciations.delete_all_g2p_entries()
|
||||||
|
|
||||||
|
#wav_file = wav_dir + '\\10\\' + regionLabels[0] + '\\9935-1464218044-1951631.wav'
|
||||||
|
#script_file = script_dir + '\\script10.txt'
|
||||||
|
#with open(script_file, 'r') as fin:
|
||||||
|
# script = fin.readline()
|
||||||
|
#fa = forced_alignment(wav_file, script)
|
||||||
|
|
||||||
|
|
||||||
|
## make database connection
|
||||||
|
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
|
||||||
|
conn = pypyodbc.connect(param)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
SQLstring1 = 'INSERT INTO ' + table + ' (filename, region, word_id, pronunciation) '
|
||||||
|
|
||||||
|
|
||||||
|
## forced-alignment to all the wav files in dir_same_utterance
|
||||||
|
word_id_start = 1
|
||||||
|
for sentenceID in range(1, 11):
|
||||||
|
sentenceIDstr = format(sentenceID, '02')
|
||||||
|
|
||||||
|
# get script
|
||||||
|
script_file = script_dir + '\\script' + sentenceIDstr + '.txt'
|
||||||
|
with open(script_file, 'r') as fin:
|
||||||
|
script = fin.readline()
|
||||||
|
|
||||||
|
# loop over three regions
|
||||||
|
for region in regionLabels:
|
||||||
|
|
||||||
|
# loop over the wav_subdir
|
||||||
|
wav_subdir = wav_dir + '\\' + sentenceIDstr + '\\' + region
|
||||||
|
wav_files = os.listdir(wav_subdir)
|
||||||
|
file_nr = 0
|
||||||
|
for wav_file in wav_files:
|
||||||
|
file_nr += 1
|
||||||
|
filename = wav_file.replace('.wav', '')
|
||||||
|
wav_file_fullpath = wav_subdir + '\\' + wav_file
|
||||||
|
|
||||||
|
# forced-alignment
|
||||||
|
print('{0} {1}: {2} ({3}/{4})'.format(sentenceIDstr, region, wav_file, file_nr, len(wav_files)))
|
||||||
|
fa = forced_alignment(wav_file_fullpath, script)
|
||||||
|
|
||||||
|
# send pronunciation variant to database
|
||||||
|
word_id = word_id_start
|
||||||
|
for row in fa:
|
||||||
|
word = row[0]
|
||||||
|
phonemes = np.array(row[1])
|
||||||
|
|
||||||
|
## get pronunciation variant
|
||||||
|
pronvar_ = phonemes[:, 2]
|
||||||
|
pronvar_[np.where(pronvar_=='ssil')]='' # remove 'ssil'
|
||||||
|
pronvar = ''.join(pronvar_)
|
||||||
|
|
||||||
|
## insert the result into the database.
|
||||||
|
SQLstring2 = 'VALUES (\'' + filename + '\',\'' + region + '\',\'' + str(word_id) + '\',\'' + pronvar + '\')'
|
||||||
|
SQLstring = SQLstring1 + SQLstring2
|
||||||
|
cursor.execute(SQLstring)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
word_id = word_id + 1
|
||||||
|
|
||||||
|
word_id_start += script.count(' ')+1
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
290
dialect_identification/classifier.py
Normal file
290
dialect_identification/classifier.py
Normal file
@ -0,0 +1,290 @@
|
|||||||
|
'''
|
||||||
|
This script perfoms the basic process for applying a machine learning
|
||||||
|
algorithm to a dataset using Python libraries.
|
||||||
|
|
||||||
|
The four steps are:
|
||||||
|
1. Download a dataset (using pandas)
|
||||||
|
2. Process the numeric data (using numpy)
|
||||||
|
3. Train and evaluate learners (using scikit-learn)
|
||||||
|
4. Plot and compare results (using matplotlib)
|
||||||
|
|
||||||
|
|
||||||
|
The data is downloaded from URL, which is defined below. As is normal
|
||||||
|
for machine learning problems, the nature of the source data affects
|
||||||
|
the entire solution. When you change URL to refer to your own data, you
|
||||||
|
will need to review the data processing steps to ensure they remain
|
||||||
|
correct.
|
||||||
|
|
||||||
|
============
|
||||||
|
Example Data
|
||||||
|
============
|
||||||
|
The example is from http://mlr.cs.umass.edu/ml/datasets/Spambase
|
||||||
|
It contains pre-processed metrics, such as the frequency of certain
|
||||||
|
words and letters, from a collection of emails. A classification for
|
||||||
|
each one indicating 'spam' or 'not spam' is in the final column.
|
||||||
|
See the linked page for full details of the data set.
|
||||||
|
|
||||||
|
This script uses three classifiers to predict the class of an email
|
||||||
|
based on the metrics. These are not representative of modern spam
|
||||||
|
detection systems.
|
||||||
|
'''
|
||||||
|
|
||||||
|
# Remember to update the script for the new data when you change this URL
|
||||||
|
URL = "http://mlr.cs.umass.edu/ml/machine-learning-databases/spambase/spambase.data"
|
||||||
|
|
||||||
|
# Uncomment this call when using matplotlib to generate images
|
||||||
|
# rather than displaying interactive UI.
|
||||||
|
#import matplotlib
|
||||||
|
#matplotlib.use('Agg')
|
||||||
|
|
||||||
|
from pandas import read_table
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
try:
|
||||||
|
# [OPTIONAL] Seaborn makes plots nicer
|
||||||
|
import seaborn
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# =====================================================================
|
||||||
|
|
||||||
|
def download_data():
|
||||||
|
'''
|
||||||
|
Downloads the data for this script into a pandas DataFrame.
|
||||||
|
'''
|
||||||
|
|
||||||
|
# If your data is in an Excel file, install 'xlrd' and use
|
||||||
|
# pandas.read_excel instead of read_table
|
||||||
|
#from pandas import read_excel
|
||||||
|
#frame = read_excel(URL)
|
||||||
|
|
||||||
|
# If your data is in a private Azure blob, install 'azure-storage' and use
|
||||||
|
# BlockBlobService.get_blob_to_path() with read_table() or read_excel()
|
||||||
|
#from azure.storage.blob import BlockBlobService
|
||||||
|
#service = BlockBlobService(ACCOUNT_NAME, ACCOUNT_KEY)
|
||||||
|
#service.get_blob_to_path(container_name, blob_name, 'my_data.csv')
|
||||||
|
#frame = read_table('my_data.csv', ...
|
||||||
|
|
||||||
|
frame = read_table(
|
||||||
|
URL,
|
||||||
|
|
||||||
|
# Uncomment if the file needs to be decompressed
|
||||||
|
#compression='gzip',
|
||||||
|
#compression='bz2',
|
||||||
|
|
||||||
|
# Specify the file encoding
|
||||||
|
# Latin-1 is common for data from US sources
|
||||||
|
encoding='latin-1',
|
||||||
|
#encoding='utf-8', # UTF-8 is also common
|
||||||
|
|
||||||
|
# Specify the separator in the data
|
||||||
|
sep=',', # comma separated values
|
||||||
|
#sep='\t', # tab separated values
|
||||||
|
#sep=' ', # space separated values
|
||||||
|
|
||||||
|
# Ignore spaces after the separator
|
||||||
|
skipinitialspace=True,
|
||||||
|
|
||||||
|
# Generate row labels from each row number
|
||||||
|
index_col=None,
|
||||||
|
#index_col=0, # use the first column as row labels
|
||||||
|
#index_col=-1, # use the last column as row labels
|
||||||
|
|
||||||
|
# Generate column headers row from each column number
|
||||||
|
header=None,
|
||||||
|
#header=0, # use the first line as headers
|
||||||
|
|
||||||
|
# Use manual headers and skip the first row in the file
|
||||||
|
#header=0,
|
||||||
|
#names=['col1', 'col2', ...],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Return a subset of the columns
|
||||||
|
#return frame[['col1', 'col4', ...]]
|
||||||
|
|
||||||
|
# Return the entire frame
|
||||||
|
return frame
|
||||||
|
|
||||||
|
|
||||||
|
# =====================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def get_features_and_labels(frame):
|
||||||
|
'''
|
||||||
|
Transforms and scales the input data and returns numpy arrays for
|
||||||
|
training and testing inputs and targets.
|
||||||
|
'''
|
||||||
|
|
||||||
|
# Replace missing values with 0.0, or we can use
|
||||||
|
# scikit-learn to calculate missing values (below)
|
||||||
|
#frame[frame.isnull()] = 0.0
|
||||||
|
|
||||||
|
# Convert values to floats
|
||||||
|
arr = np.array(frame, dtype=np.float)
|
||||||
|
|
||||||
|
# Use the last column as the target value
|
||||||
|
X, y = arr[:, :-1], arr[:, -1]
|
||||||
|
# To use the first column instead, change the index value
|
||||||
|
#X, y = arr[:, 1:], arr[:, 0]
|
||||||
|
|
||||||
|
# Use 80% of the data for training; test against the rest
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
|
||||||
|
|
||||||
|
# sklearn.pipeline.make_pipeline could also be used to chain
|
||||||
|
# processing and classification into a black box, but here we do
|
||||||
|
# them separately.
|
||||||
|
|
||||||
|
# If values are missing we could impute them from the training data
|
||||||
|
#from sklearn.preprocessing import Imputer
|
||||||
|
#imputer = Imputer(strategy='mean')
|
||||||
|
#imputer.fit(X_train)
|
||||||
|
#X_train = imputer.transform(X_train)
|
||||||
|
#X_test = imputer.transform(X_test)
|
||||||
|
|
||||||
|
# Normalize the attribute values to mean=0 and variance=1
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
scaler = StandardScaler()
|
||||||
|
# To scale to a specified range, use MinMaxScaler
|
||||||
|
#from sklearn.preprocessing import MinMaxScaler
|
||||||
|
#scaler = MinMaxScaler(feature_range=(0, 1))
|
||||||
|
|
||||||
|
# Fit the scaler based on the training data, then apply the same
|
||||||
|
# scaling to both training and test sets.
|
||||||
|
scaler.fit(X_train)
|
||||||
|
X_train = scaler.transform(X_train)
|
||||||
|
X_test = scaler.transform(X_test)
|
||||||
|
|
||||||
|
# Return the training and test sets
|
||||||
|
return X_train, X_test, y_train, y_test
|
||||||
|
|
||||||
|
|
||||||
|
# =====================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_classifier(X_train, X_test, y_train, y_test):
|
||||||
|
'''
|
||||||
|
Run multiple times with different classifiers to get an idea of the
|
||||||
|
relative performance of each configuration.
|
||||||
|
|
||||||
|
Returns a sequence of tuples containing:
|
||||||
|
(title, precision, recall)
|
||||||
|
for each learner.
|
||||||
|
'''
|
||||||
|
|
||||||
|
# Import some classifiers to test
|
||||||
|
from sklearn.svm import LinearSVC, NuSVC
|
||||||
|
from sklearn.ensemble import AdaBoostClassifier
|
||||||
|
|
||||||
|
# We will calculate the P-R curve for each classifier
|
||||||
|
from sklearn.metrics import precision_recall_curve, f1_score
|
||||||
|
|
||||||
|
# Here we create classifiers with default parameters. These need
|
||||||
|
# to be adjusted to obtain optimal performance on your data set.
|
||||||
|
|
||||||
|
# Test the linear support vector classifier
|
||||||
|
classifier = LinearSVC(C=1)
|
||||||
|
# Fit the classifier
|
||||||
|
classifier.fit(X_train, y_train)
|
||||||
|
score = f1_score(y_test, classifier.predict(X_test))
|
||||||
|
# Generate the P-R curve
|
||||||
|
y_prob = classifier.decision_function(X_test)
|
||||||
|
precision, recall, _ = precision_recall_curve(y_test, y_prob)
|
||||||
|
# Include the score in the title
|
||||||
|
yield 'Linear SVC (F1 score={:.3f})'.format(score), precision, recall
|
||||||
|
|
||||||
|
# Test the Nu support vector classifier
|
||||||
|
classifier = NuSVC(kernel='rbf', nu=0.5, gamma=1e-3)
|
||||||
|
# Fit the classifier
|
||||||
|
classifier.fit(X_train, y_train)
|
||||||
|
score = f1_score(y_test, classifier.predict(X_test))
|
||||||
|
# Generate the P-R curve
|
||||||
|
y_prob = classifier.decision_function(X_test)
|
||||||
|
precision, recall, _ = precision_recall_curve(y_test, y_prob)
|
||||||
|
# Include the score in the title
|
||||||
|
yield 'NuSVC (F1 score={:.3f})'.format(score), precision, recall
|
||||||
|
|
||||||
|
# Test the Ada boost classifier
|
||||||
|
classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.0, algorithm='SAMME.R')
|
||||||
|
# Fit the classifier
|
||||||
|
classifier.fit(X_train, y_train)
|
||||||
|
score = f1_score(y_test, classifier.predict(X_test))
|
||||||
|
# Generate the P-R curve
|
||||||
|
y_prob = classifier.decision_function(X_test)
|
||||||
|
precision, recall, _ = precision_recall_curve(y_test, y_prob)
|
||||||
|
# Include the score in the title
|
||||||
|
yield 'Ada Boost (F1 score={:.3f})'.format(score), precision, recall
|
||||||
|
|
||||||
|
# =====================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def plot(results):
|
||||||
|
'''
|
||||||
|
Create a plot comparing multiple learners.
|
||||||
|
|
||||||
|
`results` is a list of tuples containing:
|
||||||
|
(title, precision, recall)
|
||||||
|
|
||||||
|
All the elements in results will be plotted.
|
||||||
|
'''
|
||||||
|
|
||||||
|
# Plot the precision-recall curves
|
||||||
|
|
||||||
|
fig = plt.figure(figsize=(6, 6))
|
||||||
|
fig.canvas.set_window_title('Classifying data from ' + URL)
|
||||||
|
|
||||||
|
for label, precision, recall in results:
|
||||||
|
plt.plot(recall, precision, label=label)
|
||||||
|
|
||||||
|
plt.title('Precision-Recall Curves')
|
||||||
|
plt.xlabel('Precision')
|
||||||
|
plt.ylabel('Recall')
|
||||||
|
plt.legend(loc='lower left')
|
||||||
|
|
||||||
|
# Let matplotlib improve the layout
|
||||||
|
plt.tight_layout()
|
||||||
|
|
||||||
|
# ==================================
|
||||||
|
# Display the plot in interactive UI
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
# To save the plot to an image file, use savefig()
|
||||||
|
#plt.savefig('plot.png')
|
||||||
|
|
||||||
|
# Open the image file with the default image viewer
|
||||||
|
#import subprocess
|
||||||
|
#subprocess.Popen('plot.png', shell=True)
|
||||||
|
|
||||||
|
# To save the plot to an image in memory, use BytesIO and savefig()
|
||||||
|
# This can then be written to any stream-like object, such as a
|
||||||
|
# file or HTTP response.
|
||||||
|
#from io import BytesIO
|
||||||
|
#img_stream = BytesIO()
|
||||||
|
#plt.savefig(img_stream, fmt='png')
|
||||||
|
#img_bytes = img_stream.getvalue()
|
||||||
|
#print('Image is {} bytes - {!r}'.format(len(img_bytes), img_bytes[:8] + b'...'))
|
||||||
|
|
||||||
|
# Closing the figure allows matplotlib to release the memory used.
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
|
||||||
|
# =====================================================================
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# Download the data set from URL
|
||||||
|
print("Downloading data from {}".format(URL))
|
||||||
|
frame = download_data()
|
||||||
|
|
||||||
|
# Process data into feature and label arrays
|
||||||
|
print("Processing {} samples with {} attributes".format(len(frame.index), len(frame.columns)))
|
||||||
|
X_train, X_test, y_train, y_test = get_features_and_labels(frame)
|
||||||
|
|
||||||
|
# Evaluate multiple classifiers on the data
|
||||||
|
print("Evaluating classifiers")
|
||||||
|
results = list(evaluate_classifier(X_train, X_test, y_train, y_test))
|
||||||
|
|
||||||
|
# Display the results
|
||||||
|
print("Plotting the results")
|
||||||
|
plot(results)
|
8
dialect_identification/config.ini
Normal file
8
dialect_identification/config.ini
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
[word_based]
|
||||||
|
fileWordList = D:\\OneDrive\\Research\\rug\\same_utterance\\feature\\wordList.csv
|
||||||
|
fileCombined = D:\\OneDrive\\Research\\rug\\same_utterance\\feature\\combined.csv
|
||||||
|
|
||||||
|
[sentence_based]
|
||||||
|
dirFeature = D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\feature
|
||||||
|
fileMDB = D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\feature\\DialectClassification.accdb
|
||||||
|
dirData = D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\wav
|
74
dialect_identification/data_io.py
Normal file
74
dialect_identification/data_io.py
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
#
|
||||||
|
# 2017/09/25
|
||||||
|
# select samples from the combined.csv for the further analysis
|
||||||
|
#
|
||||||
|
# HISTORY
|
||||||
|
# 2017/10/02 modularized.
|
||||||
|
#
|
||||||
|
# Aki Kunikoshi
|
||||||
|
# 428968@gmail.com
|
||||||
|
#
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
def readFile(filename):
|
||||||
|
with open(filename, 'r') as fin:
|
||||||
|
lines = fin.read()
|
||||||
|
linesEach = lines.split('\n')
|
||||||
|
return linesEach
|
||||||
|
|
||||||
|
|
||||||
|
def selectSamplesFromCombinedData(word, fileCombined):
|
||||||
|
# load combined data
|
||||||
|
fin = open(fileCombined, 'r')
|
||||||
|
line = fin.readline()
|
||||||
|
|
||||||
|
# load data per region
|
||||||
|
dataGroningen = []
|
||||||
|
dataLimburg = []
|
||||||
|
dataOverijsel = []
|
||||||
|
while line:
|
||||||
|
line = fin.readline()
|
||||||
|
line = line.rstrip()
|
||||||
|
lineList = line.split(',')
|
||||||
|
if len(lineList) == 6 and lineList[5] == word:
|
||||||
|
region = lineList[2]
|
||||||
|
if region == 'Groningen_and_Drenthe':
|
||||||
|
dataGroningen.append(lineList)
|
||||||
|
elif region == 'Limburg':
|
||||||
|
dataLimburg.append(lineList)
|
||||||
|
elif region == 'Oost_Overijsel-Gelderland':
|
||||||
|
dataOverijsel.append(lineList)
|
||||||
|
fin.close()
|
||||||
|
return (dataGroningen, dataLimburg, dataOverijsel)
|
||||||
|
#print("{0}: {1} {2} {3}".format(word,len(listGroningen),len(listLimburg),len(listOverijsel))
|
||||||
|
|
||||||
|
|
||||||
|
def groupSamplesInCSV(fileCSV, idxRegion):
|
||||||
|
fin = open(fileCSV, 'r')
|
||||||
|
|
||||||
|
# first line is the header
|
||||||
|
line = fin.readline()
|
||||||
|
line = line.rstrip()
|
||||||
|
header = line.split(',')
|
||||||
|
|
||||||
|
# load data per region
|
||||||
|
dataGroningen = []
|
||||||
|
dataLimburg = []
|
||||||
|
dataOverijsel = []
|
||||||
|
while line:
|
||||||
|
line = fin.readline()
|
||||||
|
line = line.rstrip()
|
||||||
|
lineList = line.split(',')
|
||||||
|
if len(lineList) == len(header):
|
||||||
|
region = lineList[idxRegion]
|
||||||
|
if region == 'Groningen_and_Drenthe':
|
||||||
|
dataGroningen.append(lineList)
|
||||||
|
elif region == 'Limburg':
|
||||||
|
dataLimburg.append(lineList)
|
||||||
|
elif region == 'Oost_Overijsel-Gelderland':
|
||||||
|
dataOverijsel.append(lineList)
|
||||||
|
fin.close()
|
||||||
|
return (header, dataGroningen, dataLimburg, dataOverijsel)
|
||||||
|
|
||||||
|
def addUserID(featureFile, recordingsCSV):
|
||||||
|
dirFeature = config['sentence_based']['dirFeature']
|
41
dialect_identification/data_manipulation.py
Normal file
41
dialect_identification/data_manipulation.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
import numpy as np
|
||||||
|
from sklearn import manifold
|
||||||
|
import Levenshtein
|
||||||
|
|
||||||
|
# x: ndarray (dnum x dim)
|
||||||
|
# n: number of samples to extract
|
||||||
|
# OUTPUT
|
||||||
|
# index: index of the chosen samples
|
||||||
|
#
|
||||||
|
def extractRandomSample(x, n):
|
||||||
|
xRowMax = x.shape[0]
|
||||||
|
indexOriginal = np.arange(xRowMax)
|
||||||
|
indexChosen = np.random.choice(indexOriginal, n, False)
|
||||||
|
xChosen = x[indexChosen, :]
|
||||||
|
return (xChosen, indexChosen)
|
||||||
|
|
||||||
|
# x: 1d string ndarray
|
||||||
|
def makeLevenshteinMatrix(x):
|
||||||
|
xRowMax = x.shape[0]
|
||||||
|
xLevenshtein = np.ones((xRowMax, xRowMax), dtype='int')
|
||||||
|
|
||||||
|
for xRow in range(0, xRowMax):
|
||||||
|
for xCol in range(0, xRowMax):
|
||||||
|
dist = Levenshtein.distance(x[xRow], x[xCol]);
|
||||||
|
xLevenshtein[xRow, xCol] = dist
|
||||||
|
return xLevenshtein
|
||||||
|
|
||||||
|
# x: 1d string ndarray
|
||||||
|
def calcLevenshteinArray(word, x):
|
||||||
|
xRowMax = x.shape[0]
|
||||||
|
xLevenshtein = np.zeros(x.shape, dtype='int')
|
||||||
|
|
||||||
|
for xRow in range(0, xRowMax):
|
||||||
|
dist = Levenshtein.distance(word, x[xRow]);
|
||||||
|
xLevenshtein[xRow] = dist
|
||||||
|
return xLevenshtein
|
||||||
|
|
||||||
|
def MDS(x):
|
||||||
|
mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=6)
|
||||||
|
xmds = mds.fit_transform(x)
|
||||||
|
return xmds
|
70
dialect_identification/dialect_identification.pyproj
Normal file
70
dialect_identification/dialect_identification.pyproj
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" ToolsVersion="4.0">
|
||||||
|
<PropertyGroup>
|
||||||
|
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
|
||||||
|
<SchemaVersion>2.0</SchemaVersion>
|
||||||
|
<ProjectGuid>fe1b1358-adbe-4446-affd-a0802d13d15b</ProjectGuid>
|
||||||
|
<ProjectTypeGuids>{a41c8ea1-112a-4a2d-9f91-29557995525f};{888888a0-9f3d-457c-b088-3a5042f75d52}</ProjectTypeGuids>
|
||||||
|
<ProjectHome>.</ProjectHome>
|
||||||
|
<StartupFile>output_confusion_matrix.py</StartupFile>
|
||||||
|
<SearchPath>
|
||||||
|
</SearchPath>
|
||||||
|
<WorkingDirectory>.</WorkingDirectory>
|
||||||
|
<OutputPath>.</OutputPath>
|
||||||
|
<Name>dialect_identification</Name>
|
||||||
|
<RootNamespace>dialect_identification</RootNamespace>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition=" '$(Configuration)' == 'Debug' ">
|
||||||
|
<DebugSymbols>true</DebugSymbols>
|
||||||
|
<EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition=" '$(Configuration)' == 'Release' ">
|
||||||
|
<DebugSymbols>true</DebugSymbols>
|
||||||
|
<EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
|
||||||
|
</PropertyGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<Compile Include="manipulate_db.py">
|
||||||
|
<SubType>Code</SubType>
|
||||||
|
</Compile>
|
||||||
|
<Compile Include="audio2db.py">
|
||||||
|
<SubType>Code</SubType>
|
||||||
|
</Compile>
|
||||||
|
<Compile Include="classifier.py" />
|
||||||
|
<Compile Include="dataManipulation.py">
|
||||||
|
<SubType>Code</SubType>
|
||||||
|
</Compile>
|
||||||
|
<Compile Include="output_confusion_matrix.py">
|
||||||
|
<SubType>Code</SubType>
|
||||||
|
</Compile>
|
||||||
|
<Compile Include="sentence_based.py">
|
||||||
|
<SubType>Code</SubType>
|
||||||
|
</Compile>
|
||||||
|
<Compile Include="speaker_based.py">
|
||||||
|
<SubType>Code</SubType>
|
||||||
|
</Compile>
|
||||||
|
<Compile Include="speaker_based_functions.py">
|
||||||
|
<SubType>Code</SubType>
|
||||||
|
</Compile>
|
||||||
|
<Compile Include="test_code.py">
|
||||||
|
<SubType>Code</SubType>
|
||||||
|
</Compile>
|
||||||
|
<Compile Include="evaluation.py">
|
||||||
|
<SubType>Code</SubType>
|
||||||
|
</Compile>
|
||||||
|
<Compile Include="word_based.py">
|
||||||
|
<SubType>Code</SubType>
|
||||||
|
</Compile>
|
||||||
|
<Compile Include="dataIO.py" />
|
||||||
|
</ItemGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<Content Include="config.ini" />
|
||||||
|
</ItemGroup>
|
||||||
|
<Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
|
||||||
|
<!-- Uncomment the CoreCompile target to enable the Build command in
|
||||||
|
Visual Studio and specify your pre- and post-build commands in
|
||||||
|
the BeforeBuild and AfterBuild targets below. -->
|
||||||
|
<!--<Target Name="CoreCompile" />-->
|
||||||
|
<Target Name="BeforeBuild">
|
||||||
|
</Target>
|
||||||
|
<Target Name="AfterBuild">
|
||||||
|
</Target>
|
||||||
|
</Project>
|
40
dialect_identification/evaluation.py
Normal file
40
dialect_identification/evaluation.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
import numpy as np
|
||||||
|
import scipy as sp
|
||||||
|
import scipy.stats
|
||||||
|
from sklearn.model_selection import KFold
|
||||||
|
from sklearn.metrics import f1_score
|
||||||
|
from sklearn.metrics import confusion_matrix
|
||||||
|
|
||||||
|
|
||||||
|
# from https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data
|
||||||
|
def mean_confidence_interval(data, confidence):
|
||||||
|
a = 1.0*np.array(data)
|
||||||
|
n = len(a)
|
||||||
|
m, se = np.mean(a), scipy.stats.sem(a)
|
||||||
|
h = se * sp.stats.t._ppf((1+confidence)/2., n-1)
|
||||||
|
return m, m-h, m+h
|
||||||
|
|
||||||
|
# accumulated confusion matrix is added to cross_val_score
|
||||||
|
def cross_val_confusion_matrix(model, X, y, cv):
|
||||||
|
kf = KFold(n_splits=cv)
|
||||||
|
classLabels = np.unique(y)
|
||||||
|
classNumMax = classLabels.shape[0]
|
||||||
|
confusionMatrixAccumulated = np.zeros((classNumMax, classNumMax))
|
||||||
|
scores = []
|
||||||
|
for idx_train, idx_test in kf.split(X):
|
||||||
|
# split into train/test
|
||||||
|
x_train = X[idx_train, :]
|
||||||
|
x_test = X[idx_test, :]
|
||||||
|
y_train = y[idx_train]
|
||||||
|
y_test = y[idx_test]
|
||||||
|
modelfit = model.fit(x_train, y_train)
|
||||||
|
|
||||||
|
# evaluation
|
||||||
|
y_pred = modelfit.predict(x_test)
|
||||||
|
|
||||||
|
score = f1_score(y_test, y_pred, average='micro')
|
||||||
|
scores.append(score)
|
||||||
|
confusionMatrixAccumulated = confusionMatrixAccumulated + confusion_matrix(y_test, y_pred,
|
||||||
|
labels=classLabels)
|
||||||
|
scores = np.array(scores)
|
||||||
|
return scores, confusionMatrixAccumulated
|
48
dialect_identification/manipulate_db.py
Normal file
48
dialect_identification/manipulate_db.py
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import pandas
|
||||||
|
import datetime
|
||||||
|
sys.path.append('..')
|
||||||
|
|
||||||
|
# these lines are not necessary once forced-alignment is intalled as a package.
|
||||||
|
forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced-alignment'
|
||||||
|
sys.path.append(forced_alignment_module)
|
||||||
|
from forced_alignment import pronunciations
|
||||||
|
from forced_alignment.htk_dict import variances_table
|
||||||
|
|
||||||
|
|
||||||
|
#pronunciations.delete_word('kunikoshi')
|
||||||
|
#pronunciations.delete_all_g2p_entries()
|
||||||
|
|
||||||
|
|
||||||
|
#existing_pronunciations = set(pronunciations.get_all())
|
||||||
|
## only focus on word
|
||||||
|
|
||||||
|
|
||||||
|
## missing pronunciations
|
||||||
|
## (1) pronunciation is written in IPA.
|
||||||
|
## (2) pronunciation variants are made based on (1).
|
||||||
|
## (3) they are converted into HTK format.
|
||||||
|
#missing_pronunciations_file = 'D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\missing_words_in_barbara_dic\\missing_words_pronvarsHTK.txt'
|
||||||
|
|
||||||
|
#with open(missing_pronunciations_file) as fin:
|
||||||
|
# lines = fin.read()
|
||||||
|
# lines = lines.split('\n')
|
||||||
|
|
||||||
|
#source = 'generated using ipa transcription by Marita Everhardt.'
|
||||||
|
#inserts = []
|
||||||
|
#for line in lines:
|
||||||
|
# line = line.split('\t')
|
||||||
|
# word = line[0].strip().lower()
|
||||||
|
# pronounciation = line[1].strip().split()
|
||||||
|
|
||||||
|
# # surely not in the table
|
||||||
|
# #if (word, pronounciation) not in existing_pronunciations:
|
||||||
|
# inserts.append("('{}', '{}', '{}', '{}', 0)".format(
|
||||||
|
# word,
|
||||||
|
# ' '.join(pronounciation),
|
||||||
|
# source,
|
||||||
|
# datetime.datetime.now(), ))
|
||||||
|
|
||||||
|
#sql = """INSERT INTO pronunciations (word, pronunciation, collection, added, automatic) VALUES\n {};""".format(
|
||||||
|
# ',\n '.join(inserts)
|
79
dialect_identification/output_confusion_matrix.py
Normal file
79
dialect_identification/output_confusion_matrix.py
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import itertools
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
from sklearn.metrics import accuracy_score
|
||||||
|
from sklearn.metrics import confusion_matrix
|
||||||
|
|
||||||
|
|
||||||
|
currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
|
||||||
|
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
|
||||||
|
|
||||||
|
regionLabels = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg']
|
||||||
|
regionLabels2 = ['Groningen_and_Drenthe', 'Limburg']
|
||||||
|
dirOut = currDir + '\\result\\same-utterance_with_cities'
|
||||||
|
|
||||||
|
|
||||||
|
def plot_confusion_matrix(cm, classes,
|
||||||
|
normalize=False,
|
||||||
|
title='Confusion matrix',
|
||||||
|
cmap=plt.cm.Blues):
|
||||||
|
"""
|
||||||
|
This function prints and plots the confusion matrix.
|
||||||
|
Normalization can be applied by setting `normalize=True`.
|
||||||
|
Note:
|
||||||
|
this code is downloaded from: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
|
||||||
|
"""
|
||||||
|
if normalize:
|
||||||
|
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
|
||||||
|
print("Normalized confusion matrix")
|
||||||
|
else:
|
||||||
|
print('Confusion matrix, without normalization')
|
||||||
|
|
||||||
|
_fontsize = 24
|
||||||
|
plt.imshow(cm, interpolation='nearest', cmap=cmap)
|
||||||
|
#plt.title(title, fontsize=_fontsize+2)
|
||||||
|
#plt.colorbar()
|
||||||
|
tick_marks = np.arange(len(classes))
|
||||||
|
#plt.xticks(tick_marks, classes, rotation=45, fontsize=_fontsize-2)
|
||||||
|
plt.xticks(tick_marks, classes, fontsize=_fontsize-4)
|
||||||
|
plt.yticks(tick_marks, classes, fontsize=_fontsize-4)
|
||||||
|
|
||||||
|
fmt = '.2f' if normalize else 'd'
|
||||||
|
thresh = cm.max() / 2.
|
||||||
|
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
|
||||||
|
plt.text(j, i, format(cm[i, j], fmt),
|
||||||
|
horizontalalignment="center",
|
||||||
|
color="white" if cm[i, j] > thresh else "black",
|
||||||
|
fontsize=_fontsize)
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.subplots_adjust(bottom=0.2)
|
||||||
|
plt.ylabel('True label', fontsize=_fontsize-4)
|
||||||
|
plt.xlabel('Predicted label', fontsize=_fontsize-4)
|
||||||
|
|
||||||
|
|
||||||
|
pred = np.load(dirOut + '\\pred_per_pid_3regions.npy')
|
||||||
|
|
||||||
|
#accuracy = accuracy_score(pred[:, 1], pred[:, 2], normalize=True, sample_weight=None)
|
||||||
|
#print('accuracy: {}%'.format(accuracy * 100))
|
||||||
|
|
||||||
|
# confusion matrix
|
||||||
|
cm = confusion_matrix(pred[:, 1], pred[:, 2], labels=regionLabels)
|
||||||
|
# human perception (2 regions)
|
||||||
|
#cm = np.array([[39, 57], [6, 104]])
|
||||||
|
# human perception (3 regions)
|
||||||
|
#cm = np.array([[22, 14, 52], [23, 21, 52], [5, 5, 100]])
|
||||||
|
print(cm)
|
||||||
|
|
||||||
|
np.set_printoptions(precision=2)
|
||||||
|
|
||||||
|
plt.figure()
|
||||||
|
plot_confusion_matrix(cm, classes=['GD', 'OG', 'LB'], normalize=True)
|
||||||
|
#plot_confusion_matrix(cm, classes=['GD', 'LB'], normalize=True)
|
||||||
|
|
||||||
|
#plt.show()
|
||||||
|
plt.savefig(dirOut + '\\cm_machine_3regions_normalized.png')
|
197
dialect_identification/sentence_based.py
Normal file
197
dialect_identification/sentence_based.py
Normal file
@ -0,0 +1,197 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import configparser
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from matplotlib import pyplot
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.model_selection import cross_val_score
|
||||||
|
from sklearn import preprocessing
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
# database
|
||||||
|
import pypyodbc
|
||||||
|
|
||||||
|
# classifier
|
||||||
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
|
from sklearn.svm import SVC
|
||||||
|
from sklearn.tree import DecisionTreeClassifier
|
||||||
|
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
|
||||||
|
from sklearn.naive_bayes import GaussianNB
|
||||||
|
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||||
|
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
|
||||||
|
from sklearn.metrics import f1_score
|
||||||
|
from sklearn.metrics import confusion_matrix
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
|
||||||
|
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
|
||||||
|
from dataIO import readFile
|
||||||
|
from dataIO import groupSamplesInCSV
|
||||||
|
import dataManipulation
|
||||||
|
import utility as util
|
||||||
|
|
||||||
|
|
||||||
|
configFile = currDir + '\\config.ini'
|
||||||
|
# load init file
|
||||||
|
config = configparser.ConfigParser()
|
||||||
|
config.sections()
|
||||||
|
config.read(configFile)
|
||||||
|
dirFeature = config['sentence_based']['dirFeature']
|
||||||
|
|
||||||
|
sentenceNumMax = 10
|
||||||
|
classifierList = []
|
||||||
|
LE_X_decode = []
|
||||||
|
LE_y = preprocessing.LabelEncoder()
|
||||||
|
LE_y.fit(["Groningen_and_Drenthe", "Limburg", "Oost_Overijsel-Gelderland"])
|
||||||
|
|
||||||
|
testset_X = []
|
||||||
|
testset_y = []
|
||||||
|
testset_userID = []
|
||||||
|
result_y_test = []
|
||||||
|
result_y_prediction = []
|
||||||
|
fout = open("comparison.csv", "w")
|
||||||
|
for sentenceNum in range(1, sentenceNumMax+1):
|
||||||
|
#if sentenceNum != 10:
|
||||||
|
# sentenceNumStr = '0' + str(sentenceNum)
|
||||||
|
#else:
|
||||||
|
# sentenceNumStr = str(sentenceNumStr)
|
||||||
|
sentenceNumStr = format(sentenceNum, '02')
|
||||||
|
fileSentence = dirFeature + '\\\\' + sentenceNumStr + '.csv'
|
||||||
|
|
||||||
|
|
||||||
|
## load combined data
|
||||||
|
fileCSV = fileSentence
|
||||||
|
idxRegion = 1
|
||||||
|
header, dataGroningen, dataLimburg, dataOverijsel = groupSamplesInCSV(fileCSV, idxRegion)
|
||||||
|
sampleNumMax = np.min((len(dataGroningen), len(dataLimburg), len(dataOverijsel)))
|
||||||
|
|
||||||
|
|
||||||
|
## make balanced dataset
|
||||||
|
dataG, indexG = dataManipulation.extractRandomSample(np.array(dataGroningen), sampleNumMax)
|
||||||
|
dataL, indexL = dataManipulation.extractRandomSample(np.array(dataLimburg), sampleNumMax)
|
||||||
|
dataO, indexO = dataManipulation.extractRandomSample(np.array(dataOverijsel), sampleNumMax)
|
||||||
|
|
||||||
|
XIndex = np.arange(idxRegion+1, len(header))
|
||||||
|
yIndex = 1 # region
|
||||||
|
userIDindex = 0 # userID
|
||||||
|
|
||||||
|
|
||||||
|
## cathegorical values into numbers
|
||||||
|
X_ = np.r_[dataG[:, XIndex], dataL[:, XIndex], dataO[:, XIndex]]
|
||||||
|
y_ = np.r_[dataG[:, yIndex], dataL[:, yIndex], dataO[:, yIndex]]
|
||||||
|
userID_ = np.r_[dataG[:, userIDindex], dataL[:, userIDindex], dataO[:, userIDindex]]
|
||||||
|
|
||||||
|
#X = np.zeros((X_.shape), 'int')
|
||||||
|
for Xindex in XIndex:
|
||||||
|
x = X_[:, Xindex-2]
|
||||||
|
|
||||||
|
## levenshtein distance
|
||||||
|
#word_count = Counter(x)
|
||||||
|
#frequent_word = max(word_count)
|
||||||
|
#X[:, Xindex-2] = dataManipulation.calcLevenshteinArray(frequent_word, x)
|
||||||
|
|
||||||
|
# hot encoding
|
||||||
|
le_x = preprocessing.LabelBinarizer()
|
||||||
|
le_x.fit(np.unique(x))
|
||||||
|
x_ = le_x.transform(x)
|
||||||
|
LE_X_decode.append(x_.shape[1])
|
||||||
|
if Xindex == idxRegion+1:
|
||||||
|
X = x_
|
||||||
|
else:
|
||||||
|
X = np.c_[X, x_]
|
||||||
|
|
||||||
|
y = LE_y.transform(y_)
|
||||||
|
|
||||||
|
|
||||||
|
## split into train vs test set
|
||||||
|
#[X_train, X_test, y_train, y_test] = train_test_split(X, y, test_size = 0.2, random_state = 0)
|
||||||
|
|
||||||
|
# each regional data should be splited equally
|
||||||
|
lenG = dataG.shape[0]
|
||||||
|
lenL = dataL.shape[0]
|
||||||
|
lenO = dataO.shape[0]
|
||||||
|
indexG = np.arange(0, lenG)
|
||||||
|
indexL = np.arange(lenG, lenG+lenL)
|
||||||
|
indexO = np.arange(lenG+lenL, lenG+lenL+lenO)
|
||||||
|
[XG_train, XG_test, yG_train, yG_test] = train_test_split(X[indexG, :], y[indexG], test_size = 0.2, random_state = 0)
|
||||||
|
[XL_train, XL_test, yL_train, yL_test] = train_test_split(X[indexL, :], y[indexL], test_size = 0.2, random_state = 0)
|
||||||
|
[XO_train, XO_test, yO_train, yO_test] = train_test_split(X[indexO, :], y[indexO], test_size = 0.2, random_state = 0)
|
||||||
|
X_train = np.r_[XG_train, XL_train, XO_train]
|
||||||
|
X_test = np.r_[XG_test, XL_test, XO_test]
|
||||||
|
y_train = np.r_[yG_train, yL_train, yO_train]
|
||||||
|
y_test = np.r_[yG_test, yL_test, yO_test]
|
||||||
|
|
||||||
|
|
||||||
|
## comparison
|
||||||
|
## classifiers
|
||||||
|
#names = ["Nearest Neighbors",
|
||||||
|
# "Linear SVM",
|
||||||
|
# "Poly SVM",
|
||||||
|
# "RBF SVM",
|
||||||
|
# "Decision Tree",
|
||||||
|
# "Random Forest 2",
|
||||||
|
# "Random Forest 3",
|
||||||
|
# "Random Forest 4",
|
||||||
|
# "AdaBoost",
|
||||||
|
# #"Naive Bayes",
|
||||||
|
# "Linear Discriminant Analysis",
|
||||||
|
# #"Quadratic Discriminant Analysis"
|
||||||
|
# ]
|
||||||
|
#classifiers = [
|
||||||
|
# KNeighborsClassifier(3),
|
||||||
|
# SVC(kernel="linear", C=0.025),
|
||||||
|
# SVC(kernel="poly", C=0.025),
|
||||||
|
# SVC(gamma=2, C=1),
|
||||||
|
# DecisionTreeClassifier(max_depth=4),
|
||||||
|
# RandomForestClassifier(max_depth=2, n_estimators=10, max_features=1),
|
||||||
|
# RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1),
|
||||||
|
# RandomForestClassifier(max_depth=4, n_estimators=10, max_features=1),
|
||||||
|
# AdaBoostClassifier(),
|
||||||
|
# #GaussianNB(),
|
||||||
|
# LinearDiscriminantAnalysis(),
|
||||||
|
# #QuadraticDiscriminantAnalysis()
|
||||||
|
# ]
|
||||||
|
#for name, model in zip(names, classifiers):
|
||||||
|
# scores = cross_val_score(model, X, y, cv = 10, scoring = 'f1_micro')
|
||||||
|
# fout = open("comparison.csv", "a")
|
||||||
|
# fout.write("{0},{1},{2}\n".format(sentenceNum, name, scores.mean()))
|
||||||
|
# print('{0}, {1}: {2}'.format(sentenceNum, name, scores.mean()))
|
||||||
|
|
||||||
|
# quasi-optimal model
|
||||||
|
model = AdaBoostClassifier()
|
||||||
|
# cross validation
|
||||||
|
scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro')
|
||||||
|
ci_mean, ci_low, ci_high = util.mean_confidence_interval(scores, 0.95)
|
||||||
|
modelfit = model.fit(X_train, y_train)
|
||||||
|
# f1 on test data
|
||||||
|
y_prediction = modelfit.predict(X_test)
|
||||||
|
f1score = f1_score(y_test, y_prediction, average='micro')
|
||||||
|
fout.write("{0},{1},{2},{3}\n".format(ci_mean, ci_low, ci_high, f1score))
|
||||||
|
|
||||||
|
## save for the test
|
||||||
|
testset_X.append(X_test)
|
||||||
|
testset_y.append(y_test)
|
||||||
|
testset_userID.append(userID_)
|
||||||
|
result_y_test = result_y_test + list(y_test)
|
||||||
|
result_y_prediction = result_y_prediction + list(y_prediction)
|
||||||
|
fileClassifier = dirFeature + '\\\\' + sentenceNumStr + '.mdl'
|
||||||
|
pickle.dump(modelfit, open(fileClassifier, 'wb'))
|
||||||
|
fout.close()
|
||||||
|
|
||||||
|
### confusion matrix
|
||||||
|
result_y_test_label = LE_y.inverse_transform(result_y_test)
|
||||||
|
result_y_prediction_label = LE_y.inverse_transform(result_y_prediction)
|
||||||
|
confusionMatrix = confusion_matrix(result_y_test_label, result_y_prediction_label, labels=[
|
||||||
|
'Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland'])
|
||||||
|
print(confusionMatrix)
|
||||||
|
|
||||||
|
|
||||||
|
### make userID list
|
||||||
|
#userID = testset_userID[0]
|
||||||
|
#for sentenceNum in range(1, sentenceNumMax):
|
||||||
|
# userid = testset_userID[sentenceNum]
|
||||||
|
# userID = np.r_[userID, userid]
|
||||||
|
#userIDlist = np.unique(userID)
|
||||||
|
|
326
dialect_identification/speaker_based.py
Normal file
326
dialect_identification/speaker_based.py
Normal file
@ -0,0 +1,326 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import configparser
|
||||||
|
|
||||||
|
import pypyodbc
|
||||||
|
import numpy as np
|
||||||
|
from collections import Counter
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.model_selection import cross_val_score
|
||||||
|
from sklearn import preprocessing
|
||||||
|
from sklearn.metrics import confusion_matrix
|
||||||
|
from sklearn.metrics import accuracy_score
|
||||||
|
|
||||||
|
currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
|
||||||
|
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
|
||||||
|
import dataManipulation as mani
|
||||||
|
import evaluation as eval
|
||||||
|
import speaker_based_functions as sb_func
|
||||||
|
|
||||||
|
|
||||||
|
#####################
|
||||||
|
## USER DEFINE ##
|
||||||
|
#####################
|
||||||
|
sentenceNumMax = 10
|
||||||
|
configFile = currDir + '\\config.ini'
|
||||||
|
dirOut = currDir + '\\result'
|
||||||
|
|
||||||
|
# make train/test set: 1, load: 0
|
||||||
|
makeTrainTestSet = 0
|
||||||
|
# convert 3 regions to 2 regions: 1, load: 0
|
||||||
|
conv3to2region = 0
|
||||||
|
|
||||||
|
# 3 regions: 0
|
||||||
|
# saxon vs limburg: 1
|
||||||
|
# groningen vs limburg: 2
|
||||||
|
experiment_type = 2
|
||||||
|
|
||||||
|
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
|
||||||
|
|
||||||
|
# a bit useless error handling.
|
||||||
|
#assert (experiment_type in (0, 1, 2)), "experiment type should be 0, 1 or 2."
|
||||||
|
if experiment_type == 1:
|
||||||
|
regionLabels2 = ['Low_Saxon', 'Limburg']
|
||||||
|
regionLabels2 = ['Groningen_and_Drenthe', 'Limburg']
|
||||||
|
|
||||||
|
|
||||||
|
##########################
|
||||||
|
## DATA PREPARATION ##
|
||||||
|
##########################
|
||||||
|
|
||||||
|
## load init file
|
||||||
|
config = configparser.ConfigParser()
|
||||||
|
config.sections()
|
||||||
|
config.read(configFile)
|
||||||
|
dirFeature = config['sentence_based']['dirFeature']
|
||||||
|
fileMDB = config['sentence_based']['fileMDB']
|
||||||
|
|
||||||
|
|
||||||
|
## database connection
|
||||||
|
pypyodbc.lowercase = False
|
||||||
|
param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
|
||||||
|
conn = pypyodbc.connect(param)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
|
||||||
|
## get data from Access database
|
||||||
|
# data format
|
||||||
|
# 0: filename
|
||||||
|
# 1: pid
|
||||||
|
# 2: region
|
||||||
|
# 3: ID (unique word_id)
|
||||||
|
# 4: sentence_id
|
||||||
|
# 5: word_id
|
||||||
|
# 6: word
|
||||||
|
# 7: pronunciation
|
||||||
|
SQL_string = """\
|
||||||
|
{CALL dataset_with_cities}
|
||||||
|
"""
|
||||||
|
cursor.execute(SQL_string)
|
||||||
|
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
data = np.array(rows)
|
||||||
|
#dataNumMax = data.shape[0]
|
||||||
|
#uniqueWordIDmax = max(data[:, 3].astype(int))
|
||||||
|
del SQL_string, rows
|
||||||
|
|
||||||
|
|
||||||
|
## make list of LabelBinarizer object per word.
|
||||||
|
# for X
|
||||||
|
# get pronvarList from Access database
|
||||||
|
# pronvarList format
|
||||||
|
# 0: ID (unique word_id)
|
||||||
|
# 1: word
|
||||||
|
# 2: pronvar
|
||||||
|
SQL_string = """\
|
||||||
|
{CALL pronunciation_variant}
|
||||||
|
"""
|
||||||
|
cursor.execute(SQL_string)
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
pronvarList = np.array(rows)
|
||||||
|
del SQL_string, rows
|
||||||
|
|
||||||
|
|
||||||
|
LBlist = []
|
||||||
|
#uniqueWordIDlist = pronvarList[:, 0].astype(int)
|
||||||
|
uniqueWordIDlist = data[:, 3].astype(int)
|
||||||
|
uniqueWordIDmax = max(uniqueWordIDlist)
|
||||||
|
for uniqueWordID in range(1, uniqueWordIDmax+1):
|
||||||
|
pronvar = data[uniqueWordIDlist == uniqueWordID, 7]
|
||||||
|
#pronvar = pronvarList[pronvarList[:, 0] == uniqueWordID, 2]
|
||||||
|
LB = preprocessing.LabelBinarizer()
|
||||||
|
LB.fit(np.unique(pronvar))
|
||||||
|
LBlist.append(LB)
|
||||||
|
|
||||||
|
# for y (=region)
|
||||||
|
LE_y = preprocessing.LabelEncoder()
|
||||||
|
LE_y.fit(regionLabels)
|
||||||
|
LE_y2 = preprocessing.LabelEncoder()
|
||||||
|
LE_y2.fit(regionLabels2)
|
||||||
|
|
||||||
|
LB_y = preprocessing.LabelBinarizer()
|
||||||
|
LB_y.fit(regionLabels)
|
||||||
|
LB_y2 = preprocessing.LabelBinarizer()
|
||||||
|
LB_y2.fit(regionLabels2)
|
||||||
|
|
||||||
|
del uniqueWordID, uniqueWordIDmax, pronvar, LB
|
||||||
|
|
||||||
|
|
||||||
|
#################
|
||||||
|
## ITERATION ##
|
||||||
|
#################
|
||||||
|
#CM_majority = np.zeros((1, 9)).astype(int)
|
||||||
|
#CM_weighted = np.zeros((1, 9)).astype(int)
|
||||||
|
#for iter in range(0, 1):
|
||||||
|
# print(iter)
|
||||||
|
|
||||||
|
## make balanced dataset
|
||||||
|
pidlist = np.unique(data[:, (1, 2)], axis=0)
|
||||||
|
|
||||||
|
# count number of samples
|
||||||
|
pidlistCounter = Counter(pidlist[:, 1])
|
||||||
|
sampleNumMax = min(pidlistCounter.values())
|
||||||
|
del pidlistCounter
|
||||||
|
|
||||||
|
|
||||||
|
## make train/eval/test set or load
|
||||||
|
if makeTrainTestSet==1:
|
||||||
|
pidlist_train = []
|
||||||
|
pidlist_eval = []
|
||||||
|
pidlist_test = []
|
||||||
|
for regionNum in range(0, len(regionLabels)):
|
||||||
|
regionName = regionLabels[regionNum]
|
||||||
|
|
||||||
|
pidlist_per_region_ = pidlist[pidlist[:, 1]==regionLabels[regionNum], :]
|
||||||
|
pidlist_per_region, idx = mani.extractRandomSample(
|
||||||
|
pidlist_per_region_, sampleNumMax)
|
||||||
|
|
||||||
|
# split dataset into train, eval and test.
|
||||||
|
[pidlist_per_region_train, pidlist_per_region_test] = train_test_split(
|
||||||
|
pidlist_per_region, test_size = 0.2, random_state = 0)
|
||||||
|
[pidlist_per_region_train, pidlist_per_region_eval] = train_test_split(
|
||||||
|
pidlist_per_region_train, test_size = 0.1, random_state = 0)
|
||||||
|
|
||||||
|
# append numpy arrays
|
||||||
|
if regionNum == 0:
|
||||||
|
pidlist_train = pidlist_per_region_train
|
||||||
|
pidlist_eval = pidlist_per_region_eval
|
||||||
|
pidlist_test = pidlist_per_region_test
|
||||||
|
else:
|
||||||
|
pidlist_train = np.r_[pidlist_train, pidlist_per_region_train]
|
||||||
|
pidlist_eval = np.r_[pidlist_eval, pidlist_per_region_eval]
|
||||||
|
pidlist_test = np.r_[pidlist_test, pidlist_per_region_test]
|
||||||
|
del regionNum, regionName
|
||||||
|
del pidlist_per_region_, pidlist_per_region, idx
|
||||||
|
del pidlist_per_region_train, pidlist_per_region_eval, pidlist_per_region_test
|
||||||
|
np.save(dirOut + "\\pidlist_train.npy", pidlist_train)
|
||||||
|
np.save(dirOut + "\\pidlist_eval.npy", pidlist_eval)
|
||||||
|
np.save(dirOut + "\\pidlist_test.npy", pidlist_test)
|
||||||
|
else:
|
||||||
|
pidlist_train = np.load(dirOut + "\\pidlist_train.npy")
|
||||||
|
pidlist_eval = np.load(dirOut + "\\pidlist_eval.npy")
|
||||||
|
pidlist_test = np.load(dirOut + "\\pidlist_test.npy")
|
||||||
|
|
||||||
|
|
||||||
|
## make dataset for 2 regions or load
|
||||||
|
if conv3to2region==1:
|
||||||
|
pidlist2_train_ = np.r_[pidlist_train, pidlist_eval]
|
||||||
|
|
||||||
|
if experiment_type == 1:
|
||||||
|
pidlist2_train = sb_func.saxon_vs_limburg(pidlist2_train_)
|
||||||
|
pidlist2_test = sb_func.saxon_vs_limburg(pidlist_test)
|
||||||
|
np.save(dirOut + "\\pidlist2_saxon_vs_limburg_train", pidlist2_train)
|
||||||
|
np.save(dirOut + "\\pidlist2_saxon_vs_limburg_test", pidlist2_test)
|
||||||
|
|
||||||
|
elif experiment_type == 2:
|
||||||
|
pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_)
|
||||||
|
pidlist2_test = sb_func.groningen_vs_limburg(pidlist_test)
|
||||||
|
np.save(dirOut + "\\pidlist2_groningen_vs_limburg_train", pidlist2_train)
|
||||||
|
np.save(dirOut + "\\pidlist2_groningen_vs_limburg_test", pidlist2_test)
|
||||||
|
|
||||||
|
del pidlist2_train_
|
||||||
|
else:
|
||||||
|
if experiment_type == 1:
|
||||||
|
pidlist2_train = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_train.npy")
|
||||||
|
pidlist2_test = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_test.npy")
|
||||||
|
|
||||||
|
elif experiment_type == 2:
|
||||||
|
pidlist2_train = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_train.npy")
|
||||||
|
pidlist2_test = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_test.npy")
|
||||||
|
|
||||||
|
|
||||||
|
## train/test data
|
||||||
|
if experiment_type == 0:
|
||||||
|
# Groningen vs Overijsel vs Limburg
|
||||||
|
data_train = sb_func.extractPid(pidlist_train, data)
|
||||||
|
data_eval = sb_func.extractPid(pidlist_eval, data)
|
||||||
|
data_test = sb_func.extractPid(pidlist_test, data)
|
||||||
|
|
||||||
|
elif experiment_type == 1 or experiment_type == 2:
|
||||||
|
data2 = np.array(data)
|
||||||
|
|
||||||
|
if experiment_type == 1:
|
||||||
|
for row, row2 in zip(data, data2):
|
||||||
|
if row[2] == regionLabels[0] or row[2] == regionLabels[2]:
|
||||||
|
row2[2] = regionLabels2[0]
|
||||||
|
|
||||||
|
data2_train = sb_func.extractPid(pidlist2_train, data2)
|
||||||
|
data2_test = sb_func.extractPid(pidlist2_test, data2)
|
||||||
|
|
||||||
|
|
||||||
|
#####################################
|
||||||
|
## EXPERIMENTS START FROM HERE ##
|
||||||
|
#####################################
|
||||||
|
|
||||||
|
## actual training
|
||||||
|
# train vs eval
|
||||||
|
#trainData = data_train
|
||||||
|
#testData = data_eval
|
||||||
|
#testPID = pidlist_eval
|
||||||
|
#LB = LB_y
|
||||||
|
#LE = LE_y
|
||||||
|
#regionLabels = regionLabels3
|
||||||
|
|
||||||
|
# train+eval vs test
|
||||||
|
if experiment_type == 0:
|
||||||
|
trainData = np.r_[data_train, data_eval]
|
||||||
|
testData = data_test
|
||||||
|
testPID = pidlist_test
|
||||||
|
LB = LB_y
|
||||||
|
LE = LE_y
|
||||||
|
elif experiment_type == 1 or experiment_type == 2:
|
||||||
|
# 2 region: saxon vs limburg/ groningen vs limburg
|
||||||
|
trainData = data2_train
|
||||||
|
testData = data2_test
|
||||||
|
testPID = pidlist2_test
|
||||||
|
LB = LB_y2
|
||||||
|
LE = LE_y2
|
||||||
|
regionLabels = regionLabels2
|
||||||
|
|
||||||
|
|
||||||
|
# check the number of utterance
|
||||||
|
allData = np.r_[trainData, testData]
|
||||||
|
filenames = np.c_[allData[:, 0], allData[:, 2]]
|
||||||
|
filenames_unique = np.unique(filenames, axis=0)
|
||||||
|
Counter(filenames_unique[:, 1])
|
||||||
|
|
||||||
|
|
||||||
|
fileComparison = dirOut + "\\algorithm_comparison.csv"
|
||||||
|
filePerformance = dirOut + "\\sentence-level.csv"
|
||||||
|
fileConfusionMatrix = dirOut + "\\confusion_matrix.csv"
|
||||||
|
|
||||||
|
## compare classification algorithms for the sentence-classifiers.
|
||||||
|
#sb_func.compare_sentence_level_classifiers(trainData, LBlist, LE, fileComparison)
|
||||||
|
|
||||||
|
## train sentence-level classifiers.
|
||||||
|
modelList, scoreList, confusionMatrixList = sb_func.train_sentence_level_classifiers(
|
||||||
|
trainData, LBlist, LE, filePerformance)
|
||||||
|
|
||||||
|
## prediction over evaluation data per each sentence-level classifier.
|
||||||
|
pred_per_sentence = sb_func.prediction_per_sentence(testData, modelList, LBlist, LE)
|
||||||
|
|
||||||
|
## combine sentence-level classifiers
|
||||||
|
pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence)
|
||||||
|
|
||||||
|
## majority vote (weighted)
|
||||||
|
#weight = sb_func.calc_weight(confusionMatrixList)
|
||||||
|
#pred_per_pid_weighted = sb_func.prediction_per_pid_weighted(testPID, pred_per_sentence, weight, LB, LE)
|
||||||
|
|
||||||
|
### confusion matrix
|
||||||
|
if experiment_type == 0:
|
||||||
|
confusionMatrix_majority = confusion_matrix(
|
||||||
|
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg'])
|
||||||
|
else:
|
||||||
|
confusionMatrix_majority = confusion_matrix(
|
||||||
|
pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Limburg'])
|
||||||
|
|
||||||
|
#confusionMatrix_weighted = confusion_matrix(
|
||||||
|
# pred_per_pid_weighted[:, 1], pred_per_pid_weighted[:, 2], labels=regionLabels)
|
||||||
|
|
||||||
|
|
||||||
|
## output
|
||||||
|
accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None)
|
||||||
|
print('accuracy: {}%'.format(accuracy * 100))
|
||||||
|
|
||||||
|
cm = confusionMatrix_majority
|
||||||
|
print(cm)
|
||||||
|
|
||||||
|
np.save(dirOut + "\\pred_per_pid.npy", pred_per_pid_majority)
|
||||||
|
np.save(dirOut + "\\confusion_matrix.npy", cm)
|
||||||
|
|
||||||
|
#fout = open(fileConfusionMatrix, "w")
|
||||||
|
#fout.write('< confusion matrix for majority vote in evaluation set >\n')
|
||||||
|
#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_majority', regionLabels)
|
||||||
|
#fout.write('< confusion matrix for weighted vote in evaluation set >\n')
|
||||||
|
#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_weighted', regionLabels)
|
||||||
|
#fout.write('\n')
|
||||||
|
#fout.close()
|
||||||
|
|
||||||
|
|
||||||
|
##### iteration finish #####
|
||||||
|
conn.close()
|
||||||
|
#np.savetxt(dirOut + '\\cm_majority.csv', CM_majority, delimiter=',')
|
||||||
|
#np.savetxt(dirOut + '\\cm_weighted.csv', CM_weighted, delimiter=',')
|
||||||
|
|
383
dialect_identification/speaker_based_functions.py
Normal file
383
dialect_identification/speaker_based_functions.py
Normal file
@ -0,0 +1,383 @@
|
|||||||
|
import numpy as np
|
||||||
|
from collections import Counter
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
|
from sklearn.svm import SVC
|
||||||
|
from sklearn.tree import DecisionTreeClassifier
|
||||||
|
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
|
||||||
|
from sklearn.naive_bayes import GaussianNB
|
||||||
|
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||||
|
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
|
||||||
|
|
||||||
|
from sklearn.model_selection import cross_val_score
|
||||||
|
from sklearn.metrics import confusion_matrix
|
||||||
|
|
||||||
|
import dataManipulation as mani
|
||||||
|
import evaluation as eval
|
||||||
|
|
||||||
|
|
||||||
|
# extract data that corresponds to pid in the pidlist
|
||||||
|
def extractPid(pidlist, data):
|
||||||
|
for pidnum in range(0, len(pidlist)):
|
||||||
|
pid = pidlist[pidnum, 0]
|
||||||
|
x = data[data[:, 1] == pid, :]
|
||||||
|
if pidnum == 0:
|
||||||
|
data_ = x
|
||||||
|
else:
|
||||||
|
data_ = np.r_[data_, x]
|
||||||
|
return data_
|
||||||
|
|
||||||
|
|
||||||
|
def OneHotEncoding(data, LB_X, LE_y):
|
||||||
|
# one hot encoding of data using LabelBinalizer per word (LB_X) and for region (LB_y)
|
||||||
|
# INPUT
|
||||||
|
# data
|
||||||
|
# 0: filename
|
||||||
|
# 1: pid
|
||||||
|
# 2: region
|
||||||
|
# 3: ID (unique word_id)
|
||||||
|
# 4: sentence_id
|
||||||
|
# 5: word_id
|
||||||
|
# 6: word
|
||||||
|
# 7: pronunciation
|
||||||
|
# LB_x: LabelBinalizer objects
|
||||||
|
# LE_y: LabelEncoder object
|
||||||
|
# OUTPUT
|
||||||
|
# X: encoded variable data
|
||||||
|
# y: encoded target data
|
||||||
|
pidlist = data[:, 1]
|
||||||
|
regionlist = data[:, 2]
|
||||||
|
uniqueWordIDlist = data[:, 3].astype(int)
|
||||||
|
pronvarlist = data[:, 7]
|
||||||
|
|
||||||
|
uniqueWordIDlist_unique = np.unique(uniqueWordIDlist)
|
||||||
|
uniqueWordIDlist_unique.sort()
|
||||||
|
for uniqueWordIDnum in uniqueWordIDlist_unique:
|
||||||
|
x_ = pronvarlist[uniqueWordIDlist == uniqueWordIDnum]
|
||||||
|
lb = LB_X[uniqueWordIDnum-1]
|
||||||
|
x = lb.transform(x_)
|
||||||
|
if uniqueWordIDnum == uniqueWordIDlist_unique[0]:
|
||||||
|
X = x
|
||||||
|
else:
|
||||||
|
X = np.c_[X, x]
|
||||||
|
|
||||||
|
# pid and region of the speakers
|
||||||
|
y_ = regionlist[uniqueWordIDlist == uniqueWordIDlist_unique[0]]
|
||||||
|
y = LE_y.transform(y_)
|
||||||
|
|
||||||
|
pid = pidlist[uniqueWordIDlist == uniqueWordIDlist_unique[0]]
|
||||||
|
return X, y, pid
|
||||||
|
|
||||||
|
|
||||||
|
def outputConfusionMatrix33(foutName, matrixName, regionLabels):
|
||||||
|
for r in range(0, len(regionLabels)):
|
||||||
|
execString1 = foutName + '.write("{0},{1},{2},{3}\\n".format('
|
||||||
|
execString2 = 'regionLabels[' + str(r) + ']'
|
||||||
|
execString3 = ''
|
||||||
|
for c in range(0, len(regionLabels)):
|
||||||
|
execString3 = execString3 + ',' + matrixName + '[' + str(r) + '][' + str(c) + ']'
|
||||||
|
execString4 = '))'
|
||||||
|
execString = execString1 + execString2 + execString3 + execString4
|
||||||
|
exec(execString)
|
||||||
|
|
||||||
|
|
||||||
|
def compare_sentence_level_classifiers(data_train, LBlist, LE_y, fileCSV):
|
||||||
|
""" compare the classification algorithms on sentence-level classifiers.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data_train: training data.
|
||||||
|
LBlist: list of label binarizer, which is used to encode pronunciation variants.
|
||||||
|
LE_y: label encorder, which is used to encode rigion names.
|
||||||
|
fileCSV: output csv file path.
|
||||||
|
|
||||||
|
"""
|
||||||
|
fout = open(fileCSV, "w")
|
||||||
|
|
||||||
|
sentenceIDlist_train = data_train[:, 4].astype(int)
|
||||||
|
sentenceIDmax_train = max(sentenceIDlist_train)
|
||||||
|
|
||||||
|
for sentenceID in range(1, sentenceIDmax_train+1):
|
||||||
|
sentenceIDstr = format(sentenceID, '02')
|
||||||
|
|
||||||
|
## categorical values into binary values.
|
||||||
|
data_sentence = data_train[sentenceIDlist_train == sentenceID, :]
|
||||||
|
X_train, y_train, pid_train = OneHotEncoding(data_sentence, LBlist, LE_y)
|
||||||
|
regionCounter = Counter(LE_y.inverse_transform(y_train))
|
||||||
|
|
||||||
|
## classifier comparison
|
||||||
|
names = [
|
||||||
|
"Nearest Neighbors",
|
||||||
|
"Linear SVM",
|
||||||
|
"Poly SVM",
|
||||||
|
"RBF SVM",
|
||||||
|
"Decision Tree",
|
||||||
|
"Random Forest 2",
|
||||||
|
"Random Forest 3",
|
||||||
|
"Random Forest 4",
|
||||||
|
"AdaBoost",
|
||||||
|
"AdaBoost(SVM)",
|
||||||
|
"AdaBoost(Random Forest 3)",
|
||||||
|
"Naive Bayes",
|
||||||
|
"Linear Discriminant Analysis",
|
||||||
|
"Quadratic Discriminant Analysis"
|
||||||
|
]
|
||||||
|
classifiers = [
|
||||||
|
KNeighborsClassifier(3),
|
||||||
|
SVC(kernel="linear", C=0.025),
|
||||||
|
SVC(kernel="poly", C=0.025),
|
||||||
|
SVC(gamma=2, C=1),
|
||||||
|
DecisionTreeClassifier(max_depth=4),
|
||||||
|
RandomForestClassifier(max_depth=2, n_estimators=10, max_features=1),
|
||||||
|
RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1),
|
||||||
|
RandomForestClassifier(max_depth=4, n_estimators=10, max_features=1),
|
||||||
|
AdaBoostClassifier(),
|
||||||
|
AdaBoostClassifier(SVC(probability=True, kernel='linear')),
|
||||||
|
AdaBoostClassifier(RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1)),
|
||||||
|
GaussianNB(),
|
||||||
|
LinearDiscriminantAnalysis(),
|
||||||
|
QuadraticDiscriminantAnalysis()
|
||||||
|
]
|
||||||
|
for name, model in zip(names, classifiers):
|
||||||
|
scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro')
|
||||||
|
fout.write("{0},{1},{2},{3}\n".format(sentenceID, name, scores.mean(), scores.var()))
|
||||||
|
print('{0}, {1}: {2}'.format(sentenceID, name, scores.mean()))
|
||||||
|
|
||||||
|
fout.close()
|
||||||
|
|
||||||
|
|
||||||
|
def train_sentence_level_classifiers(data_train, LBlist, LE_y, fileCSV):
|
||||||
|
""" train sentence-level classifiers.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data_train: training data.
|
||||||
|
LBlist: list of label binarizer, which is used to encode pronunciation variants.
|
||||||
|
LE_y: label encorder, which is used to encode rigion names.
|
||||||
|
fileCSV: output csv file path.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
modelList (list): list of models (length: sentenceNumMax)
|
||||||
|
scoreList (list): list of scores (length: sentenceNumMax)
|
||||||
|
|
||||||
|
"""
|
||||||
|
fout = open(fileCSV, "w")
|
||||||
|
|
||||||
|
fout.write('< cross-validation in training set >\n')
|
||||||
|
|
||||||
|
sentenceIDlist_train = data_train[:, 4].astype(int)
|
||||||
|
sentenceIDmax_train = max(sentenceIDlist_train)
|
||||||
|
modelList = []
|
||||||
|
scoreList = []
|
||||||
|
confusionMatrixList = []
|
||||||
|
|
||||||
|
for sentenceID in range(1, sentenceIDmax_train+1):
|
||||||
|
sentenceIDstr = format(sentenceID, '02')
|
||||||
|
|
||||||
|
## categorical values into binary values.
|
||||||
|
data_sentence = data_train[sentenceIDlist_train == sentenceID, :]
|
||||||
|
X_train, y_train, pid_train = OneHotEncoding(data_sentence, LBlist, LE_y)
|
||||||
|
regionCounter = Counter(LE_y.inverse_transform(y_train))
|
||||||
|
|
||||||
|
## cross-validation with the best classifier
|
||||||
|
model = AdaBoostClassifier()
|
||||||
|
#model = SVC(kernel="linear", C=0.025)
|
||||||
|
#model = LinearDiscriminantAnalysis()
|
||||||
|
|
||||||
|
# #scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro')
|
||||||
|
scores, confusionMatrix = eval.cross_val_confusion_matrix(model, X_train, y_train, 10)
|
||||||
|
ci_mean, ci_low, ci_high = eval.mean_confidence_interval(scores, 0.95)
|
||||||
|
scoreList.append(scores)
|
||||||
|
confusionMatrixList.append(confusionMatrix)
|
||||||
|
|
||||||
|
## model fitting
|
||||||
|
modelfit = model.fit(X_train, y_train)
|
||||||
|
modelList.append(modelfit)
|
||||||
|
|
||||||
|
## output
|
||||||
|
fout.write("{},".format(sentenceID))
|
||||||
|
#fout.write("{0},{1},{2},".format(
|
||||||
|
# regionCounter['Groningen_and_Drenthe'], regionCounter['Limburg'], regionCounter['Oost_Overijsel-Gelderland']))
|
||||||
|
#fout.write("{0},{1},".format(
|
||||||
|
# regionCounter['Low_Saxon'], regionCounter['Limburg']))
|
||||||
|
fout.write("{0},{1},".format(
|
||||||
|
regionCounter['Groningen_and_Drenthe'], regionCounter['Limburg']))
|
||||||
|
|
||||||
|
fout.write("{0},{1},{2}\n".format(ci_mean, ci_low, ci_high))
|
||||||
|
fout.write('\n')
|
||||||
|
fout.close()
|
||||||
|
|
||||||
|
return modelList, scoreList, confusionMatrixList
|
||||||
|
|
||||||
|
|
||||||
|
def prediction_per_sentence(data_eval, modelList, LBlist, LE_y):
|
||||||
|
""" prediction using sentence-level classifiers.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data_eval: evaluation data.
|
||||||
|
modelList: list of the models.
|
||||||
|
LBlist: list of label binarizer, which is used to encode pronunciation variants.
|
||||||
|
LE_y: label encorder, which is used to encode rigion names.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
prediction (list): [sentenceID, pid, answer, prediction]
|
||||||
|
|
||||||
|
"""
|
||||||
|
sentenceIDlist_eval = data_eval[:, 4].astype(int)
|
||||||
|
sentenceIDmax_eval = max(sentenceIDlist_eval)
|
||||||
|
for sentenceID in range(1, sentenceIDmax_eval+1):
|
||||||
|
sentenceIDstr = format(sentenceID, '02')
|
||||||
|
|
||||||
|
## categorical values into binary values.
|
||||||
|
data_sentence = data_eval[sentenceIDlist_eval == sentenceID, :]
|
||||||
|
X_eval, y_eval, pid_eval = OneHotEncoding(data_sentence, LBlist, LE_y)
|
||||||
|
regionCounter = Counter(LE_y.inverse_transform(y_eval))
|
||||||
|
|
||||||
|
## evaluate model
|
||||||
|
modelfit = modelList[sentenceID-1]
|
||||||
|
y_pred = modelfit.predict(X_eval)
|
||||||
|
y_pred_label = LE_y.inverse_transform(y_pred)
|
||||||
|
y_eval_label = LE_y.inverse_transform(y_eval)
|
||||||
|
|
||||||
|
# pid, y, y_pred
|
||||||
|
sentenceIDvec = np.ones((y_eval_label.shape[0], 1)).astype(int) * sentenceID
|
||||||
|
prediction_ = np.c_[sentenceIDvec, pid_eval, y_eval_label, y_pred_label]
|
||||||
|
if sentenceID == 1:
|
||||||
|
prediction = prediction_
|
||||||
|
else:
|
||||||
|
prediction = np.r_[prediction, prediction_]
|
||||||
|
|
||||||
|
return prediction
|
||||||
|
|
||||||
|
|
||||||
|
def prediction_per_pid_majority(pidlist_eval, prediction):
|
||||||
|
""" make a prediction per pid using majority vote
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
prediction_per_pid (ndarray): [pid, ans, prediction]
|
||||||
|
|
||||||
|
"""
|
||||||
|
prediction_per_pid = []
|
||||||
|
for pid_ in range(0, len(pidlist_eval[:, 0])):
|
||||||
|
pid = pidlist_eval[pid_, 0]
|
||||||
|
ans = pidlist_eval[pid_, 1]
|
||||||
|
prediction_ = prediction[prediction[:, 1] == pid, :]
|
||||||
|
|
||||||
|
# majority vote
|
||||||
|
predCounter = Counter(prediction_[:, -1])
|
||||||
|
predMostCommon = predCounter.most_common(1)
|
||||||
|
predLabel = predMostCommon[0][0]
|
||||||
|
predRatio = predMostCommon[0][1] / prediction_.shape[0] * 100
|
||||||
|
|
||||||
|
prediction_per_pid.append([pid, ans, predLabel])
|
||||||
|
|
||||||
|
return np.array(prediction_per_pid)
|
||||||
|
|
||||||
|
|
||||||
|
def calc_weight(confusionMatrixList):
|
||||||
|
""" calculate weight (how trustworthy the prediction is) for majority vote.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
Of all subjects we predicted are GO/OG/LB, what fraction of them actually are (precision) is used as weight.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
confusionMarixList: list of confusion matrix of sentence-level classifiers.
|
||||||
|
|
||||||
|
"""
|
||||||
|
sentenceID_max = len(confusionMatrixList)
|
||||||
|
weight = np.zeros((sentenceID_max, confusionMatrixList[0].shape[0]))
|
||||||
|
for sentenceID in range(1, sentenceID_max+1):
|
||||||
|
cm = confusionMatrixList[sentenceID-1]
|
||||||
|
|
||||||
|
# normalized confusion matrix
|
||||||
|
#rTotal = np.sum(cm, axis=1)
|
||||||
|
#cm_normalized = cm / rTotal
|
||||||
|
#weight[sentenceID-1, :] = np.diag(cm_normalized)
|
||||||
|
|
||||||
|
true_positives = np.diag(cm)
|
||||||
|
predicted = np.sum(cm, axis=0)
|
||||||
|
weight[sentenceID-1, :] = true_positives / predicted
|
||||||
|
|
||||||
|
return weight
|
||||||
|
|
||||||
|
|
||||||
|
def prediction_per_pid_weighted(pidlist_eval, prediction, weight, LB_y, LE_y):
|
||||||
|
""" make a prediction per pid using weighted (majority) vote.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
weight (ndarray): how trustworthy the prediction of each sentence-based classifier is.
|
||||||
|
LB_y: label binalizer, which is used to encode region names.
|
||||||
|
LE_y: label encorder, which is used to encode region names.
|
||||||
|
Returns:
|
||||||
|
prediction_per_pid (ndarray): [pid, ans, prediction]
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
prediction_per_pid = []
|
||||||
|
for pid_ in range(0, len(pidlist_eval[:, 0])):
|
||||||
|
pid = pidlist_eval[pid_, 0]
|
||||||
|
ans = pidlist_eval[pid_, 1]
|
||||||
|
prediction_ = prediction[prediction[:, 1] == pid, :]
|
||||||
|
|
||||||
|
# calculate weighted (majority) vote
|
||||||
|
vote_weighted = np.zeros((1, 3))
|
||||||
|
for sentenceID_ in range(0, prediction_.shape[0]):
|
||||||
|
sentenceID = prediction_[sentenceID_, 0].astype(int)
|
||||||
|
w = weight[sentenceID-1, :]
|
||||||
|
pred = prediction_[sentenceID_, 3]
|
||||||
|
pred_int = LB_y.transform([pred])
|
||||||
|
vote_weighted = vote_weighted + w * pred_int
|
||||||
|
|
||||||
|
# choose the most vote
|
||||||
|
vote_weighted = vote_weighted[0]
|
||||||
|
maxindex = list(vote_weighted).index(max(vote_weighted))
|
||||||
|
#predLabel = regionLabels[maxindex]
|
||||||
|
predLabel = LE_y.inverse_transform(maxindex)
|
||||||
|
prediction_per_pid.append([pid, ans, predLabel])
|
||||||
|
|
||||||
|
return np.array(prediction_per_pid)
|
||||||
|
|
||||||
|
|
||||||
|
def saxon_vs_limburg(pidlist3):
|
||||||
|
"""convert a pidlist for 3 regions into that for 2 regions.
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
3 regions include ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
|
||||||
|
2 regions include ['Limburg', 'Low_Saxon']
|
||||||
|
where Low_Saxon = 'Groningen_and_Drenthe' + 'Oost_Overijsel-Gelderland'
|
||||||
|
samples are randomly chosen so that each class has the same amount of data.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
|
||||||
|
regionLabels2 = ['Low_Saxon', 'Limburg']
|
||||||
|
|
||||||
|
index_saxon = np.any([pidlist3[:, 1] == regionLabels[0], pidlist3[:, 1] == regionLabels[2]], axis=0)
|
||||||
|
pidlist_saxon_ = pidlist3[index_saxon, :]
|
||||||
|
pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :]
|
||||||
|
|
||||||
|
# extract the same amout of samples as Limburg.
|
||||||
|
pidlistCounter3 = Counter(pidlist3[:, 1])
|
||||||
|
pidlist_saxon, idx = mani.extractRandomSample(pidlist_saxon_, pidlistCounter3['Limburg'])
|
||||||
|
pidlist_saxon[:, 1] = regionLabels2[0]
|
||||||
|
|
||||||
|
pidlist2 = np.r_[pidlist_limburg, pidlist_saxon]
|
||||||
|
#pidlistCounter2 = Counter(pidlist2[:, 1])
|
||||||
|
return pidlist2
|
||||||
|
|
||||||
|
|
||||||
|
def groningen_vs_limburg(pidlist3):
|
||||||
|
"""convert a pidlist for 3 regions into that for 2 regions.
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
3 regions include ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
|
||||||
|
2 regions include ['Groningen_and_Drenthe', 'Limburg']
|
||||||
|
|
||||||
|
"""
|
||||||
|
regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
|
||||||
|
|
||||||
|
pidlist_groningen = pidlist3[pidlist3[:, 1] == regionLabels[0], :]
|
||||||
|
pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :]
|
||||||
|
|
||||||
|
pidlist2 = np.r_[pidlist_groningen, pidlist_limburg]
|
||||||
|
return pidlist2
|
44
dialect_identification/test_code.py
Normal file
44
dialect_identification/test_code.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
|
||||||
|
import Levenshtein
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
a = 'hello'
|
||||||
|
b = 'haall'
|
||||||
|
|
||||||
|
# approximate
|
||||||
|
infinite = 100
|
||||||
|
|
||||||
|
# make distance matrix D
|
||||||
|
len_a = len(a)
|
||||||
|
len_b = len(b)
|
||||||
|
D_ = np.zeros((len_a, len_b)).astype(int)
|
||||||
|
for ia in range(0, len_a):
|
||||||
|
a_ = a[ia]
|
||||||
|
for ib in range(0, len_b):
|
||||||
|
b_ = b[ib]
|
||||||
|
if a_ == b_:
|
||||||
|
D_[ia, ib] = 1
|
||||||
|
|
||||||
|
D = np.zeros((len_a+1, len_b+1)).astype(int)
|
||||||
|
D[1:len_a+1, 1:len_b+1] = D_
|
||||||
|
D[0, :] = infinite
|
||||||
|
D[:, 0] = infinite
|
||||||
|
D[0, 0] = 0
|
||||||
|
|
||||||
|
# calculate accumulated distance
|
||||||
|
indexPath = []
|
||||||
|
for ia in range(0, len_a):
|
||||||
|
for ib in range(0, len_b):
|
||||||
|
a_ = a[ia]
|
||||||
|
b_ = b[ib]
|
||||||
|
option = (D[ia, ib]+D[ia+1, ib+1], D[ia, ib+1], D[ia+1, ib])
|
||||||
|
Dmin = np.min(option)
|
||||||
|
D[ia+1, ib+1] = D[ia+1, ib+1]+Dmin
|
||||||
|
index = list(option).index(Dmin)
|
||||||
|
indexPath[ia, ib] = index
|
||||||
|
|
||||||
|
# back trace
|
||||||
|
ia = len_a
|
||||||
|
ib = len_b
|
||||||
|
#while (ia > 0 or ib > 0):
|
||||||
|
# tb
|
56
dialect_identification/word_based.py
Normal file
56
dialect_identification/word_based.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import configparser
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from matplotlib import pyplot
|
||||||
|
|
||||||
|
currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
|
||||||
|
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
|
||||||
|
from dataIO import readFile
|
||||||
|
from dataIO import selectSamplesFromCombinedData
|
||||||
|
import dataManipulation
|
||||||
|
|
||||||
|
|
||||||
|
configFile = currDir + '\\config.ini'
|
||||||
|
|
||||||
|
config = configparser.ConfigParser()
|
||||||
|
config.sections()
|
||||||
|
config.read(configFile)
|
||||||
|
fileWordList = config['word_based']['fileWordList']
|
||||||
|
fileCombined = config['word_based']['fileCombined']
|
||||||
|
|
||||||
|
wordList = readFile(fileWordList)
|
||||||
|
|
||||||
|
for wordNum in range(1, len(wordList)):
|
||||||
|
word = wordList[wordNum-1] # target word
|
||||||
|
#print("=== {} ===".format(word))
|
||||||
|
|
||||||
|
dataGroningen, dataLimburg, dataOverijsel = selectSamplesFromCombinedData(word, fileCombined)
|
||||||
|
|
||||||
|
sampleNumMax = 50
|
||||||
|
dataG, indexG = dataManipulation.extractRandomSample(np.array(dataGroningen), sampleNumMax)
|
||||||
|
dataL, indexL = dataManipulation.extractRandomSample(np.array(dataLimburg), sampleNumMax)
|
||||||
|
dataO, indexO = dataManipulation.extractRandomSample(np.array(dataOverijsel), sampleNumMax)
|
||||||
|
|
||||||
|
# combine pronunciation from three regions
|
||||||
|
# data: (sampleNumMax x 3) x 1
|
||||||
|
cPronunciation = 4
|
||||||
|
data = np.hstack([dataG[:, cPronunciation], dataL[:, cPronunciation], dataO[:, cPronunciation]])
|
||||||
|
|
||||||
|
# MDS
|
||||||
|
dataLevenshtein = dataManipulation.makeLevenshteinMatrix(data)
|
||||||
|
dataMDS = dataManipulation.MDS(dataLevenshtein)
|
||||||
|
|
||||||
|
# plot
|
||||||
|
pyplot.scatter(dataMDS[0:sampleNumMax-1, 0], dataMDS[0:sampleNumMax-1, 1], s=80, c='red', marker="o", facecolors='none', label="Groningen and Drenthe")
|
||||||
|
pyplot.scatter(dataMDS[sampleNumMax:sampleNumMax*2-1, 0], dataMDS[sampleNumMax:sampleNumMax*2-1, 1], c='green', marker="^", facecolors='none', label="Limburg")
|
||||||
|
pyplot.scatter(dataMDS[sampleNumMax*2:sampleNumMax*3-1, 0], dataMDS[sampleNumMax*2:sampleNumMax*3-1, 1], c='blue', marker="+", facecolors='none', label="Oost Overijsel-Gelderland")
|
||||||
|
|
||||||
|
pyplot.title(word)
|
||||||
|
#ax.set_xlabel('x')
|
||||||
|
#ax.set_ylabel('y')
|
||||||
|
pyplot.legend(loc='upper right')
|
||||||
|
#pyplot.show()
|
||||||
|
pyplot.savefig('c:\\cygwin64\\home\\Aki\\rug_cygwin\\_same-utterance\\fig\\' + word + '.png')
|
||||||
|
pyplot.gcf().clear()
|
Loading…
Reference in New Issue
Block a user