commit a1379caced09245dced3c7a12e7f9106a7d12b41 Author: yemaozi88 <428968@gmail.com> Date: Sun Mar 25 13:46:27 2018 +0200 commit to be sure. diff --git a/dialect_identification.sln b/dialect_identification.sln new file mode 100644 index 0000000..3d3874c --- /dev/null +++ b/dialect_identification.sln @@ -0,0 +1,38 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.26730.12 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "dialect_identification", "dialect_identification\dialect_identification.pyproj", "{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{5A4286D1-F037-43D4-90F8-05C5CCC0CA30}" + ProjectSection(SolutionItems) = preProject + ..\..\forced-alignment\forced_alignment\convert_phone_set.py = ..\..\forced-alignment\forced_alignment\convert_phone_set.py + ..\..\forced-alignment\forced_alignment\defaultfiles.py = ..\..\forced-alignment\forced_alignment\defaultfiles.py + ..\..\forced-alignment\forced_alignment\forced_alignment.pyproj = ..\..\forced-alignment\forced_alignment\forced_alignment.pyproj + ..\..\forced-alignment\forced_alignment\htk_dict.py = ..\..\forced-alignment\forced_alignment\htk_dict.py + ..\..\forced-alignment\forced_alignment\lexicon.py = ..\..\forced-alignment\forced_alignment\lexicon.py + ..\..\forced-alignment\forced_alignment\mlf.py = ..\..\forced-alignment\forced_alignment\mlf.py + ..\..\forced-alignment\forced_alignment\pronunciations.py = ..\..\forced-alignment\forced_alignment\pronunciations.py + ..\..\forced-alignment\forced_alignment\pyhtk.py = ..\..\forced-alignment\forced_alignment\pyhtk.py + ..\..\forced-alignment\forced_alignment\scripts.py = ..\..\forced-alignment\forced_alignment\scripts.py + ..\..\forced-alignment\forced_alignment\tempfilename.py = ..\..\forced-alignment\forced_alignment\tempfilename.py + ..\..\forced-alignment\forced_alignment\test_environment.py = ..\..\forced-alignment\forced_alignment\test_environment.py + EndProjectSection +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {FE1B1358-ADBE-4446-AFFD-A0802D13D15B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {FE1B1358-ADBE-4446-AFFD-A0802D13D15B}.Release|Any CPU.ActiveCfg = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {FA4F83BB-D460-40C1-B10E-98E4877CA29B} + EndGlobalSection +EndGlobal diff --git a/dialect_identification/audio2db.py b/dialect_identification/audio2db.py new file mode 100644 index 0000000..6b73f90 --- /dev/null +++ b/dialect_identification/audio2db.py @@ -0,0 +1,90 @@ +import os +import sys +import configparser + +import numpy as np +import pypyodbc + + +## user define +forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced-alignment' +dir_same_utterance = 'd:\\OneDrive\\Research\\rug\\experiments\\same_utterance' +wav_dir = dir_same_utterance + '\\wav_with_cities' +script_dir = dir_same_utterance + '\\script' +fileMDB = dir_same_utterance + '\\feature\\DialectClassification.accdb' +table = 'ForcedAlignmentResult' +regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland'] + +# these lines are not necessary once forced-alignment is intalled as a package. +sys.path.append(forced_alignment_module) +from forced_alignment import forced_alignment + + +## check if forced-alignment work in each sentence +#from forced_alignment import pronunciations +#pronunciations.delete_all_g2p_entries() + +#wav_file = wav_dir + '\\10\\' + regionLabels[0] + '\\9935-1464218044-1951631.wav' +#script_file = script_dir + '\\script10.txt' +#with open(script_file, 'r') as fin: +# script = fin.readline() +#fa = forced_alignment(wav_file, script) + + +## make database connection +param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";" +conn = pypyodbc.connect(param) +cursor = conn.cursor() + +SQLstring1 = 'INSERT INTO ' + table + ' (filename, region, word_id, pronunciation) ' + + +## forced-alignment to all the wav files in dir_same_utterance +word_id_start = 1 +for sentenceID in range(1, 11): + sentenceIDstr = format(sentenceID, '02') + + # get script + script_file = script_dir + '\\script' + sentenceIDstr + '.txt' + with open(script_file, 'r') as fin: + script = fin.readline() + + # loop over three regions + for region in regionLabels: + + # loop over the wav_subdir + wav_subdir = wav_dir + '\\' + sentenceIDstr + '\\' + region + wav_files = os.listdir(wav_subdir) + file_nr = 0 + for wav_file in wav_files: + file_nr += 1 + filename = wav_file.replace('.wav', '') + wav_file_fullpath = wav_subdir + '\\' + wav_file + + # forced-alignment + print('{0} {1}: {2} ({3}/{4})'.format(sentenceIDstr, region, wav_file, file_nr, len(wav_files))) + fa = forced_alignment(wav_file_fullpath, script) + + # send pronunciation variant to database + word_id = word_id_start + for row in fa: + word = row[0] + phonemes = np.array(row[1]) + + ## get pronunciation variant + pronvar_ = phonemes[:, 2] + pronvar_[np.where(pronvar_=='ssil')]='' # remove 'ssil' + pronvar = ''.join(pronvar_) + + ## insert the result into the database. + SQLstring2 = 'VALUES (\'' + filename + '\',\'' + region + '\',\'' + str(word_id) + '\',\'' + pronvar + '\')' + SQLstring = SQLstring1 + SQLstring2 + cursor.execute(SQLstring) + conn.commit() + + word_id = word_id + 1 + + word_id_start += script.count(' ')+1 + +conn.close() + diff --git a/dialect_identification/classifier.py b/dialect_identification/classifier.py new file mode 100644 index 0000000..6ed30e6 --- /dev/null +++ b/dialect_identification/classifier.py @@ -0,0 +1,290 @@ +''' +This script perfoms the basic process for applying a machine learning +algorithm to a dataset using Python libraries. + +The four steps are: + 1. Download a dataset (using pandas) + 2. Process the numeric data (using numpy) + 3. Train and evaluate learners (using scikit-learn) + 4. Plot and compare results (using matplotlib) + + +The data is downloaded from URL, which is defined below. As is normal +for machine learning problems, the nature of the source data affects +the entire solution. When you change URL to refer to your own data, you +will need to review the data processing steps to ensure they remain +correct. + +============ +Example Data +============ +The example is from http://mlr.cs.umass.edu/ml/datasets/Spambase +It contains pre-processed metrics, such as the frequency of certain +words and letters, from a collection of emails. A classification for +each one indicating 'spam' or 'not spam' is in the final column. +See the linked page for full details of the data set. + +This script uses three classifiers to predict the class of an email +based on the metrics. These are not representative of modern spam +detection systems. +''' + +# Remember to update the script for the new data when you change this URL +URL = "http://mlr.cs.umass.edu/ml/machine-learning-databases/spambase/spambase.data" + +# Uncomment this call when using matplotlib to generate images +# rather than displaying interactive UI. +#import matplotlib +#matplotlib.use('Agg') + +from pandas import read_table +import numpy as np +import matplotlib.pyplot as plt + +try: + # [OPTIONAL] Seaborn makes plots nicer + import seaborn +except ImportError: + pass + +# ===================================================================== + +def download_data(): + ''' + Downloads the data for this script into a pandas DataFrame. + ''' + + # If your data is in an Excel file, install 'xlrd' and use + # pandas.read_excel instead of read_table + #from pandas import read_excel + #frame = read_excel(URL) + + # If your data is in a private Azure blob, install 'azure-storage' and use + # BlockBlobService.get_blob_to_path() with read_table() or read_excel() + #from azure.storage.blob import BlockBlobService + #service = BlockBlobService(ACCOUNT_NAME, ACCOUNT_KEY) + #service.get_blob_to_path(container_name, blob_name, 'my_data.csv') + #frame = read_table('my_data.csv', ... + + frame = read_table( + URL, + + # Uncomment if the file needs to be decompressed + #compression='gzip', + #compression='bz2', + + # Specify the file encoding + # Latin-1 is common for data from US sources + encoding='latin-1', + #encoding='utf-8', # UTF-8 is also common + + # Specify the separator in the data + sep=',', # comma separated values + #sep='\t', # tab separated values + #sep=' ', # space separated values + + # Ignore spaces after the separator + skipinitialspace=True, + + # Generate row labels from each row number + index_col=None, + #index_col=0, # use the first column as row labels + #index_col=-1, # use the last column as row labels + + # Generate column headers row from each column number + header=None, + #header=0, # use the first line as headers + + # Use manual headers and skip the first row in the file + #header=0, + #names=['col1', 'col2', ...], + ) + + # Return a subset of the columns + #return frame[['col1', 'col4', ...]] + + # Return the entire frame + return frame + + +# ===================================================================== + + +def get_features_and_labels(frame): + ''' + Transforms and scales the input data and returns numpy arrays for + training and testing inputs and targets. + ''' + + # Replace missing values with 0.0, or we can use + # scikit-learn to calculate missing values (below) + #frame[frame.isnull()] = 0.0 + + # Convert values to floats + arr = np.array(frame, dtype=np.float) + + # Use the last column as the target value + X, y = arr[:, :-1], arr[:, -1] + # To use the first column instead, change the index value + #X, y = arr[:, 1:], arr[:, 0] + + # Use 80% of the data for training; test against the rest + from sklearn.model_selection import train_test_split + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + + # sklearn.pipeline.make_pipeline could also be used to chain + # processing and classification into a black box, but here we do + # them separately. + + # If values are missing we could impute them from the training data + #from sklearn.preprocessing import Imputer + #imputer = Imputer(strategy='mean') + #imputer.fit(X_train) + #X_train = imputer.transform(X_train) + #X_test = imputer.transform(X_test) + + # Normalize the attribute values to mean=0 and variance=1 + from sklearn.preprocessing import StandardScaler + scaler = StandardScaler() + # To scale to a specified range, use MinMaxScaler + #from sklearn.preprocessing import MinMaxScaler + #scaler = MinMaxScaler(feature_range=(0, 1)) + + # Fit the scaler based on the training data, then apply the same + # scaling to both training and test sets. + scaler.fit(X_train) + X_train = scaler.transform(X_train) + X_test = scaler.transform(X_test) + + # Return the training and test sets + return X_train, X_test, y_train, y_test + + +# ===================================================================== + + +def evaluate_classifier(X_train, X_test, y_train, y_test): + ''' + Run multiple times with different classifiers to get an idea of the + relative performance of each configuration. + + Returns a sequence of tuples containing: + (title, precision, recall) + for each learner. + ''' + + # Import some classifiers to test + from sklearn.svm import LinearSVC, NuSVC + from sklearn.ensemble import AdaBoostClassifier + + # We will calculate the P-R curve for each classifier + from sklearn.metrics import precision_recall_curve, f1_score + + # Here we create classifiers with default parameters. These need + # to be adjusted to obtain optimal performance on your data set. + + # Test the linear support vector classifier + classifier = LinearSVC(C=1) + # Fit the classifier + classifier.fit(X_train, y_train) + score = f1_score(y_test, classifier.predict(X_test)) + # Generate the P-R curve + y_prob = classifier.decision_function(X_test) + precision, recall, _ = precision_recall_curve(y_test, y_prob) + # Include the score in the title + yield 'Linear SVC (F1 score={:.3f})'.format(score), precision, recall + + # Test the Nu support vector classifier + classifier = NuSVC(kernel='rbf', nu=0.5, gamma=1e-3) + # Fit the classifier + classifier.fit(X_train, y_train) + score = f1_score(y_test, classifier.predict(X_test)) + # Generate the P-R curve + y_prob = classifier.decision_function(X_test) + precision, recall, _ = precision_recall_curve(y_test, y_prob) + # Include the score in the title + yield 'NuSVC (F1 score={:.3f})'.format(score), precision, recall + + # Test the Ada boost classifier + classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.0, algorithm='SAMME.R') + # Fit the classifier + classifier.fit(X_train, y_train) + score = f1_score(y_test, classifier.predict(X_test)) + # Generate the P-R curve + y_prob = classifier.decision_function(X_test) + precision, recall, _ = precision_recall_curve(y_test, y_prob) + # Include the score in the title + yield 'Ada Boost (F1 score={:.3f})'.format(score), precision, recall + +# ===================================================================== + + +def plot(results): + ''' + Create a plot comparing multiple learners. + + `results` is a list of tuples containing: + (title, precision, recall) + + All the elements in results will be plotted. + ''' + + # Plot the precision-recall curves + + fig = plt.figure(figsize=(6, 6)) + fig.canvas.set_window_title('Classifying data from ' + URL) + + for label, precision, recall in results: + plt.plot(recall, precision, label=label) + + plt.title('Precision-Recall Curves') + plt.xlabel('Precision') + plt.ylabel('Recall') + plt.legend(loc='lower left') + + # Let matplotlib improve the layout + plt.tight_layout() + + # ================================== + # Display the plot in interactive UI + plt.show() + + # To save the plot to an image file, use savefig() + #plt.savefig('plot.png') + + # Open the image file with the default image viewer + #import subprocess + #subprocess.Popen('plot.png', shell=True) + + # To save the plot to an image in memory, use BytesIO and savefig() + # This can then be written to any stream-like object, such as a + # file or HTTP response. + #from io import BytesIO + #img_stream = BytesIO() + #plt.savefig(img_stream, fmt='png') + #img_bytes = img_stream.getvalue() + #print('Image is {} bytes - {!r}'.format(len(img_bytes), img_bytes[:8] + b'...')) + + # Closing the figure allows matplotlib to release the memory used. + plt.close() + + +# ===================================================================== + + +if __name__ == '__main__': + # Download the data set from URL + print("Downloading data from {}".format(URL)) + frame = download_data() + + # Process data into feature and label arrays + print("Processing {} samples with {} attributes".format(len(frame.index), len(frame.columns))) + X_train, X_test, y_train, y_test = get_features_and_labels(frame) + + # Evaluate multiple classifiers on the data + print("Evaluating classifiers") + results = list(evaluate_classifier(X_train, X_test, y_train, y_test)) + + # Display the results + print("Plotting the results") + plot(results) diff --git a/dialect_identification/config.ini b/dialect_identification/config.ini new file mode 100644 index 0000000..47cc410 --- /dev/null +++ b/dialect_identification/config.ini @@ -0,0 +1,8 @@ +[word_based] +fileWordList = D:\\OneDrive\\Research\\rug\\same_utterance\\feature\\wordList.csv +fileCombined = D:\\OneDrive\\Research\\rug\\same_utterance\\feature\\combined.csv + +[sentence_based] +dirFeature = D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\feature +fileMDB = D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\feature\\DialectClassification.accdb +dirData = D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\wav \ No newline at end of file diff --git a/dialect_identification/data_io.py b/dialect_identification/data_io.py new file mode 100644 index 0000000..bf36977 --- /dev/null +++ b/dialect_identification/data_io.py @@ -0,0 +1,74 @@ +# +# 2017/09/25 +# select samples from the combined.csv for the further analysis +# +# HISTORY +# 2017/10/02 modularized. +# +# Aki Kunikoshi +# 428968@gmail.com +# +import numpy as np + +def readFile(filename): + with open(filename, 'r') as fin: + lines = fin.read() + linesEach = lines.split('\n') + return linesEach + + +def selectSamplesFromCombinedData(word, fileCombined): + # load combined data + fin = open(fileCombined, 'r') + line = fin.readline() + + # load data per region + dataGroningen = [] + dataLimburg = [] + dataOverijsel = [] + while line: + line = fin.readline() + line = line.rstrip() + lineList = line.split(',') + if len(lineList) == 6 and lineList[5] == word: + region = lineList[2] + if region == 'Groningen_and_Drenthe': + dataGroningen.append(lineList) + elif region == 'Limburg': + dataLimburg.append(lineList) + elif region == 'Oost_Overijsel-Gelderland': + dataOverijsel.append(lineList) + fin.close() + return (dataGroningen, dataLimburg, dataOverijsel) + #print("{0}: {1} {2} {3}".format(word,len(listGroningen),len(listLimburg),len(listOverijsel)) + + +def groupSamplesInCSV(fileCSV, idxRegion): + fin = open(fileCSV, 'r') + + # first line is the header + line = fin.readline() + line = line.rstrip() + header = line.split(',') + + # load data per region + dataGroningen = [] + dataLimburg = [] + dataOverijsel = [] + while line: + line = fin.readline() + line = line.rstrip() + lineList = line.split(',') + if len(lineList) == len(header): + region = lineList[idxRegion] + if region == 'Groningen_and_Drenthe': + dataGroningen.append(lineList) + elif region == 'Limburg': + dataLimburg.append(lineList) + elif region == 'Oost_Overijsel-Gelderland': + dataOverijsel.append(lineList) + fin.close() + return (header, dataGroningen, dataLimburg, dataOverijsel) + +def addUserID(featureFile, recordingsCSV): + dirFeature = config['sentence_based']['dirFeature'] diff --git a/dialect_identification/data_manipulation.py b/dialect_identification/data_manipulation.py new file mode 100644 index 0000000..c7dbb80 --- /dev/null +++ b/dialect_identification/data_manipulation.py @@ -0,0 +1,41 @@ +import numpy as np +from sklearn import manifold +import Levenshtein + +# x: ndarray (dnum x dim) +# n: number of samples to extract +# OUTPUT +# index: index of the chosen samples +# +def extractRandomSample(x, n): + xRowMax = x.shape[0] + indexOriginal = np.arange(xRowMax) + indexChosen = np.random.choice(indexOriginal, n, False) + xChosen = x[indexChosen, :] + return (xChosen, indexChosen) + +# x: 1d string ndarray +def makeLevenshteinMatrix(x): + xRowMax = x.shape[0] + xLevenshtein = np.ones((xRowMax, xRowMax), dtype='int') + + for xRow in range(0, xRowMax): + for xCol in range(0, xRowMax): + dist = Levenshtein.distance(x[xRow], x[xCol]); + xLevenshtein[xRow, xCol] = dist + return xLevenshtein + +# x: 1d string ndarray +def calcLevenshteinArray(word, x): + xRowMax = x.shape[0] + xLevenshtein = np.zeros(x.shape, dtype='int') + + for xRow in range(0, xRowMax): + dist = Levenshtein.distance(word, x[xRow]); + xLevenshtein[xRow] = dist + return xLevenshtein + +def MDS(x): + mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=6) + xmds = mds.fit_transform(x) + return xmds \ No newline at end of file diff --git a/dialect_identification/dialect_identification.pyproj b/dialect_identification/dialect_identification.pyproj new file mode 100644 index 0000000..ae47cc7 --- /dev/null +++ b/dialect_identification/dialect_identification.pyproj @@ -0,0 +1,70 @@ + + + Debug + 2.0 + fe1b1358-adbe-4446-affd-a0802d13d15b + {a41c8ea1-112a-4a2d-9f91-29557995525f};{888888a0-9f3d-457c-b088-3a5042f75d52} + . + output_confusion_matrix.py + + + . + . + dialect_identification + dialect_identification + + + true + false + + + true + false + + + + Code + + + Code + + + + Code + + + Code + + + Code + + + Code + + + Code + + + Code + + + Code + + + Code + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dialect_identification/evaluation.py b/dialect_identification/evaluation.py new file mode 100644 index 0000000..8ca04b3 --- /dev/null +++ b/dialect_identification/evaluation.py @@ -0,0 +1,40 @@ +import numpy as np +import scipy as sp +import scipy.stats +from sklearn.model_selection import KFold +from sklearn.metrics import f1_score +from sklearn.metrics import confusion_matrix + + +# from https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data +def mean_confidence_interval(data, confidence): + a = 1.0*np.array(data) + n = len(a) + m, se = np.mean(a), scipy.stats.sem(a) + h = se * sp.stats.t._ppf((1+confidence)/2., n-1) + return m, m-h, m+h + +# accumulated confusion matrix is added to cross_val_score +def cross_val_confusion_matrix(model, X, y, cv): + kf = KFold(n_splits=cv) + classLabels = np.unique(y) + classNumMax = classLabels.shape[0] + confusionMatrixAccumulated = np.zeros((classNumMax, classNumMax)) + scores = [] + for idx_train, idx_test in kf.split(X): + # split into train/test + x_train = X[idx_train, :] + x_test = X[idx_test, :] + y_train = y[idx_train] + y_test = y[idx_test] + modelfit = model.fit(x_train, y_train) + + # evaluation + y_pred = modelfit.predict(x_test) + + score = f1_score(y_test, y_pred, average='micro') + scores.append(score) + confusionMatrixAccumulated = confusionMatrixAccumulated + confusion_matrix(y_test, y_pred, + labels=classLabels) + scores = np.array(scores) + return scores, confusionMatrixAccumulated \ No newline at end of file diff --git a/dialect_identification/manipulate_db.py b/dialect_identification/manipulate_db.py new file mode 100644 index 0000000..54877c1 --- /dev/null +++ b/dialect_identification/manipulate_db.py @@ -0,0 +1,48 @@ +import sys +import os +import pandas +import datetime +sys.path.append('..') + +# these lines are not necessary once forced-alignment is intalled as a package. +forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced-alignment' +sys.path.append(forced_alignment_module) +from forced_alignment import pronunciations +from forced_alignment.htk_dict import variances_table + + +#pronunciations.delete_word('kunikoshi') +#pronunciations.delete_all_g2p_entries() + + +#existing_pronunciations = set(pronunciations.get_all()) +## only focus on word + + +## missing pronunciations +## (1) pronunciation is written in IPA. +## (2) pronunciation variants are made based on (1). +## (3) they are converted into HTK format. +#missing_pronunciations_file = 'D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\missing_words_in_barbara_dic\\missing_words_pronvarsHTK.txt' + +#with open(missing_pronunciations_file) as fin: +# lines = fin.read() +# lines = lines.split('\n') + +#source = 'generated using ipa transcription by Marita Everhardt.' +#inserts = [] +#for line in lines: +# line = line.split('\t') +# word = line[0].strip().lower() +# pronounciation = line[1].strip().split() + +# # surely not in the table +# #if (word, pronounciation) not in existing_pronunciations: +# inserts.append("('{}', '{}', '{}', '{}', 0)".format( +# word, +# ' '.join(pronounciation), +# source, +# datetime.datetime.now(), )) + +#sql = """INSERT INTO pronunciations (word, pronunciation, collection, added, automatic) VALUES\n {};""".format( +# ',\n '.join(inserts) \ No newline at end of file diff --git a/dialect_identification/output_confusion_matrix.py b/dialect_identification/output_confusion_matrix.py new file mode 100644 index 0000000..ae92fb2 --- /dev/null +++ b/dialect_identification/output_confusion_matrix.py @@ -0,0 +1,79 @@ +import os +import sys + +import itertools +import numpy as np +import matplotlib.pyplot as plt + +from sklearn.metrics import accuracy_score +from sklearn.metrics import confusion_matrix + + +currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification' +sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir)) + +regionLabels = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg'] +regionLabels2 = ['Groningen_and_Drenthe', 'Limburg'] +dirOut = currDir + '\\result\\same-utterance_with_cities' + + +def plot_confusion_matrix(cm, classes, + normalize=False, + title='Confusion matrix', + cmap=plt.cm.Blues): + """ + This function prints and plots the confusion matrix. + Normalization can be applied by setting `normalize=True`. + Note: + this code is downloaded from: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html + """ + if normalize: + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + print("Normalized confusion matrix") + else: + print('Confusion matrix, without normalization') + + _fontsize = 24 + plt.imshow(cm, interpolation='nearest', cmap=cmap) + #plt.title(title, fontsize=_fontsize+2) + #plt.colorbar() + tick_marks = np.arange(len(classes)) + #plt.xticks(tick_marks, classes, rotation=45, fontsize=_fontsize-2) + plt.xticks(tick_marks, classes, fontsize=_fontsize-4) + plt.yticks(tick_marks, classes, fontsize=_fontsize-4) + + fmt = '.2f' if normalize else 'd' + thresh = cm.max() / 2. + for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): + plt.text(j, i, format(cm[i, j], fmt), + horizontalalignment="center", + color="white" if cm[i, j] > thresh else "black", + fontsize=_fontsize) + + plt.tight_layout() + plt.subplots_adjust(bottom=0.2) + plt.ylabel('True label', fontsize=_fontsize-4) + plt.xlabel('Predicted label', fontsize=_fontsize-4) + + +pred = np.load(dirOut + '\\pred_per_pid_3regions.npy') + +#accuracy = accuracy_score(pred[:, 1], pred[:, 2], normalize=True, sample_weight=None) +#print('accuracy: {}%'.format(accuracy * 100)) + +# confusion matrix +cm = confusion_matrix(pred[:, 1], pred[:, 2], labels=regionLabels) +# human perception (2 regions) +#cm = np.array([[39, 57], [6, 104]]) +# human perception (3 regions) +#cm = np.array([[22, 14, 52], [23, 21, 52], [5, 5, 100]]) +print(cm) + +np.set_printoptions(precision=2) + +plt.figure() +plot_confusion_matrix(cm, classes=['GD', 'OG', 'LB'], normalize=True) +#plot_confusion_matrix(cm, classes=['GD', 'LB'], normalize=True) + +#plt.show() +plt.savefig(dirOut + '\\cm_machine_3regions_normalized.png') \ No newline at end of file diff --git a/dialect_identification/sentence_based.py b/dialect_identification/sentence_based.py new file mode 100644 index 0000000..390966d --- /dev/null +++ b/dialect_identification/sentence_based.py @@ -0,0 +1,197 @@ +import os +import sys +import configparser + +import numpy as np +import pandas as pd +from matplotlib import pyplot +from sklearn.model_selection import train_test_split +from sklearn.model_selection import cross_val_score +from sklearn import preprocessing +from collections import Counter + +# database +import pypyodbc + +# classifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis +from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis +from sklearn.metrics import f1_score +from sklearn.metrics import confusion_matrix +import pickle + +currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification' +sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir)) +from dataIO import readFile +from dataIO import groupSamplesInCSV +import dataManipulation +import utility as util + + +configFile = currDir + '\\config.ini' +# load init file +config = configparser.ConfigParser() +config.sections() +config.read(configFile) +dirFeature = config['sentence_based']['dirFeature'] + +sentenceNumMax = 10 +classifierList = [] +LE_X_decode = [] +LE_y = preprocessing.LabelEncoder() +LE_y.fit(["Groningen_and_Drenthe", "Limburg", "Oost_Overijsel-Gelderland"]) + +testset_X = [] +testset_y = [] +testset_userID = [] +result_y_test = [] +result_y_prediction = [] +fout = open("comparison.csv", "w") +for sentenceNum in range(1, sentenceNumMax+1): + #if sentenceNum != 10: + # sentenceNumStr = '0' + str(sentenceNum) + #else: + # sentenceNumStr = str(sentenceNumStr) + sentenceNumStr = format(sentenceNum, '02') + fileSentence = dirFeature + '\\\\' + sentenceNumStr + '.csv' + + + ## load combined data + fileCSV = fileSentence + idxRegion = 1 + header, dataGroningen, dataLimburg, dataOverijsel = groupSamplesInCSV(fileCSV, idxRegion) + sampleNumMax = np.min((len(dataGroningen), len(dataLimburg), len(dataOverijsel))) + + + ## make balanced dataset + dataG, indexG = dataManipulation.extractRandomSample(np.array(dataGroningen), sampleNumMax) + dataL, indexL = dataManipulation.extractRandomSample(np.array(dataLimburg), sampleNumMax) + dataO, indexO = dataManipulation.extractRandomSample(np.array(dataOverijsel), sampleNumMax) + + XIndex = np.arange(idxRegion+1, len(header)) + yIndex = 1 # region + userIDindex = 0 # userID + + + ## cathegorical values into numbers + X_ = np.r_[dataG[:, XIndex], dataL[:, XIndex], dataO[:, XIndex]] + y_ = np.r_[dataG[:, yIndex], dataL[:, yIndex], dataO[:, yIndex]] + userID_ = np.r_[dataG[:, userIDindex], dataL[:, userIDindex], dataO[:, userIDindex]] + + #X = np.zeros((X_.shape), 'int') + for Xindex in XIndex: + x = X_[:, Xindex-2] + + ## levenshtein distance + #word_count = Counter(x) + #frequent_word = max(word_count) + #X[:, Xindex-2] = dataManipulation.calcLevenshteinArray(frequent_word, x) + + # hot encoding + le_x = preprocessing.LabelBinarizer() + le_x.fit(np.unique(x)) + x_ = le_x.transform(x) + LE_X_decode.append(x_.shape[1]) + if Xindex == idxRegion+1: + X = x_ + else: + X = np.c_[X, x_] + + y = LE_y.transform(y_) + + + ## split into train vs test set + #[X_train, X_test, y_train, y_test] = train_test_split(X, y, test_size = 0.2, random_state = 0) + + # each regional data should be splited equally + lenG = dataG.shape[0] + lenL = dataL.shape[0] + lenO = dataO.shape[0] + indexG = np.arange(0, lenG) + indexL = np.arange(lenG, lenG+lenL) + indexO = np.arange(lenG+lenL, lenG+lenL+lenO) + [XG_train, XG_test, yG_train, yG_test] = train_test_split(X[indexG, :], y[indexG], test_size = 0.2, random_state = 0) + [XL_train, XL_test, yL_train, yL_test] = train_test_split(X[indexL, :], y[indexL], test_size = 0.2, random_state = 0) + [XO_train, XO_test, yO_train, yO_test] = train_test_split(X[indexO, :], y[indexO], test_size = 0.2, random_state = 0) + X_train = np.r_[XG_train, XL_train, XO_train] + X_test = np.r_[XG_test, XL_test, XO_test] + y_train = np.r_[yG_train, yL_train, yO_train] + y_test = np.r_[yG_test, yL_test, yO_test] + + + ## comparison + ## classifiers + #names = ["Nearest Neighbors", + # "Linear SVM", + # "Poly SVM", + # "RBF SVM", + # "Decision Tree", + # "Random Forest 2", + # "Random Forest 3", + # "Random Forest 4", + # "AdaBoost", + # #"Naive Bayes", + # "Linear Discriminant Analysis", + # #"Quadratic Discriminant Analysis" + # ] + #classifiers = [ + # KNeighborsClassifier(3), + # SVC(kernel="linear", C=0.025), + # SVC(kernel="poly", C=0.025), + # SVC(gamma=2, C=1), + # DecisionTreeClassifier(max_depth=4), + # RandomForestClassifier(max_depth=2, n_estimators=10, max_features=1), + # RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1), + # RandomForestClassifier(max_depth=4, n_estimators=10, max_features=1), + # AdaBoostClassifier(), + # #GaussianNB(), + # LinearDiscriminantAnalysis(), + # #QuadraticDiscriminantAnalysis() + # ] + #for name, model in zip(names, classifiers): + # scores = cross_val_score(model, X, y, cv = 10, scoring = 'f1_micro') + # fout = open("comparison.csv", "a") + # fout.write("{0},{1},{2}\n".format(sentenceNum, name, scores.mean())) + # print('{0}, {1}: {2}'.format(sentenceNum, name, scores.mean())) + + # quasi-optimal model + model = AdaBoostClassifier() + # cross validation + scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro') + ci_mean, ci_low, ci_high = util.mean_confidence_interval(scores, 0.95) + modelfit = model.fit(X_train, y_train) + # f1 on test data + y_prediction = modelfit.predict(X_test) + f1score = f1_score(y_test, y_prediction, average='micro') + fout.write("{0},{1},{2},{3}\n".format(ci_mean, ci_low, ci_high, f1score)) + + ## save for the test + testset_X.append(X_test) + testset_y.append(y_test) + testset_userID.append(userID_) + result_y_test = result_y_test + list(y_test) + result_y_prediction = result_y_prediction + list(y_prediction) + fileClassifier = dirFeature + '\\\\' + sentenceNumStr + '.mdl' + pickle.dump(modelfit, open(fileClassifier, 'wb')) +fout.close() + +### confusion matrix +result_y_test_label = LE_y.inverse_transform(result_y_test) +result_y_prediction_label = LE_y.inverse_transform(result_y_prediction) +confusionMatrix = confusion_matrix(result_y_test_label, result_y_prediction_label, labels=[ + 'Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']) +print(confusionMatrix) + + +### make userID list +#userID = testset_userID[0] +#for sentenceNum in range(1, sentenceNumMax): +# userid = testset_userID[sentenceNum] +# userID = np.r_[userID, userid] +#userIDlist = np.unique(userID) + diff --git a/dialect_identification/speaker_based.py b/dialect_identification/speaker_based.py new file mode 100644 index 0000000..c7d1536 --- /dev/null +++ b/dialect_identification/speaker_based.py @@ -0,0 +1,326 @@ +import os +import sys +import configparser + +import pypyodbc +import numpy as np +from collections import Counter +import matplotlib.pyplot as plt + +from sklearn.model_selection import train_test_split +from sklearn.model_selection import cross_val_score +from sklearn import preprocessing +from sklearn.metrics import confusion_matrix +from sklearn.metrics import accuracy_score + +currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification' +sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir)) +import dataManipulation as mani +import evaluation as eval +import speaker_based_functions as sb_func + + +##################### +## USER DEFINE ## +##################### +sentenceNumMax = 10 +configFile = currDir + '\\config.ini' +dirOut = currDir + '\\result' + +# make train/test set: 1, load: 0 +makeTrainTestSet = 0 +# convert 3 regions to 2 regions: 1, load: 0 +conv3to2region = 0 + +# 3 regions: 0 +# saxon vs limburg: 1 +# groningen vs limburg: 2 +experiment_type = 2 + +regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland'] + +# a bit useless error handling. +#assert (experiment_type in (0, 1, 2)), "experiment type should be 0, 1 or 2." +if experiment_type == 1: + regionLabels2 = ['Low_Saxon', 'Limburg'] +regionLabels2 = ['Groningen_and_Drenthe', 'Limburg'] + + +########################## +## DATA PREPARATION ## +########################## + +## load init file +config = configparser.ConfigParser() +config.sections() +config.read(configFile) +dirFeature = config['sentence_based']['dirFeature'] +fileMDB = config['sentence_based']['fileMDB'] + + +## database connection +pypyodbc.lowercase = False +param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";" +conn = pypyodbc.connect(param) +cursor = conn.cursor() + + +## get data from Access database +# data format +# 0: filename +# 1: pid +# 2: region +# 3: ID (unique word_id) +# 4: sentence_id +# 5: word_id +# 6: word +# 7: pronunciation +SQL_string = """\ +{CALL dataset_with_cities} +""" +cursor.execute(SQL_string) + +rows = cursor.fetchall() +data = np.array(rows) +#dataNumMax = data.shape[0] +#uniqueWordIDmax = max(data[:, 3].astype(int)) +del SQL_string, rows + + +## make list of LabelBinarizer object per word. +# for X +# get pronvarList from Access database +# pronvarList format +# 0: ID (unique word_id) +# 1: word +# 2: pronvar +SQL_string = """\ +{CALL pronunciation_variant} +""" +cursor.execute(SQL_string) +rows = cursor.fetchall() +pronvarList = np.array(rows) +del SQL_string, rows + + +LBlist = [] +#uniqueWordIDlist = pronvarList[:, 0].astype(int) +uniqueWordIDlist = data[:, 3].astype(int) +uniqueWordIDmax = max(uniqueWordIDlist) +for uniqueWordID in range(1, uniqueWordIDmax+1): + pronvar = data[uniqueWordIDlist == uniqueWordID, 7] + #pronvar = pronvarList[pronvarList[:, 0] == uniqueWordID, 2] + LB = preprocessing.LabelBinarizer() + LB.fit(np.unique(pronvar)) + LBlist.append(LB) + +# for y (=region) +LE_y = preprocessing.LabelEncoder() +LE_y.fit(regionLabels) +LE_y2 = preprocessing.LabelEncoder() +LE_y2.fit(regionLabels2) + +LB_y = preprocessing.LabelBinarizer() +LB_y.fit(regionLabels) +LB_y2 = preprocessing.LabelBinarizer() +LB_y2.fit(regionLabels2) + +del uniqueWordID, uniqueWordIDmax, pronvar, LB + + +################# +## ITERATION ## +################# +#CM_majority = np.zeros((1, 9)).astype(int) +#CM_weighted = np.zeros((1, 9)).astype(int) +#for iter in range(0, 1): +# print(iter) + +## make balanced dataset +pidlist = np.unique(data[:, (1, 2)], axis=0) + +# count number of samples +pidlistCounter = Counter(pidlist[:, 1]) +sampleNumMax = min(pidlistCounter.values()) +del pidlistCounter + + +## make train/eval/test set or load +if makeTrainTestSet==1: + pidlist_train = [] + pidlist_eval = [] + pidlist_test = [] + for regionNum in range(0, len(regionLabels)): + regionName = regionLabels[regionNum] + + pidlist_per_region_ = pidlist[pidlist[:, 1]==regionLabels[regionNum], :] + pidlist_per_region, idx = mani.extractRandomSample( + pidlist_per_region_, sampleNumMax) + + # split dataset into train, eval and test. + [pidlist_per_region_train, pidlist_per_region_test] = train_test_split( + pidlist_per_region, test_size = 0.2, random_state = 0) + [pidlist_per_region_train, pidlist_per_region_eval] = train_test_split( + pidlist_per_region_train, test_size = 0.1, random_state = 0) + + # append numpy arrays + if regionNum == 0: + pidlist_train = pidlist_per_region_train + pidlist_eval = pidlist_per_region_eval + pidlist_test = pidlist_per_region_test + else: + pidlist_train = np.r_[pidlist_train, pidlist_per_region_train] + pidlist_eval = np.r_[pidlist_eval, pidlist_per_region_eval] + pidlist_test = np.r_[pidlist_test, pidlist_per_region_test] + del regionNum, regionName + del pidlist_per_region_, pidlist_per_region, idx + del pidlist_per_region_train, pidlist_per_region_eval, pidlist_per_region_test + np.save(dirOut + "\\pidlist_train.npy", pidlist_train) + np.save(dirOut + "\\pidlist_eval.npy", pidlist_eval) + np.save(dirOut + "\\pidlist_test.npy", pidlist_test) +else: + pidlist_train = np.load(dirOut + "\\pidlist_train.npy") + pidlist_eval = np.load(dirOut + "\\pidlist_eval.npy") + pidlist_test = np.load(dirOut + "\\pidlist_test.npy") + + +## make dataset for 2 regions or load +if conv3to2region==1: + pidlist2_train_ = np.r_[pidlist_train, pidlist_eval] + + if experiment_type == 1: + pidlist2_train = sb_func.saxon_vs_limburg(pidlist2_train_) + pidlist2_test = sb_func.saxon_vs_limburg(pidlist_test) + np.save(dirOut + "\\pidlist2_saxon_vs_limburg_train", pidlist2_train) + np.save(dirOut + "\\pidlist2_saxon_vs_limburg_test", pidlist2_test) + + elif experiment_type == 2: + pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_) + pidlist2_test = sb_func.groningen_vs_limburg(pidlist_test) + np.save(dirOut + "\\pidlist2_groningen_vs_limburg_train", pidlist2_train) + np.save(dirOut + "\\pidlist2_groningen_vs_limburg_test", pidlist2_test) + + del pidlist2_train_ +else: + if experiment_type == 1: + pidlist2_train = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_train.npy") + pidlist2_test = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_test.npy") + + elif experiment_type == 2: + pidlist2_train = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_train.npy") + pidlist2_test = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_test.npy") + + +## train/test data +if experiment_type == 0: + # Groningen vs Overijsel vs Limburg + data_train = sb_func.extractPid(pidlist_train, data) + data_eval = sb_func.extractPid(pidlist_eval, data) + data_test = sb_func.extractPid(pidlist_test, data) + +elif experiment_type == 1 or experiment_type == 2: + data2 = np.array(data) + + if experiment_type == 1: + for row, row2 in zip(data, data2): + if row[2] == regionLabels[0] or row[2] == regionLabels[2]: + row2[2] = regionLabels2[0] + + data2_train = sb_func.extractPid(pidlist2_train, data2) + data2_test = sb_func.extractPid(pidlist2_test, data2) + + +##################################### +## EXPERIMENTS START FROM HERE ## +##################################### + +## actual training +# train vs eval +#trainData = data_train +#testData = data_eval +#testPID = pidlist_eval +#LB = LB_y +#LE = LE_y +#regionLabels = regionLabels3 + +# train+eval vs test +if experiment_type == 0: + trainData = np.r_[data_train, data_eval] + testData = data_test + testPID = pidlist_test + LB = LB_y + LE = LE_y +elif experiment_type == 1 or experiment_type == 2: +# 2 region: saxon vs limburg/ groningen vs limburg + trainData = data2_train + testData = data2_test + testPID = pidlist2_test + LB = LB_y2 + LE = LE_y2 + regionLabels = regionLabels2 + + +# check the number of utterance +allData = np.r_[trainData, testData] +filenames = np.c_[allData[:, 0], allData[:, 2]] +filenames_unique = np.unique(filenames, axis=0) +Counter(filenames_unique[:, 1]) + + +fileComparison = dirOut + "\\algorithm_comparison.csv" +filePerformance = dirOut + "\\sentence-level.csv" +fileConfusionMatrix = dirOut + "\\confusion_matrix.csv" + +## compare classification algorithms for the sentence-classifiers. +#sb_func.compare_sentence_level_classifiers(trainData, LBlist, LE, fileComparison) + +## train sentence-level classifiers. +modelList, scoreList, confusionMatrixList = sb_func.train_sentence_level_classifiers( + trainData, LBlist, LE, filePerformance) + +## prediction over evaluation data per each sentence-level classifier. +pred_per_sentence = sb_func.prediction_per_sentence(testData, modelList, LBlist, LE) + +## combine sentence-level classifiers +pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence) + +## majority vote (weighted) +#weight = sb_func.calc_weight(confusionMatrixList) +#pred_per_pid_weighted = sb_func.prediction_per_pid_weighted(testPID, pred_per_sentence, weight, LB, LE) + +### confusion matrix +if experiment_type == 0: + confusionMatrix_majority = confusion_matrix( + pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg']) +else: + confusionMatrix_majority = confusion_matrix( + pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Limburg']) + + #confusionMatrix_weighted = confusion_matrix( +# pred_per_pid_weighted[:, 1], pred_per_pid_weighted[:, 2], labels=regionLabels) + + +## output +accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None) +print('accuracy: {}%'.format(accuracy * 100)) + +cm = confusionMatrix_majority +print(cm) + +np.save(dirOut + "\\pred_per_pid.npy", pred_per_pid_majority) +np.save(dirOut + "\\confusion_matrix.npy", cm) + +#fout = open(fileConfusionMatrix, "w") +#fout.write('< confusion matrix for majority vote in evaluation set >\n') +#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_majority', regionLabels) +#fout.write('< confusion matrix for weighted vote in evaluation set >\n') +#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_weighted', regionLabels) +#fout.write('\n') +#fout.close() + + +##### iteration finish ##### +conn.close() +#np.savetxt(dirOut + '\\cm_majority.csv', CM_majority, delimiter=',') +#np.savetxt(dirOut + '\\cm_weighted.csv', CM_weighted, delimiter=',') + diff --git a/dialect_identification/speaker_based_functions.py b/dialect_identification/speaker_based_functions.py new file mode 100644 index 0000000..1421376 --- /dev/null +++ b/dialect_identification/speaker_based_functions.py @@ -0,0 +1,383 @@ +import numpy as np +from collections import Counter +import matplotlib.pyplot as plt +import itertools + +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis +from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis + +from sklearn.model_selection import cross_val_score +from sklearn.metrics import confusion_matrix + +import dataManipulation as mani +import evaluation as eval + + +# extract data that corresponds to pid in the pidlist +def extractPid(pidlist, data): + for pidnum in range(0, len(pidlist)): + pid = pidlist[pidnum, 0] + x = data[data[:, 1] == pid, :] + if pidnum == 0: + data_ = x + else: + data_ = np.r_[data_, x] + return data_ + + +def OneHotEncoding(data, LB_X, LE_y): +# one hot encoding of data using LabelBinalizer per word (LB_X) and for region (LB_y) +# INPUT +# data +# 0: filename +# 1: pid +# 2: region +# 3: ID (unique word_id) +# 4: sentence_id +# 5: word_id +# 6: word +# 7: pronunciation +# LB_x: LabelBinalizer objects +# LE_y: LabelEncoder object +# OUTPUT +# X: encoded variable data +# y: encoded target data + pidlist = data[:, 1] + regionlist = data[:, 2] + uniqueWordIDlist = data[:, 3].astype(int) + pronvarlist = data[:, 7] + + uniqueWordIDlist_unique = np.unique(uniqueWordIDlist) + uniqueWordIDlist_unique.sort() + for uniqueWordIDnum in uniqueWordIDlist_unique: + x_ = pronvarlist[uniqueWordIDlist == uniqueWordIDnum] + lb = LB_X[uniqueWordIDnum-1] + x = lb.transform(x_) + if uniqueWordIDnum == uniqueWordIDlist_unique[0]: + X = x + else: + X = np.c_[X, x] + + # pid and region of the speakers + y_ = regionlist[uniqueWordIDlist == uniqueWordIDlist_unique[0]] + y = LE_y.transform(y_) + + pid = pidlist[uniqueWordIDlist == uniqueWordIDlist_unique[0]] + return X, y, pid + + +def outputConfusionMatrix33(foutName, matrixName, regionLabels): + for r in range(0, len(regionLabels)): + execString1 = foutName + '.write("{0},{1},{2},{3}\\n".format(' + execString2 = 'regionLabels[' + str(r) + ']' + execString3 = '' + for c in range(0, len(regionLabels)): + execString3 = execString3 + ',' + matrixName + '[' + str(r) + '][' + str(c) + ']' + execString4 = '))' + execString = execString1 + execString2 + execString3 + execString4 + exec(execString) + + +def compare_sentence_level_classifiers(data_train, LBlist, LE_y, fileCSV): + """ compare the classification algorithms on sentence-level classifiers. + + Args: + data_train: training data. + LBlist: list of label binarizer, which is used to encode pronunciation variants. + LE_y: label encorder, which is used to encode rigion names. + fileCSV: output csv file path. + + """ + fout = open(fileCSV, "w") + + sentenceIDlist_train = data_train[:, 4].astype(int) + sentenceIDmax_train = max(sentenceIDlist_train) + + for sentenceID in range(1, sentenceIDmax_train+1): + sentenceIDstr = format(sentenceID, '02') + + ## categorical values into binary values. + data_sentence = data_train[sentenceIDlist_train == sentenceID, :] + X_train, y_train, pid_train = OneHotEncoding(data_sentence, LBlist, LE_y) + regionCounter = Counter(LE_y.inverse_transform(y_train)) + + ## classifier comparison + names = [ + "Nearest Neighbors", + "Linear SVM", + "Poly SVM", + "RBF SVM", + "Decision Tree", + "Random Forest 2", + "Random Forest 3", + "Random Forest 4", + "AdaBoost", + "AdaBoost(SVM)", + "AdaBoost(Random Forest 3)", + "Naive Bayes", + "Linear Discriminant Analysis", + "Quadratic Discriminant Analysis" + ] + classifiers = [ + KNeighborsClassifier(3), + SVC(kernel="linear", C=0.025), + SVC(kernel="poly", C=0.025), + SVC(gamma=2, C=1), + DecisionTreeClassifier(max_depth=4), + RandomForestClassifier(max_depth=2, n_estimators=10, max_features=1), + RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1), + RandomForestClassifier(max_depth=4, n_estimators=10, max_features=1), + AdaBoostClassifier(), + AdaBoostClassifier(SVC(probability=True, kernel='linear')), + AdaBoostClassifier(RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1)), + GaussianNB(), + LinearDiscriminantAnalysis(), + QuadraticDiscriminantAnalysis() + ] + for name, model in zip(names, classifiers): + scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro') + fout.write("{0},{1},{2},{3}\n".format(sentenceID, name, scores.mean(), scores.var())) + print('{0}, {1}: {2}'.format(sentenceID, name, scores.mean())) + + fout.close() + + +def train_sentence_level_classifiers(data_train, LBlist, LE_y, fileCSV): + """ train sentence-level classifiers. + + Args: + data_train: training data. + LBlist: list of label binarizer, which is used to encode pronunciation variants. + LE_y: label encorder, which is used to encode rigion names. + fileCSV: output csv file path. + + Returns: + modelList (list): list of models (length: sentenceNumMax) + scoreList (list): list of scores (length: sentenceNumMax) + + """ + fout = open(fileCSV, "w") + + fout.write('< cross-validation in training set >\n') + + sentenceIDlist_train = data_train[:, 4].astype(int) + sentenceIDmax_train = max(sentenceIDlist_train) + modelList = [] + scoreList = [] + confusionMatrixList = [] + + for sentenceID in range(1, sentenceIDmax_train+1): + sentenceIDstr = format(sentenceID, '02') + + ## categorical values into binary values. + data_sentence = data_train[sentenceIDlist_train == sentenceID, :] + X_train, y_train, pid_train = OneHotEncoding(data_sentence, LBlist, LE_y) + regionCounter = Counter(LE_y.inverse_transform(y_train)) + + ## cross-validation with the best classifier + model = AdaBoostClassifier() + #model = SVC(kernel="linear", C=0.025) + #model = LinearDiscriminantAnalysis() + +# #scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro') + scores, confusionMatrix = eval.cross_val_confusion_matrix(model, X_train, y_train, 10) + ci_mean, ci_low, ci_high = eval.mean_confidence_interval(scores, 0.95) + scoreList.append(scores) + confusionMatrixList.append(confusionMatrix) + + ## model fitting + modelfit = model.fit(X_train, y_train) + modelList.append(modelfit) + + ## output + fout.write("{},".format(sentenceID)) + #fout.write("{0},{1},{2},".format( + # regionCounter['Groningen_and_Drenthe'], regionCounter['Limburg'], regionCounter['Oost_Overijsel-Gelderland'])) + #fout.write("{0},{1},".format( + # regionCounter['Low_Saxon'], regionCounter['Limburg'])) + fout.write("{0},{1},".format( + regionCounter['Groningen_and_Drenthe'], regionCounter['Limburg'])) + + fout.write("{0},{1},{2}\n".format(ci_mean, ci_low, ci_high)) + fout.write('\n') + fout.close() + + return modelList, scoreList, confusionMatrixList + + +def prediction_per_sentence(data_eval, modelList, LBlist, LE_y): + """ prediction using sentence-level classifiers. + + Args: + data_eval: evaluation data. + modelList: list of the models. + LBlist: list of label binarizer, which is used to encode pronunciation variants. + LE_y: label encorder, which is used to encode rigion names. + + Returns: + prediction (list): [sentenceID, pid, answer, prediction] + + """ + sentenceIDlist_eval = data_eval[:, 4].astype(int) + sentenceIDmax_eval = max(sentenceIDlist_eval) + for sentenceID in range(1, sentenceIDmax_eval+1): + sentenceIDstr = format(sentenceID, '02') + + ## categorical values into binary values. + data_sentence = data_eval[sentenceIDlist_eval == sentenceID, :] + X_eval, y_eval, pid_eval = OneHotEncoding(data_sentence, LBlist, LE_y) + regionCounter = Counter(LE_y.inverse_transform(y_eval)) + + ## evaluate model + modelfit = modelList[sentenceID-1] + y_pred = modelfit.predict(X_eval) + y_pred_label = LE_y.inverse_transform(y_pred) + y_eval_label = LE_y.inverse_transform(y_eval) + + # pid, y, y_pred + sentenceIDvec = np.ones((y_eval_label.shape[0], 1)).astype(int) * sentenceID + prediction_ = np.c_[sentenceIDvec, pid_eval, y_eval_label, y_pred_label] + if sentenceID == 1: + prediction = prediction_ + else: + prediction = np.r_[prediction, prediction_] + + return prediction + + +def prediction_per_pid_majority(pidlist_eval, prediction): + """ make a prediction per pid using majority vote + + Returns: + prediction_per_pid (ndarray): [pid, ans, prediction] + + """ + prediction_per_pid = [] + for pid_ in range(0, len(pidlist_eval[:, 0])): + pid = pidlist_eval[pid_, 0] + ans = pidlist_eval[pid_, 1] + prediction_ = prediction[prediction[:, 1] == pid, :] + + # majority vote + predCounter = Counter(prediction_[:, -1]) + predMostCommon = predCounter.most_common(1) + predLabel = predMostCommon[0][0] + predRatio = predMostCommon[0][1] / prediction_.shape[0] * 100 + + prediction_per_pid.append([pid, ans, predLabel]) + + return np.array(prediction_per_pid) + + +def calc_weight(confusionMatrixList): + """ calculate weight (how trustworthy the prediction is) for majority vote. + + Note: + Of all subjects we predicted are GO/OG/LB, what fraction of them actually are (precision) is used as weight. + + Args: + confusionMarixList: list of confusion matrix of sentence-level classifiers. + + """ + sentenceID_max = len(confusionMatrixList) + weight = np.zeros((sentenceID_max, confusionMatrixList[0].shape[0])) + for sentenceID in range(1, sentenceID_max+1): + cm = confusionMatrixList[sentenceID-1] + + # normalized confusion matrix + #rTotal = np.sum(cm, axis=1) + #cm_normalized = cm / rTotal + #weight[sentenceID-1, :] = np.diag(cm_normalized) + + true_positives = np.diag(cm) + predicted = np.sum(cm, axis=0) + weight[sentenceID-1, :] = true_positives / predicted + + return weight + + +def prediction_per_pid_weighted(pidlist_eval, prediction, weight, LB_y, LE_y): + """ make a prediction per pid using weighted (majority) vote. + + Args: + weight (ndarray): how trustworthy the prediction of each sentence-based classifier is. + LB_y: label binalizer, which is used to encode region names. + LE_y: label encorder, which is used to encode region names. + Returns: + prediction_per_pid (ndarray): [pid, ans, prediction] + + """ + + prediction_per_pid = [] + for pid_ in range(0, len(pidlist_eval[:, 0])): + pid = pidlist_eval[pid_, 0] + ans = pidlist_eval[pid_, 1] + prediction_ = prediction[prediction[:, 1] == pid, :] + + # calculate weighted (majority) vote + vote_weighted = np.zeros((1, 3)) + for sentenceID_ in range(0, prediction_.shape[0]): + sentenceID = prediction_[sentenceID_, 0].astype(int) + w = weight[sentenceID-1, :] + pred = prediction_[sentenceID_, 3] + pred_int = LB_y.transform([pred]) + vote_weighted = vote_weighted + w * pred_int + + # choose the most vote + vote_weighted = vote_weighted[0] + maxindex = list(vote_weighted).index(max(vote_weighted)) + #predLabel = regionLabels[maxindex] + predLabel = LE_y.inverse_transform(maxindex) + prediction_per_pid.append([pid, ans, predLabel]) + + return np.array(prediction_per_pid) + + +def saxon_vs_limburg(pidlist3): + """convert a pidlist for 3 regions into that for 2 regions. + + Notes: + 3 regions include ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland'] + 2 regions include ['Limburg', 'Low_Saxon'] + where Low_Saxon = 'Groningen_and_Drenthe' + 'Oost_Overijsel-Gelderland' + samples are randomly chosen so that each class has the same amount of data. + + """ + + regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland'] + regionLabels2 = ['Low_Saxon', 'Limburg'] + + index_saxon = np.any([pidlist3[:, 1] == regionLabels[0], pidlist3[:, 1] == regionLabels[2]], axis=0) + pidlist_saxon_ = pidlist3[index_saxon, :] + pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :] + + # extract the same amout of samples as Limburg. + pidlistCounter3 = Counter(pidlist3[:, 1]) + pidlist_saxon, idx = mani.extractRandomSample(pidlist_saxon_, pidlistCounter3['Limburg']) + pidlist_saxon[:, 1] = regionLabels2[0] + + pidlist2 = np.r_[pidlist_limburg, pidlist_saxon] + #pidlistCounter2 = Counter(pidlist2[:, 1]) + return pidlist2 + + +def groningen_vs_limburg(pidlist3): + """convert a pidlist for 3 regions into that for 2 regions. + + Notes: + 3 regions include ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland'] + 2 regions include ['Groningen_and_Drenthe', 'Limburg'] + + """ + regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland'] + + pidlist_groningen = pidlist3[pidlist3[:, 1] == regionLabels[0], :] + pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :] + + pidlist2 = np.r_[pidlist_groningen, pidlist_limburg] + return pidlist2 \ No newline at end of file diff --git a/dialect_identification/test_code.py b/dialect_identification/test_code.py new file mode 100644 index 0000000..47a905a --- /dev/null +++ b/dialect_identification/test_code.py @@ -0,0 +1,44 @@ + +import Levenshtein +import numpy as np + +a = 'hello' +b = 'haall' + +# approximate +infinite = 100 + +# make distance matrix D +len_a = len(a) +len_b = len(b) +D_ = np.zeros((len_a, len_b)).astype(int) +for ia in range(0, len_a): + a_ = a[ia] + for ib in range(0, len_b): + b_ = b[ib] + if a_ == b_: + D_[ia, ib] = 1 + +D = np.zeros((len_a+1, len_b+1)).astype(int) +D[1:len_a+1, 1:len_b+1] = D_ +D[0, :] = infinite +D[:, 0] = infinite +D[0, 0] = 0 + +# calculate accumulated distance +indexPath = [] +for ia in range(0, len_a): + for ib in range(0, len_b): + a_ = a[ia] + b_ = b[ib] + option = (D[ia, ib]+D[ia+1, ib+1], D[ia, ib+1], D[ia+1, ib]) + Dmin = np.min(option) + D[ia+1, ib+1] = D[ia+1, ib+1]+Dmin + index = list(option).index(Dmin) + indexPath[ia, ib] = index + +# back trace +ia = len_a +ib = len_b +#while (ia > 0 or ib > 0): +# tb diff --git a/dialect_identification/word_based.py b/dialect_identification/word_based.py new file mode 100644 index 0000000..1b043a2 --- /dev/null +++ b/dialect_identification/word_based.py @@ -0,0 +1,56 @@ +import os +import sys +import configparser + +import numpy as np +from matplotlib import pyplot + +currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification' +sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir)) +from dataIO import readFile +from dataIO import selectSamplesFromCombinedData +import dataManipulation + + +configFile = currDir + '\\config.ini' + +config = configparser.ConfigParser() +config.sections() +config.read(configFile) +fileWordList = config['word_based']['fileWordList'] +fileCombined = config['word_based']['fileCombined'] + +wordList = readFile(fileWordList) + +for wordNum in range(1, len(wordList)): + word = wordList[wordNum-1] # target word + #print("=== {} ===".format(word)) + + dataGroningen, dataLimburg, dataOverijsel = selectSamplesFromCombinedData(word, fileCombined) + + sampleNumMax = 50 + dataG, indexG = dataManipulation.extractRandomSample(np.array(dataGroningen), sampleNumMax) + dataL, indexL = dataManipulation.extractRandomSample(np.array(dataLimburg), sampleNumMax) + dataO, indexO = dataManipulation.extractRandomSample(np.array(dataOverijsel), sampleNumMax) + + # combine pronunciation from three regions + # data: (sampleNumMax x 3) x 1 + cPronunciation = 4 + data = np.hstack([dataG[:, cPronunciation], dataL[:, cPronunciation], dataO[:, cPronunciation]]) + + # MDS + dataLevenshtein = dataManipulation.makeLevenshteinMatrix(data) + dataMDS = dataManipulation.MDS(dataLevenshtein) + + # plot + pyplot.scatter(dataMDS[0:sampleNumMax-1, 0], dataMDS[0:sampleNumMax-1, 1], s=80, c='red', marker="o", facecolors='none', label="Groningen and Drenthe") + pyplot.scatter(dataMDS[sampleNumMax:sampleNumMax*2-1, 0], dataMDS[sampleNumMax:sampleNumMax*2-1, 1], c='green', marker="^", facecolors='none', label="Limburg") + pyplot.scatter(dataMDS[sampleNumMax*2:sampleNumMax*3-1, 0], dataMDS[sampleNumMax*2:sampleNumMax*3-1, 1], c='blue', marker="+", facecolors='none', label="Oost Overijsel-Gelderland") + + pyplot.title(word) + #ax.set_xlabel('x') + #ax.set_ylabel('y') + pyplot.legend(loc='upper right') + #pyplot.show() + pyplot.savefig('c:\\cygwin64\\home\\Aki\\rug_cygwin\\_same-utterance\\fig\\' + word + '.png') + pyplot.gcf().clear() \ No newline at end of file