commit to be sure.

2018-03-25 13:46:27 +02:00
commit a1379caced
15 changed files with 1784 additions and 0 deletions
--- a/dialect_identification.sln
+++ b/dialect_identification.sln
@@ -0,0 +1,38 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.26730.12
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "dialect_identification", "dialect_identification\dialect_identification.pyproj", "{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{5A4286D1-F037-43D4-90F8-05C5CCC0CA30}"
+	ProjectSection(SolutionItems) = preProject
+		..\..\forced-alignment\forced_alignment\convert_phone_set.py = ..\..\forced-alignment\forced_alignment\convert_phone_set.py
+		..\..\forced-alignment\forced_alignment\defaultfiles.py = ..\..\forced-alignment\forced_alignment\defaultfiles.py
+		..\..\forced-alignment\forced_alignment\forced_alignment.pyproj = ..\..\forced-alignment\forced_alignment\forced_alignment.pyproj
+		..\..\forced-alignment\forced_alignment\htk_dict.py = ..\..\forced-alignment\forced_alignment\htk_dict.py
+		..\..\forced-alignment\forced_alignment\lexicon.py = ..\..\forced-alignment\forced_alignment\lexicon.py
+		..\..\forced-alignment\forced_alignment\mlf.py = ..\..\forced-alignment\forced_alignment\mlf.py
+		..\..\forced-alignment\forced_alignment\pronunciations.py = ..\..\forced-alignment\forced_alignment\pronunciations.py
+		..\..\forced-alignment\forced_alignment\pyhtk.py = ..\..\forced-alignment\forced_alignment\pyhtk.py
+		..\..\forced-alignment\forced_alignment\scripts.py = ..\..\forced-alignment\forced_alignment\scripts.py
+		..\..\forced-alignment\forced_alignment\tempfilename.py = ..\..\forced-alignment\forced_alignment\tempfilename.py
+		..\..\forced-alignment\forced_alignment\test_environment.py = ..\..\forced-alignment\forced_alignment\test_environment.py
+	EndProjectSection
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Release|Any CPU = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}.Release|Any CPU.ActiveCfg = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {FA4F83BB-D460-40C1-B10E-98E4877CA29B}
+	EndGlobalSection
+EndGlobal
--- a/dialect_identification/audio2db.py
+++ b/dialect_identification/audio2db.py
@@ -0,0 +1,90 @@
+import os
+import sys
+import configparser
+
+import numpy as np
+import pypyodbc
+
+
+## user define
+forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced-alignment'
+dir_same_utterance = 'd:\\OneDrive\\Research\\rug\\experiments\\same_utterance'
+wav_dir      = dir_same_utterance + '\\wav_with_cities'
+script_dir   = dir_same_utterance + '\\script'
+fileMDB		 = dir_same_utterance + '\\feature\\DialectClassification.accdb'
+table		 = 'ForcedAlignmentResult'
+regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
+
+# these lines are not necessary once forced-alignment is intalled as a package.
+sys.path.append(forced_alignment_module)
+from forced_alignment import forced_alignment
+
+
+## check if forced-alignment work in each sentence
+#from forced_alignment import pronunciations
+#pronunciations.delete_all_g2p_entries()
+
+#wav_file = wav_dir + '\\10\\' + regionLabels[0] + '\\9935-1464218044-1951631.wav'
+#script_file = script_dir + '\\script10.txt'
+#with open(script_file, 'r') as fin:
+#	script = fin.readline()
+#fa = forced_alignment(wav_file, script)
+
+
+## make database connection
+param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
+conn = pypyodbc.connect(param)
+cursor = conn.cursor()
+
+SQLstring1 = 'INSERT INTO ' + table + ' (filename, region, word_id, pronunciation) '
+
+
+## forced-alignment to all the wav files in dir_same_utterance  
+word_id_start = 1
+for sentenceID in range(1, 11):
+	sentenceIDstr = format(sentenceID, '02')
+
+	# get script
+	script_file = script_dir + '\\script' + sentenceIDstr + '.txt'
+	with open(script_file, 'r') as fin:
+		script = fin.readline()
+
+		# loop over three regions
+		for region in regionLabels:
+			
+			# loop over the wav_subdir
+			wav_subdir = wav_dir + '\\' + sentenceIDstr + '\\' + region
+			wav_files = os.listdir(wav_subdir)
+			file_nr = 0
+			for wav_file in wav_files:
+				file_nr += 1
+				filename = wav_file.replace('.wav', '')
+				wav_file_fullpath = wav_subdir + '\\' + wav_file
+
+				# forced-alignment
+				print('{0} {1}: {2} ({3}/{4})'.format(sentenceIDstr, region, wav_file, file_nr, len(wav_files)))
+				fa = forced_alignment(wav_file_fullpath, script)
+
+				# send pronunciation variant to database
+				word_id = word_id_start
+				for row	in fa:
+					word	 = row[0]
+					phonemes = np.array(row[1])
+
+					## get pronunciation variant
+					pronvar_ = phonemes[:, 2]
+					pronvar_[np.where(pronvar_=='ssil')]='' # remove 'ssil'
+					pronvar = ''.join(pronvar_)
+
+					## insert the result into the database.  
+					SQLstring2 = 'VALUES (\'' + filename + '\',\'' + region + '\',\'' + str(word_id) + '\',\'' + pronvar + '\')'
+					SQLstring  = SQLstring1 + SQLstring2 
+					cursor.execute(SQLstring)
+					conn.commit()
+
+					word_id = word_id + 1
+	
+	word_id_start += script.count(' ')+1
+
+conn.close()
+
--- a/dialect_identification/classifier.py
+++ b/dialect_identification/classifier.py
@@ -0,0 +1,290 @@
+'''
+This script perfoms the basic process for applying a machine learning
+algorithm to a dataset using Python libraries.
+
+The four steps are:
+   1. Download a dataset (using pandas)
+   2. Process the numeric data (using numpy)
+   3. Train and evaluate learners (using scikit-learn)
+   4. Plot and compare results (using matplotlib)
+
+
+The data is downloaded from URL, which is defined below. As is normal
+for machine learning problems, the nature of the source data affects
+the entire solution. When you change URL to refer to your own data, you
+will need to review the data processing steps to ensure they remain
+correct.
+
+============
+Example Data
+============
+The example is from http://mlr.cs.umass.edu/ml/datasets/Spambase
+It contains pre-processed metrics, such as the frequency of certain
+words and letters, from a collection of emails. A classification for
+each one indicating 'spam' or 'not spam' is in the final column.
+See the linked page for full details of the data set.
+
+This script uses three classifiers to predict the class of an email
+based on the metrics. These are not representative of modern spam
+detection systems.
+'''
+
+# Remember to update the script for the new data when you change this URL
+URL = "http://mlr.cs.umass.edu/ml/machine-learning-databases/spambase/spambase.data"
+
+# Uncomment this call when using matplotlib to generate images
+# rather than displaying interactive UI.
+#import matplotlib
+#matplotlib.use('Agg')
+
+from pandas import read_table
+import numpy as np
+import matplotlib.pyplot as plt
+
+try:
+    # [OPTIONAL] Seaborn makes plots nicer
+    import seaborn
+except ImportError:
+    pass
+
+# =====================================================================
+
+def download_data():
+    '''
+    Downloads the data for this script into a pandas DataFrame.
+    '''
+
+    # If your data is in an Excel file, install 'xlrd' and use
+    # pandas.read_excel instead of read_table
+    #from pandas import read_excel
+    #frame = read_excel(URL)
+
+    # If your data is in a private Azure blob, install 'azure-storage' and use
+    # BlockBlobService.get_blob_to_path() with read_table() or read_excel()
+    #from azure.storage.blob import BlockBlobService
+    #service = BlockBlobService(ACCOUNT_NAME, ACCOUNT_KEY)
+    #service.get_blob_to_path(container_name, blob_name, 'my_data.csv')
+    #frame = read_table('my_data.csv', ...
+
+    frame = read_table(
+        URL,
+        
+        # Uncomment if the file needs to be decompressed
+        #compression='gzip',
+        #compression='bz2',
+
+        # Specify the file encoding
+        # Latin-1 is common for data from US sources
+        encoding='latin-1',
+        #encoding='utf-8',  # UTF-8 is also common
+
+        # Specify the separator in the data
+        sep=',',            # comma separated values
+        #sep='\t',          # tab separated values
+        #sep=' ',           # space separated values
+
+        # Ignore spaces after the separator
+        skipinitialspace=True,
+
+        # Generate row labels from each row number
+        index_col=None,
+        #index_col=0,       # use the first column as row labels
+        #index_col=-1,      # use the last column as row labels
+
+        # Generate column headers row from each column number
+        header=None,
+        #header=0,          # use the first line as headers
+
+        # Use manual headers and skip the first row in the file
+        #header=0,
+        #names=['col1', 'col2', ...],
+    )
+
+    # Return a subset of the columns
+    #return frame[['col1', 'col4', ...]]
+
+    # Return the entire frame
+    return frame
+
+
+# =====================================================================
+
+
+def get_features_and_labels(frame):
+    '''
+    Transforms and scales the input data and returns numpy arrays for
+    training and testing inputs and targets.
+    '''
+
+    # Replace missing values with 0.0, or we can use
+    # scikit-learn to calculate missing values (below)
+    #frame[frame.isnull()] = 0.0
+
+    # Convert values to floats
+    arr = np.array(frame, dtype=np.float)
+
+    # Use the last column as the target value
+    X, y = arr[:, :-1], arr[:, -1]
+    # To use the first column instead, change the index value
+    #X, y = arr[:, 1:], arr[:, 0]
+    
+    # Use 80% of the data for training; test against the rest
+    from sklearn.model_selection import train_test_split
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
+
+    # sklearn.pipeline.make_pipeline could also be used to chain 
+    # processing and classification into a black box, but here we do
+    # them separately.
+    
+    # If values are missing we could impute them from the training data
+    #from sklearn.preprocessing import Imputer
+    #imputer = Imputer(strategy='mean')
+    #imputer.fit(X_train)
+    #X_train = imputer.transform(X_train)
+    #X_test = imputer.transform(X_test)
+    
+    # Normalize the attribute values to mean=0 and variance=1
+    from sklearn.preprocessing import StandardScaler
+    scaler = StandardScaler()
+    # To scale to a specified range, use MinMaxScaler
+    #from sklearn.preprocessing import MinMaxScaler
+    #scaler = MinMaxScaler(feature_range=(0, 1))
+    
+    # Fit the scaler based on the training data, then apply the same
+    # scaling to both training and test sets.
+    scaler.fit(X_train)
+    X_train = scaler.transform(X_train)
+    X_test = scaler.transform(X_test)
+
+    # Return the training and test sets
+    return X_train, X_test, y_train, y_test
+
+
+# =====================================================================
+
+
+def evaluate_classifier(X_train, X_test, y_train, y_test):
+    '''
+    Run multiple times with different classifiers to get an idea of the
+    relative performance of each configuration.
+
+    Returns a sequence of tuples containing:
+        (title, precision, recall)
+    for each learner.
+    '''
+
+    # Import some classifiers to test
+    from sklearn.svm import LinearSVC, NuSVC
+    from sklearn.ensemble import AdaBoostClassifier
+
+    # We will calculate the P-R curve for each classifier
+    from sklearn.metrics import precision_recall_curve, f1_score
+    
+    # Here we create classifiers with default parameters. These need
+    # to be adjusted to obtain optimal performance on your data set.
+    
+    # Test the linear support vector classifier
+    classifier = LinearSVC(C=1)
+    # Fit the classifier
+    classifier.fit(X_train, y_train)
+    score = f1_score(y_test, classifier.predict(X_test))
+    # Generate the P-R curve
+    y_prob = classifier.decision_function(X_test)
+    precision, recall, _ = precision_recall_curve(y_test, y_prob)
+    # Include the score in the title
+    yield 'Linear SVC (F1 score={:.3f})'.format(score), precision, recall
+
+    # Test the Nu support vector classifier
+    classifier = NuSVC(kernel='rbf', nu=0.5, gamma=1e-3)
+    # Fit the classifier
+    classifier.fit(X_train, y_train)
+    score = f1_score(y_test, classifier.predict(X_test))
+    # Generate the P-R curve
+    y_prob = classifier.decision_function(X_test)
+    precision, recall, _ = precision_recall_curve(y_test, y_prob)
+    # Include the score in the title
+    yield 'NuSVC (F1 score={:.3f})'.format(score), precision, recall
+
+    # Test the Ada boost classifier
+    classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.0, algorithm='SAMME.R')
+    # Fit the classifier
+    classifier.fit(X_train, y_train)
+    score = f1_score(y_test, classifier.predict(X_test))
+    # Generate the P-R curve
+    y_prob = classifier.decision_function(X_test)
+    precision, recall, _ = precision_recall_curve(y_test, y_prob)
+    # Include the score in the title
+    yield 'Ada Boost (F1 score={:.3f})'.format(score), precision, recall
+
+# =====================================================================
+
+
+def plot(results):
+    '''
+    Create a plot comparing multiple learners.
+
+    `results` is a list of tuples containing:
+        (title, precision, recall)
+    
+    All the elements in results will be plotted.
+    '''
+
+    # Plot the precision-recall curves
+
+    fig = plt.figure(figsize=(6, 6))
+    fig.canvas.set_window_title('Classifying data from ' + URL)
+
+    for label, precision, recall in results:
+        plt.plot(recall, precision, label=label)
+
+    plt.title('Precision-Recall Curves')
+    plt.xlabel('Precision')
+    plt.ylabel('Recall')
+    plt.legend(loc='lower left')
+
+    # Let matplotlib improve the layout
+    plt.tight_layout()
+
+    # ==================================
+    # Display the plot in interactive UI
+    plt.show()
+
+    # To save the plot to an image file, use savefig()
+    #plt.savefig('plot.png')
+
+    # Open the image file with the default image viewer
+    #import subprocess
+    #subprocess.Popen('plot.png', shell=True)
+
+    # To save the plot to an image in memory, use BytesIO and savefig()
+    # This can then be written to any stream-like object, such as a
+    # file or HTTP response.
+    #from io import BytesIO
+    #img_stream = BytesIO()
+    #plt.savefig(img_stream, fmt='png')
+    #img_bytes = img_stream.getvalue()
+    #print('Image is {} bytes - {!r}'.format(len(img_bytes), img_bytes[:8] + b'...'))
+
+    # Closing the figure allows matplotlib to release the memory used.
+    plt.close()
+
+
+# =====================================================================
+
+
+if __name__ == '__main__':
+    # Download the data set from URL
+    print("Downloading data from {}".format(URL))
+    frame = download_data()
+
+    # Process data into feature and label arrays
+    print("Processing {} samples with {} attributes".format(len(frame.index), len(frame.columns)))
+    X_train, X_test, y_train, y_test = get_features_and_labels(frame)
+
+    # Evaluate multiple classifiers on the data
+    print("Evaluating classifiers")
+    results = list(evaluate_classifier(X_train, X_test, y_train, y_test))
+
+    # Display the results
+    print("Plotting the results")
+    plot(results)
--- a/dialect_identification/config.ini
+++ b/dialect_identification/config.ini
@@ -0,0 +1,8 @@
+[word_based]
+fileWordList = D:\\OneDrive\\Research\\rug\\same_utterance\\feature\\wordList.csv
+fileCombined = D:\\OneDrive\\Research\\rug\\same_utterance\\feature\\combined.csv
+
+[sentence_based]
+dirFeature = D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\feature
+fileMDB = D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\feature\\DialectClassification.accdb
+dirData = D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\wav
--- a/dialect_identification/data_io.py
+++ b/dialect_identification/data_io.py
@@ -0,0 +1,74 @@
+#
+# 2017/09/25
+# select samples from the combined.csv for the further analysis
+#
+# HISTORY
+# 2017/10/02 modularized.
+# 
+# Aki Kunikoshi
+# 428968@gmail.com
+#
+import numpy as np
+
+def readFile(filename):
+    with open(filename, 'r') as fin:
+        lines = fin.read()
+    linesEach = lines.split('\n')
+    return linesEach
+
+
+def selectSamplesFromCombinedData(word, fileCombined):
+    # load combined data 
+	fin = open(fileCombined, 'r')
+	line = fin.readline()
+
+    # load data per region
+	dataGroningen = []
+	dataLimburg   = []
+	dataOverijsel = []
+	while line:
+		line = fin.readline()
+		line = line.rstrip()
+		lineList = line.split(',')
+		if len(lineList) == 6 and lineList[5] == word:
+			region = lineList[2]
+			if region == 'Groningen_and_Drenthe':
+				dataGroningen.append(lineList)
+			elif region == 'Limburg':
+				dataLimburg.append(lineList)
+			elif region == 'Oost_Overijsel-Gelderland':
+				dataOverijsel.append(lineList)
+	fin.close()
+	return (dataGroningen, dataLimburg, dataOverijsel)
+    #print("{0}: {1} {2} {3}".format(word,len(listGroningen),len(listLimburg),len(listOverijsel))
+
+
+def groupSamplesInCSV(fileCSV, idxRegion):
+	fin = open(fileCSV, 'r')
+
+	# first line is the header
+	line = fin.readline()
+	line = line.rstrip()
+	header = line.split(',')
+
+	# load data per region
+	dataGroningen = []
+	dataLimburg   = []
+	dataOverijsel = []
+	while line:
+		line = fin.readline()
+		line = line.rstrip()
+		lineList = line.split(',')
+		if len(lineList) == len(header):
+			region = lineList[idxRegion]
+			if region == 'Groningen_and_Drenthe':
+				dataGroningen.append(lineList)
+			elif region == 'Limburg':
+				dataLimburg.append(lineList)
+			elif region == 'Oost_Overijsel-Gelderland':
+				dataOverijsel.append(lineList)
+	fin.close()
+	return (header, dataGroningen, dataLimburg, dataOverijsel)
+
+def addUserID(featureFile, recordingsCSV):
+	dirFeature = config['sentence_based']['dirFeature']
--- a/dialect_identification/data_manipulation.py
+++ b/dialect_identification/data_manipulation.py
@@ -0,0 +1,41 @@
+import numpy as np
+from sklearn import manifold
+import Levenshtein
+
+# x: ndarray (dnum x dim)
+# n: number of samples to extract
+# OUTPUT
+# index: index of the chosen samples
+# 
+def extractRandomSample(x, n):
+	xRowMax = x.shape[0]
+	indexOriginal = np.arange(xRowMax)
+	indexChosen	= np.random.choice(indexOriginal, n, False)
+	xChosen = x[indexChosen, :]
+	return (xChosen, indexChosen)
+
+# x: 1d string ndarray
+def makeLevenshteinMatrix(x):
+	xRowMax = x.shape[0]
+	xLevenshtein = np.ones((xRowMax, xRowMax), dtype='int')
+	
+	for xRow in range(0, xRowMax):
+		for xCol in range(0, xRowMax):
+			dist = Levenshtein.distance(x[xRow], x[xCol]);
+			xLevenshtein[xRow, xCol] = dist
+	return xLevenshtein
+
+# x: 1d string ndarray
+def calcLevenshteinArray(word, x):
+	xRowMax = x.shape[0]
+	xLevenshtein = np.zeros(x.shape, dtype='int')
+	
+	for xRow in range(0, xRowMax):
+		dist = Levenshtein.distance(word, x[xRow]);
+		xLevenshtein[xRow] = dist
+	return xLevenshtein	
+
+def MDS(x):
+	mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=6)
+	xmds = mds.fit_transform(x)
+	return xmds
--- a/dialect_identification/dialect_identification.pyproj
+++ b/dialect_identification/dialect_identification.pyproj
@@ -0,0 +1,70 @@
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" ToolsVersion="4.0">
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <SchemaVersion>2.0</SchemaVersion>
+    <ProjectGuid>fe1b1358-adbe-4446-affd-a0802d13d15b</ProjectGuid>
+    <ProjectTypeGuids>{a41c8ea1-112a-4a2d-9f91-29557995525f};{888888a0-9f3d-457c-b088-3a5042f75d52}</ProjectTypeGuids>
+    <ProjectHome>.</ProjectHome>
+    <StartupFile>output_confusion_matrix.py</StartupFile>
+    <SearchPath>
+    </SearchPath>
+    <WorkingDirectory>.</WorkingDirectory>
+    <OutputPath>.</OutputPath>
+    <Name>dialect_identification</Name>
+    <RootNamespace>dialect_identification</RootNamespace>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)' == 'Debug' ">
+    <DebugSymbols>true</DebugSymbols>
+    <EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)' == 'Release' ">
+    <DebugSymbols>true</DebugSymbols>
+    <EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
+  </PropertyGroup>
+  <ItemGroup>
+    <Compile Include="manipulate_db.py">
+      <SubType>Code</SubType>
+    </Compile>
+    <Compile Include="audio2db.py">
+      <SubType>Code</SubType>
+    </Compile>
+    <Compile Include="classifier.py" />
+    <Compile Include="dataManipulation.py">
+      <SubType>Code</SubType>
+    </Compile>
+    <Compile Include="output_confusion_matrix.py">
+      <SubType>Code</SubType>
+    </Compile>
+    <Compile Include="sentence_based.py">
+      <SubType>Code</SubType>
+    </Compile>
+    <Compile Include="speaker_based.py">
+      <SubType>Code</SubType>
+    </Compile>
+    <Compile Include="speaker_based_functions.py">
+      <SubType>Code</SubType>
+    </Compile>
+    <Compile Include="test_code.py">
+      <SubType>Code</SubType>
+    </Compile>
+    <Compile Include="evaluation.py">
+      <SubType>Code</SubType>
+    </Compile>
+    <Compile Include="word_based.py">
+      <SubType>Code</SubType>
+    </Compile>
+    <Compile Include="dataIO.py" />
+  </ItemGroup>
+  <ItemGroup>
+    <Content Include="config.ini" />
+  </ItemGroup>
+  <Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
+  <!-- Uncomment the CoreCompile target to enable the Build command in
+       Visual Studio and specify your pre- and post-build commands in
+       the BeforeBuild and AfterBuild targets below. -->
+  <!--<Target Name="CoreCompile" />-->
+  <Target Name="BeforeBuild">
+  </Target>
+  <Target Name="AfterBuild">
+  </Target>
+</Project>
--- a/dialect_identification/evaluation.py
+++ b/dialect_identification/evaluation.py
@@ -0,0 +1,40 @@
+import numpy as np
+import scipy as sp
+import scipy.stats
+from sklearn.model_selection import KFold
+from sklearn.metrics import f1_score
+from sklearn.metrics import confusion_matrix
+
+
+# from https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data
+def mean_confidence_interval(data, confidence):
+    a = 1.0*np.array(data)
+    n = len(a)
+    m, se = np.mean(a), scipy.stats.sem(a)
+    h = se * sp.stats.t._ppf((1+confidence)/2., n-1)
+    return m, m-h, m+h
+
+# accumulated confusion matrix is added to cross_val_score  
+def cross_val_confusion_matrix(model, X, y, cv):
+	kf = KFold(n_splits=cv)
+	classLabels = np.unique(y)
+	classNumMax = classLabels.shape[0]
+	confusionMatrixAccumulated = np.zeros((classNumMax, classNumMax))
+	scores = []
+	for idx_train, idx_test in kf.split(X):
+		# split into train/test
+		x_train = X[idx_train, :]
+		x_test  = X[idx_test, :]
+		y_train = y[idx_train]
+		y_test  = y[idx_test]
+		modelfit = model.fit(x_train, y_train)
+		
+		# evaluation
+		y_pred = modelfit.predict(x_test)
+
+		score = f1_score(y_test, y_pred, average='micro')
+		scores.append(score)
+		confusionMatrixAccumulated = confusionMatrixAccumulated + confusion_matrix(y_test, y_pred,
+			labels=classLabels) 
+	scores = np.array(scores)
+	return scores, confusionMatrixAccumulated
--- a/dialect_identification/manipulate_db.py
+++ b/dialect_identification/manipulate_db.py
@@ -0,0 +1,48 @@
+import sys
+import os
+import pandas
+import datetime
+sys.path.append('..')
+
+# these lines are not necessary once forced-alignment is intalled as a package.
+forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced-alignment'
+sys.path.append(forced_alignment_module)
+from forced_alignment import pronunciations
+from forced_alignment.htk_dict import variances_table
+
+
+#pronunciations.delete_word('kunikoshi')
+#pronunciations.delete_all_g2p_entries()
+
+
+#existing_pronunciations = set(pronunciations.get_all())
+## only focus on word
+
+
+## missing pronunciations
+## (1) pronunciation is written in IPA.
+## (2) pronunciation variants are made based on (1).
+## (3) they are converted into HTK format.
+#missing_pronunciations_file = 'D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\missing_words_in_barbara_dic\\missing_words_pronvarsHTK.txt'
+
+#with open(missing_pronunciations_file) as fin:
+#	lines = fin.read()
+#	lines = lines.split('\n')
+
+#source = 'generated using ipa transcription by Marita Everhardt.'
+#inserts = []
+#for line in lines:
+#	line = line.split('\t')
+#	word = line[0].strip().lower()
+#	pronounciation = line[1].strip().split()
+	
+#	# surely not in the table
+#	#if (word, pronounciation) not in existing_pronunciations:
+#	inserts.append("('{}', '{}', '{}', '{}', 0)".format(
+#		word, 
+#		' '.join(pronounciation),
+#		source,
+#		datetime.datetime.now(), ))
+
+#sql = """INSERT INTO pronunciations (word, pronunciation, collection, added, automatic) VALUES\n  {};""".format(
+#    ',\n  '.join(inserts)
--- a/dialect_identification/output_confusion_matrix.py
+++ b/dialect_identification/output_confusion_matrix.py
@@ -0,0 +1,79 @@
+import os
+import sys
+
+import itertools
+import numpy as np
+import matplotlib.pyplot as plt
+
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import confusion_matrix
+
+
+currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
+sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
+
+regionLabels  = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg']
+regionLabels2 = ['Groningen_and_Drenthe', 'Limburg']
+dirOut = currDir + '\\result\\same-utterance_with_cities'
+
+
+def plot_confusion_matrix(cm, classes,
+						  normalize=False,
+						  title='Confusion matrix',
+						  cmap=plt.cm.Blues):
+	"""
+	This function prints and plots the confusion matrix.
+	Normalization can be applied by setting `normalize=True`.
+	Note:
+	this code is downloaded from: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
+	"""
+	if normalize:
+		cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+		print("Normalized confusion matrix")
+	else:
+		print('Confusion matrix, without normalization')
+
+	_fontsize = 24
+	plt.imshow(cm, interpolation='nearest', cmap=cmap)
+	#plt.title(title, fontsize=_fontsize+2)
+	#plt.colorbar()
+	tick_marks = np.arange(len(classes))
+	#plt.xticks(tick_marks, classes, rotation=45, fontsize=_fontsize-2)
+	plt.xticks(tick_marks, classes, fontsize=_fontsize-4)
+	plt.yticks(tick_marks, classes, fontsize=_fontsize-4)
+
+	fmt = '.2f' if normalize else 'd'
+	thresh = cm.max() / 2.
+	for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
+		plt.text(j, i, format(cm[i, j], fmt),
+				 horizontalalignment="center",
+				 color="white" if cm[i, j] > thresh else "black", 
+				 fontsize=_fontsize)
+
+	plt.tight_layout()
+	plt.subplots_adjust(bottom=0.2)
+	plt.ylabel('True label', fontsize=_fontsize-4)
+	plt.xlabel('Predicted label', fontsize=_fontsize-4)
+
+
+pred = np.load(dirOut + '\\pred_per_pid_3regions.npy')
+
+#accuracy = accuracy_score(pred[:, 1], pred[:, 2], normalize=True, sample_weight=None)
+#print('accuracy: {}%'.format(accuracy * 100))
+
+# confusion matrix
+cm = confusion_matrix(pred[:, 1], pred[:, 2], labels=regionLabels)
+# human perception (2 regions)
+#cm = np.array([[39, 57], [6, 104]])
+# human perception (3 regions)
+#cm = np.array([[22, 14, 52], [23, 21, 52], [5, 5, 100]])
+print(cm)
+
+np.set_printoptions(precision=2)
+
+plt.figure()
+plot_confusion_matrix(cm, classes=['GD', 'OG', 'LB'], normalize=True)
+#plot_confusion_matrix(cm, classes=['GD', 'LB'], normalize=True)
+
+#plt.show()
+plt.savefig(dirOut + '\\cm_machine_3regions_normalized.png')
--- a/dialect_identification/sentence_based.py
+++ b/dialect_identification/sentence_based.py
@@ -0,0 +1,197 @@
+import os
+import sys
+import configparser
+
+import numpy as np
+import pandas as pd
+from matplotlib import pyplot
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import cross_val_score
+from sklearn import preprocessing
+from collections import Counter
+
+# database
+import pypyodbc
+
+# classifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
+from sklearn.metrics import f1_score
+from sklearn.metrics import confusion_matrix
+import pickle
+
+currDir    = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
+sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
+from dataIO import readFile
+from dataIO import groupSamplesInCSV
+import dataManipulation
+import utility as util
+
+
+configFile = currDir + '\\config.ini'
+# load init file
+config = configparser.ConfigParser()
+config.sections()
+config.read(configFile)
+dirFeature = config['sentence_based']['dirFeature']
+
+sentenceNumMax = 10
+classifierList = []
+LE_X_decode	   = []
+LE_y = preprocessing.LabelEncoder()
+LE_y.fit(["Groningen_and_Drenthe", "Limburg", "Oost_Overijsel-Gelderland"])
+
+testset_X = []
+testset_y = []
+testset_userID = []
+result_y_test = []
+result_y_prediction = []
+fout = open("comparison.csv", "w")
+for sentenceNum in range(1, sentenceNumMax+1):
+	#if sentenceNum != 10:
+	#	sentenceNumStr = '0' + str(sentenceNum)
+	#else:
+	#	sentenceNumStr = str(sentenceNumStr)
+	sentenceNumStr = format(sentenceNum, '02')
+	fileSentence = dirFeature + '\\\\' + sentenceNumStr + '.csv'
+
+
+	## load combined data 
+	fileCSV = fileSentence
+	idxRegion = 1
+	header, dataGroningen, dataLimburg, dataOverijsel = groupSamplesInCSV(fileCSV, idxRegion)
+	sampleNumMax = np.min((len(dataGroningen), len(dataLimburg), len(dataOverijsel)))
+	
+
+	## make balanced dataset
+	dataG, indexG = dataManipulation.extractRandomSample(np.array(dataGroningen), sampleNumMax)
+	dataL, indexL = dataManipulation.extractRandomSample(np.array(dataLimburg), sampleNumMax)
+	dataO, indexO = dataManipulation.extractRandomSample(np.array(dataOverijsel), sampleNumMax)
+
+	XIndex = np.arange(idxRegion+1, len(header))
+	yIndex = 1 # region
+	userIDindex = 0 # userID
+
+	
+	## cathegorical values into numbers	
+	X_ = np.r_[dataG[:, XIndex], dataL[:, XIndex], dataO[:, XIndex]]
+	y_ = np.r_[dataG[:, yIndex], dataL[:, yIndex], dataO[:, yIndex]]
+	userID_ = np.r_[dataG[:, userIDindex], dataL[:, userIDindex], dataO[:, userIDindex]]
+
+	#X = np.zeros((X_.shape), 'int')
+	for Xindex in XIndex:
+		x = X_[:, Xindex-2]
+
+		## levenshtein distance
+		#word_count = Counter(x)
+		#frequent_word = max(word_count)
+		#X[:, Xindex-2] = dataManipulation.calcLevenshteinArray(frequent_word, x)
+
+		# hot encoding
+		le_x = preprocessing.LabelBinarizer()
+		le_x.fit(np.unique(x))
+		x_ = le_x.transform(x)
+		LE_X_decode.append(x_.shape[1])
+		if Xindex == idxRegion+1:
+			X = x_
+		else:
+			X = np.c_[X, x_]
+
+	y = LE_y.transform(y_)
+
+
+	## split into train vs test set
+	#[X_train, X_test, y_train, y_test] = train_test_split(X, y, test_size = 0.2, random_state = 0)
+	
+	# each regional data should be splited equally 
+	lenG = dataG.shape[0]
+	lenL = dataL.shape[0]
+	lenO = dataO.shape[0]
+	indexG = np.arange(0, lenG)
+	indexL = np.arange(lenG, lenG+lenL)
+	indexO = np.arange(lenG+lenL, lenG+lenL+lenO)
+	[XG_train, XG_test, yG_train, yG_test] = train_test_split(X[indexG, :], y[indexG], test_size = 0.2, random_state = 0)
+	[XL_train, XL_test, yL_train, yL_test] = train_test_split(X[indexL, :], y[indexL], test_size = 0.2, random_state = 0)
+	[XO_train, XO_test, yO_train, yO_test] = train_test_split(X[indexO, :], y[indexO], test_size = 0.2, random_state = 0)
+	X_train = np.r_[XG_train, XL_train, XO_train]
+	X_test  = np.r_[XG_test, XL_test, XO_test]
+	y_train = np.r_[yG_train, yL_train, yO_train]
+	y_test  = np.r_[yG_test, yL_test, yO_test]
+
+
+	## comparison
+	## classifiers
+	#names = ["Nearest Neighbors", 
+	#		 "Linear SVM",
+	#		 "Poly SVM",
+	#		 "RBF SVM", 
+	#		 "Decision Tree",
+	#		 "Random Forest 2", 
+	#		 "Random Forest 3", 
+	#		 "Random Forest 4", 
+	#		 "AdaBoost", 
+	#		 #"Naive Bayes", 
+	#		 "Linear Discriminant Analysis",
+	#		 #"Quadratic Discriminant Analysis"
+	#		 ]
+	#classifiers = [
+	#	KNeighborsClassifier(3),
+	#	SVC(kernel="linear", C=0.025),
+	#	SVC(kernel="poly", C=0.025),
+	#	SVC(gamma=2, C=1),
+	#	DecisionTreeClassifier(max_depth=4),
+	#	RandomForestClassifier(max_depth=2, n_estimators=10, max_features=1),
+	#	RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1),
+	#	RandomForestClassifier(max_depth=4, n_estimators=10, max_features=1),
+	#	AdaBoostClassifier(),
+	#	#GaussianNB(),
+	#	LinearDiscriminantAnalysis(),
+	#	#QuadraticDiscriminantAnalysis()
+	#	]
+	#for name, model in zip(names, classifiers):
+	#	scores = cross_val_score(model, X, y, cv = 10, scoring = 'f1_micro')
+	#	fout = open("comparison.csv", "a")
+	#	fout.write("{0},{1},{2}\n".format(sentenceNum, name, scores.mean()))
+	#	print('{0}, {1}: {2}'.format(sentenceNum, name, scores.mean()))
+	
+	# quasi-optimal model
+	model = AdaBoostClassifier()
+	# cross validation
+	scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro')
+	ci_mean, ci_low, ci_high = util.mean_confidence_interval(scores, 0.95)
+	modelfit = model.fit(X_train, y_train)
+	# f1 on test data
+	y_prediction = modelfit.predict(X_test)
+	f1score = f1_score(y_test, y_prediction, average='micro')
+	fout.write("{0},{1},{2},{3}\n".format(ci_mean, ci_low, ci_high, f1score))
+	
+	## save for the test
+	testset_X.append(X_test)
+	testset_y.append(y_test)
+	testset_userID.append(userID_)
+	result_y_test = result_y_test + list(y_test)
+	result_y_prediction = result_y_prediction + list(y_prediction)
+	fileClassifier = dirFeature + '\\\\' + sentenceNumStr + '.mdl'
+	pickle.dump(modelfit, open(fileClassifier, 'wb'))
+fout.close()
+
+### confusion matrix
+result_y_test_label = LE_y.inverse_transform(result_y_test)
+result_y_prediction_label = LE_y.inverse_transform(result_y_prediction)
+confusionMatrix = confusion_matrix(result_y_test_label, result_y_prediction_label, labels=[
+	'Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland'])
+print(confusionMatrix)
+
+
+### make userID list 
+#userID = testset_userID[0]
+#for sentenceNum in range(1, sentenceNumMax):
+#	userid = testset_userID[sentenceNum]
+#	userID = np.r_[userID, userid]
+#userIDlist = np.unique(userID)
+
--- a/dialect_identification/speaker_based.py
+++ b/dialect_identification/speaker_based.py
@@ -0,0 +1,326 @@
+import os
+import sys
+import configparser
+
+import pypyodbc
+import numpy as np
+from collections import Counter
+import matplotlib.pyplot as plt
+
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import cross_val_score
+from sklearn import preprocessing
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import accuracy_score
+
+currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
+sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
+import dataManipulation as mani
+import evaluation as eval
+import speaker_based_functions as sb_func
+
+
+#####################
+##   USER DEFINE   ##
+#####################
+sentenceNumMax = 10
+configFile = currDir + '\\config.ini'
+dirOut = currDir + '\\result'
+
+# make train/test set: 1, load: 0
+makeTrainTestSet = 0
+# convert 3 regions to 2 regions: 1, load: 0
+conv3to2region   = 0
+
+# 3 regions: 0
+# saxon vs limburg: 1
+# groningen vs limburg: 2
+experiment_type = 2
+
+regionLabels  = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
+
+# a bit useless error handling.
+#assert (experiment_type in (0, 1, 2)), "experiment type should be 0, 1 or 2."
+if experiment_type == 1:
+	regionLabels2 = ['Low_Saxon', 'Limburg'] 
+regionLabels2 = ['Groningen_and_Drenthe', 'Limburg']	
+
+
+##########################
+##   DATA PREPARATION   ##
+##########################
+
+## load init file
+config = configparser.ConfigParser()
+config.sections()
+config.read(configFile)
+dirFeature = config['sentence_based']['dirFeature']
+fileMDB = config['sentence_based']['fileMDB']
+
+
+## database connection
+pypyodbc.lowercase = False
+param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
+conn = pypyodbc.connect(param)
+cursor = conn.cursor()
+
+
+## get data from Access database
+# data format
+#	0: filename
+#	1: pid
+#	2: region
+#	3: ID (unique word_id)
+#	4: sentence_id
+#	5: word_id
+#	6: word
+#	7: pronunciation
+SQL_string = """\
+{CALL dataset_with_cities}
+"""
+cursor.execute(SQL_string)
+
+rows = cursor.fetchall()
+data = np.array(rows)
+#dataNumMax = data.shape[0]
+#uniqueWordIDmax = max(data[:, 3].astype(int))
+del SQL_string, rows
+
+
+## make list of LabelBinarizer object per word.
+# for X
+# get pronvarList from Access database 
+# pronvarList format
+#	0: ID (unique word_id)
+#	1: word
+#	2: pronvar
+SQL_string = """\
+{CALL pronunciation_variant}
+"""
+cursor.execute(SQL_string)
+rows = cursor.fetchall()
+pronvarList = np.array(rows)
+del SQL_string, rows
+
+
+LBlist = []
+#uniqueWordIDlist = pronvarList[:, 0].astype(int)
+uniqueWordIDlist = data[:, 3].astype(int)
+uniqueWordIDmax  = max(uniqueWordIDlist)
+for uniqueWordID in range(1, uniqueWordIDmax+1):
+	pronvar = data[uniqueWordIDlist == uniqueWordID, 7]
+	#pronvar = pronvarList[pronvarList[:, 0] == uniqueWordID, 2]
+	LB = preprocessing.LabelBinarizer()
+	LB.fit(np.unique(pronvar))
+	LBlist.append(LB)
+
+# for y (=region)
+LE_y = preprocessing.LabelEncoder()
+LE_y.fit(regionLabels)
+LE_y2 = preprocessing.LabelEncoder()
+LE_y2.fit(regionLabels2)
+
+LB_y = preprocessing.LabelBinarizer()
+LB_y.fit(regionLabels)
+LB_y2 = preprocessing.LabelBinarizer()
+LB_y2.fit(regionLabels2)
+
+del uniqueWordID, uniqueWordIDmax, pronvar, LB
+
+
+#################
+##  ITERATION  ##
+#################
+#CM_majority = np.zeros((1, 9)).astype(int)
+#CM_weighted = np.zeros((1, 9)).astype(int)
+#for iter in range(0, 1):
+#	print(iter)
+
+## make balanced dataset
+pidlist = np.unique(data[:, (1, 2)], axis=0)
+
+# count number of samples
+pidlistCounter = Counter(pidlist[:, 1])
+sampleNumMax = min(pidlistCounter.values())
+del pidlistCounter
+
+
+## make train/eval/test set or load
+if makeTrainTestSet==1:
+	pidlist_train = []
+	pidlist_eval  = []
+	pidlist_test  = []
+	for regionNum in range(0, len(regionLabels)):
+		regionName = regionLabels[regionNum]
+
+		pidlist_per_region_ = pidlist[pidlist[:, 1]==regionLabels[regionNum], :]
+		pidlist_per_region, idx = mani.extractRandomSample(
+			pidlist_per_region_, sampleNumMax)
+
+		# split dataset into train, eval and test.
+		[pidlist_per_region_train, pidlist_per_region_test] = train_test_split(
+			pidlist_per_region, test_size = 0.2, random_state = 0)
+		[pidlist_per_region_train, pidlist_per_region_eval] = train_test_split(
+			pidlist_per_region_train, test_size = 0.1, random_state = 0)
+
+		# append numpy arrays
+		if regionNum == 0:
+			pidlist_train = pidlist_per_region_train
+			pidlist_eval  = pidlist_per_region_eval
+			pidlist_test  = pidlist_per_region_test
+		else:
+			pidlist_train = np.r_[pidlist_train, pidlist_per_region_train]
+			pidlist_eval  = np.r_[pidlist_eval, pidlist_per_region_eval]
+			pidlist_test  = np.r_[pidlist_test, pidlist_per_region_test]
+	del regionNum, regionName
+	del pidlist_per_region_, pidlist_per_region, idx
+	del pidlist_per_region_train, pidlist_per_region_eval, pidlist_per_region_test
+	np.save(dirOut + "\\pidlist_train.npy", pidlist_train)
+	np.save(dirOut + "\\pidlist_eval.npy", pidlist_eval)
+	np.save(dirOut + "\\pidlist_test.npy", pidlist_test)
+else:
+	pidlist_train = np.load(dirOut + "\\pidlist_train.npy")
+	pidlist_eval  = np.load(dirOut + "\\pidlist_eval.npy")
+	pidlist_test  = np.load(dirOut + "\\pidlist_test.npy")
+
+
+## make dataset for 2 regions or load
+if conv3to2region==1:
+	pidlist2_train_ = np.r_[pidlist_train, pidlist_eval]
+
+	if experiment_type == 1:
+		pidlist2_train = sb_func.saxon_vs_limburg(pidlist2_train_)
+		pidlist2_test  = sb_func.saxon_vs_limburg(pidlist_test)	
+		np.save(dirOut + "\\pidlist2_saxon_vs_limburg_train", pidlist2_train)
+		np.save(dirOut + "\\pidlist2_saxon_vs_limburg_test", pidlist2_test)
+	
+	elif experiment_type == 2:
+		pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_)
+		pidlist2_test  = sb_func.groningen_vs_limburg(pidlist_test)
+		np.save(dirOut + "\\pidlist2_groningen_vs_limburg_train", pidlist2_train)
+		np.save(dirOut + "\\pidlist2_groningen_vs_limburg_test", pidlist2_test)
+
+	del pidlist2_train_
+else:
+	if experiment_type == 1:
+		pidlist2_train = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_train.npy")
+		pidlist2_test  = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_test.npy")
+
+	elif experiment_type == 2:
+		pidlist2_train = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_train.npy")
+		pidlist2_test  = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_test.npy")
+
+
+## train/test data
+if experiment_type == 0:
+	# Groningen vs Overijsel vs Limburg
+	data_train = sb_func.extractPid(pidlist_train, data)
+	data_eval  = sb_func.extractPid(pidlist_eval, data)
+	data_test  = sb_func.extractPid(pidlist_test, data)
+
+elif experiment_type == 1 or experiment_type == 2:
+	data2 = np.array(data)
+
+	if experiment_type == 1:
+		for row, row2 in zip(data, data2):
+			if row[2] == regionLabels[0] or row[2] == regionLabels[2]:
+				row2[2] = regionLabels2[0]
+
+	data2_train = sb_func.extractPid(pidlist2_train, data2)
+	data2_test  = sb_func.extractPid(pidlist2_test, data2)
+
+
+#####################################
+##   EXPERIMENTS START FROM HERE   ##
+#####################################
+
+## actual training
+# train vs eval
+#trainData = data_train
+#testData  = data_eval
+#testPID   = pidlist_eval
+#LB = LB_y
+#LE = LE_y
+#regionLabels = regionLabels3
+
+# train+eval vs test
+if experiment_type == 0:
+	trainData = np.r_[data_train, data_eval]
+	testData  = data_test
+	testPID   = pidlist_test
+	LB = LB_y
+	LE = LE_y
+elif experiment_type == 1 or experiment_type == 2:
+# 2 region: saxon vs limburg/ groningen vs limburg
+	trainData = data2_train
+	testData  = data2_test
+	testPID   = pidlist2_test
+	LB = LB_y2
+	LE = LE_y2
+	regionLabels = regionLabels2
+
+
+# check the number of utterance
+allData = np.r_[trainData, testData]
+filenames = np.c_[allData[:, 0], allData[:, 2]]
+filenames_unique = np.unique(filenames, axis=0)
+Counter(filenames_unique[:, 1])
+
+
+fileComparison		= dirOut + "\\algorithm_comparison.csv"
+filePerformance		= dirOut + "\\sentence-level.csv"
+fileConfusionMatrix = dirOut + "\\confusion_matrix.csv"
+
+## compare classification algorithms for the sentence-classifiers.
+#sb_func.compare_sentence_level_classifiers(trainData, LBlist, LE, fileComparison)
+
+## train sentence-level classifiers.
+modelList, scoreList, confusionMatrixList = sb_func.train_sentence_level_classifiers(
+	trainData, LBlist, LE, filePerformance)
+
+## prediction over evaluation data per each sentence-level classifier.
+pred_per_sentence = sb_func.prediction_per_sentence(testData, modelList, LBlist, LE)
+
+## combine sentence-level classifiers 
+pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence)
+
+## majority vote (weighted)
+#weight = sb_func.calc_weight(confusionMatrixList)
+#pred_per_pid_weighted = sb_func.prediction_per_pid_weighted(testPID, pred_per_sentence, weight, LB, LE)
+
+### confusion matrix
+if experiment_type == 0:
+	confusionMatrix_majority = confusion_matrix(
+		pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg'])
+else:
+	confusionMatrix_majority = confusion_matrix(
+		pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Limburg'])
+
+	#confusionMatrix_weighted = confusion_matrix(
+#	pred_per_pid_weighted[:, 1], pred_per_pid_weighted[:, 2], labels=regionLabels)
+
+
+## output
+accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None)
+print('accuracy: {}%'.format(accuracy * 100))
+
+cm = confusionMatrix_majority
+print(cm)
+
+np.save(dirOut + "\\pred_per_pid.npy", pred_per_pid_majority)
+np.save(dirOut + "\\confusion_matrix.npy", cm)
+
+#fout = open(fileConfusionMatrix, "w")
+#fout.write('< confusion matrix for majority vote in evaluation set >\n')
+#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_majority', regionLabels)
+#fout.write('< confusion matrix for weighted vote in evaluation set >\n')
+#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_weighted', regionLabels)
+#fout.write('\n')
+#fout.close()
+
+
+##### iteration finish #####
+conn.close()
+#np.savetxt(dirOut + '\\cm_majority.csv', CM_majority, delimiter=',') 
+#np.savetxt(dirOut + '\\cm_weighted.csv', CM_weighted, delimiter=',') 
+
--- a/dialect_identification/speaker_based_functions.py
+++ b/dialect_identification/speaker_based_functions.py
@@ -0,0 +1,383 @@
+import numpy as np
+from collections import Counter
+import matplotlib.pyplot as plt
+import itertools
+
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
+
+from sklearn.model_selection import cross_val_score
+from sklearn.metrics import confusion_matrix
+
+import dataManipulation as mani
+import evaluation as eval
+
+
+# extract data that corresponds to pid in the pidlist
+def extractPid(pidlist, data):
+	for pidnum in range(0, len(pidlist)):
+		pid = pidlist[pidnum, 0]
+		x = data[data[:, 1] == pid, :]
+		if pidnum == 0:
+			data_ = x
+		else:
+			data_ = np.r_[data_, x]	
+	return data_
+
+
+def OneHotEncoding(data, LB_X, LE_y):
+# one hot encoding of data using LabelBinalizer per word (LB_X) and for region (LB_y)
+# INPUT
+#  data
+#	0: filename
+#	1: pid
+#	2: region
+#	3: ID (unique word_id)
+#	4: sentence_id
+#	5: word_id
+#	6: word
+#	7: pronunciation
+#  LB_x: LabelBinalizer objects
+#  LE_y: LabelEncoder object
+# OUTPUT
+#  X: encoded variable data
+#  y: encoded target data
+	pidlist			 = data[:, 1]
+	regionlist		 = data[:, 2]
+	uniqueWordIDlist = data[:, 3].astype(int)
+	pronvarlist		 = data[:, 7]
+
+	uniqueWordIDlist_unique = np.unique(uniqueWordIDlist)
+	uniqueWordIDlist_unique.sort()
+	for uniqueWordIDnum in uniqueWordIDlist_unique:
+		x_ = pronvarlist[uniqueWordIDlist == uniqueWordIDnum]	
+		lb = LB_X[uniqueWordIDnum-1]
+		x  = lb.transform(x_)
+		if uniqueWordIDnum == uniqueWordIDlist_unique[0]:
+			X = x
+		else:
+			X = np.c_[X, x]
+
+	# pid and region of the speakers
+	y_ = regionlist[uniqueWordIDlist == uniqueWordIDlist_unique[0]]
+	y = LE_y.transform(y_)
+
+	pid = pidlist[uniqueWordIDlist == uniqueWordIDlist_unique[0]]
+	return X, y, pid
+
+
+def outputConfusionMatrix33(foutName, matrixName, regionLabels):
+	for r in range(0, len(regionLabels)):
+		execString1 = foutName + '.write("{0},{1},{2},{3}\\n".format('
+		execString2 = 'regionLabels[' + str(r) + ']'
+		execString3 = ''
+		for c in range(0, len(regionLabels)):
+			execString3 = execString3 + ',' + matrixName + '[' + str(r) + '][' + str(c) + ']'
+		execString4 = '))'
+		execString  = execString1 + execString2 + execString3 + execString4
+		exec(execString)
+
+
+def compare_sentence_level_classifiers(data_train, LBlist, LE_y, fileCSV):
+	""" compare the classification algorithms on sentence-level classifiers. 
+
+	Args:
+		data_train: training data.
+		LBlist: list of label binarizer, which is used to encode pronunciation variants.
+		LE_y: label encorder, which is used to encode rigion names.
+		fileCSV: output csv file path.
+
+	"""
+	fout = open(fileCSV, "w")
+
+	sentenceIDlist_train = data_train[:, 4].astype(int)
+	sentenceIDmax_train  = max(sentenceIDlist_train)
+
+	for sentenceID in range(1, sentenceIDmax_train+1):
+		sentenceIDstr = format(sentenceID, '02')
+
+		## categorical values into binary values.
+		data_sentence = data_train[sentenceIDlist_train == sentenceID, :]
+		X_train, y_train, pid_train = OneHotEncoding(data_sentence, LBlist, LE_y)
+		regionCounter = Counter(LE_y.inverse_transform(y_train))
+
+		## classifier comparison
+		names = [
+			"Nearest Neighbors", 
+			"Linear SVM",
+			"Poly SVM",
+			"RBF SVM", 
+			"Decision Tree",
+			"Random Forest 2", 
+			"Random Forest 3", 
+			"Random Forest 4", 
+			"AdaBoost", 
+			"AdaBoost(SVM)",
+			"AdaBoost(Random Forest 3)",
+			"Naive Bayes", 
+			"Linear Discriminant Analysis",
+			"Quadratic Discriminant Analysis"
+			]
+		classifiers = [
+			KNeighborsClassifier(3),
+			SVC(kernel="linear", C=0.025),
+			SVC(kernel="poly", C=0.025),
+			SVC(gamma=2, C=1),
+			DecisionTreeClassifier(max_depth=4),
+			RandomForestClassifier(max_depth=2, n_estimators=10, max_features=1),
+			RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1),
+			RandomForestClassifier(max_depth=4, n_estimators=10, max_features=1),
+			AdaBoostClassifier(),
+			AdaBoostClassifier(SVC(probability=True, kernel='linear')),
+			AdaBoostClassifier(RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1)),
+			GaussianNB(),
+			LinearDiscriminantAnalysis(),
+			QuadraticDiscriminantAnalysis()
+			]
+		for name, model in zip(names, classifiers):
+			scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro')
+			fout.write("{0},{1},{2},{3}\n".format(sentenceID, name, scores.mean(), scores.var()))
+			print('{0}, {1}: {2}'.format(sentenceID, name, scores.mean()))
+
+	fout.close()
+
+
+def train_sentence_level_classifiers(data_train, LBlist, LE_y, fileCSV):
+	""" train sentence-level classifiers.
+		
+	Args:
+		data_train: training data.
+		LBlist: list of label binarizer, which is used to encode pronunciation variants.
+		LE_y: label encorder, which is used to encode rigion names.
+		fileCSV: output csv file path.
+
+	Returns:
+		modelList (list): list of models (length: sentenceNumMax)
+		scoreList (list): list of scores (length: sentenceNumMax)
+
+	"""
+	fout = open(fileCSV, "w")
+
+	fout.write('< cross-validation in training set >\n')
+
+	sentenceIDlist_train = data_train[:, 4].astype(int)
+	sentenceIDmax_train  = max(sentenceIDlist_train)
+	modelList = []
+	scoreList = []
+	confusionMatrixList = []
+
+	for sentenceID in range(1, sentenceIDmax_train+1):
+		sentenceIDstr = format(sentenceID, '02')
+
+		## categorical values into binary values.
+		data_sentence = data_train[sentenceIDlist_train == sentenceID, :]
+		X_train, y_train, pid_train = OneHotEncoding(data_sentence, LBlist, LE_y)
+		regionCounter = Counter(LE_y.inverse_transform(y_train))
+
+		## cross-validation with the best classifier
+		model = AdaBoostClassifier()
+		#model = SVC(kernel="linear", C=0.025)
+		#model = LinearDiscriminantAnalysis()
+
+#		#scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro')
+		scores, confusionMatrix = eval.cross_val_confusion_matrix(model, X_train, y_train, 10)
+		ci_mean, ci_low, ci_high = eval.mean_confidence_interval(scores, 0.95)
+		scoreList.append(scores)
+		confusionMatrixList.append(confusionMatrix)
+
+		## model fitting
+		modelfit = model.fit(X_train, y_train)
+		modelList.append(modelfit)
+
+		## output 
+		fout.write("{},".format(sentenceID))
+		#fout.write("{0},{1},{2},".format(
+		#	regionCounter['Groningen_and_Drenthe'], regionCounter['Limburg'], regionCounter['Oost_Overijsel-Gelderland']))
+		#fout.write("{0},{1},".format(
+		#	regionCounter['Low_Saxon'], regionCounter['Limburg']))
+		fout.write("{0},{1},".format(
+			regionCounter['Groningen_and_Drenthe'], regionCounter['Limburg']))
+
+		fout.write("{0},{1},{2}\n".format(ci_mean, ci_low, ci_high))
+	fout.write('\n')
+	fout.close()
+
+	return modelList, scoreList, confusionMatrixList
+
+
+def prediction_per_sentence(data_eval, modelList, LBlist, LE_y):
+	""" prediction using sentence-level classifiers.
+		
+	Args:
+		data_eval: evaluation data.
+		modelList: list of the models.
+		LBlist: list of label binarizer, which is used to encode pronunciation variants.
+		LE_y: label encorder, which is used to encode rigion names.
+
+	Returns:
+		prediction (list): [sentenceID, pid, answer, prediction]
+
+	"""
+	sentenceIDlist_eval = data_eval[:, 4].astype(int)
+	sentenceIDmax_eval  = max(sentenceIDlist_eval)
+	for sentenceID in range(1, sentenceIDmax_eval+1):
+		sentenceIDstr = format(sentenceID, '02')
+
+		## categorical values into binary values.
+		data_sentence = data_eval[sentenceIDlist_eval == sentenceID, :]
+		X_eval, y_eval, pid_eval = OneHotEncoding(data_sentence, LBlist, LE_y)
+		regionCounter = Counter(LE_y.inverse_transform(y_eval))
+
+		## evaluate model
+		modelfit = modelList[sentenceID-1]
+		y_pred  = modelfit.predict(X_eval)
+		y_pred_label = LE_y.inverse_transform(y_pred)
+		y_eval_label = LE_y.inverse_transform(y_eval)
+
+		# pid, y, y_pred
+		sentenceIDvec = np.ones((y_eval_label.shape[0], 1)).astype(int) * sentenceID
+		prediction_   = np.c_[sentenceIDvec, pid_eval, y_eval_label, y_pred_label]
+		if sentenceID == 1:
+			prediction = prediction_
+		else:
+			prediction = np.r_[prediction, prediction_]
+
+	return prediction
+
+
+def prediction_per_pid_majority(pidlist_eval, prediction):
+	""" make a prediction per pid using majority vote 
+
+	Returns:
+		prediction_per_pid (ndarray): [pid, ans, prediction]
+	
+	"""
+	prediction_per_pid = []
+	for pid_ in range(0, len(pidlist_eval[:, 0])):
+		pid = pidlist_eval[pid_, 0]
+		ans = pidlist_eval[pid_, 1]
+		prediction_ = prediction[prediction[:, 1] == pid, :]
+
+		# majority vote
+		predCounter = Counter(prediction_[:, -1])
+		predMostCommon = predCounter.most_common(1)
+		predLabel = predMostCommon[0][0]
+		predRatio = predMostCommon[0][1] / prediction_.shape[0] * 100
+
+		prediction_per_pid.append([pid, ans, predLabel])
+
+	return np.array(prediction_per_pid)
+
+
+def calc_weight(confusionMatrixList):
+	""" calculate weight (how trustworthy the prediction is) for majority vote.
+
+	Note:
+		Of all subjects we predicted are GO/OG/LB, what fraction of them actually are (precision) is used as weight.
+
+	Args:
+		confusionMarixList: list of confusion matrix of sentence-level classifiers.
+
+	"""
+	sentenceID_max = len(confusionMatrixList)
+	weight = np.zeros((sentenceID_max, confusionMatrixList[0].shape[0]))
+	for sentenceID in range(1, sentenceID_max+1):
+		cm = confusionMatrixList[sentenceID-1]
+
+		# normalized confusion matrix
+		#rTotal = np.sum(cm, axis=1)
+		#cm_normalized = cm / rTotal
+		#weight[sentenceID-1, :] = np.diag(cm_normalized)
+
+		true_positives = np.diag(cm)
+		predicted = np.sum(cm, axis=0)
+		weight[sentenceID-1, :] = true_positives / predicted
+
+	return weight
+
+
+def prediction_per_pid_weighted(pidlist_eval, prediction, weight, LB_y, LE_y):
+	""" make a prediction per pid using weighted (majority) vote. 
+
+	Args:
+		weight (ndarray): how trustworthy the prediction of each sentence-based classifier is.
+		LB_y: label binalizer, which is used to encode region names.
+		LE_y: label encorder, which is used to encode region names.
+	Returns:
+		prediction_per_pid (ndarray): [pid, ans, prediction]
+
+	"""
+
+	prediction_per_pid = []
+	for pid_ in range(0, len(pidlist_eval[:, 0])):
+		pid = pidlist_eval[pid_, 0]
+		ans = pidlist_eval[pid_, 1]
+		prediction_ = prediction[prediction[:, 1] == pid, :]
+
+		# calculate weighted (majority) vote
+		vote_weighted = np.zeros((1, 3))
+		for sentenceID_ in range(0, prediction_.shape[0]):
+			sentenceID = prediction_[sentenceID_, 0].astype(int)
+			w = weight[sentenceID-1, :]
+			pred = prediction_[sentenceID_, 3]
+			pred_int = LB_y.transform([pred])
+			vote_weighted = vote_weighted + w * pred_int
+
+		# choose the most vote
+		vote_weighted = vote_weighted[0]
+		maxindex = list(vote_weighted).index(max(vote_weighted))
+		#predLabel = regionLabels[maxindex]
+		predLabel = LE_y.inverse_transform(maxindex)
+		prediction_per_pid.append([pid, ans, predLabel])
+
+	return np.array(prediction_per_pid)
+
+
+def saxon_vs_limburg(pidlist3):
+	"""convert a pidlist for 3 regions into that for 2 regions.
+
+	Notes:
+		3 regions include ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
+		2 regions include ['Limburg', 'Low_Saxon']
+		where Low_Saxon = 'Groningen_and_Drenthe' + 'Oost_Overijsel-Gelderland'
+		samples are randomly chosen so that each class has the same amount of data. 
+
+	"""
+
+	regionLabels  = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
+	regionLabels2 = ['Low_Saxon', 'Limburg']
+
+	index_saxon = np.any([pidlist3[:, 1] == regionLabels[0], pidlist3[:, 1] == regionLabels[2]], axis=0)
+	pidlist_saxon_  = pidlist3[index_saxon, :]
+	pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :]
+	
+	# extract the same amout of samples as Limburg. 
+	pidlistCounter3 = Counter(pidlist3[:, 1])
+	pidlist_saxon, idx = mani.extractRandomSample(pidlist_saxon_, pidlistCounter3['Limburg'])
+	pidlist_saxon[:, 1] = regionLabels2[0]
+
+	pidlist2 = np.r_[pidlist_limburg, pidlist_saxon]
+	#pidlistCounter2 = Counter(pidlist2[:, 1])
+	return pidlist2
+
+
+def groningen_vs_limburg(pidlist3):
+	"""convert a pidlist for 3 regions into that for 2 regions.
+
+	Notes:
+		3 regions include ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
+		2 regions include ['Groningen_and_Drenthe', 'Limburg']
+
+	"""
+	regionLabels  = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
+
+	pidlist_groningen = pidlist3[pidlist3[:, 1] == regionLabels[0], :] 
+	pidlist_limburg   = pidlist3[pidlist3[:, 1] == regionLabels[1], :]
+
+	pidlist2 = np.r_[pidlist_groningen, pidlist_limburg]
+	return pidlist2
--- a/dialect_identification/test_code.py
+++ b/dialect_identification/test_code.py
@@ -0,0 +1,44 @@
+
+import Levenshtein
+import numpy as np
+
+a = 'hello'
+b = 'haall'
+
+# approximate
+infinite = 100
+
+# make distance matrix D
+len_a = len(a)
+len_b = len(b)
+D_ = np.zeros((len_a, len_b)).astype(int)
+for ia in range(0, len_a):
+	a_ = a[ia]
+	for ib in range(0, len_b):
+		b_ = b[ib]
+		if a_ == b_:
+			D_[ia, ib] = 1
+
+D = np.zeros((len_a+1, len_b+1)).astype(int)
+D[1:len_a+1, 1:len_b+1] = D_
+D[0, :] = infinite
+D[:, 0] = infinite
+D[0, 0] = 0
+
+# calculate accumulated distance
+indexPath = []
+for ia in range(0, len_a):
+	for ib in range(0, len_b):
+		a_ = a[ia]
+		b_ = b[ib]
+		option = (D[ia, ib]+D[ia+1, ib+1], D[ia, ib+1], D[ia+1, ib])
+		Dmin = np.min(option)
+		D[ia+1, ib+1] = D[ia+1, ib+1]+Dmin
+		index = list(option).index(Dmin)
+		indexPath[ia, ib] = index
+
+# back trace
+ia = len_a
+ib = len_b
+#while (ia > 0 or ib > 0):
+#	tb
--- a/dialect_identification/word_based.py
+++ b/dialect_identification/word_based.py
@@ -0,0 +1,56 @@
+import os
+import sys
+import configparser
+
+import numpy as np
+from matplotlib import pyplot
+
+currDir    = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
+sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
+from dataIO import readFile
+from dataIO import selectSamplesFromCombinedData
+import dataManipulation
+
+
+configFile = currDir + '\\config.ini'
+
+config = configparser.ConfigParser()
+config.sections()
+config.read(configFile)
+fileWordList = config['word_based']['fileWordList']
+fileCombined = config['word_based']['fileCombined']
+
+wordList = readFile(fileWordList)
+
+for wordNum in range(1, len(wordList)):
+	word = wordList[wordNum-1] # target word
+	#print("=== {} ===".format(word))
+
+	dataGroningen, dataLimburg, dataOverijsel = selectSamplesFromCombinedData(word, fileCombined)
+
+	sampleNumMax = 50
+	dataG, indexG = dataManipulation.extractRandomSample(np.array(dataGroningen), sampleNumMax)
+	dataL, indexL = dataManipulation.extractRandomSample(np.array(dataLimburg), sampleNumMax)
+	dataO, indexO = dataManipulation.extractRandomSample(np.array(dataOverijsel), sampleNumMax)
+
+	# combine pronunciation from three regions
+	# data: (sampleNumMax x 3) x 1
+	cPronunciation = 4
+	data = np.hstack([dataG[:, cPronunciation], dataL[:, cPronunciation], dataO[:, cPronunciation]])
+
+	# MDS
+	dataLevenshtein = dataManipulation.makeLevenshteinMatrix(data)
+	dataMDS = dataManipulation.MDS(dataLevenshtein)
+
+	# plot
+	pyplot.scatter(dataMDS[0:sampleNumMax-1, 0], dataMDS[0:sampleNumMax-1, 1], s=80, c='red', marker="o", facecolors='none', label="Groningen and Drenthe")
+	pyplot.scatter(dataMDS[sampleNumMax:sampleNumMax*2-1, 0], dataMDS[sampleNumMax:sampleNumMax*2-1, 1], c='green', marker="^", facecolors='none', label="Limburg")
+	pyplot.scatter(dataMDS[sampleNumMax*2:sampleNumMax*3-1, 0], dataMDS[sampleNumMax*2:sampleNumMax*3-1, 1], c='blue', marker="+", facecolors='none', label="Oost Overijsel-Gelderland")
+
+	pyplot.title(word)
+	#ax.set_xlabel('x')
+	#ax.set_ylabel('y')
+	pyplot.legend(loc='upper right')
+	#pyplot.show()
+	pyplot.savefig('c:\\cygwin64\\home\\Aki\\rug_cygwin\\_same-utterance\\fig\\' + word + '.png')
+	pyplot.gcf().clear()