commit to be sure.

2018-03-25 13:46:27 +02:00
commit a1379caced
15 changed files with 1784 additions and 0 deletions
--- a/dialect_identification.sln
+++ b/dialect_identification.sln
@@ -0,0 +1,38 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 15
 VisualStudioVersion = 15.0.26730.12
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "dialect_identification", "dialect_identification\dialect_identification.pyproj", "{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}"
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{5A4286D1-F037-43D4-90F8-05C5CCC0CA30}"
 	ProjectSection(SolutionItems) = preProject
 		..\..\forced-alignment\forced_alignment\convert_phone_set.py = ..\..\forced-alignment\forced_alignment\convert_phone_set.py
 		..\..\forced-alignment\forced_alignment\defaultfiles.py = ..\..\forced-alignment\forced_alignment\defaultfiles.py
 		..\..\forced-alignment\forced_alignment\forced_alignment.pyproj = ..\..\forced-alignment\forced_alignment\forced_alignment.pyproj
 		..\..\forced-alignment\forced_alignment\htk_dict.py = ..\..\forced-alignment\forced_alignment\htk_dict.py
 		..\..\forced-alignment\forced_alignment\lexicon.py = ..\..\forced-alignment\forced_alignment\lexicon.py
 		..\..\forced-alignment\forced_alignment\mlf.py = ..\..\forced-alignment\forced_alignment\mlf.py
 		..\..\forced-alignment\forced_alignment\pronunciations.py = ..\..\forced-alignment\forced_alignment\pronunciations.py
 		..\..\forced-alignment\forced_alignment\pyhtk.py = ..\..\forced-alignment\forced_alignment\pyhtk.py
 		..\..\forced-alignment\forced_alignment\scripts.py = ..\..\forced-alignment\forced_alignment\scripts.py
 		..\..\forced-alignment\forced_alignment\tempfilename.py = ..\..\forced-alignment\forced_alignment\tempfilename.py
 		..\..\forced-alignment\forced_alignment\test_environment.py = ..\..\forced-alignment\forced_alignment\test_environment.py
 	EndProjectSection
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
 		Release|Any CPU = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
 		{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}.Release|Any CPU.ActiveCfg = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
 		SolutionGuid = {FA4F83BB-D460-40C1-B10E-98E4877CA29B}
 	EndGlobalSection
 EndGlobal
--- a/dialect_identification/audio2db.py
+++ b/dialect_identification/audio2db.py
@@ -0,0 +1,90 @@
 import os
 import sys
 import configparser
 import numpy as np
 import pypyodbc
 ## user define
 forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced-alignment'
 dir_same_utterance = 'd:\\OneDrive\\Research\\rug\\experiments\\same_utterance'
 wav_dir      = dir_same_utterance + '\\wav_with_cities'
 script_dir   = dir_same_utterance + '\\script'
 fileMDB		 = dir_same_utterance + '\\feature\\DialectClassification.accdb'
 table		 = 'ForcedAlignmentResult'
 regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
 # these lines are not necessary once forced-alignment is intalled as a package.
 sys.path.append(forced_alignment_module)
 from forced_alignment import forced_alignment
 ## check if forced-alignment work in each sentence
 #from forced_alignment import pronunciations
 #pronunciations.delete_all_g2p_entries()
 #wav_file = wav_dir + '\\10\\' + regionLabels[0] + '\\9935-1464218044-1951631.wav'
 #script_file = script_dir + '\\script10.txt'
 #with open(script_file, 'r') as fin:
 #	script = fin.readline()
 #fa = forced_alignment(wav_file, script)
 ## make database connection
 param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
 conn = pypyodbc.connect(param)
 cursor = conn.cursor()
 SQLstring1 = 'INSERT INTO ' + table + ' (filename, region, word_id, pronunciation) '
 ## forced-alignment to all the wav files in dir_same_utterance  
 word_id_start = 1
 for sentenceID in range(1, 11):
 	sentenceIDstr = format(sentenceID, '02')
 	# get script
 	script_file = script_dir + '\\script' + sentenceIDstr + '.txt'
 	with open(script_file, 'r') as fin:
 		script = fin.readline()
 		# loop over three regions
 		for region in regionLabels:
 			# loop over the wav_subdir
 			wav_subdir = wav_dir + '\\' + sentenceIDstr + '\\' + region
 			wav_files = os.listdir(wav_subdir)
 			file_nr = 0
 			for wav_file in wav_files:
 				file_nr += 1
 				filename = wav_file.replace('.wav', '')
 				wav_file_fullpath = wav_subdir + '\\' + wav_file
 				# forced-alignment
 				print('{0} {1}: {2} ({3}/{4})'.format(sentenceIDstr, region, wav_file, file_nr, len(wav_files)))
 				fa = forced_alignment(wav_file_fullpath, script)
 				# send pronunciation variant to database
 				word_id = word_id_start
 				for row	in fa:
 					word	 = row[0]
 					phonemes = np.array(row[1])
 					## get pronunciation variant
 					pronvar_ = phonemes[:, 2]
 					pronvar_[np.where(pronvar_=='ssil')]='' # remove 'ssil'
 					pronvar = ''.join(pronvar_)
 					## insert the result into the database.  
 					SQLstring2 = 'VALUES (\'' + filename + '\',\'' + region + '\',\'' + str(word_id) + '\',\'' + pronvar + '\')'
 					SQLstring  = SQLstring1 + SQLstring2 
 					cursor.execute(SQLstring)
 					conn.commit()
 					word_id = word_id + 1
 	word_id_start += script.count(' ')+1
 conn.close()
--- a/dialect_identification/classifier.py
+++ b/dialect_identification/classifier.py
@@ -0,0 +1,290 @@
 '''
 This script perfoms the basic process for applying a machine learning
 algorithm to a dataset using Python libraries.
 The four steps are:
   1. Download a dataset (using pandas)
   2. Process the numeric data (using numpy)
   3. Train and evaluate learners (using scikit-learn)
   4. Plot and compare results (using matplotlib)
 The data is downloaded from URL, which is defined below. As is normal
 for machine learning problems, the nature of the source data affects
 the entire solution. When you change URL to refer to your own data, you
 will need to review the data processing steps to ensure they remain
 correct.
 ============
 Example Data
 ============
 The example is from http://mlr.cs.umass.edu/ml/datasets/Spambase
 It contains pre-processed metrics, such as the frequency of certain
 words and letters, from a collection of emails. A classification for
 each one indicating 'spam' or 'not spam' is in the final column.
 See the linked page for full details of the data set.
 This script uses three classifiers to predict the class of an email
 based on the metrics. These are not representative of modern spam
 detection systems.
 '''
 # Remember to update the script for the new data when you change this URL
 URL = "http://mlr.cs.umass.edu/ml/machine-learning-databases/spambase/spambase.data"
 # Uncomment this call when using matplotlib to generate images
 # rather than displaying interactive UI.
 #import matplotlib
 #matplotlib.use('Agg')
 from pandas import read_table
 import numpy as np
 import matplotlib.pyplot as plt
 try:
    # [OPTIONAL] Seaborn makes plots nicer
    import seaborn
 except ImportError:
    pass
 # =====================================================================
 def download_data():
    '''
    Downloads the data for this script into a pandas DataFrame.
    '''
    # If your data is in an Excel file, install 'xlrd' and use
    # pandas.read_excel instead of read_table
    #from pandas import read_excel
    #frame = read_excel(URL)
    # If your data is in a private Azure blob, install 'azure-storage' and use
    # BlockBlobService.get_blob_to_path() with read_table() or read_excel()
    #from azure.storage.blob import BlockBlobService
    #service = BlockBlobService(ACCOUNT_NAME, ACCOUNT_KEY)
    #service.get_blob_to_path(container_name, blob_name, 'my_data.csv')
    #frame = read_table('my_data.csv', ...
    frame = read_table(
        URL,
        # Uncomment if the file needs to be decompressed
        #compression='gzip',
        #compression='bz2',
        # Specify the file encoding
        # Latin-1 is common for data from US sources
        encoding='latin-1',
        #encoding='utf-8',  # UTF-8 is also common
        # Specify the separator in the data
        sep=',',            # comma separated values
        #sep='\t',          # tab separated values
        #sep=' ',           # space separated values
        # Ignore spaces after the separator
        skipinitialspace=True,
        # Generate row labels from each row number
        index_col=None,
        #index_col=0,       # use the first column as row labels
        #index_col=-1,      # use the last column as row labels
        # Generate column headers row from each column number
        header=None,
        #header=0,          # use the first line as headers
        # Use manual headers and skip the first row in the file
        #header=0,
        #names=['col1', 'col2', ...],
    )
    # Return a subset of the columns
    #return frame[['col1', 'col4', ...]]
    # Return the entire frame
    return frame
 # =====================================================================
 def get_features_and_labels(frame):
    '''
    Transforms and scales the input data and returns numpy arrays for
    training and testing inputs and targets.
    '''
    # Replace missing values with 0.0, or we can use
    # scikit-learn to calculate missing values (below)
    #frame[frame.isnull()] = 0.0
    # Convert values to floats
    arr = np.array(frame, dtype=np.float)
    # Use the last column as the target value
    X, y = arr[:, :-1], arr[:, -1]
    # To use the first column instead, change the index value
    #X, y = arr[:, 1:], arr[:, 0]
    # Use 80% of the data for training; test against the rest
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    # sklearn.pipeline.make_pipeline could also be used to chain 
    # processing and classification into a black box, but here we do
    # them separately.
    # If values are missing we could impute them from the training data
    #from sklearn.preprocessing import Imputer
    #imputer = Imputer(strategy='mean')
    #imputer.fit(X_train)
    #X_train = imputer.transform(X_train)
    #X_test = imputer.transform(X_test)
    # Normalize the attribute values to mean=0 and variance=1
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    # To scale to a specified range, use MinMaxScaler
    #from sklearn.preprocessing import MinMaxScaler
    #scaler = MinMaxScaler(feature_range=(0, 1))
    # Fit the scaler based on the training data, then apply the same
    # scaling to both training and test sets.
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    # Return the training and test sets
    return X_train, X_test, y_train, y_test
 # =====================================================================
 def evaluate_classifier(X_train, X_test, y_train, y_test):
    '''
    Run multiple times with different classifiers to get an idea of the
    relative performance of each configuration.
    Returns a sequence of tuples containing:
        (title, precision, recall)
    for each learner.
    '''
    # Import some classifiers to test
    from sklearn.svm import LinearSVC, NuSVC
    from sklearn.ensemble import AdaBoostClassifier
    # We will calculate the P-R curve for each classifier
    from sklearn.metrics import precision_recall_curve, f1_score
    # Here we create classifiers with default parameters. These need
    # to be adjusted to obtain optimal performance on your data set.
    # Test the linear support vector classifier
    classifier = LinearSVC(C=1)
    # Fit the classifier
    classifier.fit(X_train, y_train)
    score = f1_score(y_test, classifier.predict(X_test))
    # Generate the P-R curve
    y_prob = classifier.decision_function(X_test)
    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    # Include the score in the title
    yield 'Linear SVC (F1 score={:.3f})'.format(score), precision, recall
    # Test the Nu support vector classifier
    classifier = NuSVC(kernel='rbf', nu=0.5, gamma=1e-3)
    # Fit the classifier
    classifier.fit(X_train, y_train)
    score = f1_score(y_test, classifier.predict(X_test))
    # Generate the P-R curve
    y_prob = classifier.decision_function(X_test)
    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    # Include the score in the title
    yield 'NuSVC (F1 score={:.3f})'.format(score), precision, recall
    # Test the Ada boost classifier
    classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.0, algorithm='SAMME.R')
    # Fit the classifier
    classifier.fit(X_train, y_train)
    score = f1_score(y_test, classifier.predict(X_test))
    # Generate the P-R curve
    y_prob = classifier.decision_function(X_test)
    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    # Include the score in the title
    yield 'Ada Boost (F1 score={:.3f})'.format(score), precision, recall
 # =====================================================================
 def plot(results):
    '''
    Create a plot comparing multiple learners.
    `results` is a list of tuples containing:
        (title, precision, recall)
    All the elements in results will be plotted.
    '''
    # Plot the precision-recall curves
    fig = plt.figure(figsize=(6, 6))
    fig.canvas.set_window_title('Classifying data from ' + URL)
    for label, precision, recall in results:
        plt.plot(recall, precision, label=label)
    plt.title('Precision-Recall Curves')
    plt.xlabel('Precision')
    plt.ylabel('Recall')
    plt.legend(loc='lower left')
    # Let matplotlib improve the layout
    plt.tight_layout()
    # ==================================
    # Display the plot in interactive UI
    plt.show()
    # To save the plot to an image file, use savefig()
    #plt.savefig('plot.png')
    # Open the image file with the default image viewer
    #import subprocess
    #subprocess.Popen('plot.png', shell=True)
    # To save the plot to an image in memory, use BytesIO and savefig()
    # This can then be written to any stream-like object, such as a
    # file or HTTP response.
    #from io import BytesIO
    #img_stream = BytesIO()
    #plt.savefig(img_stream, fmt='png')
    #img_bytes = img_stream.getvalue()
    #print('Image is {} bytes - {!r}'.format(len(img_bytes), img_bytes[:8] + b'...'))
    # Closing the figure allows matplotlib to release the memory used.
    plt.close()
 # =====================================================================
 if __name__ == '__main__':
    # Download the data set from URL
    print("Downloading data from {}".format(URL))
    frame = download_data()
    # Process data into feature and label arrays
    print("Processing {} samples with {} attributes".format(len(frame.index), len(frame.columns)))
    X_train, X_test, y_train, y_test = get_features_and_labels(frame)
    # Evaluate multiple classifiers on the data
    print("Evaluating classifiers")
    results = list(evaluate_classifier(X_train, X_test, y_train, y_test))
    # Display the results
    print("Plotting the results")
    plot(results)
--- a/dialect_identification/config.ini
+++ b/dialect_identification/config.ini
@@ -0,0 +1,8 @@
 [word_based]
 fileWordList = D:\\OneDrive\\Research\\rug\\same_utterance\\feature\\wordList.csv
 fileCombined = D:\\OneDrive\\Research\\rug\\same_utterance\\feature\\combined.csv
 [sentence_based]
 dirFeature = D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\feature
 fileMDB = D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\feature\\DialectClassification.accdb
 dirData = D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\wav
--- a/dialect_identification/data_io.py
+++ b/dialect_identification/data_io.py
@@ -0,0 +1,74 @@
 #
 # 2017/09/25
 # select samples from the combined.csv for the further analysis
 #
 # HISTORY
 # 2017/10/02 modularized.
 # 
 # Aki Kunikoshi
 # 428968@gmail.com
 #
 import numpy as np
 def readFile(filename):
    with open(filename, 'r') as fin:
        lines = fin.read()
    linesEach = lines.split('\n')
    return linesEach
 def selectSamplesFromCombinedData(word, fileCombined):
    # load combined data 
 	fin = open(fileCombined, 'r')
 	line = fin.readline()
    # load data per region
 	dataGroningen = []
 	dataLimburg   = []
 	dataOverijsel = []
 	while line:
 		line = fin.readline()
 		line = line.rstrip()
 		lineList = line.split(',')
 		if len(lineList) == 6 and lineList[5] == word:
 			region = lineList[2]
 			if region == 'Groningen_and_Drenthe':
 				dataGroningen.append(lineList)
 			elif region == 'Limburg':
 				dataLimburg.append(lineList)
 			elif region == 'Oost_Overijsel-Gelderland':
 				dataOverijsel.append(lineList)
 	fin.close()
 	return (dataGroningen, dataLimburg, dataOverijsel)
    #print("{0}: {1} {2} {3}".format(word,len(listGroningen),len(listLimburg),len(listOverijsel))
 def groupSamplesInCSV(fileCSV, idxRegion):
 	fin = open(fileCSV, 'r')
 	# first line is the header
 	line = fin.readline()
 	line = line.rstrip()
 	header = line.split(',')
 	# load data per region
 	dataGroningen = []
 	dataLimburg   = []
 	dataOverijsel = []
 	while line:
 		line = fin.readline()
 		line = line.rstrip()
 		lineList = line.split(',')
 		if len(lineList) == len(header):
 			region = lineList[idxRegion]
 			if region == 'Groningen_and_Drenthe':
 				dataGroningen.append(lineList)
 			elif region == 'Limburg':
 				dataLimburg.append(lineList)
 			elif region == 'Oost_Overijsel-Gelderland':
 				dataOverijsel.append(lineList)
 	fin.close()
 	return (header, dataGroningen, dataLimburg, dataOverijsel)
 def addUserID(featureFile, recordingsCSV):
 	dirFeature = config['sentence_based']['dirFeature']
--- a/dialect_identification/data_manipulation.py
+++ b/dialect_identification/data_manipulation.py
@@ -0,0 +1,41 @@
 import numpy as np
 from sklearn import manifold
 import Levenshtein
 # x: ndarray (dnum x dim)
 # n: number of samples to extract
 # OUTPUT
 # index: index of the chosen samples
 # 
 def extractRandomSample(x, n):
 	xRowMax = x.shape[0]
 	indexOriginal = np.arange(xRowMax)
 	indexChosen	= np.random.choice(indexOriginal, n, False)
 	xChosen = x[indexChosen, :]
 	return (xChosen, indexChosen)
 # x: 1d string ndarray
 def makeLevenshteinMatrix(x):
 	xRowMax = x.shape[0]
 	xLevenshtein = np.ones((xRowMax, xRowMax), dtype='int')
 	for xRow in range(0, xRowMax):
 		for xCol in range(0, xRowMax):
 			dist = Levenshtein.distance(x[xRow], x[xCol]);
 			xLevenshtein[xRow, xCol] = dist
 	return xLevenshtein
 # x: 1d string ndarray
 def calcLevenshteinArray(word, x):
 	xRowMax = x.shape[0]
 	xLevenshtein = np.zeros(x.shape, dtype='int')
 	for xRow in range(0, xRowMax):
 		dist = Levenshtein.distance(word, x[xRow]);
 		xLevenshtein[xRow] = dist
 	return xLevenshtein	
 def MDS(x):
 	mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=6)
 	xmds = mds.fit_transform(x)
 	return xmds
--- a/dialect_identification/dialect_identification.pyproj
+++ b/dialect_identification/dialect_identification.pyproj
@@ -0,0 +1,70 @@
 <Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" ToolsVersion="4.0">
  <PropertyGroup>
    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
    <SchemaVersion>2.0</SchemaVersion>
    <ProjectGuid>fe1b1358-adbe-4446-affd-a0802d13d15b</ProjectGuid>
    <ProjectTypeGuids>{a41c8ea1-112a-4a2d-9f91-29557995525f};{888888a0-9f3d-457c-b088-3a5042f75d52}</ProjectTypeGuids>
    <ProjectHome>.</ProjectHome>
    <StartupFile>output_confusion_matrix.py</StartupFile>
    <SearchPath>
    </SearchPath>
    <WorkingDirectory>.</WorkingDirectory>
    <OutputPath>.</OutputPath>
    <Name>dialect_identification</Name>
    <RootNamespace>dialect_identification</RootNamespace>
  </PropertyGroup>
  <PropertyGroup Condition=" '$(Configuration)' == 'Debug' ">
    <DebugSymbols>true</DebugSymbols>
    <EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
  </PropertyGroup>
  <PropertyGroup Condition=" '$(Configuration)' == 'Release' ">
    <DebugSymbols>true</DebugSymbols>
    <EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
  </PropertyGroup>
  <ItemGroup>
    <Compile Include="manipulate_db.py">
      <SubType>Code</SubType>
    </Compile>
    <Compile Include="audio2db.py">
      <SubType>Code</SubType>
    </Compile>
    <Compile Include="classifier.py" />
    <Compile Include="dataManipulation.py">
      <SubType>Code</SubType>
    </Compile>
    <Compile Include="output_confusion_matrix.py">
      <SubType>Code</SubType>
    </Compile>
    <Compile Include="sentence_based.py">
      <SubType>Code</SubType>
    </Compile>
    <Compile Include="speaker_based.py">
      <SubType>Code</SubType>
    </Compile>
    <Compile Include="speaker_based_functions.py">
      <SubType>Code</SubType>
    </Compile>
    <Compile Include="test_code.py">
      <SubType>Code</SubType>
    </Compile>
    <Compile Include="evaluation.py">
      <SubType>Code</SubType>
    </Compile>
    <Compile Include="word_based.py">
      <SubType>Code</SubType>
    </Compile>
    <Compile Include="dataIO.py" />
  </ItemGroup>
  <ItemGroup>
    <Content Include="config.ini" />
  </ItemGroup>
  <Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
  <!-- Uncomment the CoreCompile target to enable the Build command in
       Visual Studio and specify your pre- and post-build commands in
       the BeforeBuild and AfterBuild targets below. -->
  <!--<Target Name="CoreCompile" />-->
  <Target Name="BeforeBuild">
  </Target>
  <Target Name="AfterBuild">
  </Target>
 </Project>
--- a/dialect_identification/evaluation.py
+++ b/dialect_identification/evaluation.py
@@ -0,0 +1,40 @@
 import numpy as np
 import scipy as sp
 import scipy.stats
 from sklearn.model_selection import KFold
 from sklearn.metrics import f1_score
 from sklearn.metrics import confusion_matrix
 # from https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data
 def mean_confidence_interval(data, confidence):
    a = 1.0*np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * sp.stats.t._ppf((1+confidence)/2., n-1)
    return m, m-h, m+h
 # accumulated confusion matrix is added to cross_val_score  
 def cross_val_confusion_matrix(model, X, y, cv):
 	kf = KFold(n_splits=cv)
 	classLabels = np.unique(y)
 	classNumMax = classLabels.shape[0]
 	confusionMatrixAccumulated = np.zeros((classNumMax, classNumMax))
 	scores = []
 	for idx_train, idx_test in kf.split(X):
 		# split into train/test
 		x_train = X[idx_train, :]
 		x_test  = X[idx_test, :]
 		y_train = y[idx_train]
 		y_test  = y[idx_test]
 		modelfit = model.fit(x_train, y_train)
 		# evaluation
 		y_pred = modelfit.predict(x_test)
 		score = f1_score(y_test, y_pred, average='micro')
 		scores.append(score)
 		confusionMatrixAccumulated = confusionMatrixAccumulated + confusion_matrix(y_test, y_pred,
 			labels=classLabels) 
 	scores = np.array(scores)
 	return scores, confusionMatrixAccumulated
--- a/dialect_identification/manipulate_db.py
+++ b/dialect_identification/manipulate_db.py
@@ -0,0 +1,48 @@
 import sys
 import os
 import pandas
 import datetime
 sys.path.append('..')
 # these lines are not necessary once forced-alignment is intalled as a package.
 forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced-alignment'
 sys.path.append(forced_alignment_module)
 from forced_alignment import pronunciations
 from forced_alignment.htk_dict import variances_table
 #pronunciations.delete_word('kunikoshi')
 #pronunciations.delete_all_g2p_entries()
 #existing_pronunciations = set(pronunciations.get_all())
 ## only focus on word
 ## missing pronunciations
 ## (1) pronunciation is written in IPA.
 ## (2) pronunciation variants are made based on (1).
 ## (3) they are converted into HTK format.
 #missing_pronunciations_file = 'D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\missing_words_in_barbara_dic\\missing_words_pronvarsHTK.txt'
 #with open(missing_pronunciations_file) as fin:
 #	lines = fin.read()
 #	lines = lines.split('\n')
 #source = 'generated using ipa transcription by Marita Everhardt.'
 #inserts = []
 #for line in lines:
 #	line = line.split('\t')
 #	word = line[0].strip().lower()
 #	pronounciation = line[1].strip().split()
 #	# surely not in the table
 #	#if (word, pronounciation) not in existing_pronunciations:
 #	inserts.append("('{}', '{}', '{}', '{}', 0)".format(
 #		word, 
 #		' '.join(pronounciation),
 #		source,
 #		datetime.datetime.now(), ))
 #sql = """INSERT INTO pronunciations (word, pronunciation, collection, added, automatic) VALUES\n  {};""".format(
 #    ',\n  '.join(inserts)
--- a/dialect_identification/output_confusion_matrix.py
+++ b/dialect_identification/output_confusion_matrix.py
@@ -0,0 +1,79 @@
 import os
 import sys
 import itertools
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.metrics import accuracy_score
 from sklearn.metrics import confusion_matrix
 currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
 sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
 regionLabels  = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg']
 regionLabels2 = ['Groningen_and_Drenthe', 'Limburg']
 dirOut = currDir + '\\result\\same-utterance_with_cities'
 def plot_confusion_matrix(cm, classes,
 						  normalize=False,
 						  title='Confusion matrix',
 						  cmap=plt.cm.Blues):
 	"""
 	This function prints and plots the confusion matrix.
 	Normalization can be applied by setting `normalize=True`.
 	Note:
 	this code is downloaded from: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
 	"""
 	if normalize:
 		cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
 		print("Normalized confusion matrix")
 	else:
 		print('Confusion matrix, without normalization')
 	_fontsize = 24
 	plt.imshow(cm, interpolation='nearest', cmap=cmap)
 	#plt.title(title, fontsize=_fontsize+2)
 	#plt.colorbar()
 	tick_marks = np.arange(len(classes))
 	#plt.xticks(tick_marks, classes, rotation=45, fontsize=_fontsize-2)
 	plt.xticks(tick_marks, classes, fontsize=_fontsize-4)
 	plt.yticks(tick_marks, classes, fontsize=_fontsize-4)
 	fmt = '.2f' if normalize else 'd'
 	thresh = cm.max() / 2.
 	for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
 		plt.text(j, i, format(cm[i, j], fmt),
 				 horizontalalignment="center",
 				 color="white" if cm[i, j] > thresh else "black", 
 				 fontsize=_fontsize)
 	plt.tight_layout()
 	plt.subplots_adjust(bottom=0.2)
 	plt.ylabel('True label', fontsize=_fontsize-4)
 	plt.xlabel('Predicted label', fontsize=_fontsize-4)
 pred = np.load(dirOut + '\\pred_per_pid_3regions.npy')
 #accuracy = accuracy_score(pred[:, 1], pred[:, 2], normalize=True, sample_weight=None)
 #print('accuracy: {}%'.format(accuracy * 100))
 # confusion matrix
 cm = confusion_matrix(pred[:, 1], pred[:, 2], labels=regionLabels)
 # human perception (2 regions)
 #cm = np.array([[39, 57], [6, 104]])
 # human perception (3 regions)
 #cm = np.array([[22, 14, 52], [23, 21, 52], [5, 5, 100]])
 print(cm)
 np.set_printoptions(precision=2)
 plt.figure()
 plot_confusion_matrix(cm, classes=['GD', 'OG', 'LB'], normalize=True)
 #plot_confusion_matrix(cm, classes=['GD', 'LB'], normalize=True)
 #plt.show()
 plt.savefig(dirOut + '\\cm_machine_3regions_normalized.png')
--- a/dialect_identification/sentence_based.py
+++ b/dialect_identification/sentence_based.py
@@ -0,0 +1,197 @@
 import os
 import sys
 import configparser
 import numpy as np
 import pandas as pd
 from matplotlib import pyplot
 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import cross_val_score
 from sklearn import preprocessing
 from collections import Counter
 # database
 import pypyodbc
 # classifier
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
 from sklearn.naive_bayes import GaussianNB
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
 from sklearn.metrics import f1_score
 from sklearn.metrics import confusion_matrix
 import pickle
 currDir    = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
 sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
 from dataIO import readFile
 from dataIO import groupSamplesInCSV
 import dataManipulation
 import utility as util
 configFile = currDir + '\\config.ini'
 # load init file
 config = configparser.ConfigParser()
 config.sections()
 config.read(configFile)
 dirFeature = config['sentence_based']['dirFeature']
 sentenceNumMax = 10
 classifierList = []
 LE_X_decode	   = []
 LE_y = preprocessing.LabelEncoder()
 LE_y.fit(["Groningen_and_Drenthe", "Limburg", "Oost_Overijsel-Gelderland"])
 testset_X = []
 testset_y = []
 testset_userID = []
 result_y_test = []
 result_y_prediction = []
 fout = open("comparison.csv", "w")
 for sentenceNum in range(1, sentenceNumMax+1):
 	#if sentenceNum != 10:
 	#	sentenceNumStr = '0' + str(sentenceNum)
 	#else:
 	#	sentenceNumStr = str(sentenceNumStr)
 	sentenceNumStr = format(sentenceNum, '02')
 	fileSentence = dirFeature + '\\\\' + sentenceNumStr + '.csv'
 	## load combined data 
 	fileCSV = fileSentence
 	idxRegion = 1
 	header, dataGroningen, dataLimburg, dataOverijsel = groupSamplesInCSV(fileCSV, idxRegion)
 	sampleNumMax = np.min((len(dataGroningen), len(dataLimburg), len(dataOverijsel)))
 	## make balanced dataset
 	dataG, indexG = dataManipulation.extractRandomSample(np.array(dataGroningen), sampleNumMax)
 	dataL, indexL = dataManipulation.extractRandomSample(np.array(dataLimburg), sampleNumMax)
 	dataO, indexO = dataManipulation.extractRandomSample(np.array(dataOverijsel), sampleNumMax)
 	XIndex = np.arange(idxRegion+1, len(header))
 	yIndex = 1 # region
 	userIDindex = 0 # userID
 	## cathegorical values into numbers	
 	X_ = np.r_[dataG[:, XIndex], dataL[:, XIndex], dataO[:, XIndex]]
 	y_ = np.r_[dataG[:, yIndex], dataL[:, yIndex], dataO[:, yIndex]]
 	userID_ = np.r_[dataG[:, userIDindex], dataL[:, userIDindex], dataO[:, userIDindex]]
 	#X = np.zeros((X_.shape), 'int')
 	for Xindex in XIndex:
 		x = X_[:, Xindex-2]
 		## levenshtein distance
 		#word_count = Counter(x)
 		#frequent_word = max(word_count)
 		#X[:, Xindex-2] = dataManipulation.calcLevenshteinArray(frequent_word, x)
 		# hot encoding
 		le_x = preprocessing.LabelBinarizer()
 		le_x.fit(np.unique(x))
 		x_ = le_x.transform(x)
 		LE_X_decode.append(x_.shape[1])
 		if Xindex == idxRegion+1:
 			X = x_
 		else:
 			X = np.c_[X, x_]
 	y = LE_y.transform(y_)
 	## split into train vs test set
 	#[X_train, X_test, y_train, y_test] = train_test_split(X, y, test_size = 0.2, random_state = 0)
 	# each regional data should be splited equally 
 	lenG = dataG.shape[0]
 	lenL = dataL.shape[0]
 	lenO = dataO.shape[0]
 	indexG = np.arange(0, lenG)
 	indexL = np.arange(lenG, lenG+lenL)
 	indexO = np.arange(lenG+lenL, lenG+lenL+lenO)
 	[XG_train, XG_test, yG_train, yG_test] = train_test_split(X[indexG, :], y[indexG], test_size = 0.2, random_state = 0)
 	[XL_train, XL_test, yL_train, yL_test] = train_test_split(X[indexL, :], y[indexL], test_size = 0.2, random_state = 0)
 	[XO_train, XO_test, yO_train, yO_test] = train_test_split(X[indexO, :], y[indexO], test_size = 0.2, random_state = 0)
 	X_train = np.r_[XG_train, XL_train, XO_train]
 	X_test  = np.r_[XG_test, XL_test, XO_test]
 	y_train = np.r_[yG_train, yL_train, yO_train]
 	y_test  = np.r_[yG_test, yL_test, yO_test]
 	## comparison
 	## classifiers
 	#names = ["Nearest Neighbors", 
 	#		 "Linear SVM",
 	#		 "Poly SVM",
 	#		 "RBF SVM", 
 	#		 "Decision Tree",
 	#		 "Random Forest 2", 
 	#		 "Random Forest 3", 
 	#		 "Random Forest 4", 
 	#		 "AdaBoost", 
 	#		 #"Naive Bayes", 
 	#		 "Linear Discriminant Analysis",
 	#		 #"Quadratic Discriminant Analysis"
 	#		 ]
 	#classifiers = [
 	#	KNeighborsClassifier(3),
 	#	SVC(kernel="linear", C=0.025),
 	#	SVC(kernel="poly", C=0.025),
 	#	SVC(gamma=2, C=1),
 	#	DecisionTreeClassifier(max_depth=4),
 	#	RandomForestClassifier(max_depth=2, n_estimators=10, max_features=1),
 	#	RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1),
 	#	RandomForestClassifier(max_depth=4, n_estimators=10, max_features=1),
 	#	AdaBoostClassifier(),
 	#	#GaussianNB(),
 	#	LinearDiscriminantAnalysis(),
 	#	#QuadraticDiscriminantAnalysis()
 	#	]
 	#for name, model in zip(names, classifiers):
 	#	scores = cross_val_score(model, X, y, cv = 10, scoring = 'f1_micro')
 	#	fout = open("comparison.csv", "a")
 	#	fout.write("{0},{1},{2}\n".format(sentenceNum, name, scores.mean()))
 	#	print('{0}, {1}: {2}'.format(sentenceNum, name, scores.mean()))
 	# quasi-optimal model
 	model = AdaBoostClassifier()
 	# cross validation
 	scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro')
 	ci_mean, ci_low, ci_high = util.mean_confidence_interval(scores, 0.95)
 	modelfit = model.fit(X_train, y_train)
 	# f1 on test data
 	y_prediction = modelfit.predict(X_test)
 	f1score = f1_score(y_test, y_prediction, average='micro')
 	fout.write("{0},{1},{2},{3}\n".format(ci_mean, ci_low, ci_high, f1score))
 	## save for the test
 	testset_X.append(X_test)
 	testset_y.append(y_test)
 	testset_userID.append(userID_)
 	result_y_test = result_y_test + list(y_test)
 	result_y_prediction = result_y_prediction + list(y_prediction)
 	fileClassifier = dirFeature + '\\\\' + sentenceNumStr + '.mdl'
 	pickle.dump(modelfit, open(fileClassifier, 'wb'))
 fout.close()
 ### confusion matrix
 result_y_test_label = LE_y.inverse_transform(result_y_test)
 result_y_prediction_label = LE_y.inverse_transform(result_y_prediction)
 confusionMatrix = confusion_matrix(result_y_test_label, result_y_prediction_label, labels=[
 	'Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland'])
 print(confusionMatrix)
 ### make userID list 
 #userID = testset_userID[0]
 #for sentenceNum in range(1, sentenceNumMax):
 #	userid = testset_userID[sentenceNum]
 #	userID = np.r_[userID, userid]
 #userIDlist = np.unique(userID)
--- a/dialect_identification/speaker_based.py
+++ b/dialect_identification/speaker_based.py
@@ -0,0 +1,326 @@
 import os
 import sys
 import configparser
 import pypyodbc
 import numpy as np
 from collections import Counter
 import matplotlib.pyplot as plt
 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import cross_val_score
 from sklearn import preprocessing
 from sklearn.metrics import confusion_matrix
 from sklearn.metrics import accuracy_score
 currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
 sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
 import dataManipulation as mani
 import evaluation as eval
 import speaker_based_functions as sb_func
 #####################
 ##   USER DEFINE   ##
 #####################
 sentenceNumMax = 10
 configFile = currDir + '\\config.ini'
 dirOut = currDir + '\\result'
 # make train/test set: 1, load: 0
 makeTrainTestSet = 0
 # convert 3 regions to 2 regions: 1, load: 0
 conv3to2region   = 0
 # 3 regions: 0
 # saxon vs limburg: 1
 # groningen vs limburg: 2
 experiment_type = 2
 regionLabels  = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
 # a bit useless error handling.
 #assert (experiment_type in (0, 1, 2)), "experiment type should be 0, 1 or 2."
 if experiment_type == 1:
 	regionLabels2 = ['Low_Saxon', 'Limburg'] 
 regionLabels2 = ['Groningen_and_Drenthe', 'Limburg']	
 ##########################
 ##   DATA PREPARATION   ##
 ##########################
 ## load init file
 config = configparser.ConfigParser()
 config.sections()
 config.read(configFile)
 dirFeature = config['sentence_based']['dirFeature']
 fileMDB = config['sentence_based']['fileMDB']
 ## database connection
 pypyodbc.lowercase = False
 param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
 conn = pypyodbc.connect(param)
 cursor = conn.cursor()
 ## get data from Access database
 # data format
 #	0: filename
 #	1: pid
 #	2: region
 #	3: ID (unique word_id)
 #	4: sentence_id
 #	5: word_id
 #	6: word
 #	7: pronunciation
 SQL_string = """\
 {CALL dataset_with_cities}
 """
 cursor.execute(SQL_string)
 rows = cursor.fetchall()
 data = np.array(rows)
 #dataNumMax = data.shape[0]
 #uniqueWordIDmax = max(data[:, 3].astype(int))
 del SQL_string, rows
 ## make list of LabelBinarizer object per word.
 # for X
 # get pronvarList from Access database 
 # pronvarList format
 #	0: ID (unique word_id)
 #	1: word
 #	2: pronvar
 SQL_string = """\
 {CALL pronunciation_variant}
 """
 cursor.execute(SQL_string)
 rows = cursor.fetchall()
 pronvarList = np.array(rows)
 del SQL_string, rows
 LBlist = []
 #uniqueWordIDlist = pronvarList[:, 0].astype(int)
 uniqueWordIDlist = data[:, 3].astype(int)
 uniqueWordIDmax  = max(uniqueWordIDlist)
 for uniqueWordID in range(1, uniqueWordIDmax+1):
 	pronvar = data[uniqueWordIDlist == uniqueWordID, 7]
 	#pronvar = pronvarList[pronvarList[:, 0] == uniqueWordID, 2]
 	LB = preprocessing.LabelBinarizer()
 	LB.fit(np.unique(pronvar))
 	LBlist.append(LB)
 # for y (=region)
 LE_y = preprocessing.LabelEncoder()
 LE_y.fit(regionLabels)
 LE_y2 = preprocessing.LabelEncoder()
 LE_y2.fit(regionLabels2)
 LB_y = preprocessing.LabelBinarizer()
 LB_y.fit(regionLabels)
 LB_y2 = preprocessing.LabelBinarizer()
 LB_y2.fit(regionLabels2)
 del uniqueWordID, uniqueWordIDmax, pronvar, LB
 #################
 ##  ITERATION  ##
 #################
 #CM_majority = np.zeros((1, 9)).astype(int)
 #CM_weighted = np.zeros((1, 9)).astype(int)
 #for iter in range(0, 1):
 #	print(iter)
 ## make balanced dataset
 pidlist = np.unique(data[:, (1, 2)], axis=0)
 # count number of samples
 pidlistCounter = Counter(pidlist[:, 1])
 sampleNumMax = min(pidlistCounter.values())
 del pidlistCounter
 ## make train/eval/test set or load
 if makeTrainTestSet==1:
 	pidlist_train = []
 	pidlist_eval  = []
 	pidlist_test  = []
 	for regionNum in range(0, len(regionLabels)):
 		regionName = regionLabels[regionNum]
 		pidlist_per_region_ = pidlist[pidlist[:, 1]==regionLabels[regionNum], :]
 		pidlist_per_region, idx = mani.extractRandomSample(
 			pidlist_per_region_, sampleNumMax)
 		# split dataset into train, eval and test.
 		[pidlist_per_region_train, pidlist_per_region_test] = train_test_split(
 			pidlist_per_region, test_size = 0.2, random_state = 0)
 		[pidlist_per_region_train, pidlist_per_region_eval] = train_test_split(
 			pidlist_per_region_train, test_size = 0.1, random_state = 0)
 		# append numpy arrays
 		if regionNum == 0:
 			pidlist_train = pidlist_per_region_train
 			pidlist_eval  = pidlist_per_region_eval
 			pidlist_test  = pidlist_per_region_test
 		else:
 			pidlist_train = np.r_[pidlist_train, pidlist_per_region_train]
 			pidlist_eval  = np.r_[pidlist_eval, pidlist_per_region_eval]
 			pidlist_test  = np.r_[pidlist_test, pidlist_per_region_test]
 	del regionNum, regionName
 	del pidlist_per_region_, pidlist_per_region, idx
 	del pidlist_per_region_train, pidlist_per_region_eval, pidlist_per_region_test
 	np.save(dirOut + "\\pidlist_train.npy", pidlist_train)
 	np.save(dirOut + "\\pidlist_eval.npy", pidlist_eval)
 	np.save(dirOut + "\\pidlist_test.npy", pidlist_test)
 else:
 	pidlist_train = np.load(dirOut + "\\pidlist_train.npy")
 	pidlist_eval  = np.load(dirOut + "\\pidlist_eval.npy")
 	pidlist_test  = np.load(dirOut + "\\pidlist_test.npy")
 ## make dataset for 2 regions or load
 if conv3to2region==1:
 	pidlist2_train_ = np.r_[pidlist_train, pidlist_eval]
 	if experiment_type == 1:
 		pidlist2_train = sb_func.saxon_vs_limburg(pidlist2_train_)
 		pidlist2_test  = sb_func.saxon_vs_limburg(pidlist_test)	
 		np.save(dirOut + "\\pidlist2_saxon_vs_limburg_train", pidlist2_train)
 		np.save(dirOut + "\\pidlist2_saxon_vs_limburg_test", pidlist2_test)
 	elif experiment_type == 2:
 		pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_)
 		pidlist2_test  = sb_func.groningen_vs_limburg(pidlist_test)
 		np.save(dirOut + "\\pidlist2_groningen_vs_limburg_train", pidlist2_train)
 		np.save(dirOut + "\\pidlist2_groningen_vs_limburg_test", pidlist2_test)
 	del pidlist2_train_
 else:
 	if experiment_type == 1:
 		pidlist2_train = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_train.npy")
 		pidlist2_test  = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_test.npy")
 	elif experiment_type == 2:
 		pidlist2_train = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_train.npy")
 		pidlist2_test  = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_test.npy")
 ## train/test data
 if experiment_type == 0:
 	# Groningen vs Overijsel vs Limburg
 	data_train = sb_func.extractPid(pidlist_train, data)
 	data_eval  = sb_func.extractPid(pidlist_eval, data)
 	data_test  = sb_func.extractPid(pidlist_test, data)
 elif experiment_type == 1 or experiment_type == 2:
 	data2 = np.array(data)
 	if experiment_type == 1:
 		for row, row2 in zip(data, data2):
 			if row[2] == regionLabels[0] or row[2] == regionLabels[2]:
 				row2[2] = regionLabels2[0]
 	data2_train = sb_func.extractPid(pidlist2_train, data2)
 	data2_test  = sb_func.extractPid(pidlist2_test, data2)
 #####################################
 ##   EXPERIMENTS START FROM HERE   ##
 #####################################
 ## actual training
 # train vs eval
 #trainData = data_train
 #testData  = data_eval
 #testPID   = pidlist_eval
 #LB = LB_y
 #LE = LE_y
 #regionLabels = regionLabels3
 # train+eval vs test
 if experiment_type == 0:
 	trainData = np.r_[data_train, data_eval]
 	testData  = data_test
 	testPID   = pidlist_test
 	LB = LB_y
 	LE = LE_y
 elif experiment_type == 1 or experiment_type == 2:
 # 2 region: saxon vs limburg/ groningen vs limburg
 	trainData = data2_train
 	testData  = data2_test
 	testPID   = pidlist2_test
 	LB = LB_y2
 	LE = LE_y2
 	regionLabels = regionLabels2
 # check the number of utterance
 allData = np.r_[trainData, testData]
 filenames = np.c_[allData[:, 0], allData[:, 2]]
 filenames_unique = np.unique(filenames, axis=0)
 Counter(filenames_unique[:, 1])
 fileComparison		= dirOut + "\\algorithm_comparison.csv"
 filePerformance		= dirOut + "\\sentence-level.csv"
 fileConfusionMatrix = dirOut + "\\confusion_matrix.csv"
 ## compare classification algorithms for the sentence-classifiers.
 #sb_func.compare_sentence_level_classifiers(trainData, LBlist, LE, fileComparison)
 ## train sentence-level classifiers.
 modelList, scoreList, confusionMatrixList = sb_func.train_sentence_level_classifiers(
 	trainData, LBlist, LE, filePerformance)
 ## prediction over evaluation data per each sentence-level classifier.
 pred_per_sentence = sb_func.prediction_per_sentence(testData, modelList, LBlist, LE)
 ## combine sentence-level classifiers 
 pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence)
 ## majority vote (weighted)
 #weight = sb_func.calc_weight(confusionMatrixList)
 #pred_per_pid_weighted = sb_func.prediction_per_pid_weighted(testPID, pred_per_sentence, weight, LB, LE)
 ### confusion matrix
 if experiment_type == 0:
 	confusionMatrix_majority = confusion_matrix(
 		pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg'])
 else:
 	confusionMatrix_majority = confusion_matrix(
 		pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Limburg'])
 	#confusionMatrix_weighted = confusion_matrix(
 #	pred_per_pid_weighted[:, 1], pred_per_pid_weighted[:, 2], labels=regionLabels)
 ## output
 accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None)
 print('accuracy: {}%'.format(accuracy * 100))
 cm = confusionMatrix_majority
 print(cm)
 np.save(dirOut + "\\pred_per_pid.npy", pred_per_pid_majority)
 np.save(dirOut + "\\confusion_matrix.npy", cm)
 #fout = open(fileConfusionMatrix, "w")
 #fout.write('< confusion matrix for majority vote in evaluation set >\n')
 #sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_majority', regionLabels)
 #fout.write('< confusion matrix for weighted vote in evaluation set >\n')
 #sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_weighted', regionLabels)
 #fout.write('\n')
 #fout.close()
 ##### iteration finish #####
 conn.close()
 #np.savetxt(dirOut + '\\cm_majority.csv', CM_majority, delimiter=',') 
 #np.savetxt(dirOut + '\\cm_weighted.csv', CM_weighted, delimiter=',') 
--- a/dialect_identification/speaker_based_functions.py
+++ b/dialect_identification/speaker_based_functions.py
@@ -0,0 +1,383 @@
 import numpy as np
 from collections import Counter
 import matplotlib.pyplot as plt
 import itertools
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
 from sklearn.naive_bayes import GaussianNB
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
 from sklearn.model_selection import cross_val_score
 from sklearn.metrics import confusion_matrix
 import dataManipulation as mani
 import evaluation as eval
 # extract data that corresponds to pid in the pidlist
 def extractPid(pidlist, data):
 	for pidnum in range(0, len(pidlist)):
 		pid = pidlist[pidnum, 0]
 		x = data[data[:, 1] == pid, :]
 		if pidnum == 0:
 			data_ = x
 		else:
 			data_ = np.r_[data_, x]	
 	return data_
 def OneHotEncoding(data, LB_X, LE_y):
 # one hot encoding of data using LabelBinalizer per word (LB_X) and for region (LB_y)
 # INPUT
 #  data
 #	0: filename
 #	1: pid
 #	2: region
 #	3: ID (unique word_id)
 #	4: sentence_id
 #	5: word_id
 #	6: word
 #	7: pronunciation
 #  LB_x: LabelBinalizer objects
 #  LE_y: LabelEncoder object
 # OUTPUT
 #  X: encoded variable data
 #  y: encoded target data
 	pidlist			 = data[:, 1]
 	regionlist		 = data[:, 2]
 	uniqueWordIDlist = data[:, 3].astype(int)
 	pronvarlist		 = data[:, 7]
 	uniqueWordIDlist_unique = np.unique(uniqueWordIDlist)
 	uniqueWordIDlist_unique.sort()
 	for uniqueWordIDnum in uniqueWordIDlist_unique:
 		x_ = pronvarlist[uniqueWordIDlist == uniqueWordIDnum]	
 		lb = LB_X[uniqueWordIDnum-1]
 		x  = lb.transform(x_)
 		if uniqueWordIDnum == uniqueWordIDlist_unique[0]:
 			X = x
 		else:
 			X = np.c_[X, x]
 	# pid and region of the speakers
 	y_ = regionlist[uniqueWordIDlist == uniqueWordIDlist_unique[0]]
 	y = LE_y.transform(y_)
 	pid = pidlist[uniqueWordIDlist == uniqueWordIDlist_unique[0]]
 	return X, y, pid
 def outputConfusionMatrix33(foutName, matrixName, regionLabels):
 	for r in range(0, len(regionLabels)):
 		execString1 = foutName + '.write("{0},{1},{2},{3}\\n".format('
 		execString2 = 'regionLabels[' + str(r) + ']'
 		execString3 = ''
 		for c in range(0, len(regionLabels)):
 			execString3 = execString3 + ',' + matrixName + '[' + str(r) + '][' + str(c) + ']'
 		execString4 = '))'
 		execString  = execString1 + execString2 + execString3 + execString4
 		exec(execString)
 def compare_sentence_level_classifiers(data_train, LBlist, LE_y, fileCSV):
 	""" compare the classification algorithms on sentence-level classifiers. 
 	Args:
 		data_train: training data.
 		LBlist: list of label binarizer, which is used to encode pronunciation variants.
 		LE_y: label encorder, which is used to encode rigion names.
 		fileCSV: output csv file path.
 	"""
 	fout = open(fileCSV, "w")
 	sentenceIDlist_train = data_train[:, 4].astype(int)
 	sentenceIDmax_train  = max(sentenceIDlist_train)
 	for sentenceID in range(1, sentenceIDmax_train+1):
 		sentenceIDstr = format(sentenceID, '02')
 		## categorical values into binary values.
 		data_sentence = data_train[sentenceIDlist_train == sentenceID, :]
 		X_train, y_train, pid_train = OneHotEncoding(data_sentence, LBlist, LE_y)
 		regionCounter = Counter(LE_y.inverse_transform(y_train))
 		## classifier comparison
 		names = [
 			"Nearest Neighbors", 
 			"Linear SVM",
 			"Poly SVM",
 			"RBF SVM", 
 			"Decision Tree",
 			"Random Forest 2", 
 			"Random Forest 3", 
 			"Random Forest 4", 
 			"AdaBoost", 
 			"AdaBoost(SVM)",
 			"AdaBoost(Random Forest 3)",
 			"Naive Bayes", 
 			"Linear Discriminant Analysis",
 			"Quadratic Discriminant Analysis"
 			]
 		classifiers = [
 			KNeighborsClassifier(3),
 			SVC(kernel="linear", C=0.025),
 			SVC(kernel="poly", C=0.025),
 			SVC(gamma=2, C=1),
 			DecisionTreeClassifier(max_depth=4),
 			RandomForestClassifier(max_depth=2, n_estimators=10, max_features=1),
 			RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1),
 			RandomForestClassifier(max_depth=4, n_estimators=10, max_features=1),
 			AdaBoostClassifier(),
 			AdaBoostClassifier(SVC(probability=True, kernel='linear')),
 			AdaBoostClassifier(RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1)),
 			GaussianNB(),
 			LinearDiscriminantAnalysis(),
 			QuadraticDiscriminantAnalysis()
 			]
 		for name, model in zip(names, classifiers):
 			scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro')
 			fout.write("{0},{1},{2},{3}\n".format(sentenceID, name, scores.mean(), scores.var()))
 			print('{0}, {1}: {2}'.format(sentenceID, name, scores.mean()))
 	fout.close()
 def train_sentence_level_classifiers(data_train, LBlist, LE_y, fileCSV):
 	""" train sentence-level classifiers.
 	Args:
 		data_train: training data.
 		LBlist: list of label binarizer, which is used to encode pronunciation variants.
 		LE_y: label encorder, which is used to encode rigion names.
 		fileCSV: output csv file path.
 	Returns:
 		modelList (list): list of models (length: sentenceNumMax)
 		scoreList (list): list of scores (length: sentenceNumMax)
 	"""
 	fout = open(fileCSV, "w")
 	fout.write('< cross-validation in training set >\n')
 	sentenceIDlist_train = data_train[:, 4].astype(int)
 	sentenceIDmax_train  = max(sentenceIDlist_train)
 	modelList = []
 	scoreList = []
 	confusionMatrixList = []
 	for sentenceID in range(1, sentenceIDmax_train+1):
 		sentenceIDstr = format(sentenceID, '02')
 		## categorical values into binary values.
 		data_sentence = data_train[sentenceIDlist_train == sentenceID, :]
 		X_train, y_train, pid_train = OneHotEncoding(data_sentence, LBlist, LE_y)
 		regionCounter = Counter(LE_y.inverse_transform(y_train))
 		## cross-validation with the best classifier
 		model = AdaBoostClassifier()
 		#model = SVC(kernel="linear", C=0.025)
 		#model = LinearDiscriminantAnalysis()
 #		#scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro')
 		scores, confusionMatrix = eval.cross_val_confusion_matrix(model, X_train, y_train, 10)
 		ci_mean, ci_low, ci_high = eval.mean_confidence_interval(scores, 0.95)
 		scoreList.append(scores)
 		confusionMatrixList.append(confusionMatrix)
 		## model fitting
 		modelfit = model.fit(X_train, y_train)
 		modelList.append(modelfit)
 		## output 
 		fout.write("{},".format(sentenceID))
 		#fout.write("{0},{1},{2},".format(
 		#	regionCounter['Groningen_and_Drenthe'], regionCounter['Limburg'], regionCounter['Oost_Overijsel-Gelderland']))
 		#fout.write("{0},{1},".format(
 		#	regionCounter['Low_Saxon'], regionCounter['Limburg']))
 		fout.write("{0},{1},".format(
 			regionCounter['Groningen_and_Drenthe'], regionCounter['Limburg']))
 		fout.write("{0},{1},{2}\n".format(ci_mean, ci_low, ci_high))
 	fout.write('\n')
 	fout.close()
 	return modelList, scoreList, confusionMatrixList
 def prediction_per_sentence(data_eval, modelList, LBlist, LE_y):
 	""" prediction using sentence-level classifiers.
 	Args:
 		data_eval: evaluation data.
 		modelList: list of the models.
 		LBlist: list of label binarizer, which is used to encode pronunciation variants.
 		LE_y: label encorder, which is used to encode rigion names.
 	Returns:
 		prediction (list): [sentenceID, pid, answer, prediction]
 	"""
 	sentenceIDlist_eval = data_eval[:, 4].astype(int)
 	sentenceIDmax_eval  = max(sentenceIDlist_eval)
 	for sentenceID in range(1, sentenceIDmax_eval+1):
 		sentenceIDstr = format(sentenceID, '02')
 		## categorical values into binary values.
 		data_sentence = data_eval[sentenceIDlist_eval == sentenceID, :]
 		X_eval, y_eval, pid_eval = OneHotEncoding(data_sentence, LBlist, LE_y)
 		regionCounter = Counter(LE_y.inverse_transform(y_eval))
 		## evaluate model
 		modelfit = modelList[sentenceID-1]
 		y_pred  = modelfit.predict(X_eval)
 		y_pred_label = LE_y.inverse_transform(y_pred)
 		y_eval_label = LE_y.inverse_transform(y_eval)
 		# pid, y, y_pred
 		sentenceIDvec = np.ones((y_eval_label.shape[0], 1)).astype(int) * sentenceID
 		prediction_   = np.c_[sentenceIDvec, pid_eval, y_eval_label, y_pred_label]
 		if sentenceID == 1:
 			prediction = prediction_
 		else:
 			prediction = np.r_[prediction, prediction_]
 	return prediction
 def prediction_per_pid_majority(pidlist_eval, prediction):
 	""" make a prediction per pid using majority vote 
 	Returns:
 		prediction_per_pid (ndarray): [pid, ans, prediction]
 	"""
 	prediction_per_pid = []
 	for pid_ in range(0, len(pidlist_eval[:, 0])):
 		pid = pidlist_eval[pid_, 0]
 		ans = pidlist_eval[pid_, 1]
 		prediction_ = prediction[prediction[:, 1] == pid, :]
 		# majority vote
 		predCounter = Counter(prediction_[:, -1])
 		predMostCommon = predCounter.most_common(1)
 		predLabel = predMostCommon[0][0]
 		predRatio = predMostCommon[0][1] / prediction_.shape[0] * 100
 		prediction_per_pid.append([pid, ans, predLabel])
 	return np.array(prediction_per_pid)
 def calc_weight(confusionMatrixList):
 	""" calculate weight (how trustworthy the prediction is) for majority vote.
 	Note:
 		Of all subjects we predicted are GO/OG/LB, what fraction of them actually are (precision) is used as weight.
 	Args:
 		confusionMarixList: list of confusion matrix of sentence-level classifiers.
 	"""
 	sentenceID_max = len(confusionMatrixList)
 	weight = np.zeros((sentenceID_max, confusionMatrixList[0].shape[0]))
 	for sentenceID in range(1, sentenceID_max+1):
 		cm = confusionMatrixList[sentenceID-1]
 		# normalized confusion matrix
 		#rTotal = np.sum(cm, axis=1)
 		#cm_normalized = cm / rTotal
 		#weight[sentenceID-1, :] = np.diag(cm_normalized)
 		true_positives = np.diag(cm)
 		predicted = np.sum(cm, axis=0)
 		weight[sentenceID-1, :] = true_positives / predicted
 	return weight
 def prediction_per_pid_weighted(pidlist_eval, prediction, weight, LB_y, LE_y):
 	""" make a prediction per pid using weighted (majority) vote. 
 	Args:
 		weight (ndarray): how trustworthy the prediction of each sentence-based classifier is.
 		LB_y: label binalizer, which is used to encode region names.
 		LE_y: label encorder, which is used to encode region names.
 	Returns:
 		prediction_per_pid (ndarray): [pid, ans, prediction]
 	"""
 	prediction_per_pid = []
 	for pid_ in range(0, len(pidlist_eval[:, 0])):
 		pid = pidlist_eval[pid_, 0]
 		ans = pidlist_eval[pid_, 1]
 		prediction_ = prediction[prediction[:, 1] == pid, :]
 		# calculate weighted (majority) vote
 		vote_weighted = np.zeros((1, 3))
 		for sentenceID_ in range(0, prediction_.shape[0]):
 			sentenceID = prediction_[sentenceID_, 0].astype(int)
 			w = weight[sentenceID-1, :]
 			pred = prediction_[sentenceID_, 3]
 			pred_int = LB_y.transform([pred])
 			vote_weighted = vote_weighted + w * pred_int
 		# choose the most vote
 		vote_weighted = vote_weighted[0]
 		maxindex = list(vote_weighted).index(max(vote_weighted))
 		#predLabel = regionLabels[maxindex]
 		predLabel = LE_y.inverse_transform(maxindex)
 		prediction_per_pid.append([pid, ans, predLabel])
 	return np.array(prediction_per_pid)
 def saxon_vs_limburg(pidlist3):
 	"""convert a pidlist for 3 regions into that for 2 regions.
 	Notes:
 		3 regions include ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
 		2 regions include ['Limburg', 'Low_Saxon']
 		where Low_Saxon = 'Groningen_and_Drenthe' + 'Oost_Overijsel-Gelderland'
 		samples are randomly chosen so that each class has the same amount of data. 
 	"""
 	regionLabels  = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
 	regionLabels2 = ['Low_Saxon', 'Limburg']
 	index_saxon = np.any([pidlist3[:, 1] == regionLabels[0], pidlist3[:, 1] == regionLabels[2]], axis=0)
 	pidlist_saxon_  = pidlist3[index_saxon, :]
 	pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :]
 	# extract the same amout of samples as Limburg. 
 	pidlistCounter3 = Counter(pidlist3[:, 1])
 	pidlist_saxon, idx = mani.extractRandomSample(pidlist_saxon_, pidlistCounter3['Limburg'])
 	pidlist_saxon[:, 1] = regionLabels2[0]
 	pidlist2 = np.r_[pidlist_limburg, pidlist_saxon]
 	#pidlistCounter2 = Counter(pidlist2[:, 1])
 	return pidlist2
 def groningen_vs_limburg(pidlist3):
 	"""convert a pidlist for 3 regions into that for 2 regions.
 	Notes:
 		3 regions include ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
 		2 regions include ['Groningen_and_Drenthe', 'Limburg']
 	"""
 	regionLabels  = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
 	pidlist_groningen = pidlist3[pidlist3[:, 1] == regionLabels[0], :] 
 	pidlist_limburg   = pidlist3[pidlist3[:, 1] == regionLabels[1], :]
 	pidlist2 = np.r_[pidlist_groningen, pidlist_limburg]
 	return pidlist2
--- a/dialect_identification/test_code.py
+++ b/dialect_identification/test_code.py
@@ -0,0 +1,44 @@
 import Levenshtein
 import numpy as np
 a = 'hello'
 b = 'haall'
 # approximate
 infinite = 100
 # make distance matrix D
 len_a = len(a)
 len_b = len(b)
 D_ = np.zeros((len_a, len_b)).astype(int)
 for ia in range(0, len_a):
 	a_ = a[ia]
 	for ib in range(0, len_b):
 		b_ = b[ib]
 		if a_ == b_:
 			D_[ia, ib] = 1
 D = np.zeros((len_a+1, len_b+1)).astype(int)
 D[1:len_a+1, 1:len_b+1] = D_
 D[0, :] = infinite
 D[:, 0] = infinite
 D[0, 0] = 0
 # calculate accumulated distance
 indexPath = []
 for ia in range(0, len_a):
 	for ib in range(0, len_b):
 		a_ = a[ia]
 		b_ = b[ib]
 		option = (D[ia, ib]+D[ia+1, ib+1], D[ia, ib+1], D[ia+1, ib])
 		Dmin = np.min(option)
 		D[ia+1, ib+1] = D[ia+1, ib+1]+Dmin
 		index = list(option).index(Dmin)
 		indexPath[ia, ib] = index
 # back trace
 ia = len_a
 ib = len_b
 #while (ia > 0 or ib > 0):
 #	tb
--- a/dialect_identification/word_based.py
+++ b/dialect_identification/word_based.py
@@ -0,0 +1,56 @@
 import os
 import sys
 import configparser
 import numpy as np
 from matplotlib import pyplot
 currDir    = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
 sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
 from dataIO import readFile
 from dataIO import selectSamplesFromCombinedData
 import dataManipulation
 configFile = currDir + '\\config.ini'
 config = configparser.ConfigParser()
 config.sections()
 config.read(configFile)
 fileWordList = config['word_based']['fileWordList']
 fileCombined = config['word_based']['fileCombined']
 wordList = readFile(fileWordList)
 for wordNum in range(1, len(wordList)):
 	word = wordList[wordNum-1] # target word
 	#print("=== {} ===".format(word))
 	dataGroningen, dataLimburg, dataOverijsel = selectSamplesFromCombinedData(word, fileCombined)
 	sampleNumMax = 50
 	dataG, indexG = dataManipulation.extractRandomSample(np.array(dataGroningen), sampleNumMax)
 	dataL, indexL = dataManipulation.extractRandomSample(np.array(dataLimburg), sampleNumMax)
 	dataO, indexO = dataManipulation.extractRandomSample(np.array(dataOverijsel), sampleNumMax)
 	# combine pronunciation from three regions
 	# data: (sampleNumMax x 3) x 1
 	cPronunciation = 4
 	data = np.hstack([dataG[:, cPronunciation], dataL[:, cPronunciation], dataO[:, cPronunciation]])
 	# MDS
 	dataLevenshtein = dataManipulation.makeLevenshteinMatrix(data)
 	dataMDS = dataManipulation.MDS(dataLevenshtein)
 	# plot
 	pyplot.scatter(dataMDS[0:sampleNumMax-1, 0], dataMDS[0:sampleNumMax-1, 1], s=80, c='red', marker="o", facecolors='none', label="Groningen and Drenthe")
 	pyplot.scatter(dataMDS[sampleNumMax:sampleNumMax*2-1, 0], dataMDS[sampleNumMax:sampleNumMax*2-1, 1], c='green', marker="^", facecolors='none', label="Limburg")
 	pyplot.scatter(dataMDS[sampleNumMax*2:sampleNumMax*3-1, 0], dataMDS[sampleNumMax*2:sampleNumMax*3-1, 1], c='blue', marker="+", facecolors='none', label="Oost Overijsel-Gelderland")
 	pyplot.title(word)
 	#ax.set_xlabel('x')
 	#ax.set_ylabel('y')
 	pyplot.legend(loc='upper right')
 	#pyplot.show()
 	pyplot.savefig('c:\\cygwin64\\home\\Aki\\rug_cygwin\\_same-utterance\\fig\\' + word + '.png')
 	pyplot.gcf().clear()