commit to be sure.
This commit is contained in:
		
							
								
								
									
										38
									
								
								dialect_identification.sln
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								dialect_identification.sln
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,38 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					Microsoft Visual Studio Solution File, Format Version 12.00
 | 
				
			||||||
 | 
					# Visual Studio 15
 | 
				
			||||||
 | 
					VisualStudioVersion = 15.0.26730.12
 | 
				
			||||||
 | 
					MinimumVisualStudioVersion = 10.0.40219.1
 | 
				
			||||||
 | 
					Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "dialect_identification", "dialect_identification\dialect_identification.pyproj", "{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}"
 | 
				
			||||||
 | 
					EndProject
 | 
				
			||||||
 | 
					Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{5A4286D1-F037-43D4-90F8-05C5CCC0CA30}"
 | 
				
			||||||
 | 
						ProjectSection(SolutionItems) = preProject
 | 
				
			||||||
 | 
							..\..\forced-alignment\forced_alignment\convert_phone_set.py = ..\..\forced-alignment\forced_alignment\convert_phone_set.py
 | 
				
			||||||
 | 
							..\..\forced-alignment\forced_alignment\defaultfiles.py = ..\..\forced-alignment\forced_alignment\defaultfiles.py
 | 
				
			||||||
 | 
							..\..\forced-alignment\forced_alignment\forced_alignment.pyproj = ..\..\forced-alignment\forced_alignment\forced_alignment.pyproj
 | 
				
			||||||
 | 
							..\..\forced-alignment\forced_alignment\htk_dict.py = ..\..\forced-alignment\forced_alignment\htk_dict.py
 | 
				
			||||||
 | 
							..\..\forced-alignment\forced_alignment\lexicon.py = ..\..\forced-alignment\forced_alignment\lexicon.py
 | 
				
			||||||
 | 
							..\..\forced-alignment\forced_alignment\mlf.py = ..\..\forced-alignment\forced_alignment\mlf.py
 | 
				
			||||||
 | 
							..\..\forced-alignment\forced_alignment\pronunciations.py = ..\..\forced-alignment\forced_alignment\pronunciations.py
 | 
				
			||||||
 | 
							..\..\forced-alignment\forced_alignment\pyhtk.py = ..\..\forced-alignment\forced_alignment\pyhtk.py
 | 
				
			||||||
 | 
							..\..\forced-alignment\forced_alignment\scripts.py = ..\..\forced-alignment\forced_alignment\scripts.py
 | 
				
			||||||
 | 
							..\..\forced-alignment\forced_alignment\tempfilename.py = ..\..\forced-alignment\forced_alignment\tempfilename.py
 | 
				
			||||||
 | 
							..\..\forced-alignment\forced_alignment\test_environment.py = ..\..\forced-alignment\forced_alignment\test_environment.py
 | 
				
			||||||
 | 
						EndProjectSection
 | 
				
			||||||
 | 
					EndProject
 | 
				
			||||||
 | 
					Global
 | 
				
			||||||
 | 
						GlobalSection(SolutionConfigurationPlatforms) = preSolution
 | 
				
			||||||
 | 
							Debug|Any CPU = Debug|Any CPU
 | 
				
			||||||
 | 
							Release|Any CPU = Release|Any CPU
 | 
				
			||||||
 | 
						EndGlobalSection
 | 
				
			||||||
 | 
						GlobalSection(ProjectConfigurationPlatforms) = postSolution
 | 
				
			||||||
 | 
							{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
 | 
				
			||||||
 | 
							{FE1B1358-ADBE-4446-AFFD-A0802D13D15B}.Release|Any CPU.ActiveCfg = Release|Any CPU
 | 
				
			||||||
 | 
						EndGlobalSection
 | 
				
			||||||
 | 
						GlobalSection(SolutionProperties) = preSolution
 | 
				
			||||||
 | 
							HideSolutionNode = FALSE
 | 
				
			||||||
 | 
						EndGlobalSection
 | 
				
			||||||
 | 
						GlobalSection(ExtensibilityGlobals) = postSolution
 | 
				
			||||||
 | 
							SolutionGuid = {FA4F83BB-D460-40C1-B10E-98E4877CA29B}
 | 
				
			||||||
 | 
						EndGlobalSection
 | 
				
			||||||
 | 
					EndGlobal
 | 
				
			||||||
							
								
								
									
										90
									
								
								dialect_identification/audio2db.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										90
									
								
								dialect_identification/audio2db.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,90 @@
 | 
				
			|||||||
 | 
					import os
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					import configparser
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import numpy as np
 | 
				
			||||||
 | 
					import pypyodbc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## user define
 | 
				
			||||||
 | 
					forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced-alignment'
 | 
				
			||||||
 | 
					dir_same_utterance = 'd:\\OneDrive\\Research\\rug\\experiments\\same_utterance'
 | 
				
			||||||
 | 
					wav_dir      = dir_same_utterance + '\\wav_with_cities'
 | 
				
			||||||
 | 
					script_dir   = dir_same_utterance + '\\script'
 | 
				
			||||||
 | 
					fileMDB		 = dir_same_utterance + '\\feature\\DialectClassification.accdb'
 | 
				
			||||||
 | 
					table		 = 'ForcedAlignmentResult'
 | 
				
			||||||
 | 
					regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# these lines are not necessary once forced-alignment is intalled as a package.
 | 
				
			||||||
 | 
					sys.path.append(forced_alignment_module)
 | 
				
			||||||
 | 
					from forced_alignment import forced_alignment
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## check if forced-alignment work in each sentence
 | 
				
			||||||
 | 
					#from forced_alignment import pronunciations
 | 
				
			||||||
 | 
					#pronunciations.delete_all_g2p_entries()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#wav_file = wav_dir + '\\10\\' + regionLabels[0] + '\\9935-1464218044-1951631.wav'
 | 
				
			||||||
 | 
					#script_file = script_dir + '\\script10.txt'
 | 
				
			||||||
 | 
					#with open(script_file, 'r') as fin:
 | 
				
			||||||
 | 
					#	script = fin.readline()
 | 
				
			||||||
 | 
					#fa = forced_alignment(wav_file, script)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## make database connection
 | 
				
			||||||
 | 
					param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
 | 
				
			||||||
 | 
					conn = pypyodbc.connect(param)
 | 
				
			||||||
 | 
					cursor = conn.cursor()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					SQLstring1 = 'INSERT INTO ' + table + ' (filename, region, word_id, pronunciation) '
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## forced-alignment to all the wav files in dir_same_utterance  
 | 
				
			||||||
 | 
					word_id_start = 1
 | 
				
			||||||
 | 
					for sentenceID in range(1, 11):
 | 
				
			||||||
 | 
						sentenceIDstr = format(sentenceID, '02')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						# get script
 | 
				
			||||||
 | 
						script_file = script_dir + '\\script' + sentenceIDstr + '.txt'
 | 
				
			||||||
 | 
						with open(script_file, 'r') as fin:
 | 
				
			||||||
 | 
							script = fin.readline()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							# loop over three regions
 | 
				
			||||||
 | 
							for region in regionLabels:
 | 
				
			||||||
 | 
								
 | 
				
			||||||
 | 
								# loop over the wav_subdir
 | 
				
			||||||
 | 
								wav_subdir = wav_dir + '\\' + sentenceIDstr + '\\' + region
 | 
				
			||||||
 | 
								wav_files = os.listdir(wav_subdir)
 | 
				
			||||||
 | 
								file_nr = 0
 | 
				
			||||||
 | 
								for wav_file in wav_files:
 | 
				
			||||||
 | 
									file_nr += 1
 | 
				
			||||||
 | 
									filename = wav_file.replace('.wav', '')
 | 
				
			||||||
 | 
									wav_file_fullpath = wav_subdir + '\\' + wav_file
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
									# forced-alignment
 | 
				
			||||||
 | 
									print('{0} {1}: {2} ({3}/{4})'.format(sentenceIDstr, region, wav_file, file_nr, len(wav_files)))
 | 
				
			||||||
 | 
									fa = forced_alignment(wav_file_fullpath, script)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
									# send pronunciation variant to database
 | 
				
			||||||
 | 
									word_id = word_id_start
 | 
				
			||||||
 | 
									for row	in fa:
 | 
				
			||||||
 | 
										word	 = row[0]
 | 
				
			||||||
 | 
										phonemes = np.array(row[1])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
										## get pronunciation variant
 | 
				
			||||||
 | 
										pronvar_ = phonemes[:, 2]
 | 
				
			||||||
 | 
										pronvar_[np.where(pronvar_=='ssil')]='' # remove 'ssil'
 | 
				
			||||||
 | 
										pronvar = ''.join(pronvar_)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
										## insert the result into the database.  
 | 
				
			||||||
 | 
										SQLstring2 = 'VALUES (\'' + filename + '\',\'' + region + '\',\'' + str(word_id) + '\',\'' + pronvar + '\')'
 | 
				
			||||||
 | 
										SQLstring  = SQLstring1 + SQLstring2 
 | 
				
			||||||
 | 
										cursor.execute(SQLstring)
 | 
				
			||||||
 | 
										conn.commit()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
										word_id = word_id + 1
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						word_id_start += script.count(' ')+1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					conn.close()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
							
								
								
									
										290
									
								
								dialect_identification/classifier.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										290
									
								
								dialect_identification/classifier.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,290 @@
 | 
				
			|||||||
 | 
					'''
 | 
				
			||||||
 | 
					This script perfoms the basic process for applying a machine learning
 | 
				
			||||||
 | 
					algorithm to a dataset using Python libraries.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The four steps are:
 | 
				
			||||||
 | 
					   1. Download a dataset (using pandas)
 | 
				
			||||||
 | 
					   2. Process the numeric data (using numpy)
 | 
				
			||||||
 | 
					   3. Train and evaluate learners (using scikit-learn)
 | 
				
			||||||
 | 
					   4. Plot and compare results (using matplotlib)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The data is downloaded from URL, which is defined below. As is normal
 | 
				
			||||||
 | 
					for machine learning problems, the nature of the source data affects
 | 
				
			||||||
 | 
					the entire solution. When you change URL to refer to your own data, you
 | 
				
			||||||
 | 
					will need to review the data processing steps to ensure they remain
 | 
				
			||||||
 | 
					correct.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					============
 | 
				
			||||||
 | 
					Example Data
 | 
				
			||||||
 | 
					============
 | 
				
			||||||
 | 
					The example is from http://mlr.cs.umass.edu/ml/datasets/Spambase
 | 
				
			||||||
 | 
					It contains pre-processed metrics, such as the frequency of certain
 | 
				
			||||||
 | 
					words and letters, from a collection of emails. A classification for
 | 
				
			||||||
 | 
					each one indicating 'spam' or 'not spam' is in the final column.
 | 
				
			||||||
 | 
					See the linked page for full details of the data set.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This script uses three classifiers to predict the class of an email
 | 
				
			||||||
 | 
					based on the metrics. These are not representative of modern spam
 | 
				
			||||||
 | 
					detection systems.
 | 
				
			||||||
 | 
					'''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Remember to update the script for the new data when you change this URL
 | 
				
			||||||
 | 
					URL = "http://mlr.cs.umass.edu/ml/machine-learning-databases/spambase/spambase.data"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Uncomment this call when using matplotlib to generate images
 | 
				
			||||||
 | 
					# rather than displaying interactive UI.
 | 
				
			||||||
 | 
					#import matplotlib
 | 
				
			||||||
 | 
					#matplotlib.use('Agg')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from pandas import read_table
 | 
				
			||||||
 | 
					import numpy as np
 | 
				
			||||||
 | 
					import matplotlib.pyplot as plt
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					try:
 | 
				
			||||||
 | 
					    # [OPTIONAL] Seaborn makes plots nicer
 | 
				
			||||||
 | 
					    import seaborn
 | 
				
			||||||
 | 
					except ImportError:
 | 
				
			||||||
 | 
					    pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# =====================================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def download_data():
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
 | 
					    Downloads the data for this script into a pandas DataFrame.
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # If your data is in an Excel file, install 'xlrd' and use
 | 
				
			||||||
 | 
					    # pandas.read_excel instead of read_table
 | 
				
			||||||
 | 
					    #from pandas import read_excel
 | 
				
			||||||
 | 
					    #frame = read_excel(URL)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # If your data is in a private Azure blob, install 'azure-storage' and use
 | 
				
			||||||
 | 
					    # BlockBlobService.get_blob_to_path() with read_table() or read_excel()
 | 
				
			||||||
 | 
					    #from azure.storage.blob import BlockBlobService
 | 
				
			||||||
 | 
					    #service = BlockBlobService(ACCOUNT_NAME, ACCOUNT_KEY)
 | 
				
			||||||
 | 
					    #service.get_blob_to_path(container_name, blob_name, 'my_data.csv')
 | 
				
			||||||
 | 
					    #frame = read_table('my_data.csv', ...
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    frame = read_table(
 | 
				
			||||||
 | 
					        URL,
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # Uncomment if the file needs to be decompressed
 | 
				
			||||||
 | 
					        #compression='gzip',
 | 
				
			||||||
 | 
					        #compression='bz2',
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Specify the file encoding
 | 
				
			||||||
 | 
					        # Latin-1 is common for data from US sources
 | 
				
			||||||
 | 
					        encoding='latin-1',
 | 
				
			||||||
 | 
					        #encoding='utf-8',  # UTF-8 is also common
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Specify the separator in the data
 | 
				
			||||||
 | 
					        sep=',',            # comma separated values
 | 
				
			||||||
 | 
					        #sep='\t',          # tab separated values
 | 
				
			||||||
 | 
					        #sep=' ',           # space separated values
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Ignore spaces after the separator
 | 
				
			||||||
 | 
					        skipinitialspace=True,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Generate row labels from each row number
 | 
				
			||||||
 | 
					        index_col=None,
 | 
				
			||||||
 | 
					        #index_col=0,       # use the first column as row labels
 | 
				
			||||||
 | 
					        #index_col=-1,      # use the last column as row labels
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Generate column headers row from each column number
 | 
				
			||||||
 | 
					        header=None,
 | 
				
			||||||
 | 
					        #header=0,          # use the first line as headers
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Use manual headers and skip the first row in the file
 | 
				
			||||||
 | 
					        #header=0,
 | 
				
			||||||
 | 
					        #names=['col1', 'col2', ...],
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Return a subset of the columns
 | 
				
			||||||
 | 
					    #return frame[['col1', 'col4', ...]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Return the entire frame
 | 
				
			||||||
 | 
					    return frame
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# =====================================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_features_and_labels(frame):
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
 | 
					    Transforms and scales the input data and returns numpy arrays for
 | 
				
			||||||
 | 
					    training and testing inputs and targets.
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Replace missing values with 0.0, or we can use
 | 
				
			||||||
 | 
					    # scikit-learn to calculate missing values (below)
 | 
				
			||||||
 | 
					    #frame[frame.isnull()] = 0.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Convert values to floats
 | 
				
			||||||
 | 
					    arr = np.array(frame, dtype=np.float)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Use the last column as the target value
 | 
				
			||||||
 | 
					    X, y = arr[:, :-1], arr[:, -1]
 | 
				
			||||||
 | 
					    # To use the first column instead, change the index value
 | 
				
			||||||
 | 
					    #X, y = arr[:, 1:], arr[:, 0]
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # Use 80% of the data for training; test against the rest
 | 
				
			||||||
 | 
					    from sklearn.model_selection import train_test_split
 | 
				
			||||||
 | 
					    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # sklearn.pipeline.make_pipeline could also be used to chain 
 | 
				
			||||||
 | 
					    # processing and classification into a black box, but here we do
 | 
				
			||||||
 | 
					    # them separately.
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # If values are missing we could impute them from the training data
 | 
				
			||||||
 | 
					    #from sklearn.preprocessing import Imputer
 | 
				
			||||||
 | 
					    #imputer = Imputer(strategy='mean')
 | 
				
			||||||
 | 
					    #imputer.fit(X_train)
 | 
				
			||||||
 | 
					    #X_train = imputer.transform(X_train)
 | 
				
			||||||
 | 
					    #X_test = imputer.transform(X_test)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # Normalize the attribute values to mean=0 and variance=1
 | 
				
			||||||
 | 
					    from sklearn.preprocessing import StandardScaler
 | 
				
			||||||
 | 
					    scaler = StandardScaler()
 | 
				
			||||||
 | 
					    # To scale to a specified range, use MinMaxScaler
 | 
				
			||||||
 | 
					    #from sklearn.preprocessing import MinMaxScaler
 | 
				
			||||||
 | 
					    #scaler = MinMaxScaler(feature_range=(0, 1))
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # Fit the scaler based on the training data, then apply the same
 | 
				
			||||||
 | 
					    # scaling to both training and test sets.
 | 
				
			||||||
 | 
					    scaler.fit(X_train)
 | 
				
			||||||
 | 
					    X_train = scaler.transform(X_train)
 | 
				
			||||||
 | 
					    X_test = scaler.transform(X_test)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Return the training and test sets
 | 
				
			||||||
 | 
					    return X_train, X_test, y_train, y_test
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# =====================================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def evaluate_classifier(X_train, X_test, y_train, y_test):
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
 | 
					    Run multiple times with different classifiers to get an idea of the
 | 
				
			||||||
 | 
					    relative performance of each configuration.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Returns a sequence of tuples containing:
 | 
				
			||||||
 | 
					        (title, precision, recall)
 | 
				
			||||||
 | 
					    for each learner.
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Import some classifiers to test
 | 
				
			||||||
 | 
					    from sklearn.svm import LinearSVC, NuSVC
 | 
				
			||||||
 | 
					    from sklearn.ensemble import AdaBoostClassifier
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # We will calculate the P-R curve for each classifier
 | 
				
			||||||
 | 
					    from sklearn.metrics import precision_recall_curve, f1_score
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # Here we create classifiers with default parameters. These need
 | 
				
			||||||
 | 
					    # to be adjusted to obtain optimal performance on your data set.
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # Test the linear support vector classifier
 | 
				
			||||||
 | 
					    classifier = LinearSVC(C=1)
 | 
				
			||||||
 | 
					    # Fit the classifier
 | 
				
			||||||
 | 
					    classifier.fit(X_train, y_train)
 | 
				
			||||||
 | 
					    score = f1_score(y_test, classifier.predict(X_test))
 | 
				
			||||||
 | 
					    # Generate the P-R curve
 | 
				
			||||||
 | 
					    y_prob = classifier.decision_function(X_test)
 | 
				
			||||||
 | 
					    precision, recall, _ = precision_recall_curve(y_test, y_prob)
 | 
				
			||||||
 | 
					    # Include the score in the title
 | 
				
			||||||
 | 
					    yield 'Linear SVC (F1 score={:.3f})'.format(score), precision, recall
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Test the Nu support vector classifier
 | 
				
			||||||
 | 
					    classifier = NuSVC(kernel='rbf', nu=0.5, gamma=1e-3)
 | 
				
			||||||
 | 
					    # Fit the classifier
 | 
				
			||||||
 | 
					    classifier.fit(X_train, y_train)
 | 
				
			||||||
 | 
					    score = f1_score(y_test, classifier.predict(X_test))
 | 
				
			||||||
 | 
					    # Generate the P-R curve
 | 
				
			||||||
 | 
					    y_prob = classifier.decision_function(X_test)
 | 
				
			||||||
 | 
					    precision, recall, _ = precision_recall_curve(y_test, y_prob)
 | 
				
			||||||
 | 
					    # Include the score in the title
 | 
				
			||||||
 | 
					    yield 'NuSVC (F1 score={:.3f})'.format(score), precision, recall
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Test the Ada boost classifier
 | 
				
			||||||
 | 
					    classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.0, algorithm='SAMME.R')
 | 
				
			||||||
 | 
					    # Fit the classifier
 | 
				
			||||||
 | 
					    classifier.fit(X_train, y_train)
 | 
				
			||||||
 | 
					    score = f1_score(y_test, classifier.predict(X_test))
 | 
				
			||||||
 | 
					    # Generate the P-R curve
 | 
				
			||||||
 | 
					    y_prob = classifier.decision_function(X_test)
 | 
				
			||||||
 | 
					    precision, recall, _ = precision_recall_curve(y_test, y_prob)
 | 
				
			||||||
 | 
					    # Include the score in the title
 | 
				
			||||||
 | 
					    yield 'Ada Boost (F1 score={:.3f})'.format(score), precision, recall
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# =====================================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def plot(results):
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
 | 
					    Create a plot comparing multiple learners.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    `results` is a list of tuples containing:
 | 
				
			||||||
 | 
					        (title, precision, recall)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    All the elements in results will be plotted.
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Plot the precision-recall curves
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    fig = plt.figure(figsize=(6, 6))
 | 
				
			||||||
 | 
					    fig.canvas.set_window_title('Classifying data from ' + URL)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for label, precision, recall in results:
 | 
				
			||||||
 | 
					        plt.plot(recall, precision, label=label)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    plt.title('Precision-Recall Curves')
 | 
				
			||||||
 | 
					    plt.xlabel('Precision')
 | 
				
			||||||
 | 
					    plt.ylabel('Recall')
 | 
				
			||||||
 | 
					    plt.legend(loc='lower left')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Let matplotlib improve the layout
 | 
				
			||||||
 | 
					    plt.tight_layout()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # ==================================
 | 
				
			||||||
 | 
					    # Display the plot in interactive UI
 | 
				
			||||||
 | 
					    plt.show()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # To save the plot to an image file, use savefig()
 | 
				
			||||||
 | 
					    #plt.savefig('plot.png')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Open the image file with the default image viewer
 | 
				
			||||||
 | 
					    #import subprocess
 | 
				
			||||||
 | 
					    #subprocess.Popen('plot.png', shell=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # To save the plot to an image in memory, use BytesIO and savefig()
 | 
				
			||||||
 | 
					    # This can then be written to any stream-like object, such as a
 | 
				
			||||||
 | 
					    # file or HTTP response.
 | 
				
			||||||
 | 
					    #from io import BytesIO
 | 
				
			||||||
 | 
					    #img_stream = BytesIO()
 | 
				
			||||||
 | 
					    #plt.savefig(img_stream, fmt='png')
 | 
				
			||||||
 | 
					    #img_bytes = img_stream.getvalue()
 | 
				
			||||||
 | 
					    #print('Image is {} bytes - {!r}'.format(len(img_bytes), img_bytes[:8] + b'...'))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Closing the figure allows matplotlib to release the memory used.
 | 
				
			||||||
 | 
					    plt.close()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# =====================================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == '__main__':
 | 
				
			||||||
 | 
					    # Download the data set from URL
 | 
				
			||||||
 | 
					    print("Downloading data from {}".format(URL))
 | 
				
			||||||
 | 
					    frame = download_data()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Process data into feature and label arrays
 | 
				
			||||||
 | 
					    print("Processing {} samples with {} attributes".format(len(frame.index), len(frame.columns)))
 | 
				
			||||||
 | 
					    X_train, X_test, y_train, y_test = get_features_and_labels(frame)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Evaluate multiple classifiers on the data
 | 
				
			||||||
 | 
					    print("Evaluating classifiers")
 | 
				
			||||||
 | 
					    results = list(evaluate_classifier(X_train, X_test, y_train, y_test))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Display the results
 | 
				
			||||||
 | 
					    print("Plotting the results")
 | 
				
			||||||
 | 
					    plot(results)
 | 
				
			||||||
							
								
								
									
										8
									
								
								dialect_identification/config.ini
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								dialect_identification/config.ini
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,8 @@
 | 
				
			|||||||
 | 
					[word_based]
 | 
				
			||||||
 | 
					fileWordList = D:\\OneDrive\\Research\\rug\\same_utterance\\feature\\wordList.csv
 | 
				
			||||||
 | 
					fileCombined = D:\\OneDrive\\Research\\rug\\same_utterance\\feature\\combined.csv
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[sentence_based]
 | 
				
			||||||
 | 
					dirFeature = D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\feature
 | 
				
			||||||
 | 
					fileMDB = D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\feature\\DialectClassification.accdb
 | 
				
			||||||
 | 
					dirData = D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\wav
 | 
				
			||||||
							
								
								
									
										74
									
								
								dialect_identification/data_io.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										74
									
								
								dialect_identification/data_io.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,74 @@
 | 
				
			|||||||
 | 
					#
 | 
				
			||||||
 | 
					# 2017/09/25
 | 
				
			||||||
 | 
					# select samples from the combined.csv for the further analysis
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# HISTORY
 | 
				
			||||||
 | 
					# 2017/10/02 modularized.
 | 
				
			||||||
 | 
					# 
 | 
				
			||||||
 | 
					# Aki Kunikoshi
 | 
				
			||||||
 | 
					# 428968@gmail.com
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					import numpy as np
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def readFile(filename):
 | 
				
			||||||
 | 
					    with open(filename, 'r') as fin:
 | 
				
			||||||
 | 
					        lines = fin.read()
 | 
				
			||||||
 | 
					    linesEach = lines.split('\n')
 | 
				
			||||||
 | 
					    return linesEach
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def selectSamplesFromCombinedData(word, fileCombined):
 | 
				
			||||||
 | 
					    # load combined data 
 | 
				
			||||||
 | 
						fin = open(fileCombined, 'r')
 | 
				
			||||||
 | 
						line = fin.readline()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # load data per region
 | 
				
			||||||
 | 
						dataGroningen = []
 | 
				
			||||||
 | 
						dataLimburg   = []
 | 
				
			||||||
 | 
						dataOverijsel = []
 | 
				
			||||||
 | 
						while line:
 | 
				
			||||||
 | 
							line = fin.readline()
 | 
				
			||||||
 | 
							line = line.rstrip()
 | 
				
			||||||
 | 
							lineList = line.split(',')
 | 
				
			||||||
 | 
							if len(lineList) == 6 and lineList[5] == word:
 | 
				
			||||||
 | 
								region = lineList[2]
 | 
				
			||||||
 | 
								if region == 'Groningen_and_Drenthe':
 | 
				
			||||||
 | 
									dataGroningen.append(lineList)
 | 
				
			||||||
 | 
								elif region == 'Limburg':
 | 
				
			||||||
 | 
									dataLimburg.append(lineList)
 | 
				
			||||||
 | 
								elif region == 'Oost_Overijsel-Gelderland':
 | 
				
			||||||
 | 
									dataOverijsel.append(lineList)
 | 
				
			||||||
 | 
						fin.close()
 | 
				
			||||||
 | 
						return (dataGroningen, dataLimburg, dataOverijsel)
 | 
				
			||||||
 | 
					    #print("{0}: {1} {2} {3}".format(word,len(listGroningen),len(listLimburg),len(listOverijsel))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def groupSamplesInCSV(fileCSV, idxRegion):
 | 
				
			||||||
 | 
						fin = open(fileCSV, 'r')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						# first line is the header
 | 
				
			||||||
 | 
						line = fin.readline()
 | 
				
			||||||
 | 
						line = line.rstrip()
 | 
				
			||||||
 | 
						header = line.split(',')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						# load data per region
 | 
				
			||||||
 | 
						dataGroningen = []
 | 
				
			||||||
 | 
						dataLimburg   = []
 | 
				
			||||||
 | 
						dataOverijsel = []
 | 
				
			||||||
 | 
						while line:
 | 
				
			||||||
 | 
							line = fin.readline()
 | 
				
			||||||
 | 
							line = line.rstrip()
 | 
				
			||||||
 | 
							lineList = line.split(',')
 | 
				
			||||||
 | 
							if len(lineList) == len(header):
 | 
				
			||||||
 | 
								region = lineList[idxRegion]
 | 
				
			||||||
 | 
								if region == 'Groningen_and_Drenthe':
 | 
				
			||||||
 | 
									dataGroningen.append(lineList)
 | 
				
			||||||
 | 
								elif region == 'Limburg':
 | 
				
			||||||
 | 
									dataLimburg.append(lineList)
 | 
				
			||||||
 | 
								elif region == 'Oost_Overijsel-Gelderland':
 | 
				
			||||||
 | 
									dataOverijsel.append(lineList)
 | 
				
			||||||
 | 
						fin.close()
 | 
				
			||||||
 | 
						return (header, dataGroningen, dataLimburg, dataOverijsel)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def addUserID(featureFile, recordingsCSV):
 | 
				
			||||||
 | 
						dirFeature = config['sentence_based']['dirFeature']
 | 
				
			||||||
							
								
								
									
										41
									
								
								dialect_identification/data_manipulation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										41
									
								
								dialect_identification/data_manipulation.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,41 @@
 | 
				
			|||||||
 | 
					import numpy as np
 | 
				
			||||||
 | 
					from sklearn import manifold
 | 
				
			||||||
 | 
					import Levenshtein
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# x: ndarray (dnum x dim)
 | 
				
			||||||
 | 
					# n: number of samples to extract
 | 
				
			||||||
 | 
					# OUTPUT
 | 
				
			||||||
 | 
					# index: index of the chosen samples
 | 
				
			||||||
 | 
					# 
 | 
				
			||||||
 | 
					def extractRandomSample(x, n):
 | 
				
			||||||
 | 
						xRowMax = x.shape[0]
 | 
				
			||||||
 | 
						indexOriginal = np.arange(xRowMax)
 | 
				
			||||||
 | 
						indexChosen	= np.random.choice(indexOriginal, n, False)
 | 
				
			||||||
 | 
						xChosen = x[indexChosen, :]
 | 
				
			||||||
 | 
						return (xChosen, indexChosen)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# x: 1d string ndarray
 | 
				
			||||||
 | 
					def makeLevenshteinMatrix(x):
 | 
				
			||||||
 | 
						xRowMax = x.shape[0]
 | 
				
			||||||
 | 
						xLevenshtein = np.ones((xRowMax, xRowMax), dtype='int')
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						for xRow in range(0, xRowMax):
 | 
				
			||||||
 | 
							for xCol in range(0, xRowMax):
 | 
				
			||||||
 | 
								dist = Levenshtein.distance(x[xRow], x[xCol]);
 | 
				
			||||||
 | 
								xLevenshtein[xRow, xCol] = dist
 | 
				
			||||||
 | 
						return xLevenshtein
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# x: 1d string ndarray
 | 
				
			||||||
 | 
					def calcLevenshteinArray(word, x):
 | 
				
			||||||
 | 
						xRowMax = x.shape[0]
 | 
				
			||||||
 | 
						xLevenshtein = np.zeros(x.shape, dtype='int')
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						for xRow in range(0, xRowMax):
 | 
				
			||||||
 | 
							dist = Levenshtein.distance(word, x[xRow]);
 | 
				
			||||||
 | 
							xLevenshtein[xRow] = dist
 | 
				
			||||||
 | 
						return xLevenshtein	
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def MDS(x):
 | 
				
			||||||
 | 
						mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=6)
 | 
				
			||||||
 | 
						xmds = mds.fit_transform(x)
 | 
				
			||||||
 | 
						return xmds
 | 
				
			||||||
							
								
								
									
										70
									
								
								dialect_identification/dialect_identification.pyproj
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										70
									
								
								dialect_identification/dialect_identification.pyproj
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,70 @@
 | 
				
			|||||||
 | 
					<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" ToolsVersion="4.0">
 | 
				
			||||||
 | 
					  <PropertyGroup>
 | 
				
			||||||
 | 
					    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
 | 
				
			||||||
 | 
					    <SchemaVersion>2.0</SchemaVersion>
 | 
				
			||||||
 | 
					    <ProjectGuid>fe1b1358-adbe-4446-affd-a0802d13d15b</ProjectGuid>
 | 
				
			||||||
 | 
					    <ProjectTypeGuids>{a41c8ea1-112a-4a2d-9f91-29557995525f};{888888a0-9f3d-457c-b088-3a5042f75d52}</ProjectTypeGuids>
 | 
				
			||||||
 | 
					    <ProjectHome>.</ProjectHome>
 | 
				
			||||||
 | 
					    <StartupFile>output_confusion_matrix.py</StartupFile>
 | 
				
			||||||
 | 
					    <SearchPath>
 | 
				
			||||||
 | 
					    </SearchPath>
 | 
				
			||||||
 | 
					    <WorkingDirectory>.</WorkingDirectory>
 | 
				
			||||||
 | 
					    <OutputPath>.</OutputPath>
 | 
				
			||||||
 | 
					    <Name>dialect_identification</Name>
 | 
				
			||||||
 | 
					    <RootNamespace>dialect_identification</RootNamespace>
 | 
				
			||||||
 | 
					  </PropertyGroup>
 | 
				
			||||||
 | 
					  <PropertyGroup Condition=" '$(Configuration)' == 'Debug' ">
 | 
				
			||||||
 | 
					    <DebugSymbols>true</DebugSymbols>
 | 
				
			||||||
 | 
					    <EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
 | 
				
			||||||
 | 
					  </PropertyGroup>
 | 
				
			||||||
 | 
					  <PropertyGroup Condition=" '$(Configuration)' == 'Release' ">
 | 
				
			||||||
 | 
					    <DebugSymbols>true</DebugSymbols>
 | 
				
			||||||
 | 
					    <EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
 | 
				
			||||||
 | 
					  </PropertyGroup>
 | 
				
			||||||
 | 
					  <ItemGroup>
 | 
				
			||||||
 | 
					    <Compile Include="manipulate_db.py">
 | 
				
			||||||
 | 
					      <SubType>Code</SubType>
 | 
				
			||||||
 | 
					    </Compile>
 | 
				
			||||||
 | 
					    <Compile Include="audio2db.py">
 | 
				
			||||||
 | 
					      <SubType>Code</SubType>
 | 
				
			||||||
 | 
					    </Compile>
 | 
				
			||||||
 | 
					    <Compile Include="classifier.py" />
 | 
				
			||||||
 | 
					    <Compile Include="dataManipulation.py">
 | 
				
			||||||
 | 
					      <SubType>Code</SubType>
 | 
				
			||||||
 | 
					    </Compile>
 | 
				
			||||||
 | 
					    <Compile Include="output_confusion_matrix.py">
 | 
				
			||||||
 | 
					      <SubType>Code</SubType>
 | 
				
			||||||
 | 
					    </Compile>
 | 
				
			||||||
 | 
					    <Compile Include="sentence_based.py">
 | 
				
			||||||
 | 
					      <SubType>Code</SubType>
 | 
				
			||||||
 | 
					    </Compile>
 | 
				
			||||||
 | 
					    <Compile Include="speaker_based.py">
 | 
				
			||||||
 | 
					      <SubType>Code</SubType>
 | 
				
			||||||
 | 
					    </Compile>
 | 
				
			||||||
 | 
					    <Compile Include="speaker_based_functions.py">
 | 
				
			||||||
 | 
					      <SubType>Code</SubType>
 | 
				
			||||||
 | 
					    </Compile>
 | 
				
			||||||
 | 
					    <Compile Include="test_code.py">
 | 
				
			||||||
 | 
					      <SubType>Code</SubType>
 | 
				
			||||||
 | 
					    </Compile>
 | 
				
			||||||
 | 
					    <Compile Include="evaluation.py">
 | 
				
			||||||
 | 
					      <SubType>Code</SubType>
 | 
				
			||||||
 | 
					    </Compile>
 | 
				
			||||||
 | 
					    <Compile Include="word_based.py">
 | 
				
			||||||
 | 
					      <SubType>Code</SubType>
 | 
				
			||||||
 | 
					    </Compile>
 | 
				
			||||||
 | 
					    <Compile Include="dataIO.py" />
 | 
				
			||||||
 | 
					  </ItemGroup>
 | 
				
			||||||
 | 
					  <ItemGroup>
 | 
				
			||||||
 | 
					    <Content Include="config.ini" />
 | 
				
			||||||
 | 
					  </ItemGroup>
 | 
				
			||||||
 | 
					  <Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
 | 
				
			||||||
 | 
					  <!-- Uncomment the CoreCompile target to enable the Build command in
 | 
				
			||||||
 | 
					       Visual Studio and specify your pre- and post-build commands in
 | 
				
			||||||
 | 
					       the BeforeBuild and AfterBuild targets below. -->
 | 
				
			||||||
 | 
					  <!--<Target Name="CoreCompile" />-->
 | 
				
			||||||
 | 
					  <Target Name="BeforeBuild">
 | 
				
			||||||
 | 
					  </Target>
 | 
				
			||||||
 | 
					  <Target Name="AfterBuild">
 | 
				
			||||||
 | 
					  </Target>
 | 
				
			||||||
 | 
					</Project>
 | 
				
			||||||
							
								
								
									
										40
									
								
								dialect_identification/evaluation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								dialect_identification/evaluation.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,40 @@
 | 
				
			|||||||
 | 
					import numpy as np
 | 
				
			||||||
 | 
					import scipy as sp
 | 
				
			||||||
 | 
					import scipy.stats
 | 
				
			||||||
 | 
					from sklearn.model_selection import KFold
 | 
				
			||||||
 | 
					from sklearn.metrics import f1_score
 | 
				
			||||||
 | 
					from sklearn.metrics import confusion_matrix
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# from https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data
 | 
				
			||||||
 | 
					def mean_confidence_interval(data, confidence):
 | 
				
			||||||
 | 
					    a = 1.0*np.array(data)
 | 
				
			||||||
 | 
					    n = len(a)
 | 
				
			||||||
 | 
					    m, se = np.mean(a), scipy.stats.sem(a)
 | 
				
			||||||
 | 
					    h = se * sp.stats.t._ppf((1+confidence)/2., n-1)
 | 
				
			||||||
 | 
					    return m, m-h, m+h
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# accumulated confusion matrix is added to cross_val_score  
 | 
				
			||||||
 | 
					def cross_val_confusion_matrix(model, X, y, cv):
 | 
				
			||||||
 | 
						kf = KFold(n_splits=cv)
 | 
				
			||||||
 | 
						classLabels = np.unique(y)
 | 
				
			||||||
 | 
						classNumMax = classLabels.shape[0]
 | 
				
			||||||
 | 
						confusionMatrixAccumulated = np.zeros((classNumMax, classNumMax))
 | 
				
			||||||
 | 
						scores = []
 | 
				
			||||||
 | 
						for idx_train, idx_test in kf.split(X):
 | 
				
			||||||
 | 
							# split into train/test
 | 
				
			||||||
 | 
							x_train = X[idx_train, :]
 | 
				
			||||||
 | 
							x_test  = X[idx_test, :]
 | 
				
			||||||
 | 
							y_train = y[idx_train]
 | 
				
			||||||
 | 
							y_test  = y[idx_test]
 | 
				
			||||||
 | 
							modelfit = model.fit(x_train, y_train)
 | 
				
			||||||
 | 
							
 | 
				
			||||||
 | 
							# evaluation
 | 
				
			||||||
 | 
							y_pred = modelfit.predict(x_test)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							score = f1_score(y_test, y_pred, average='micro')
 | 
				
			||||||
 | 
							scores.append(score)
 | 
				
			||||||
 | 
							confusionMatrixAccumulated = confusionMatrixAccumulated + confusion_matrix(y_test, y_pred,
 | 
				
			||||||
 | 
								labels=classLabels) 
 | 
				
			||||||
 | 
						scores = np.array(scores)
 | 
				
			||||||
 | 
						return scores, confusionMatrixAccumulated
 | 
				
			||||||
							
								
								
									
										48
									
								
								dialect_identification/manipulate_db.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								dialect_identification/manipulate_db.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,48 @@
 | 
				
			|||||||
 | 
					import sys
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					import pandas
 | 
				
			||||||
 | 
					import datetime
 | 
				
			||||||
 | 
					sys.path.append('..')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# these lines are not necessary once forced-alignment is intalled as a package.
 | 
				
			||||||
 | 
					forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced-alignment'
 | 
				
			||||||
 | 
					sys.path.append(forced_alignment_module)
 | 
				
			||||||
 | 
					from forced_alignment import pronunciations
 | 
				
			||||||
 | 
					from forced_alignment.htk_dict import variances_table
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#pronunciations.delete_word('kunikoshi')
 | 
				
			||||||
 | 
					#pronunciations.delete_all_g2p_entries()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#existing_pronunciations = set(pronunciations.get_all())
 | 
				
			||||||
 | 
					## only focus on word
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## missing pronunciations
 | 
				
			||||||
 | 
					## (1) pronunciation is written in IPA.
 | 
				
			||||||
 | 
					## (2) pronunciation variants are made based on (1).
 | 
				
			||||||
 | 
					## (3) they are converted into HTK format.
 | 
				
			||||||
 | 
					#missing_pronunciations_file = 'D:\\OneDrive\\Research\\rug\\experiments\\same_utterance\\missing_words_in_barbara_dic\\missing_words_pronvarsHTK.txt'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#with open(missing_pronunciations_file) as fin:
 | 
				
			||||||
 | 
					#	lines = fin.read()
 | 
				
			||||||
 | 
					#	lines = lines.split('\n')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#source = 'generated using ipa transcription by Marita Everhardt.'
 | 
				
			||||||
 | 
					#inserts = []
 | 
				
			||||||
 | 
					#for line in lines:
 | 
				
			||||||
 | 
					#	line = line.split('\t')
 | 
				
			||||||
 | 
					#	word = line[0].strip().lower()
 | 
				
			||||||
 | 
					#	pronounciation = line[1].strip().split()
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
					#	# surely not in the table
 | 
				
			||||||
 | 
					#	#if (word, pronounciation) not in existing_pronunciations:
 | 
				
			||||||
 | 
					#	inserts.append("('{}', '{}', '{}', '{}', 0)".format(
 | 
				
			||||||
 | 
					#		word, 
 | 
				
			||||||
 | 
					#		' '.join(pronounciation),
 | 
				
			||||||
 | 
					#		source,
 | 
				
			||||||
 | 
					#		datetime.datetime.now(), ))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#sql = """INSERT INTO pronunciations (word, pronunciation, collection, added, automatic) VALUES\n  {};""".format(
 | 
				
			||||||
 | 
					#    ',\n  '.join(inserts)
 | 
				
			||||||
							
								
								
									
										79
									
								
								dialect_identification/output_confusion_matrix.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										79
									
								
								dialect_identification/output_confusion_matrix.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,79 @@
 | 
				
			|||||||
 | 
					import os
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import itertools
 | 
				
			||||||
 | 
					import numpy as np
 | 
				
			||||||
 | 
					import matplotlib.pyplot as plt
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from sklearn.metrics import accuracy_score
 | 
				
			||||||
 | 
					from sklearn.metrics import confusion_matrix
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
 | 
				
			||||||
 | 
					sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					regionLabels  = ['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg']
 | 
				
			||||||
 | 
					regionLabels2 = ['Groningen_and_Drenthe', 'Limburg']
 | 
				
			||||||
 | 
					dirOut = currDir + '\\result\\same-utterance_with_cities'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def plot_confusion_matrix(cm, classes,
 | 
				
			||||||
 | 
											  normalize=False,
 | 
				
			||||||
 | 
											  title='Confusion matrix',
 | 
				
			||||||
 | 
											  cmap=plt.cm.Blues):
 | 
				
			||||||
 | 
						"""
 | 
				
			||||||
 | 
						This function prints and plots the confusion matrix.
 | 
				
			||||||
 | 
						Normalization can be applied by setting `normalize=True`.
 | 
				
			||||||
 | 
						Note:
 | 
				
			||||||
 | 
						this code is downloaded from: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
 | 
				
			||||||
 | 
						"""
 | 
				
			||||||
 | 
						if normalize:
 | 
				
			||||||
 | 
							cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
 | 
				
			||||||
 | 
							print("Normalized confusion matrix")
 | 
				
			||||||
 | 
						else:
 | 
				
			||||||
 | 
							print('Confusion matrix, without normalization')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						_fontsize = 24
 | 
				
			||||||
 | 
						plt.imshow(cm, interpolation='nearest', cmap=cmap)
 | 
				
			||||||
 | 
						#plt.title(title, fontsize=_fontsize+2)
 | 
				
			||||||
 | 
						#plt.colorbar()
 | 
				
			||||||
 | 
						tick_marks = np.arange(len(classes))
 | 
				
			||||||
 | 
						#plt.xticks(tick_marks, classes, rotation=45, fontsize=_fontsize-2)
 | 
				
			||||||
 | 
						plt.xticks(tick_marks, classes, fontsize=_fontsize-4)
 | 
				
			||||||
 | 
						plt.yticks(tick_marks, classes, fontsize=_fontsize-4)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						fmt = '.2f' if normalize else 'd'
 | 
				
			||||||
 | 
						thresh = cm.max() / 2.
 | 
				
			||||||
 | 
						for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
 | 
				
			||||||
 | 
							plt.text(j, i, format(cm[i, j], fmt),
 | 
				
			||||||
 | 
									 horizontalalignment="center",
 | 
				
			||||||
 | 
									 color="white" if cm[i, j] > thresh else "black", 
 | 
				
			||||||
 | 
									 fontsize=_fontsize)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						plt.tight_layout()
 | 
				
			||||||
 | 
						plt.subplots_adjust(bottom=0.2)
 | 
				
			||||||
 | 
						plt.ylabel('True label', fontsize=_fontsize-4)
 | 
				
			||||||
 | 
						plt.xlabel('Predicted label', fontsize=_fontsize-4)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					pred = np.load(dirOut + '\\pred_per_pid_3regions.npy')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#accuracy = accuracy_score(pred[:, 1], pred[:, 2], normalize=True, sample_weight=None)
 | 
				
			||||||
 | 
					#print('accuracy: {}%'.format(accuracy * 100))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# confusion matrix
 | 
				
			||||||
 | 
					cm = confusion_matrix(pred[:, 1], pred[:, 2], labels=regionLabels)
 | 
				
			||||||
 | 
					# human perception (2 regions)
 | 
				
			||||||
 | 
					#cm = np.array([[39, 57], [6, 104]])
 | 
				
			||||||
 | 
					# human perception (3 regions)
 | 
				
			||||||
 | 
					#cm = np.array([[22, 14, 52], [23, 21, 52], [5, 5, 100]])
 | 
				
			||||||
 | 
					print(cm)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					np.set_printoptions(precision=2)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					plt.figure()
 | 
				
			||||||
 | 
					plot_confusion_matrix(cm, classes=['GD', 'OG', 'LB'], normalize=True)
 | 
				
			||||||
 | 
					#plot_confusion_matrix(cm, classes=['GD', 'LB'], normalize=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#plt.show()
 | 
				
			||||||
 | 
					plt.savefig(dirOut + '\\cm_machine_3regions_normalized.png')
 | 
				
			||||||
							
								
								
									
										197
									
								
								dialect_identification/sentence_based.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										197
									
								
								dialect_identification/sentence_based.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,197 @@
 | 
				
			|||||||
 | 
					import os
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					import configparser
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import numpy as np
 | 
				
			||||||
 | 
					import pandas as pd
 | 
				
			||||||
 | 
					from matplotlib import pyplot
 | 
				
			||||||
 | 
					from sklearn.model_selection import train_test_split
 | 
				
			||||||
 | 
					from sklearn.model_selection import cross_val_score
 | 
				
			||||||
 | 
					from sklearn import preprocessing
 | 
				
			||||||
 | 
					from collections import Counter
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# database
 | 
				
			||||||
 | 
					import pypyodbc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# classifier
 | 
				
			||||||
 | 
					from sklearn.neighbors import KNeighborsClassifier
 | 
				
			||||||
 | 
					from sklearn.svm import SVC
 | 
				
			||||||
 | 
					from sklearn.tree import DecisionTreeClassifier
 | 
				
			||||||
 | 
					from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
 | 
				
			||||||
 | 
					from sklearn.naive_bayes import GaussianNB
 | 
				
			||||||
 | 
					from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 | 
				
			||||||
 | 
					from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
 | 
				
			||||||
 | 
					from sklearn.metrics import f1_score
 | 
				
			||||||
 | 
					from sklearn.metrics import confusion_matrix
 | 
				
			||||||
 | 
					import pickle
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					currDir    = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
 | 
				
			||||||
 | 
					sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
 | 
				
			||||||
 | 
					from dataIO import readFile
 | 
				
			||||||
 | 
					from dataIO import groupSamplesInCSV
 | 
				
			||||||
 | 
					import dataManipulation
 | 
				
			||||||
 | 
					import utility as util
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					configFile = currDir + '\\config.ini'
 | 
				
			||||||
 | 
					# load init file
 | 
				
			||||||
 | 
					config = configparser.ConfigParser()
 | 
				
			||||||
 | 
					config.sections()
 | 
				
			||||||
 | 
					config.read(configFile)
 | 
				
			||||||
 | 
					dirFeature = config['sentence_based']['dirFeature']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					sentenceNumMax = 10
 | 
				
			||||||
 | 
					classifierList = []
 | 
				
			||||||
 | 
					LE_X_decode	   = []
 | 
				
			||||||
 | 
					LE_y = preprocessing.LabelEncoder()
 | 
				
			||||||
 | 
					LE_y.fit(["Groningen_and_Drenthe", "Limburg", "Oost_Overijsel-Gelderland"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					testset_X = []
 | 
				
			||||||
 | 
					testset_y = []
 | 
				
			||||||
 | 
					testset_userID = []
 | 
				
			||||||
 | 
					result_y_test = []
 | 
				
			||||||
 | 
					result_y_prediction = []
 | 
				
			||||||
 | 
					fout = open("comparison.csv", "w")
 | 
				
			||||||
 | 
					for sentenceNum in range(1, sentenceNumMax+1):
 | 
				
			||||||
 | 
						#if sentenceNum != 10:
 | 
				
			||||||
 | 
						#	sentenceNumStr = '0' + str(sentenceNum)
 | 
				
			||||||
 | 
						#else:
 | 
				
			||||||
 | 
						#	sentenceNumStr = str(sentenceNumStr)
 | 
				
			||||||
 | 
						sentenceNumStr = format(sentenceNum, '02')
 | 
				
			||||||
 | 
						fileSentence = dirFeature + '\\\\' + sentenceNumStr + '.csv'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						## load combined data 
 | 
				
			||||||
 | 
						fileCSV = fileSentence
 | 
				
			||||||
 | 
						idxRegion = 1
 | 
				
			||||||
 | 
						header, dataGroningen, dataLimburg, dataOverijsel = groupSamplesInCSV(fileCSV, idxRegion)
 | 
				
			||||||
 | 
						sampleNumMax = np.min((len(dataGroningen), len(dataLimburg), len(dataOverijsel)))
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						## make balanced dataset
 | 
				
			||||||
 | 
						dataG, indexG = dataManipulation.extractRandomSample(np.array(dataGroningen), sampleNumMax)
 | 
				
			||||||
 | 
						dataL, indexL = dataManipulation.extractRandomSample(np.array(dataLimburg), sampleNumMax)
 | 
				
			||||||
 | 
						dataO, indexO = dataManipulation.extractRandomSample(np.array(dataOverijsel), sampleNumMax)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						XIndex = np.arange(idxRegion+1, len(header))
 | 
				
			||||||
 | 
						yIndex = 1 # region
 | 
				
			||||||
 | 
						userIDindex = 0 # userID
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						## cathegorical values into numbers	
 | 
				
			||||||
 | 
						X_ = np.r_[dataG[:, XIndex], dataL[:, XIndex], dataO[:, XIndex]]
 | 
				
			||||||
 | 
						y_ = np.r_[dataG[:, yIndex], dataL[:, yIndex], dataO[:, yIndex]]
 | 
				
			||||||
 | 
						userID_ = np.r_[dataG[:, userIDindex], dataL[:, userIDindex], dataO[:, userIDindex]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						#X = np.zeros((X_.shape), 'int')
 | 
				
			||||||
 | 
						for Xindex in XIndex:
 | 
				
			||||||
 | 
							x = X_[:, Xindex-2]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							## levenshtein distance
 | 
				
			||||||
 | 
							#word_count = Counter(x)
 | 
				
			||||||
 | 
							#frequent_word = max(word_count)
 | 
				
			||||||
 | 
							#X[:, Xindex-2] = dataManipulation.calcLevenshteinArray(frequent_word, x)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							# hot encoding
 | 
				
			||||||
 | 
							le_x = preprocessing.LabelBinarizer()
 | 
				
			||||||
 | 
							le_x.fit(np.unique(x))
 | 
				
			||||||
 | 
							x_ = le_x.transform(x)
 | 
				
			||||||
 | 
							LE_X_decode.append(x_.shape[1])
 | 
				
			||||||
 | 
							if Xindex == idxRegion+1:
 | 
				
			||||||
 | 
								X = x_
 | 
				
			||||||
 | 
							else:
 | 
				
			||||||
 | 
								X = np.c_[X, x_]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						y = LE_y.transform(y_)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						## split into train vs test set
 | 
				
			||||||
 | 
						#[X_train, X_test, y_train, y_test] = train_test_split(X, y, test_size = 0.2, random_state = 0)
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						# each regional data should be splited equally 
 | 
				
			||||||
 | 
						lenG = dataG.shape[0]
 | 
				
			||||||
 | 
						lenL = dataL.shape[0]
 | 
				
			||||||
 | 
						lenO = dataO.shape[0]
 | 
				
			||||||
 | 
						indexG = np.arange(0, lenG)
 | 
				
			||||||
 | 
						indexL = np.arange(lenG, lenG+lenL)
 | 
				
			||||||
 | 
						indexO = np.arange(lenG+lenL, lenG+lenL+lenO)
 | 
				
			||||||
 | 
						[XG_train, XG_test, yG_train, yG_test] = train_test_split(X[indexG, :], y[indexG], test_size = 0.2, random_state = 0)
 | 
				
			||||||
 | 
						[XL_train, XL_test, yL_train, yL_test] = train_test_split(X[indexL, :], y[indexL], test_size = 0.2, random_state = 0)
 | 
				
			||||||
 | 
						[XO_train, XO_test, yO_train, yO_test] = train_test_split(X[indexO, :], y[indexO], test_size = 0.2, random_state = 0)
 | 
				
			||||||
 | 
						X_train = np.r_[XG_train, XL_train, XO_train]
 | 
				
			||||||
 | 
						X_test  = np.r_[XG_test, XL_test, XO_test]
 | 
				
			||||||
 | 
						y_train = np.r_[yG_train, yL_train, yO_train]
 | 
				
			||||||
 | 
						y_test  = np.r_[yG_test, yL_test, yO_test]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						## comparison
 | 
				
			||||||
 | 
						## classifiers
 | 
				
			||||||
 | 
						#names = ["Nearest Neighbors", 
 | 
				
			||||||
 | 
						#		 "Linear SVM",
 | 
				
			||||||
 | 
						#		 "Poly SVM",
 | 
				
			||||||
 | 
						#		 "RBF SVM", 
 | 
				
			||||||
 | 
						#		 "Decision Tree",
 | 
				
			||||||
 | 
						#		 "Random Forest 2", 
 | 
				
			||||||
 | 
						#		 "Random Forest 3", 
 | 
				
			||||||
 | 
						#		 "Random Forest 4", 
 | 
				
			||||||
 | 
						#		 "AdaBoost", 
 | 
				
			||||||
 | 
						#		 #"Naive Bayes", 
 | 
				
			||||||
 | 
						#		 "Linear Discriminant Analysis",
 | 
				
			||||||
 | 
						#		 #"Quadratic Discriminant Analysis"
 | 
				
			||||||
 | 
						#		 ]
 | 
				
			||||||
 | 
						#classifiers = [
 | 
				
			||||||
 | 
						#	KNeighborsClassifier(3),
 | 
				
			||||||
 | 
						#	SVC(kernel="linear", C=0.025),
 | 
				
			||||||
 | 
						#	SVC(kernel="poly", C=0.025),
 | 
				
			||||||
 | 
						#	SVC(gamma=2, C=1),
 | 
				
			||||||
 | 
						#	DecisionTreeClassifier(max_depth=4),
 | 
				
			||||||
 | 
						#	RandomForestClassifier(max_depth=2, n_estimators=10, max_features=1),
 | 
				
			||||||
 | 
						#	RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1),
 | 
				
			||||||
 | 
						#	RandomForestClassifier(max_depth=4, n_estimators=10, max_features=1),
 | 
				
			||||||
 | 
						#	AdaBoostClassifier(),
 | 
				
			||||||
 | 
						#	#GaussianNB(),
 | 
				
			||||||
 | 
						#	LinearDiscriminantAnalysis(),
 | 
				
			||||||
 | 
						#	#QuadraticDiscriminantAnalysis()
 | 
				
			||||||
 | 
						#	]
 | 
				
			||||||
 | 
						#for name, model in zip(names, classifiers):
 | 
				
			||||||
 | 
						#	scores = cross_val_score(model, X, y, cv = 10, scoring = 'f1_micro')
 | 
				
			||||||
 | 
						#	fout = open("comparison.csv", "a")
 | 
				
			||||||
 | 
						#	fout.write("{0},{1},{2}\n".format(sentenceNum, name, scores.mean()))
 | 
				
			||||||
 | 
						#	print('{0}, {1}: {2}'.format(sentenceNum, name, scores.mean()))
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						# quasi-optimal model
 | 
				
			||||||
 | 
						model = AdaBoostClassifier()
 | 
				
			||||||
 | 
						# cross validation
 | 
				
			||||||
 | 
						scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro')
 | 
				
			||||||
 | 
						ci_mean, ci_low, ci_high = util.mean_confidence_interval(scores, 0.95)
 | 
				
			||||||
 | 
						modelfit = model.fit(X_train, y_train)
 | 
				
			||||||
 | 
						# f1 on test data
 | 
				
			||||||
 | 
						y_prediction = modelfit.predict(X_test)
 | 
				
			||||||
 | 
						f1score = f1_score(y_test, y_prediction, average='micro')
 | 
				
			||||||
 | 
						fout.write("{0},{1},{2},{3}\n".format(ci_mean, ci_low, ci_high, f1score))
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						## save for the test
 | 
				
			||||||
 | 
						testset_X.append(X_test)
 | 
				
			||||||
 | 
						testset_y.append(y_test)
 | 
				
			||||||
 | 
						testset_userID.append(userID_)
 | 
				
			||||||
 | 
						result_y_test = result_y_test + list(y_test)
 | 
				
			||||||
 | 
						result_y_prediction = result_y_prediction + list(y_prediction)
 | 
				
			||||||
 | 
						fileClassifier = dirFeature + '\\\\' + sentenceNumStr + '.mdl'
 | 
				
			||||||
 | 
						pickle.dump(modelfit, open(fileClassifier, 'wb'))
 | 
				
			||||||
 | 
					fout.close()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### confusion matrix
 | 
				
			||||||
 | 
					result_y_test_label = LE_y.inverse_transform(result_y_test)
 | 
				
			||||||
 | 
					result_y_prediction_label = LE_y.inverse_transform(result_y_prediction)
 | 
				
			||||||
 | 
					confusionMatrix = confusion_matrix(result_y_test_label, result_y_prediction_label, labels=[
 | 
				
			||||||
 | 
						'Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland'])
 | 
				
			||||||
 | 
					print(confusionMatrix)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### make userID list 
 | 
				
			||||||
 | 
					#userID = testset_userID[0]
 | 
				
			||||||
 | 
					#for sentenceNum in range(1, sentenceNumMax):
 | 
				
			||||||
 | 
					#	userid = testset_userID[sentenceNum]
 | 
				
			||||||
 | 
					#	userID = np.r_[userID, userid]
 | 
				
			||||||
 | 
					#userIDlist = np.unique(userID)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
							
								
								
									
										326
									
								
								dialect_identification/speaker_based.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										326
									
								
								dialect_identification/speaker_based.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,326 @@
 | 
				
			|||||||
 | 
					import os
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					import configparser
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import pypyodbc
 | 
				
			||||||
 | 
					import numpy as np
 | 
				
			||||||
 | 
					from collections import Counter
 | 
				
			||||||
 | 
					import matplotlib.pyplot as plt
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from sklearn.model_selection import train_test_split
 | 
				
			||||||
 | 
					from sklearn.model_selection import cross_val_score
 | 
				
			||||||
 | 
					from sklearn import preprocessing
 | 
				
			||||||
 | 
					from sklearn.metrics import confusion_matrix
 | 
				
			||||||
 | 
					from sklearn.metrics import accuracy_score
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
 | 
				
			||||||
 | 
					sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
 | 
				
			||||||
 | 
					import dataManipulation as mani
 | 
				
			||||||
 | 
					import evaluation as eval
 | 
				
			||||||
 | 
					import speaker_based_functions as sb_func
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#####################
 | 
				
			||||||
 | 
					##   USER DEFINE   ##
 | 
				
			||||||
 | 
					#####################
 | 
				
			||||||
 | 
					sentenceNumMax = 10
 | 
				
			||||||
 | 
					configFile = currDir + '\\config.ini'
 | 
				
			||||||
 | 
					dirOut = currDir + '\\result'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# make train/test set: 1, load: 0
 | 
				
			||||||
 | 
					makeTrainTestSet = 0
 | 
				
			||||||
 | 
					# convert 3 regions to 2 regions: 1, load: 0
 | 
				
			||||||
 | 
					conv3to2region   = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# 3 regions: 0
 | 
				
			||||||
 | 
					# saxon vs limburg: 1
 | 
				
			||||||
 | 
					# groningen vs limburg: 2
 | 
				
			||||||
 | 
					experiment_type = 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					regionLabels  = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# a bit useless error handling.
 | 
				
			||||||
 | 
					#assert (experiment_type in (0, 1, 2)), "experiment type should be 0, 1 or 2."
 | 
				
			||||||
 | 
					if experiment_type == 1:
 | 
				
			||||||
 | 
						regionLabels2 = ['Low_Saxon', 'Limburg'] 
 | 
				
			||||||
 | 
					regionLabels2 = ['Groningen_and_Drenthe', 'Limburg']	
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					##########################
 | 
				
			||||||
 | 
					##   DATA PREPARATION   ##
 | 
				
			||||||
 | 
					##########################
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## load init file
 | 
				
			||||||
 | 
					config = configparser.ConfigParser()
 | 
				
			||||||
 | 
					config.sections()
 | 
				
			||||||
 | 
					config.read(configFile)
 | 
				
			||||||
 | 
					dirFeature = config['sentence_based']['dirFeature']
 | 
				
			||||||
 | 
					fileMDB = config['sentence_based']['fileMDB']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## database connection
 | 
				
			||||||
 | 
					pypyodbc.lowercase = False
 | 
				
			||||||
 | 
					param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";"
 | 
				
			||||||
 | 
					conn = pypyodbc.connect(param)
 | 
				
			||||||
 | 
					cursor = conn.cursor()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## get data from Access database
 | 
				
			||||||
 | 
					# data format
 | 
				
			||||||
 | 
					#	0: filename
 | 
				
			||||||
 | 
					#	1: pid
 | 
				
			||||||
 | 
					#	2: region
 | 
				
			||||||
 | 
					#	3: ID (unique word_id)
 | 
				
			||||||
 | 
					#	4: sentence_id
 | 
				
			||||||
 | 
					#	5: word_id
 | 
				
			||||||
 | 
					#	6: word
 | 
				
			||||||
 | 
					#	7: pronunciation
 | 
				
			||||||
 | 
					SQL_string = """\
 | 
				
			||||||
 | 
					{CALL dataset_with_cities}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					cursor.execute(SQL_string)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					rows = cursor.fetchall()
 | 
				
			||||||
 | 
					data = np.array(rows)
 | 
				
			||||||
 | 
					#dataNumMax = data.shape[0]
 | 
				
			||||||
 | 
					#uniqueWordIDmax = max(data[:, 3].astype(int))
 | 
				
			||||||
 | 
					del SQL_string, rows
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## make list of LabelBinarizer object per word.
 | 
				
			||||||
 | 
					# for X
 | 
				
			||||||
 | 
					# get pronvarList from Access database 
 | 
				
			||||||
 | 
					# pronvarList format
 | 
				
			||||||
 | 
					#	0: ID (unique word_id)
 | 
				
			||||||
 | 
					#	1: word
 | 
				
			||||||
 | 
					#	2: pronvar
 | 
				
			||||||
 | 
					SQL_string = """\
 | 
				
			||||||
 | 
					{CALL pronunciation_variant}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					cursor.execute(SQL_string)
 | 
				
			||||||
 | 
					rows = cursor.fetchall()
 | 
				
			||||||
 | 
					pronvarList = np.array(rows)
 | 
				
			||||||
 | 
					del SQL_string, rows
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					LBlist = []
 | 
				
			||||||
 | 
					#uniqueWordIDlist = pronvarList[:, 0].astype(int)
 | 
				
			||||||
 | 
					uniqueWordIDlist = data[:, 3].astype(int)
 | 
				
			||||||
 | 
					uniqueWordIDmax  = max(uniqueWordIDlist)
 | 
				
			||||||
 | 
					for uniqueWordID in range(1, uniqueWordIDmax+1):
 | 
				
			||||||
 | 
						pronvar = data[uniqueWordIDlist == uniqueWordID, 7]
 | 
				
			||||||
 | 
						#pronvar = pronvarList[pronvarList[:, 0] == uniqueWordID, 2]
 | 
				
			||||||
 | 
						LB = preprocessing.LabelBinarizer()
 | 
				
			||||||
 | 
						LB.fit(np.unique(pronvar))
 | 
				
			||||||
 | 
						LBlist.append(LB)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# for y (=region)
 | 
				
			||||||
 | 
					LE_y = preprocessing.LabelEncoder()
 | 
				
			||||||
 | 
					LE_y.fit(regionLabels)
 | 
				
			||||||
 | 
					LE_y2 = preprocessing.LabelEncoder()
 | 
				
			||||||
 | 
					LE_y2.fit(regionLabels2)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					LB_y = preprocessing.LabelBinarizer()
 | 
				
			||||||
 | 
					LB_y.fit(regionLabels)
 | 
				
			||||||
 | 
					LB_y2 = preprocessing.LabelBinarizer()
 | 
				
			||||||
 | 
					LB_y2.fit(regionLabels2)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					del uniqueWordID, uniqueWordIDmax, pronvar, LB
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#################
 | 
				
			||||||
 | 
					##  ITERATION  ##
 | 
				
			||||||
 | 
					#################
 | 
				
			||||||
 | 
					#CM_majority = np.zeros((1, 9)).astype(int)
 | 
				
			||||||
 | 
					#CM_weighted = np.zeros((1, 9)).astype(int)
 | 
				
			||||||
 | 
					#for iter in range(0, 1):
 | 
				
			||||||
 | 
					#	print(iter)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## make balanced dataset
 | 
				
			||||||
 | 
					pidlist = np.unique(data[:, (1, 2)], axis=0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# count number of samples
 | 
				
			||||||
 | 
					pidlistCounter = Counter(pidlist[:, 1])
 | 
				
			||||||
 | 
					sampleNumMax = min(pidlistCounter.values())
 | 
				
			||||||
 | 
					del pidlistCounter
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## make train/eval/test set or load
 | 
				
			||||||
 | 
					if makeTrainTestSet==1:
 | 
				
			||||||
 | 
						pidlist_train = []
 | 
				
			||||||
 | 
						pidlist_eval  = []
 | 
				
			||||||
 | 
						pidlist_test  = []
 | 
				
			||||||
 | 
						for regionNum in range(0, len(regionLabels)):
 | 
				
			||||||
 | 
							regionName = regionLabels[regionNum]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							pidlist_per_region_ = pidlist[pidlist[:, 1]==regionLabels[regionNum], :]
 | 
				
			||||||
 | 
							pidlist_per_region, idx = mani.extractRandomSample(
 | 
				
			||||||
 | 
								pidlist_per_region_, sampleNumMax)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							# split dataset into train, eval and test.
 | 
				
			||||||
 | 
							[pidlist_per_region_train, pidlist_per_region_test] = train_test_split(
 | 
				
			||||||
 | 
								pidlist_per_region, test_size = 0.2, random_state = 0)
 | 
				
			||||||
 | 
							[pidlist_per_region_train, pidlist_per_region_eval] = train_test_split(
 | 
				
			||||||
 | 
								pidlist_per_region_train, test_size = 0.1, random_state = 0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							# append numpy arrays
 | 
				
			||||||
 | 
							if regionNum == 0:
 | 
				
			||||||
 | 
								pidlist_train = pidlist_per_region_train
 | 
				
			||||||
 | 
								pidlist_eval  = pidlist_per_region_eval
 | 
				
			||||||
 | 
								pidlist_test  = pidlist_per_region_test
 | 
				
			||||||
 | 
							else:
 | 
				
			||||||
 | 
								pidlist_train = np.r_[pidlist_train, pidlist_per_region_train]
 | 
				
			||||||
 | 
								pidlist_eval  = np.r_[pidlist_eval, pidlist_per_region_eval]
 | 
				
			||||||
 | 
								pidlist_test  = np.r_[pidlist_test, pidlist_per_region_test]
 | 
				
			||||||
 | 
						del regionNum, regionName
 | 
				
			||||||
 | 
						del pidlist_per_region_, pidlist_per_region, idx
 | 
				
			||||||
 | 
						del pidlist_per_region_train, pidlist_per_region_eval, pidlist_per_region_test
 | 
				
			||||||
 | 
						np.save(dirOut + "\\pidlist_train.npy", pidlist_train)
 | 
				
			||||||
 | 
						np.save(dirOut + "\\pidlist_eval.npy", pidlist_eval)
 | 
				
			||||||
 | 
						np.save(dirOut + "\\pidlist_test.npy", pidlist_test)
 | 
				
			||||||
 | 
					else:
 | 
				
			||||||
 | 
						pidlist_train = np.load(dirOut + "\\pidlist_train.npy")
 | 
				
			||||||
 | 
						pidlist_eval  = np.load(dirOut + "\\pidlist_eval.npy")
 | 
				
			||||||
 | 
						pidlist_test  = np.load(dirOut + "\\pidlist_test.npy")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## make dataset for 2 regions or load
 | 
				
			||||||
 | 
					if conv3to2region==1:
 | 
				
			||||||
 | 
						pidlist2_train_ = np.r_[pidlist_train, pidlist_eval]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if experiment_type == 1:
 | 
				
			||||||
 | 
							pidlist2_train = sb_func.saxon_vs_limburg(pidlist2_train_)
 | 
				
			||||||
 | 
							pidlist2_test  = sb_func.saxon_vs_limburg(pidlist_test)	
 | 
				
			||||||
 | 
							np.save(dirOut + "\\pidlist2_saxon_vs_limburg_train", pidlist2_train)
 | 
				
			||||||
 | 
							np.save(dirOut + "\\pidlist2_saxon_vs_limburg_test", pidlist2_test)
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						elif experiment_type == 2:
 | 
				
			||||||
 | 
							pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_)
 | 
				
			||||||
 | 
							pidlist2_test  = sb_func.groningen_vs_limburg(pidlist_test)
 | 
				
			||||||
 | 
							np.save(dirOut + "\\pidlist2_groningen_vs_limburg_train", pidlist2_train)
 | 
				
			||||||
 | 
							np.save(dirOut + "\\pidlist2_groningen_vs_limburg_test", pidlist2_test)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						del pidlist2_train_
 | 
				
			||||||
 | 
					else:
 | 
				
			||||||
 | 
						if experiment_type == 1:
 | 
				
			||||||
 | 
							pidlist2_train = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_train.npy")
 | 
				
			||||||
 | 
							pidlist2_test  = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_test.npy")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						elif experiment_type == 2:
 | 
				
			||||||
 | 
							pidlist2_train = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_train.npy")
 | 
				
			||||||
 | 
							pidlist2_test  = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_test.npy")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## train/test data
 | 
				
			||||||
 | 
					if experiment_type == 0:
 | 
				
			||||||
 | 
						# Groningen vs Overijsel vs Limburg
 | 
				
			||||||
 | 
						data_train = sb_func.extractPid(pidlist_train, data)
 | 
				
			||||||
 | 
						data_eval  = sb_func.extractPid(pidlist_eval, data)
 | 
				
			||||||
 | 
						data_test  = sb_func.extractPid(pidlist_test, data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					elif experiment_type == 1 or experiment_type == 2:
 | 
				
			||||||
 | 
						data2 = np.array(data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if experiment_type == 1:
 | 
				
			||||||
 | 
							for row, row2 in zip(data, data2):
 | 
				
			||||||
 | 
								if row[2] == regionLabels[0] or row[2] == regionLabels[2]:
 | 
				
			||||||
 | 
									row2[2] = regionLabels2[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						data2_train = sb_func.extractPid(pidlist2_train, data2)
 | 
				
			||||||
 | 
						data2_test  = sb_func.extractPid(pidlist2_test, data2)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#####################################
 | 
				
			||||||
 | 
					##   EXPERIMENTS START FROM HERE   ##
 | 
				
			||||||
 | 
					#####################################
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## actual training
 | 
				
			||||||
 | 
					# train vs eval
 | 
				
			||||||
 | 
					#trainData = data_train
 | 
				
			||||||
 | 
					#testData  = data_eval
 | 
				
			||||||
 | 
					#testPID   = pidlist_eval
 | 
				
			||||||
 | 
					#LB = LB_y
 | 
				
			||||||
 | 
					#LE = LE_y
 | 
				
			||||||
 | 
					#regionLabels = regionLabels3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# train+eval vs test
 | 
				
			||||||
 | 
					if experiment_type == 0:
 | 
				
			||||||
 | 
						trainData = np.r_[data_train, data_eval]
 | 
				
			||||||
 | 
						testData  = data_test
 | 
				
			||||||
 | 
						testPID   = pidlist_test
 | 
				
			||||||
 | 
						LB = LB_y
 | 
				
			||||||
 | 
						LE = LE_y
 | 
				
			||||||
 | 
					elif experiment_type == 1 or experiment_type == 2:
 | 
				
			||||||
 | 
					# 2 region: saxon vs limburg/ groningen vs limburg
 | 
				
			||||||
 | 
						trainData = data2_train
 | 
				
			||||||
 | 
						testData  = data2_test
 | 
				
			||||||
 | 
						testPID   = pidlist2_test
 | 
				
			||||||
 | 
						LB = LB_y2
 | 
				
			||||||
 | 
						LE = LE_y2
 | 
				
			||||||
 | 
						regionLabels = regionLabels2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# check the number of utterance
 | 
				
			||||||
 | 
					allData = np.r_[trainData, testData]
 | 
				
			||||||
 | 
					filenames = np.c_[allData[:, 0], allData[:, 2]]
 | 
				
			||||||
 | 
					filenames_unique = np.unique(filenames, axis=0)
 | 
				
			||||||
 | 
					Counter(filenames_unique[:, 1])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					fileComparison		= dirOut + "\\algorithm_comparison.csv"
 | 
				
			||||||
 | 
					filePerformance		= dirOut + "\\sentence-level.csv"
 | 
				
			||||||
 | 
					fileConfusionMatrix = dirOut + "\\confusion_matrix.csv"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## compare classification algorithms for the sentence-classifiers.
 | 
				
			||||||
 | 
					#sb_func.compare_sentence_level_classifiers(trainData, LBlist, LE, fileComparison)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## train sentence-level classifiers.
 | 
				
			||||||
 | 
					modelList, scoreList, confusionMatrixList = sb_func.train_sentence_level_classifiers(
 | 
				
			||||||
 | 
						trainData, LBlist, LE, filePerformance)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## prediction over evaluation data per each sentence-level classifier.
 | 
				
			||||||
 | 
					pred_per_sentence = sb_func.prediction_per_sentence(testData, modelList, LBlist, LE)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## combine sentence-level classifiers 
 | 
				
			||||||
 | 
					pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## majority vote (weighted)
 | 
				
			||||||
 | 
					#weight = sb_func.calc_weight(confusionMatrixList)
 | 
				
			||||||
 | 
					#pred_per_pid_weighted = sb_func.prediction_per_pid_weighted(testPID, pred_per_sentence, weight, LB, LE)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### confusion matrix
 | 
				
			||||||
 | 
					if experiment_type == 0:
 | 
				
			||||||
 | 
						confusionMatrix_majority = confusion_matrix(
 | 
				
			||||||
 | 
							pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg'])
 | 
				
			||||||
 | 
					else:
 | 
				
			||||||
 | 
						confusionMatrix_majority = confusion_matrix(
 | 
				
			||||||
 | 
							pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Limburg'])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						#confusionMatrix_weighted = confusion_matrix(
 | 
				
			||||||
 | 
					#	pred_per_pid_weighted[:, 1], pred_per_pid_weighted[:, 2], labels=regionLabels)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## output
 | 
				
			||||||
 | 
					accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None)
 | 
				
			||||||
 | 
					print('accuracy: {}%'.format(accuracy * 100))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cm = confusionMatrix_majority
 | 
				
			||||||
 | 
					print(cm)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					np.save(dirOut + "\\pred_per_pid.npy", pred_per_pid_majority)
 | 
				
			||||||
 | 
					np.save(dirOut + "\\confusion_matrix.npy", cm)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#fout = open(fileConfusionMatrix, "w")
 | 
				
			||||||
 | 
					#fout.write('< confusion matrix for majority vote in evaluation set >\n')
 | 
				
			||||||
 | 
					#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_majority', regionLabels)
 | 
				
			||||||
 | 
					#fout.write('< confusion matrix for weighted vote in evaluation set >\n')
 | 
				
			||||||
 | 
					#sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_weighted', regionLabels)
 | 
				
			||||||
 | 
					#fout.write('\n')
 | 
				
			||||||
 | 
					#fout.close()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					##### iteration finish #####
 | 
				
			||||||
 | 
					conn.close()
 | 
				
			||||||
 | 
					#np.savetxt(dirOut + '\\cm_majority.csv', CM_majority, delimiter=',') 
 | 
				
			||||||
 | 
					#np.savetxt(dirOut + '\\cm_weighted.csv', CM_weighted, delimiter=',') 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
							
								
								
									
										383
									
								
								dialect_identification/speaker_based_functions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										383
									
								
								dialect_identification/speaker_based_functions.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,383 @@
 | 
				
			|||||||
 | 
					import numpy as np
 | 
				
			||||||
 | 
					from collections import Counter
 | 
				
			||||||
 | 
					import matplotlib.pyplot as plt
 | 
				
			||||||
 | 
					import itertools
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from sklearn.neighbors import KNeighborsClassifier
 | 
				
			||||||
 | 
					from sklearn.svm import SVC
 | 
				
			||||||
 | 
					from sklearn.tree import DecisionTreeClassifier
 | 
				
			||||||
 | 
					from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
 | 
				
			||||||
 | 
					from sklearn.naive_bayes import GaussianNB
 | 
				
			||||||
 | 
					from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 | 
				
			||||||
 | 
					from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from sklearn.model_selection import cross_val_score
 | 
				
			||||||
 | 
					from sklearn.metrics import confusion_matrix
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import dataManipulation as mani
 | 
				
			||||||
 | 
					import evaluation as eval
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# extract data that corresponds to pid in the pidlist
 | 
				
			||||||
 | 
					def extractPid(pidlist, data):
 | 
				
			||||||
 | 
						for pidnum in range(0, len(pidlist)):
 | 
				
			||||||
 | 
							pid = pidlist[pidnum, 0]
 | 
				
			||||||
 | 
							x = data[data[:, 1] == pid, :]
 | 
				
			||||||
 | 
							if pidnum == 0:
 | 
				
			||||||
 | 
								data_ = x
 | 
				
			||||||
 | 
							else:
 | 
				
			||||||
 | 
								data_ = np.r_[data_, x]	
 | 
				
			||||||
 | 
						return data_
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def OneHotEncoding(data, LB_X, LE_y):
 | 
				
			||||||
 | 
					# one hot encoding of data using LabelBinalizer per word (LB_X) and for region (LB_y)
 | 
				
			||||||
 | 
					# INPUT
 | 
				
			||||||
 | 
					#  data
 | 
				
			||||||
 | 
					#	0: filename
 | 
				
			||||||
 | 
					#	1: pid
 | 
				
			||||||
 | 
					#	2: region
 | 
				
			||||||
 | 
					#	3: ID (unique word_id)
 | 
				
			||||||
 | 
					#	4: sentence_id
 | 
				
			||||||
 | 
					#	5: word_id
 | 
				
			||||||
 | 
					#	6: word
 | 
				
			||||||
 | 
					#	7: pronunciation
 | 
				
			||||||
 | 
					#  LB_x: LabelBinalizer objects
 | 
				
			||||||
 | 
					#  LE_y: LabelEncoder object
 | 
				
			||||||
 | 
					# OUTPUT
 | 
				
			||||||
 | 
					#  X: encoded variable data
 | 
				
			||||||
 | 
					#  y: encoded target data
 | 
				
			||||||
 | 
						pidlist			 = data[:, 1]
 | 
				
			||||||
 | 
						regionlist		 = data[:, 2]
 | 
				
			||||||
 | 
						uniqueWordIDlist = data[:, 3].astype(int)
 | 
				
			||||||
 | 
						pronvarlist		 = data[:, 7]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						uniqueWordIDlist_unique = np.unique(uniqueWordIDlist)
 | 
				
			||||||
 | 
						uniqueWordIDlist_unique.sort()
 | 
				
			||||||
 | 
						for uniqueWordIDnum in uniqueWordIDlist_unique:
 | 
				
			||||||
 | 
							x_ = pronvarlist[uniqueWordIDlist == uniqueWordIDnum]	
 | 
				
			||||||
 | 
							lb = LB_X[uniqueWordIDnum-1]
 | 
				
			||||||
 | 
							x  = lb.transform(x_)
 | 
				
			||||||
 | 
							if uniqueWordIDnum == uniqueWordIDlist_unique[0]:
 | 
				
			||||||
 | 
								X = x
 | 
				
			||||||
 | 
							else:
 | 
				
			||||||
 | 
								X = np.c_[X, x]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						# pid and region of the speakers
 | 
				
			||||||
 | 
						y_ = regionlist[uniqueWordIDlist == uniqueWordIDlist_unique[0]]
 | 
				
			||||||
 | 
						y = LE_y.transform(y_)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						pid = pidlist[uniqueWordIDlist == uniqueWordIDlist_unique[0]]
 | 
				
			||||||
 | 
						return X, y, pid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def outputConfusionMatrix33(foutName, matrixName, regionLabels):
 | 
				
			||||||
 | 
						for r in range(0, len(regionLabels)):
 | 
				
			||||||
 | 
							execString1 = foutName + '.write("{0},{1},{2},{3}\\n".format('
 | 
				
			||||||
 | 
							execString2 = 'regionLabels[' + str(r) + ']'
 | 
				
			||||||
 | 
							execString3 = ''
 | 
				
			||||||
 | 
							for c in range(0, len(regionLabels)):
 | 
				
			||||||
 | 
								execString3 = execString3 + ',' + matrixName + '[' + str(r) + '][' + str(c) + ']'
 | 
				
			||||||
 | 
							execString4 = '))'
 | 
				
			||||||
 | 
							execString  = execString1 + execString2 + execString3 + execString4
 | 
				
			||||||
 | 
							exec(execString)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def compare_sentence_level_classifiers(data_train, LBlist, LE_y, fileCSV):
 | 
				
			||||||
 | 
						""" compare the classification algorithms on sentence-level classifiers. 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						Args:
 | 
				
			||||||
 | 
							data_train: training data.
 | 
				
			||||||
 | 
							LBlist: list of label binarizer, which is used to encode pronunciation variants.
 | 
				
			||||||
 | 
							LE_y: label encorder, which is used to encode rigion names.
 | 
				
			||||||
 | 
							fileCSV: output csv file path.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						"""
 | 
				
			||||||
 | 
						fout = open(fileCSV, "w")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						sentenceIDlist_train = data_train[:, 4].astype(int)
 | 
				
			||||||
 | 
						sentenceIDmax_train  = max(sentenceIDlist_train)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						for sentenceID in range(1, sentenceIDmax_train+1):
 | 
				
			||||||
 | 
							sentenceIDstr = format(sentenceID, '02')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							## categorical values into binary values.
 | 
				
			||||||
 | 
							data_sentence = data_train[sentenceIDlist_train == sentenceID, :]
 | 
				
			||||||
 | 
							X_train, y_train, pid_train = OneHotEncoding(data_sentence, LBlist, LE_y)
 | 
				
			||||||
 | 
							regionCounter = Counter(LE_y.inverse_transform(y_train))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							## classifier comparison
 | 
				
			||||||
 | 
							names = [
 | 
				
			||||||
 | 
								"Nearest Neighbors", 
 | 
				
			||||||
 | 
								"Linear SVM",
 | 
				
			||||||
 | 
								"Poly SVM",
 | 
				
			||||||
 | 
								"RBF SVM", 
 | 
				
			||||||
 | 
								"Decision Tree",
 | 
				
			||||||
 | 
								"Random Forest 2", 
 | 
				
			||||||
 | 
								"Random Forest 3", 
 | 
				
			||||||
 | 
								"Random Forest 4", 
 | 
				
			||||||
 | 
								"AdaBoost", 
 | 
				
			||||||
 | 
								"AdaBoost(SVM)",
 | 
				
			||||||
 | 
								"AdaBoost(Random Forest 3)",
 | 
				
			||||||
 | 
								"Naive Bayes", 
 | 
				
			||||||
 | 
								"Linear Discriminant Analysis",
 | 
				
			||||||
 | 
								"Quadratic Discriminant Analysis"
 | 
				
			||||||
 | 
								]
 | 
				
			||||||
 | 
							classifiers = [
 | 
				
			||||||
 | 
								KNeighborsClassifier(3),
 | 
				
			||||||
 | 
								SVC(kernel="linear", C=0.025),
 | 
				
			||||||
 | 
								SVC(kernel="poly", C=0.025),
 | 
				
			||||||
 | 
								SVC(gamma=2, C=1),
 | 
				
			||||||
 | 
								DecisionTreeClassifier(max_depth=4),
 | 
				
			||||||
 | 
								RandomForestClassifier(max_depth=2, n_estimators=10, max_features=1),
 | 
				
			||||||
 | 
								RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1),
 | 
				
			||||||
 | 
								RandomForestClassifier(max_depth=4, n_estimators=10, max_features=1),
 | 
				
			||||||
 | 
								AdaBoostClassifier(),
 | 
				
			||||||
 | 
								AdaBoostClassifier(SVC(probability=True, kernel='linear')),
 | 
				
			||||||
 | 
								AdaBoostClassifier(RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1)),
 | 
				
			||||||
 | 
								GaussianNB(),
 | 
				
			||||||
 | 
								LinearDiscriminantAnalysis(),
 | 
				
			||||||
 | 
								QuadraticDiscriminantAnalysis()
 | 
				
			||||||
 | 
								]
 | 
				
			||||||
 | 
							for name, model in zip(names, classifiers):
 | 
				
			||||||
 | 
								scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro')
 | 
				
			||||||
 | 
								fout.write("{0},{1},{2},{3}\n".format(sentenceID, name, scores.mean(), scores.var()))
 | 
				
			||||||
 | 
								print('{0}, {1}: {2}'.format(sentenceID, name, scores.mean()))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						fout.close()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def train_sentence_level_classifiers(data_train, LBlist, LE_y, fileCSV):
 | 
				
			||||||
 | 
						""" train sentence-level classifiers.
 | 
				
			||||||
 | 
							
 | 
				
			||||||
 | 
						Args:
 | 
				
			||||||
 | 
							data_train: training data.
 | 
				
			||||||
 | 
							LBlist: list of label binarizer, which is used to encode pronunciation variants.
 | 
				
			||||||
 | 
							LE_y: label encorder, which is used to encode rigion names.
 | 
				
			||||||
 | 
							fileCSV: output csv file path.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						Returns:
 | 
				
			||||||
 | 
							modelList (list): list of models (length: sentenceNumMax)
 | 
				
			||||||
 | 
							scoreList (list): list of scores (length: sentenceNumMax)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						"""
 | 
				
			||||||
 | 
						fout = open(fileCSV, "w")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						fout.write('< cross-validation in training set >\n')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						sentenceIDlist_train = data_train[:, 4].astype(int)
 | 
				
			||||||
 | 
						sentenceIDmax_train  = max(sentenceIDlist_train)
 | 
				
			||||||
 | 
						modelList = []
 | 
				
			||||||
 | 
						scoreList = []
 | 
				
			||||||
 | 
						confusionMatrixList = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						for sentenceID in range(1, sentenceIDmax_train+1):
 | 
				
			||||||
 | 
							sentenceIDstr = format(sentenceID, '02')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							## categorical values into binary values.
 | 
				
			||||||
 | 
							data_sentence = data_train[sentenceIDlist_train == sentenceID, :]
 | 
				
			||||||
 | 
							X_train, y_train, pid_train = OneHotEncoding(data_sentence, LBlist, LE_y)
 | 
				
			||||||
 | 
							regionCounter = Counter(LE_y.inverse_transform(y_train))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							## cross-validation with the best classifier
 | 
				
			||||||
 | 
							model = AdaBoostClassifier()
 | 
				
			||||||
 | 
							#model = SVC(kernel="linear", C=0.025)
 | 
				
			||||||
 | 
							#model = LinearDiscriminantAnalysis()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#		#scores = cross_val_score(model, X_train, y_train, cv = 10, scoring = 'f1_micro')
 | 
				
			||||||
 | 
							scores, confusionMatrix = eval.cross_val_confusion_matrix(model, X_train, y_train, 10)
 | 
				
			||||||
 | 
							ci_mean, ci_low, ci_high = eval.mean_confidence_interval(scores, 0.95)
 | 
				
			||||||
 | 
							scoreList.append(scores)
 | 
				
			||||||
 | 
							confusionMatrixList.append(confusionMatrix)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							## model fitting
 | 
				
			||||||
 | 
							modelfit = model.fit(X_train, y_train)
 | 
				
			||||||
 | 
							modelList.append(modelfit)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							## output 
 | 
				
			||||||
 | 
							fout.write("{},".format(sentenceID))
 | 
				
			||||||
 | 
							#fout.write("{0},{1},{2},".format(
 | 
				
			||||||
 | 
							#	regionCounter['Groningen_and_Drenthe'], regionCounter['Limburg'], regionCounter['Oost_Overijsel-Gelderland']))
 | 
				
			||||||
 | 
							#fout.write("{0},{1},".format(
 | 
				
			||||||
 | 
							#	regionCounter['Low_Saxon'], regionCounter['Limburg']))
 | 
				
			||||||
 | 
							fout.write("{0},{1},".format(
 | 
				
			||||||
 | 
								regionCounter['Groningen_and_Drenthe'], regionCounter['Limburg']))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							fout.write("{0},{1},{2}\n".format(ci_mean, ci_low, ci_high))
 | 
				
			||||||
 | 
						fout.write('\n')
 | 
				
			||||||
 | 
						fout.close()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return modelList, scoreList, confusionMatrixList
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def prediction_per_sentence(data_eval, modelList, LBlist, LE_y):
 | 
				
			||||||
 | 
						""" prediction using sentence-level classifiers.
 | 
				
			||||||
 | 
							
 | 
				
			||||||
 | 
						Args:
 | 
				
			||||||
 | 
							data_eval: evaluation data.
 | 
				
			||||||
 | 
							modelList: list of the models.
 | 
				
			||||||
 | 
							LBlist: list of label binarizer, which is used to encode pronunciation variants.
 | 
				
			||||||
 | 
							LE_y: label encorder, which is used to encode rigion names.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						Returns:
 | 
				
			||||||
 | 
							prediction (list): [sentenceID, pid, answer, prediction]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						"""
 | 
				
			||||||
 | 
						sentenceIDlist_eval = data_eval[:, 4].astype(int)
 | 
				
			||||||
 | 
						sentenceIDmax_eval  = max(sentenceIDlist_eval)
 | 
				
			||||||
 | 
						for sentenceID in range(1, sentenceIDmax_eval+1):
 | 
				
			||||||
 | 
							sentenceIDstr = format(sentenceID, '02')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							## categorical values into binary values.
 | 
				
			||||||
 | 
							data_sentence = data_eval[sentenceIDlist_eval == sentenceID, :]
 | 
				
			||||||
 | 
							X_eval, y_eval, pid_eval = OneHotEncoding(data_sentence, LBlist, LE_y)
 | 
				
			||||||
 | 
							regionCounter = Counter(LE_y.inverse_transform(y_eval))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							## evaluate model
 | 
				
			||||||
 | 
							modelfit = modelList[sentenceID-1]
 | 
				
			||||||
 | 
							y_pred  = modelfit.predict(X_eval)
 | 
				
			||||||
 | 
							y_pred_label = LE_y.inverse_transform(y_pred)
 | 
				
			||||||
 | 
							y_eval_label = LE_y.inverse_transform(y_eval)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							# pid, y, y_pred
 | 
				
			||||||
 | 
							sentenceIDvec = np.ones((y_eval_label.shape[0], 1)).astype(int) * sentenceID
 | 
				
			||||||
 | 
							prediction_   = np.c_[sentenceIDvec, pid_eval, y_eval_label, y_pred_label]
 | 
				
			||||||
 | 
							if sentenceID == 1:
 | 
				
			||||||
 | 
								prediction = prediction_
 | 
				
			||||||
 | 
							else:
 | 
				
			||||||
 | 
								prediction = np.r_[prediction, prediction_]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return prediction
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def prediction_per_pid_majority(pidlist_eval, prediction):
 | 
				
			||||||
 | 
						""" make a prediction per pid using majority vote 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						Returns:
 | 
				
			||||||
 | 
							prediction_per_pid (ndarray): [pid, ans, prediction]
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						"""
 | 
				
			||||||
 | 
						prediction_per_pid = []
 | 
				
			||||||
 | 
						for pid_ in range(0, len(pidlist_eval[:, 0])):
 | 
				
			||||||
 | 
							pid = pidlist_eval[pid_, 0]
 | 
				
			||||||
 | 
							ans = pidlist_eval[pid_, 1]
 | 
				
			||||||
 | 
							prediction_ = prediction[prediction[:, 1] == pid, :]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							# majority vote
 | 
				
			||||||
 | 
							predCounter = Counter(prediction_[:, -1])
 | 
				
			||||||
 | 
							predMostCommon = predCounter.most_common(1)
 | 
				
			||||||
 | 
							predLabel = predMostCommon[0][0]
 | 
				
			||||||
 | 
							predRatio = predMostCommon[0][1] / prediction_.shape[0] * 100
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							prediction_per_pid.append([pid, ans, predLabel])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return np.array(prediction_per_pid)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def calc_weight(confusionMatrixList):
 | 
				
			||||||
 | 
						""" calculate weight (how trustworthy the prediction is) for majority vote.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						Note:
 | 
				
			||||||
 | 
							Of all subjects we predicted are GO/OG/LB, what fraction of them actually are (precision) is used as weight.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						Args:
 | 
				
			||||||
 | 
							confusionMarixList: list of confusion matrix of sentence-level classifiers.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						"""
 | 
				
			||||||
 | 
						sentenceID_max = len(confusionMatrixList)
 | 
				
			||||||
 | 
						weight = np.zeros((sentenceID_max, confusionMatrixList[0].shape[0]))
 | 
				
			||||||
 | 
						for sentenceID in range(1, sentenceID_max+1):
 | 
				
			||||||
 | 
							cm = confusionMatrixList[sentenceID-1]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							# normalized confusion matrix
 | 
				
			||||||
 | 
							#rTotal = np.sum(cm, axis=1)
 | 
				
			||||||
 | 
							#cm_normalized = cm / rTotal
 | 
				
			||||||
 | 
							#weight[sentenceID-1, :] = np.diag(cm_normalized)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							true_positives = np.diag(cm)
 | 
				
			||||||
 | 
							predicted = np.sum(cm, axis=0)
 | 
				
			||||||
 | 
							weight[sentenceID-1, :] = true_positives / predicted
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return weight
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def prediction_per_pid_weighted(pidlist_eval, prediction, weight, LB_y, LE_y):
 | 
				
			||||||
 | 
						""" make a prediction per pid using weighted (majority) vote. 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						Args:
 | 
				
			||||||
 | 
							weight (ndarray): how trustworthy the prediction of each sentence-based classifier is.
 | 
				
			||||||
 | 
							LB_y: label binalizer, which is used to encode region names.
 | 
				
			||||||
 | 
							LE_y: label encorder, which is used to encode region names.
 | 
				
			||||||
 | 
						Returns:
 | 
				
			||||||
 | 
							prediction_per_pid (ndarray): [pid, ans, prediction]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						prediction_per_pid = []
 | 
				
			||||||
 | 
						for pid_ in range(0, len(pidlist_eval[:, 0])):
 | 
				
			||||||
 | 
							pid = pidlist_eval[pid_, 0]
 | 
				
			||||||
 | 
							ans = pidlist_eval[pid_, 1]
 | 
				
			||||||
 | 
							prediction_ = prediction[prediction[:, 1] == pid, :]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							# calculate weighted (majority) vote
 | 
				
			||||||
 | 
							vote_weighted = np.zeros((1, 3))
 | 
				
			||||||
 | 
							for sentenceID_ in range(0, prediction_.shape[0]):
 | 
				
			||||||
 | 
								sentenceID = prediction_[sentenceID_, 0].astype(int)
 | 
				
			||||||
 | 
								w = weight[sentenceID-1, :]
 | 
				
			||||||
 | 
								pred = prediction_[sentenceID_, 3]
 | 
				
			||||||
 | 
								pred_int = LB_y.transform([pred])
 | 
				
			||||||
 | 
								vote_weighted = vote_weighted + w * pred_int
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							# choose the most vote
 | 
				
			||||||
 | 
							vote_weighted = vote_weighted[0]
 | 
				
			||||||
 | 
							maxindex = list(vote_weighted).index(max(vote_weighted))
 | 
				
			||||||
 | 
							#predLabel = regionLabels[maxindex]
 | 
				
			||||||
 | 
							predLabel = LE_y.inverse_transform(maxindex)
 | 
				
			||||||
 | 
							prediction_per_pid.append([pid, ans, predLabel])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return np.array(prediction_per_pid)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def saxon_vs_limburg(pidlist3):
 | 
				
			||||||
 | 
						"""convert a pidlist for 3 regions into that for 2 regions.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						Notes:
 | 
				
			||||||
 | 
							3 regions include ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
 | 
				
			||||||
 | 
							2 regions include ['Limburg', 'Low_Saxon']
 | 
				
			||||||
 | 
							where Low_Saxon = 'Groningen_and_Drenthe' + 'Oost_Overijsel-Gelderland'
 | 
				
			||||||
 | 
							samples are randomly chosen so that each class has the same amount of data. 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						regionLabels  = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
 | 
				
			||||||
 | 
						regionLabels2 = ['Low_Saxon', 'Limburg']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						index_saxon = np.any([pidlist3[:, 1] == regionLabels[0], pidlist3[:, 1] == regionLabels[2]], axis=0)
 | 
				
			||||||
 | 
						pidlist_saxon_  = pidlist3[index_saxon, :]
 | 
				
			||||||
 | 
						pidlist_limburg = pidlist3[pidlist3[:, 1] == regionLabels[1], :]
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						# extract the same amout of samples as Limburg. 
 | 
				
			||||||
 | 
						pidlistCounter3 = Counter(pidlist3[:, 1])
 | 
				
			||||||
 | 
						pidlist_saxon, idx = mani.extractRandomSample(pidlist_saxon_, pidlistCounter3['Limburg'])
 | 
				
			||||||
 | 
						pidlist_saxon[:, 1] = regionLabels2[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						pidlist2 = np.r_[pidlist_limburg, pidlist_saxon]
 | 
				
			||||||
 | 
						#pidlistCounter2 = Counter(pidlist2[:, 1])
 | 
				
			||||||
 | 
						return pidlist2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def groningen_vs_limburg(pidlist3):
 | 
				
			||||||
 | 
						"""convert a pidlist for 3 regions into that for 2 regions.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						Notes:
 | 
				
			||||||
 | 
							3 regions include ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
 | 
				
			||||||
 | 
							2 regions include ['Groningen_and_Drenthe', 'Limburg']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						"""
 | 
				
			||||||
 | 
						regionLabels  = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						pidlist_groningen = pidlist3[pidlist3[:, 1] == regionLabels[0], :] 
 | 
				
			||||||
 | 
						pidlist_limburg   = pidlist3[pidlist3[:, 1] == regionLabels[1], :]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						pidlist2 = np.r_[pidlist_groningen, pidlist_limburg]
 | 
				
			||||||
 | 
						return pidlist2
 | 
				
			||||||
							
								
								
									
										44
									
								
								dialect_identification/test_code.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										44
									
								
								dialect_identification/test_code.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,44 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					import Levenshtein
 | 
				
			||||||
 | 
					import numpy as np
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					a = 'hello'
 | 
				
			||||||
 | 
					b = 'haall'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# approximate
 | 
				
			||||||
 | 
					infinite = 100
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# make distance matrix D
 | 
				
			||||||
 | 
					len_a = len(a)
 | 
				
			||||||
 | 
					len_b = len(b)
 | 
				
			||||||
 | 
					D_ = np.zeros((len_a, len_b)).astype(int)
 | 
				
			||||||
 | 
					for ia in range(0, len_a):
 | 
				
			||||||
 | 
						a_ = a[ia]
 | 
				
			||||||
 | 
						for ib in range(0, len_b):
 | 
				
			||||||
 | 
							b_ = b[ib]
 | 
				
			||||||
 | 
							if a_ == b_:
 | 
				
			||||||
 | 
								D_[ia, ib] = 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					D = np.zeros((len_a+1, len_b+1)).astype(int)
 | 
				
			||||||
 | 
					D[1:len_a+1, 1:len_b+1] = D_
 | 
				
			||||||
 | 
					D[0, :] = infinite
 | 
				
			||||||
 | 
					D[:, 0] = infinite
 | 
				
			||||||
 | 
					D[0, 0] = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# calculate accumulated distance
 | 
				
			||||||
 | 
					indexPath = []
 | 
				
			||||||
 | 
					for ia in range(0, len_a):
 | 
				
			||||||
 | 
						for ib in range(0, len_b):
 | 
				
			||||||
 | 
							a_ = a[ia]
 | 
				
			||||||
 | 
							b_ = b[ib]
 | 
				
			||||||
 | 
							option = (D[ia, ib]+D[ia+1, ib+1], D[ia, ib+1], D[ia+1, ib])
 | 
				
			||||||
 | 
							Dmin = np.min(option)
 | 
				
			||||||
 | 
							D[ia+1, ib+1] = D[ia+1, ib+1]+Dmin
 | 
				
			||||||
 | 
							index = list(option).index(Dmin)
 | 
				
			||||||
 | 
							indexPath[ia, ib] = index
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# back trace
 | 
				
			||||||
 | 
					ia = len_a
 | 
				
			||||||
 | 
					ib = len_b
 | 
				
			||||||
 | 
					#while (ia > 0 or ib > 0):
 | 
				
			||||||
 | 
					#	tb
 | 
				
			||||||
							
								
								
									
										56
									
								
								dialect_identification/word_based.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										56
									
								
								dialect_identification/word_based.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,56 @@
 | 
				
			|||||||
 | 
					import os
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					import configparser
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import numpy as np
 | 
				
			||||||
 | 
					from matplotlib import pyplot
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					currDir    = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification'
 | 
				
			||||||
 | 
					sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir))
 | 
				
			||||||
 | 
					from dataIO import readFile
 | 
				
			||||||
 | 
					from dataIO import selectSamplesFromCombinedData
 | 
				
			||||||
 | 
					import dataManipulation
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					configFile = currDir + '\\config.ini'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					config = configparser.ConfigParser()
 | 
				
			||||||
 | 
					config.sections()
 | 
				
			||||||
 | 
					config.read(configFile)
 | 
				
			||||||
 | 
					fileWordList = config['word_based']['fileWordList']
 | 
				
			||||||
 | 
					fileCombined = config['word_based']['fileCombined']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					wordList = readFile(fileWordList)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					for wordNum in range(1, len(wordList)):
 | 
				
			||||||
 | 
						word = wordList[wordNum-1] # target word
 | 
				
			||||||
 | 
						#print("=== {} ===".format(word))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						dataGroningen, dataLimburg, dataOverijsel = selectSamplesFromCombinedData(word, fileCombined)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						sampleNumMax = 50
 | 
				
			||||||
 | 
						dataG, indexG = dataManipulation.extractRandomSample(np.array(dataGroningen), sampleNumMax)
 | 
				
			||||||
 | 
						dataL, indexL = dataManipulation.extractRandomSample(np.array(dataLimburg), sampleNumMax)
 | 
				
			||||||
 | 
						dataO, indexO = dataManipulation.extractRandomSample(np.array(dataOverijsel), sampleNumMax)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						# combine pronunciation from three regions
 | 
				
			||||||
 | 
						# data: (sampleNumMax x 3) x 1
 | 
				
			||||||
 | 
						cPronunciation = 4
 | 
				
			||||||
 | 
						data = np.hstack([dataG[:, cPronunciation], dataL[:, cPronunciation], dataO[:, cPronunciation]])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						# MDS
 | 
				
			||||||
 | 
						dataLevenshtein = dataManipulation.makeLevenshteinMatrix(data)
 | 
				
			||||||
 | 
						dataMDS = dataManipulation.MDS(dataLevenshtein)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						# plot
 | 
				
			||||||
 | 
						pyplot.scatter(dataMDS[0:sampleNumMax-1, 0], dataMDS[0:sampleNumMax-1, 1], s=80, c='red', marker="o", facecolors='none', label="Groningen and Drenthe")
 | 
				
			||||||
 | 
						pyplot.scatter(dataMDS[sampleNumMax:sampleNumMax*2-1, 0], dataMDS[sampleNumMax:sampleNumMax*2-1, 1], c='green', marker="^", facecolors='none', label="Limburg")
 | 
				
			||||||
 | 
						pyplot.scatter(dataMDS[sampleNumMax*2:sampleNumMax*3-1, 0], dataMDS[sampleNumMax*2:sampleNumMax*3-1, 1], c='blue', marker="+", facecolors='none', label="Oost Overijsel-Gelderland")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						pyplot.title(word)
 | 
				
			||||||
 | 
						#ax.set_xlabel('x')
 | 
				
			||||||
 | 
						#ax.set_ylabel('y')
 | 
				
			||||||
 | 
						pyplot.legend(loc='upper right')
 | 
				
			||||||
 | 
						#pyplot.show()
 | 
				
			||||||
 | 
						pyplot.savefig('c:\\cygwin64\\home\\Aki\\rug_cygwin\\_same-utterance\\fig\\' + word + '.png')
 | 
				
			||||||
 | 
						pyplot.gcf().clear()
 | 
				
			||||||
		Reference in New Issue
	
	Block a user