fix the bug there are characters in the lexicon which cannot be described in ascii.

2019-02-03 00:34:35 +01:00 · 2019-02-03 00:34:35 +01:00 · 22cccfb61d
parent dc6b7b84b6
commit 22cccfb61d
9 changed files with 199 additions and 103 deletions
--- a/.vs/acoustic_model/v15/.suo
+++ b/.vs/acoustic_model/v15/.suo
--- a/acoustic_model/pycache/defaultfiles.cpython-36.pyc
+++ b/acoustic_model/pycache/defaultfiles.cpython-36.pyc
--- a/acoustic_model/acoustic_model.pyproj
+++ b/acoustic_model/acoustic_model.pyproj
@ -4,8 +4,7 @@
    <SchemaVersion>2.0</SchemaVersion>
    <ProjectGuid>4d8c8573-32f0-4a62-9e62-3ce5cc680390</ProjectGuid>
    <ProjectHome>.</ProjectHome>
-    <StartupFile>
+    <StartupFile>fame_hmm.py</StartupFile>
    </StartupFile>
    <SearchPath>
    </SearchPath>
    <WorkingDirectory>.</WorkingDirectory>
--- a/acoustic_model/defaultfiles.py
+++ b/acoustic_model/defaultfiles.py
@ -39,11 +39,11 @@ toolbox_dir					= os.path.join(repo_dir, 'toolbox')
 #config_hvite = os.path.join(htk_config_dir, 'config.HVite')
 #acoustic_model = os.path.join(htk_config_dir, 'hmmdefs.compo')
 #acoustic_model = r'c:\cygwin64\home\A.Kunikoshi\acoustic_model\model\barbara\hmm128-2\hmmdefs.compo'
-#phonelist_txt = os.path.join(htk_config_dir, 'phonelist.txt')
+phonelist_txt = os.path.join(htk_dir, 'config', 'phonelist.txt')
 WSL_dir   = r'C:\OneDrive\WSL'
 #fame_dir        = os.path.join(WSL_dir, 'kaldi-trunk', 'egs', 'fame')
-fame_dir = r'd:\_corpus\fame'
+fame_dir = r'c:\OneDrive\Research\rug\_data\FAME'
 fame_s5_dir     = os.path.join(fame_dir, 's5')
 fame_corpus_dir = os.path.join(fame_dir, 'corpus')
--- a/acoustic_model/fame_functions.py
+++ b/acoustic_model/fame_functions.py
@ -290,15 +290,17 @@ def lexicon_asr2htk(lexicon_file_asr, lexicon_file_htk):
 	"""
 	lex_asr = load_lexicon(lexicon_file_asr)
 	def word2htk_(row):
 		return word2htk(row['word'])
 	def asr2htk_space_delimited_(row):
 		return asr2htk_space_delimited(row['pronunciation'])
 	lex_htk = pd.DataFrame({
-		'word': lex_asr['word'], 
+		'word': lex_asr.apply(word2htk_, axis=1).str.upper(),
        'pronunciation': lex_asr.apply(asr2htk_space_delimited_, axis=1)
 		})
 	lex_htk = lex_htk.ix[:, ['word', 'pronunciation']]
-	lex_htk.to_csv(lexicon_file_htk, header=None, index=None, sep='\t')
+	lex_htk.to_csv(lexicon_file_htk, header=None, index=None, sep='\t', encoding='utf-8')
 	return
@ -316,20 +318,26 @@ def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out):
 	lex2 = load_lexicon(lexicon_file2)
 	lex  = pd.concat([lex1, lex2])
 	lex  = lex.sort_values(by='word', ascending=True)
-	lex.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
+	lex.to_csv(lexicon_out, index=False, header=False, sep='\t', encoding='utf-8')
 def fix_single_quote(lexicon_file):
 	""" add '\' before all single quote at the beginning of words.
 	convert special characters to ascii compatible characters.
 	Args:
 		lexicon_file (path): lexicon file, which will be overwitten.
 	"""
 	lex = load_lexicon(lexicon_file)
 	lex = lex.dropna() # remove N/A.
 	for i in lex[lex['word'].str.startswith('\'')].index.values:
 		lex.iat[i, 0] = lex.iat[i, 0].replace('\'', '\\\'')
-		# to_csv does not work with space seperator. therefore all tabs should manually be replaced.
+	# to_csv does not work with space seperator. therefore all tabs should manually be replaced.
-		#lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
+	#lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
-		lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep='\t')
+	lex.to_csv(lexicon_file, index=False, header=False, sep='\t', encoding='utf-8')
 	return
 def word2htk(word):
 	return ''.join([fame_asr.translation_key_word2htk.get(i, i) for i in word])
--- a/acoustic_model/fame_hmm.py
+++ b/acoustic_model/fame_hmm.py
@ -3,6 +3,7 @@ import os
 os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
 import tempfile
 import shutil
 #import configparser
 #import subprocess
 import time
@ -11,6 +12,7 @@ import numpy as np
 import pandas as pd
 import fame_functions
 from phoneset import fame_ipa, fame_asr
 import defaultfiles as default
 sys.path.append(default.toolbox_dir)
 import file_handling as fh
@ -28,7 +30,7 @@ dataset_list = ['devel', 'test', 'train']
 # procedure
 extract_features  = 0
-make_lexicon	  = 0
+make_lexicon	  = 1
 make_mlf		  = 0
 combine_files	  = 0
 flat_start		  = 0
@ -44,6 +46,9 @@ lexicon_htk_asr = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_asr')
 lexicon_htk_oov = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_oov')
 lexicon_htk     = os.path.join(default.htk_dir, 'lexicon', 'lex.htk')
 global_ded = os.path.join(default.htk_dir, 'config', 'global.ded')
 #hcompv_scp = output_dir + '\\scp\\combined.scp'
 #combined_mlf = output_dir + '\\label\\combined.mlf'
@ -60,14 +65,17 @@ if not os.path.exists(feature_dir):
 tmp_dir = os.path.join(default.htk_dir, 'tmp')
 if not os.path.exists(tmp_dir):
 	os.makedirs(tmp_dir)
 label_dir = os.path.join(default.htk_dir, 'label')
 if not os.path.exists(label_dir):
 	os.makedirs(label_dir)
 ## ======================= extract features =======================
 if extract_features:
-	print('==== extract features ====\n')
+	
 	for dataset in dataset_list:
-		print('==== dataset: {} ===='.format(dataset))
+		print('==== extract features on dataset {} ====\n'.format(dataset))
 		# a script file for HCopy 
 		print(">>> making a script file for HCopy... \n")
@ -89,6 +97,8 @@ if extract_features:
 		hcompv_scp  = os.path.join(tmp_dir, dataset + '.scp')
 		fh.make_filelist(feature_dir_, hcompv_scp, '.mfc')
 		os.remove(hcopy_scp.name)
 ## ======================= make lexicon for HTK =======================
 if make_lexicon:
@ -114,94 +124,132 @@ if make_lexicon:
 	fame_functions.fix_single_quote(lexicon_htk)
 ## ======================= make phonelist =======================
 #phonelist_txt = os.path.join(default.htk_dir, 'config', 'phonelist.txt')
 #pyhtk.create_phonelist_file(fame_asr.phoneset_htk, phonelist_txt)
 #sentence = 'ien fan de minsken fan it deiferbliuw sels brúntsje visser'
 #log_txt = os.path.join(default.htk_dir, 'config', 'log.txt')
 #dictionary_file = os.path.join(default.htk_dir, 'config', 'test.dic')
 #pyhtk.create_dictionary(
 #	sentence, global_ded, log_txt, dictionary_file, lexicon_htk)
 #pyhtk.create_dictionary_without_log(
 #	sentence, global_ded, dictionary_file, lexicon_htk)
 ## ======================= make label file =======================
 if make_mlf:
 	print("==== make mlf ====\n")
 	print("generating word level transcription...\n")
 	for dataset in dataset_list:
-		hcompv_scp  = output_dir + '\\scp\\' + dataset + '.scp'
+		timer_start = time.time()
-		hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'
+		print("==== generating word level transcription on dataset {}\n".format(dataset))
 		script_list = FAME_dir + '\\data\\' + dataset + '\\text'
 		mlf_word	= output_dir + '\\label\\' + dataset + '_word.mlf'
 		mlf_phone   = output_dir + '\\label\\' + dataset + '_phone.mlf'
-		# lexicon
+		#hcompv_scp  = output_dir + '\\scp\\' + dataset + '.scp'
-		lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])
+		#hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'
-
+		script_list = os.path.join(default.fame_dir, 'data', dataset, 'text')
-		# list of features
+		#mlf_word	= output_dir + '\\label\\' + dataset + '_word.mlf'
-		with open(hcompv_scp) as fin:
+		#mlf_phone   = output_dir + '\\label\\' + dataset + '_phone.mlf'
-			features = fin.read()
+		wav_dir = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
-			features = features.split('\n')
+		dictionary_file = os.path.join(wav_dir, 'temp.dic')
 		# list of scripts 
 		with open(script_list, "rt", encoding="utf-8") as fin:
-			scripts = fin.read()
+			scripts = fin.read().split('\n')
 			scripts = pd.Series(scripts.split('\n'))
-		i = 0
+		for line in scripts:
-		missing_words = []
+		#for line in ['sp0035m_train_1975_fragmentenvraaggesprekkenruilverkaveling_15413 en dat kan men nog meer']:
-		fscp = open(hcompv_scp2, 'wt')
+			# sample line:
-		fmlf = open(mlf_word, "wt", encoding="utf-8")
+			# sp0457m_test_1968_plakkenfryslanterhorne_2168 en dan begjinne je natuerlik
-		fmlf.write("#!MLF!#\n")
+			filename_ = line.split(' ')[0]
-		feature_nr = 1
+			filename  = '_'.join(filename_.split('_')[1:])
-		for feature in features:
+			sentence  = ' '.join(line.split(' ')[1:])
 			sys.stdout.write("\r%d/%d" % (feature_nr, len(features)))
 			sys.stdout.flush()
 			feature_nr += 1
 			file_basename = os.path.basename(feature).replace('.mfc', '')
-			# get words from scripts.
+			wav_file = os.path.join(wav_dir, filename + '.wav')
-			try:
+			if len(re.findall(r'[\w]+[âêûô\'ú]+[\w]+', sentence))==0:
-				script = scripts[scripts.str.contains(file_basename)]
+				try:
-			except IndexError:
+					sentence_ascii = bytes(sentence, 'ascii')
-				script = []
+				except UnicodeEncodeError:
 					print(sentence)
 			#if os.path.exists(wav_file):
 			#	#dictionary_file = os.path.join(wav_dir, filename + '.dic')
 			#	if pyhtk.create_dictionary_without_log(
 			#		sentence, global_ded, dictionary_file, lexicon_htk) == 0:
 			#		# when the file name is too long, HDMan command does not work.
 			#		# therefore first temporary dictionary_file is made, then renamed. 
 			#		shutil.move(dictionary_file, os.path.join(wav_dir, filename + '.dic'))
 			#		label_file = os.path.join(wav_dir, filename + '.lab')
 			#		pyhtk.create_label_file(sentence, label_file)
 			#	else:
 			#		os.remove(dictionary_file)
 		print("elapsed time: {}".format(time.time() - timer_start))
 		# lexicon
 		#lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])
-			if len(script) != 0:
+		# list of features
-				script_id  = script.index[0]
+		#with open(hcompv_scp) as fin:
-				script_txt = script.get(script_id)
+		#	features = fin.read()
-				script_words = script_txt.split(' ')
+		#	features = features.split('\n')
-				del script_words[0]
+		#i = 0
 		#missing_words = []
 		#fscp = open(hcompv_scp2, 'wt')
 		#fmlf = open(mlf_word, "wt", encoding="utf-8")
 		#fmlf.write("#!MLF!#\n")
 		#feature_nr = 1
 		#for feature in features:
 		#	sys.stdout.write("\r%d/%d" % (feature_nr, len(features)))
 		#	sys.stdout.flush()
 		#	feature_nr += 1
 		#	file_basename = os.path.basename(feature).replace('.mfc', '')
 		#	# get words from scripts.
 		#	try:
 		#		script = scripts[scripts.str.contains(file_basename)]
 		#	except IndexError:
 		#		script = []
 		#	if len(script) != 0:
 		#		script_id  = script.index[0]
 		#		script_txt = script.get(script_id)
 		#		script_words = script_txt.split(' ')
 		#		del script_words[0]
 				# check if all words can be found in the lexicon.
-				SCRIPT_WORDS = []
+		#		SCRIPT_WORDS = []
-				script_prons = []
+		#		script_prons = []
-				is_in_lexicon = 1
+		#		is_in_lexicon = 1
-				for word in script_words:
+		#		for word in script_words:
-					WORD = word.upper()
+		#			WORD = word.upper()
-					SCRIPT_WORDS.append(WORD)
+		#			SCRIPT_WORDS.append(WORD)
-					extracted = lexicon_htk[lexicon_htk['word']==WORD]
+		#			extracted = lexicon_htk[lexicon_htk['word']==WORD]
-					if len(extracted) == 0:
+		#			if len(extracted) == 0:
-						missing_words.append(word)
+	#					missing_words.append(word)
-					script_prons.append(extracted)
+	#				script_prons.append(extracted)
-					is_in_lexicon *= len(extracted)
+	#				is_in_lexicon *= len(extracted)
 				# if all pronunciations are found in the lexicon, update scp and mlf files.
-				if is_in_lexicon:
+	#			if is_in_lexicon:
 					# add the feature filename into the .scp file.
-					fscp.write("{}\n".format(feature))
+	#				fscp.write("{}\n".format(feature))
-					i += 1
+	#				i += 1
 					# add the words to the mlf file.
-					fmlf.write('\"*/{}.lab\"\n'.format(file_basename))
+	#				fmlf.write('\"*/{}.lab\"\n'.format(file_basename))
 					#fmlf.write('{}'.format('\n'.join(SCRIPT_WORDS)))
-					for word_ in SCRIPT_WORDS:
+	#				for word_ in SCRIPT_WORDS:
-						if word_[0] == '\'':
+	#					if word_[0] == '\'':
-							word_ = '\\' + word_
+	#						word_ = '\\' + word_
-						fmlf.write('{}\n'.format(word_))
+	#					fmlf.write('{}\n'.format(word_))
-					fmlf.write('.\n')
+	#				fmlf.write('.\n')
-		print("\n{0} has {1} samples.\n".format(dataset, i))
+	#	print("\n{0} has {1} samples.\n".format(dataset, i))
-		np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words)
+	#	np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words)
-		fscp.close()
+	#	fscp.close()
-		fmlf.close()
+	#	fmlf.close()
 		## generate phone level transcription 
-		print("generating phone level transcription...\n")
+	#	print("generating phone level transcription...\n")
-		mkphones = output_dir + '\\label\\mkphones0.txt'
+	#	mkphones = output_dir + '\\label\\mkphones0.txt'
-		subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word
+	#	subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word
-		subprocess.call(subprocessStr, shell=True)
+	#	subprocess.call(subprocessStr, shell=True)
 ## ======================= combined scps and mlfs =======================
--- a/acoustic_model/fame_test.py
+++ b/acoustic_model/fame_test.py
@ -3,6 +3,7 @@ import os
 os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
 from collections import Counter
 import time
 import re
 import numpy as np
 import pandas as pd
@ -82,22 +83,52 @@ np.save(os.path.join('phoneset', 'fame_ipa2asr.npy'), translation_key_ipa2asr)
 ## check if all the phones in lexicon.htk are in fame_asr.py.
-timer_start = time.time()
+#timer_start = time.time()
-phoneset_htk = fame_asr.phoneset_htk
+#phoneset_htk = fame_asr.phoneset_htk
-phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_htk)
+#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_htk)
-phoneset_lex.remove('')
+#phoneset_lex.remove('')
-print("phones which is in lexicon.htk but not in the fame_asr.py are:\n{}".format(
+#print("phones which is in lexicon.htk but not in the fame_asr.py are:\n{}".format(
-	set(phoneset_htk) - set(phoneset_lex)))
+#	set(phoneset_htk) - set(phoneset_lex)))
-print("elapsed time: {}".format(time.time() - timer_start))
+#print("elapsed time: {}".format(time.time() - timer_start))
-# statistics over the lexicon
+## statistics over the lexicon
-lex_htk = fame_functions.load_lexicon(lexicon_htk)
+#lex_htk = fame_functions.load_lexicon(lexicon_htk)
-phones_all = (' '.join(lex_htk['pronunciation'])).split(' ')
+#phones_all = (' '.join(lex_htk['pronunciation'])).split(' ')
-c = Counter(phones_all)
+#c = Counter(phones_all)
 #lexicon_out = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\lexicon\lex.htk2'
 #for i in lex_htk[lex_htk['word'].str.startswith('\'')].index.values:
 #	lex_htk.iat[i, 0] = lex_htk.iat[i, 0].replace('\'', '\\\'')
 ## to_csv does not work with space seperator. therefore all tabs should manually be replaced.
 ##lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
 #lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
 ## check which letters are not coded in ascii. 
 print('asr phones which cannot be coded in ascii:\n')
 for i in fame_asr.phoneset_short:
 	try:
 		i_encoded = i.encode("ascii")
 		#print("{0} --> {1}".format(i, i.encode("ascii")))
 	except UnicodeEncodeError:
 		print(">>> {}".format(i))
 print("letters in the scripts which is not coded in ascii:\n")
 for dataset in ['train', 'devel', 'test']:
 	timer_start = time.time()
 	script_list = os.path.join(default.fame_dir, 'data', dataset, 'text')
 	with open(script_list, "rt", encoding="utf-8") as fin:
 		scripts = fin.read().split('\n')
 	for line in scripts:
 		sentence  = ' '.join(line.split(' ')[1:])
 		sentence_htk = fame_functions.word2htk(sentence)
 		#if len(re.findall(r'[âêôûč\'àéèúćäëïöü]', sentence))==0:
 		try:
 			sentence_htk = bytes(sentence_htk, 'ascii')
 		except UnicodeEncodeError:
 			print(sentence)
 			print(sentence_htk)
 lexicon_out = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\lexicon\lex.htk2'
 for i in lex_htk[lex_htk['word'].str.startswith('\'')].index.values:
 	lex_htk.iat[i, 0] = lex_htk.iat[i, 0].replace('\'', '\\\'')
 # to_csv does not work with space seperator. therefore all tabs should manually be replaced.
 #lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
 lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
--- a/acoustic_model/phoneset/fame_asr.py
+++ b/acoustic_model/phoneset/fame_asr.py
@ -103,12 +103,22 @@ translation_key_asr2htk = {
 	}
 phoneset_htk = [translation_key_asr2htk.get(i, i) for i in phoneset_short]
-## check
+#not_in_ascii = [
-#for i in phoneset_short:
+#	'\'', 
-#	try:
+#	'â', 'ê', 'ô', 'û', 'č', 
-#		print("{0} --> {1}".format(i, i.encode("ascii")))
+#	'à', 'í', 'é', 'è', 'ú', 'ć', 
-#	except UnicodeEncodeError:
+#	'ä', 'ë', 'ï', 'ö', 'ü'
-#		print(">>> {}".format(i))
+#]
 translation_key_word2htk = {
 	'\'': '\\\'',
 	'í':'i1', 'é':'e1', 'ú':'u1', 'ć':'c1',
 	'à':'a2', 'è':'e2', 	
 	'â':'a3', 'ê':'e3', 'ô':'o3', 'û':'u3', 
 	'č':'c4',
 	'ä': 'ao', 'ë': 'ee', 'ï': 'ie', 'ö': 'oe', 'ü': 'ue',
 }
 #[translation_key_word2htk.get(i, i) for i in not_in_ascii]
 ## the list of multi character phones. 
--- a/acoustic_model/test.txt
+++ b/acoustic_model/test.txt