based on the recommendation from linguists, the total number of phones is reduced.

2018-04-25 09:07:46 +02:00
parent 5a587e0422
commit bbed340228
7 changed files with 357 additions and 63 deletions
--- a/.vs/acoustic_model/v15/.suo
+++ b/.vs/acoustic_model/v15/.suo
--- a/acoustic_model.sln
+++ b/acoustic_model.sln
@@ -5,7 +5,21 @@ VisualStudioVersion = 15.0.26730.12
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "acoustic_model", "acoustic_model\acoustic_model.pyproj", "{4D8C8573-32F0-4A62-9E62-3CE5CC680390}"
 EndProject
-Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "forced_alignment", "..\forced_alignment\forced_alignment\forced_alignment.pyproj", "{92E4D819-38D0-467A-ABEE-09662EEAA084}"
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{3DCEA49A-8FD7-4255-A223-573DCD2595E0}"
+	ProjectSection(SolutionItems) = preProject
+		..\forced_alignment\forced_alignment\__init__.py = ..\forced_alignment\forced_alignment\__init__.py
+		..\forced_alignment\forced_alignment\convert_phone_set.py = ..\forced_alignment\forced_alignment\convert_phone_set.py
+		..\forced_alignment\forced_alignment\defaultfiles.py = ..\forced_alignment\forced_alignment\defaultfiles.py
+		..\forced_alignment\forced_alignment\forced_alignment.pyproj = ..\forced_alignment\forced_alignment\forced_alignment.pyproj
+		..\forced_alignment\forced_alignment\htk_dict.py = ..\forced_alignment\forced_alignment\htk_dict.py
+		..\forced_alignment\forced_alignment\lexicon.py = ..\forced_alignment\forced_alignment\lexicon.py
+		..\forced_alignment\forced_alignment\mlf.py = ..\forced_alignment\forced_alignment\mlf.py
+		..\forced_alignment\forced_alignment\pronunciations.py = ..\forced_alignment\forced_alignment\pronunciations.py
+		..\forced_alignment\forced_alignment\pyhtk.py = ..\forced_alignment\forced_alignment\pyhtk.py
+		..\forced_alignment\forced_alignment\scripts.py = ..\forced_alignment\forced_alignment\scripts.py
+		..\forced_alignment\forced_alignment\tempfilename.py = ..\forced_alignment\forced_alignment\tempfilename.py
+		..\forced_alignment\forced_alignment\test_environment.py = ..\forced_alignment\forced_alignment\test_environment.py
+	EndProjectSection
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
@@ -15,8 +29,6 @@ Global
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{4D8C8573-32F0-4A62-9E62-3CE5CC680390}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
 		{4D8C8573-32F0-4A62-9E62-3CE5CC680390}.Release|Any CPU.ActiveCfg = Release|Any CPU
-		{92E4D819-38D0-467A-ABEE-09662EEAA084}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
-		{92E4D819-38D0-467A-ABEE-09662EEAA084}.Release|Any CPU.ActiveCfg = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/acoustic_model/pycache/acoustic_model_functions.cpython-36.pyc
+++ b/acoustic_model/pycache/acoustic_model_functions.cpython-36.pyc
--- a/acoustic_model/acoustic_model.py
+++ b/acoustic_model/acoustic_model.py
@@ -3,7 +3,9 @@ import sys
 import tempfile
 import configparser
 import subprocess
+from collections import Counter

+import numpy as np
 import pandas as pd


@@ -12,13 +14,26 @@ repo_dir = 'C:\\Users\\Aki\\source\\repos\\acoustic_model'
 curr_dir = repo_dir + '\\acoustic_model'
 config_ini = curr_dir + '\\config.ini'
 output_dir = 'd:\\OneDrive\\Research\\rug\\experiments\\friesian\\acoustic_model'
-forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced-alignment'
+forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced_alignment'
+
+dataset_list = ['devel', 'test', 'train']
+
+# procedure
+extract_features  = 0
+make_feature_list = 0
+conv_lexicon	  = 0
+check_lexicon	  = 0
+make_mlf		  = 0
+combine_files	  = 0
+flat_start		  = 0
+train_model		  = 1
+forced_alignment  = 0
+

 sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir))
 sys.path.append(forced_alignment_module)
 from forced_alignment import convert_phone_set

-
 import acoustic_model_functions as am_func


@@ -30,88 +45,294 @@ config.read(config_ini)

 config_hcopy = config['Settings']['config_hcopy']
 config_train = config['Settings']['config_train']
+mkhmmdefs_pl = config['Settings']['mkhmmdefs_pl']
 FAME_dir	 = config['Settings']['FAME_dir']

-lexicon_file = FAME_dir + '\\lexicon\\lex.asr'
-dataset_list = ['devel', 'test', 'train']
+lex_asr		= FAME_dir + '\\lexicon\\lex.asr'
+lex_asr_htk = FAME_dir + '\\lexicon\\lex.asr_htk'
+lex_oov		= FAME_dir + '\\lexicon\\lex.oov'
+lex_oov_htk = FAME_dir + '\\lexicon\\lex.oov_htk'
+#lex_ipa		= FAME_dir + '\\lexicon\\lex.ipa'
+#lex_ipa_	= FAME_dir + '\\lexicon\\lex.ipa_'
+#lex_ipa_htk = FAME_dir + '\\lexicon\\lex.ipa_htk'
+lex_htk		= FAME_dir + '\\lexicon\\lex_original.htk'
+lex_htk_	= FAME_dir + '\\lexicon\\lex.htk'
+
+hcompv_scp = output_dir + '\\scp\\combined.scp'
+combined_mlf = output_dir + '\\label\\combined.mlf'
+
+model_dir  = output_dir + '\\model'
+model0_dir = model_dir + '\\hmm0'
+proto_init = model_dir + '\\proto38'
+proto_name = 'proto'
+phonelist  = output_dir + '\\config\\phonelist_friesian.txt'
+hmmdefs_name = 'hmmdefs'
+


 ## ======================= extract features =======================
-##dataset = dataset_list[0]
-#for dataset in dataset_list:
-#	print(dataset)
+if extract_features:
+	print("==== extract features ====\n")
+
+	for dataset in dataset_list:
+		print(dataset)
 	
-	## make a script file for HCopy 
-	#hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False)
-	#hcopy_scp.close()
+		# a script file for HCopy 
+		hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False)
+		hcopy_scp.close()

-	## using the filelist in FAME! corpus
-	#feature_dir = output_dir + '\\mfc\\' + dataset
-	#am_func.make_hcopy_scp_from_filelist_in_fame(FAME_dir, dataset, feature_dir, hcopy_scp.name)
+		# get a list of features (hcopy.scp) from the filelist in FAME! corpus
+		feature_dir = output_dir + '\\mfc\\' + dataset
+		am_func.make_hcopy_scp_from_filelist_in_fame(FAME_dir, dataset, feature_dir, hcopy_scp.name)

-	## extract features
-	#subprocessStr = 'HCopy -C ' + config_hcopy + ' -S ' + hcopy_scp.name
-	#subprocess.call(subprocessStr, shell=True)
-
-	#os.remove(hcopy_scp.name)
+		# extract features
+		subprocessStr = 'HCopy -C ' + config_hcopy + ' -S ' + hcopy_scp.name
+		subprocess.call(subprocessStr, shell=True)


 ## ======================= make a list of features =======================
-##dataset = dataset_list[2]
-#for dataset in dataset_list:
-#	print(dataset)
+if make_feature_list:
+	print("==== make a list of features ====\n")

-#	feature_dir = output_dir + '\\mfc\\' + dataset
-#	hcompv_scp  = output_dir + '\\scp\\' + dataset + '.scp'
+	for dataset in dataset_list:
+		print(dataset)

-#	am_func.make_filelist(feature_dir, hcompv_scp)
+		feature_dir = output_dir + '\\mfc\\' + dataset
+		hcompv_scp  = output_dir + '\\scp\\' + dataset + '.scp'
+
+		am_func.make_filelist(feature_dir, hcompv_scp)


-## ======================= check the phonemes used in the lexicon =======================
-phonelist = am_func.get_phonelist(lexicon_file) # 49
-phonelist_list = list(phonelist)
+## ======================= convert lexicon from ipa to fame_htk =======================
+if conv_lexicon:
+	print('==== convert lexicon from ipa 2 fame ====\n')

-#lines_g1 = am_func.find_phone(lexicon_file, 'g')
-#lines_g2 = am_func.find_phone(lexicon_file, 'ɡ')
+	# lex.asr is Kaldi compatible version of lex.ipa.
+	# to check... 
+	#lexicon_ipa = pd.read_table(lex_ipa, names=['word', 'pronunciation'])
+	#with open(lex_ipa_, "w", encoding="utf-8") as fout:
+	#	for word, pronunciation in zip(lexicon_ipa['word'], lexicon_ipa['pronunciation']):
+	#		# ignore nasalization and '.'
+	#		pronunciation_ = pronunciation.replace(u'ⁿ', '')
+	#		pronunciation_ = pronunciation_.replace('.', '')
+	#		pronunciation_split = convert_phone_set.split_ipa_fame(pronunciation_)
+	#		fout.write("{0}\t{1}\n".format(word, ' '.join(pronunciation_split)))
+
+	# convert each lexicon from ipa description to fame_htk phoneset.
+	am_func.ipa2famehtk_lexicon(lex_oov, lex_oov_htk)
+	am_func.ipa2famehtk_lexicon(lex_asr, lex_asr_htk)
+
+	# combine lexicon
+	# pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov.
+	# therefore there is no overlap between lex_asr and lex_oov.   
+	am_func.combine_lexicon(lex_asr_htk, lex_oov_htk, lex_htk)
+
+
+## ======================= check if all the phones are successfully converted =======================
+if check_lexicon:
+	print("==== check if all the phones are successfully converted. ====\n")
+
+	# the phones used in the lexicon.
+	phonelist = am_func.get_phonelist(lex_htk)
+
+	# the lines which include a specific phone.
+	lines = am_func.find_phone(lex_asr, 'g')
+
+	# statistics over the lexicon
+	lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])
+	pronunciation = lexicon_htk['pronunciation']
+	phones_all = []
+	for word in pronunciation:
+		phones_all = phones_all + word.split()
+	c = Counter(phones_all)
+
+
+## ======================= 
+## manually make changes to the pronunciation dictionary and save it as lex.htk 
+## =======================
+# (1) Replace all tabs with single space;
+# (2) Put a '\' before any dictionary entry beginning with single quote 
+#http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html


 ## ======================= make label file =======================
-dataset = 'train'
-hcompv_scp  = output_dir + '\\scp\\' + dataset + '.scp'
-script_list = FAME_dir + '\\data\\' + dataset + '\\text'
+if make_mlf:
+	print("==== make mlf ====\n")

-lexicon = pd.read_table(lexicon_file, names=['word', 'pronunciation'])
+	print("generating word level transcription...\n")
+	for dataset in dataset_list:
+		hcompv_scp  = output_dir + '\\scp\\' + dataset + '.scp'
+		hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'
+		script_list = FAME_dir + '\\data\\' + dataset + '\\text'
+		mlf_word	= output_dir + '\\label\\' + dataset + '_word.mlf'
+		mlf_phone   = output_dir + '\\label\\' + dataset + '_phone.mlf'

-with open(hcompv_scp) as fin:
-	features = fin.read()
-	features = features.split('\n')
+		# lexicon
+		lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])

-with open(script_list, "rt", encoding="utf-8") as fin:
-	scripts = fin.read()
-	scripts = pd.Series(scripts.split('\n'))
+		# list of features
+		with open(hcompv_scp) as fin:
+			features = fin.read()
+			features = features.split('\n')
+
+		# list of scripts 
+		with open(script_list, "rt", encoding="utf-8") as fin:
+			scripts = fin.read()
+			scripts = pd.Series(scripts.split('\n'))
+
+		i = 0
+		missing_words = []
+		fscp = open(hcompv_scp2, 'wt')
+		fmlf = open(mlf_word, "wt", encoding="utf-8")
+		fmlf.write("#!MLF!#\n")
+		feature_nr = 1
+		for feature in features:
+			sys.stdout.write("\r%d/%d" % (feature_nr, len(features)))
+			sys.stdout.flush()
+			feature_nr += 1
+			file_basename = os.path.basename(feature).replace('.mfc', '')
+
+			# get words from scripts.
+			try:
+				script = scripts[scripts.str.contains(file_basename)]
+			except IndexError:
+				script = []
+
+			if len(script) != 0:
+				script_id  = script.index[0]
+				script_txt = script.get(script_id)
+				script_words = script_txt.split(' ')
+				del script_words[0]
+
+				# check if all words can be found in the lexicon.
+				SCRIPT_WORDS = []
+				script_prons = []
+				is_in_lexicon = 1
+				for word in script_words:
+					WORD = word.upper()
+					SCRIPT_WORDS.append(WORD)
+					extracted = lexicon_htk[lexicon_htk['word']==WORD]
+					if len(extracted) == 0:
+						missing_words.append(word)
+					script_prons.append(extracted)
+					is_in_lexicon *= len(extracted)
+
+				# if all pronunciations are found in the lexicon, update scp and mlf files.
+				if is_in_lexicon:
+					# add the feature filename into the .scp file.
+					fscp.write("{}\n".format(feature))
+					i += 1
+
+					# add the words to the mlf file.
+					fmlf.write('\"*/{}.lab\"\n'.format(file_basename))
+					#fmlf.write('{}'.format('\n'.join(SCRIPT_WORDS)))
+					for word_ in SCRIPT_WORDS:
+						if word_[0] == '\'':
+							word_ = '\\' + word_
+						fmlf.write('{}\n'.format(word_))
+					fmlf.write('.\n')
+		print("\n{0} has {1} samples.\n".format(dataset, i))
+		np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words)
+
+		fscp.close()
+		fmlf.close()


-feature = features[0]
-file_basename = os.path.basename(feature).replace('.mfc', '')
+		## generate phone level transcription 
+		print("generating phone level transcription...\n")
+		mkphones = output_dir + '\\label\\mkphones0.txt'
+		subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word
+		subprocess.call(subprocessStr, shell=True)
+	

-# get words from scripts.
-script = scripts[scripts.str.contains(file_basename)]
-script_id  = script.index[0]
-script_txt = script.get(script_id)
-script_words = script_txt.split(' ')
-del script_words[0]
+## ======================= combined scps and mlfs =======================
+if combine_files:
+	print("==== combine scps and mlfs ====\n")

-# make the label file.
-SCRIPT_WORDS = []
-script_prons = []
-all_prons_found = 1
-for word in script_words:
-	SCRIPT_WORDS.append(word.upper())
-	extracted = lexicon[lexicon['word']==word]
-	script_prons.append(extracted)
-	all_prons_found *= len(extracted)
-# make the dict file.
+	fscp = open(hcompv_scp, 'wt')
+	fmlf = open(combined_mlf, 'wt')

-convert_phone_set.ipa2fame(phonelist_list)
-phonelist_list
+	for dataset in dataset_list:
+		fmlf.write("#!MLF!#\n")
+		for dataset in dataset_list:
+			each_mlf = output_dir + '\\label\\' + dataset + '_phone.mlf'
+			each_scp = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'
+		
+		with open(each_mlf, 'r') as fin:
+			lines = fin.read()
+			lines = lines.split('\n')
+		fmlf.write('\n'.join(lines[1:]))

+		with open(each_scp, 'r') as fin:
+			lines = fin.read()
+		fscp.write(lines)
+
+	fscp.close()
+	fmlf.close()
+
+
+## ======================= flat start monophones =======================
+if flat_start:	
+	subprocessStr = 'HCompV -T 1 -C ' + config_train + ' -m -v 0.01 -S ' + hcompv_scp + ' -M ' + model0_dir + ' ' + proto_init
+	subprocess.call(subprocessStr, shell=True)
+
+	# allocate mean & variance to all phones in the phone list
+	subprocessStr = 'perl ' + mkhmmdefs_pl + ' ' + model0_dir + '\\proto38' + ' ' + phonelist + ' > ' + model0_dir + '\\' + hmmdefs_name 
+	subprocess.call(subprocessStr, shell=True)
+
+
+## ======================= estimate monophones =======================
+if train_model:
+	iter_num_max = 3
+	for mix_num in [16, 32, 64, 128]:
+		for iter_num in range(1, iter_num_max+1):
+			print("===== mix{}, iter{} =====".format(mix_num, iter_num))
+			iter_num_pre = iter_num - 1
+			modelN_dir = model_dir + '\\hmm' + str(mix_num) + '-' + str(iter_num)
+			if not os.path.exists(modelN_dir):
+				os.makedirs(modelN_dir)
+
+			if iter_num == 1 and mix_num == 1:
+				modelN_dir_pre = model0_dir
+			else:
+				modelN_dir_pre = model_dir + '\\hmm' + str(mix_num) + '-' + str(iter_num_pre)
+		
+			## re-estimation
+			subprocessStr = 'HERest -T 1 -C ' + config_train + ' -v 0.01 -I ' + combined_mlf + ' -H ' + modelN_dir_pre + '\\' + hmmdefs_name + ' -M ' + modelN_dir + ' ' + phonelist + ' -S ' + hcompv_scp
+			subprocess.call(subprocessStr, shell=True)
+
+		mix_num_next = mix_num * 2
+		modelN_dir_next = model_dir + '\\hmm' + str(mix_num_next) + '-0'
+		if not os.path.exists(modelN_dir_next):
+			os.makedirs(modelN_dir_next)
+	
+		header_file = modelN_dir + '\\mix' + str(mix_num_next) + '.hed'
+		with open(header_file, 'w') as fout:
+			fout.write("MU %d {*.state[2-4].mix}" % (mix_num_next))
+
+		subprocessStr =	'HHEd -T 1 -H ' + modelN_dir + '\\' + hmmdefs_name + ' -M ' + modelN_dir_next + ' ' + header_file + ' ' + phonelist
+		subprocess.call(subprocessStr, shell=True)
+
+
+### ======================= forced alignment =======================
+#if forced_alignment:
+#	try:
+#		scripts.run_command([
+#			'HVite','-T', '1', '-a', '-C', configHVite,
+#			'-H', AcousticModel, '-m', '-I',
+#			mlf_file, '-i', fa_file, '-S',
+#			script_file, htk_dict_file, filePhoneList
+#		])
+#	except:
+#		print("\033[91mHVite command failed with these input files:\033[0m")
+#		print(_debug_show_file('HVite config', configHVite))
+#		print(_debug_show_file('Accoustic model', AcousticModel))
+#		print(_debug_show_file('Master Label file', mlf_file))
+#		print(_debug_show_file('Output', fa_file))
+#		print(_debug_show_file('Script file', script_file))
+#		print(_debug_show_file('HTK dictionary', htk_dict_file))
+#		print(_debug_show_file('Phoneme list', filePhoneList))
+#		raise
+
+
+##os.remove(hcopy_scp.name)
--- a/acoustic_model/acoustic_model_functions.py
+++ b/acoustic_model/acoustic_model_functions.py
@@ -1,9 +1,18 @@
 import os
 import sys

+import pandas as pd
+
+
+## ======================= user define =======================
 repo_dir = 'C:\\Users\\Aki\\source\\repos\\acoustic_model'
 curr_dir = repo_dir + '\\acoustic_model'
+forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced_alignment'
+
+
 sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir))
+sys.path.append(forced_alignment_module)
+from forced_alignment import convert_phone_set


 def make_hcopy_scp_from_filelist_in_fame(FAME_dir, dataset, feature_dir, hcopy_scp):
@@ -61,4 +70,33 @@ def find_phone(lexicon_file, phone):
 			pron = line[1]
 			if phone in pron:
 				extracted.append(line)
-	return extracted
+	return extracted
+
+
+def ipa2famehtk_lexicon(lexicon_file_in, lexicon_file_out):
+	""" Convert a lexicon file from IPA to HTK format for FAME! corpus. """
+
+	lexicon_in = pd.read_table(lexicon_file_in, names=['word', 'pronunciation'])
+	with open(lexicon_file_out, "w", encoding="utf-8") as fout:
+		for word, pronunciation in zip(lexicon_in['word'], lexicon_in['pronunciation']):
+			pronunciation_no_space = pronunciation.replace(' ', '')
+			pronunciation_famehtk  = convert_phone_set.ipa2famehtk(pronunciation_no_space)
+			if 'ceh' not in pronunciation_famehtk and 'sh' not in pronunciation_famehtk:
+				fout.write("{0}\t{1}\n".format(word.upper(), pronunciation_famehtk))
+
+
+def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out):
+	""" Combine two lexicon files and sort by words. """
+
+	with open(lexicon_file1, "rt", encoding="utf-8") as fin:
+		lines1 = fin.read()
+		lines1 = lines1.split('\n')
+	with open(lexicon_file2, "rt", encoding="utf-8") as fin:
+		lines2 = fin.read()
+		lines2 = lines2.split('\n')
+	
+	lex1 = pd.read_table(lexicon_file1, names=['word', 'pronunciation'])
+	lex2 = pd.read_table(lexicon_file2, names=['word', 'pronunciation'])
+	lex  = pd.concat([lex1, lex2])
+	lex  = lex.sort_values(by='word', ascending=True)
+	lex.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
--- a/acoustic_model/config.ini
+++ b/acoustic_model/config.ini
@@ -1,4 +1,5 @@
 [Settings]
 config_hcopy = c:\cygwin64\home\Aki\acoustic_model\config\config.HCopy
 config_train = c:\cygwin64\home\Aki\acoustic_model\config\config.train
+mkhmmdefs_pl = c:\cygwin64\home\Aki\acoustic_model\src\acoustic_model\mkhmmdefs.pl
 FAME_dir = d:\OneDrive\Research\rug\experiments\friesian\corpus
--- a/acoustic_model/performance_check.py
+++ b/acoustic_model/performance_check.py
@@ -0,0 +1,22 @@
+### ======================= forced alignment =======================
+#if forced_alignment:
+#	try:
+#		scripts.run_command([
+#			'HVite','-T', '1', '-a', '-C', configHVite,
+#			'-H', AcousticModel, '-m', '-I',
+#			mlf_file, '-i', fa_file, '-S',
+#			script_file, htk_dict_file, filePhoneList
+#		])
+#	except:
+#		print("\033[91mHVite command failed with these input files:\033[0m")
+#		print(_debug_show_file('HVite config', configHVite))
+#		print(_debug_show_file('Accoustic model', AcousticModel))
+#		print(_debug_show_file('Master Label file', mlf_file))
+#		print(_debug_show_file('Output', fa_file))
+#		print(_debug_show_file('Script file', script_file))
+#		print(_debug_show_file('HTK dictionary', htk_dict_file))
+#		print(_debug_show_file('Phoneme list', filePhoneList))
+#		raise
+
+
+##os.remove(hcopy_scp.name)