bug related encoding on label file is fixed.

2019-02-04 13:46:27 +01:00
parent 322a8a0079
commit f6e7c8eefa
5 changed files with 151 additions and 219 deletions
--- a/.vs/acoustic_model/v15/.suo
+++ b/.vs/acoustic_model/v15/.suo
--- a/acoustic_model/pycache/defaultfiles.cpython-36.pyc
+++ b/acoustic_model/pycache/defaultfiles.cpython-36.pyc
--- a/acoustic_model/defaultfiles.py
+++ b/acoustic_model/defaultfiles.py
@@ -1,14 +1,13 @@
 import os
-
-#default_hvite_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', 'htk', 'config.HVite')
+# add path of the parent directory
+#os.path.dirname(os.path.realpath(__file__))

 #cygwin_dir = r'C:\cygwin64\home\Aki\acoustic_model'

 #htk_dir = r'C:\Aki\htk_fame'
 htk_dir = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk'

-config_hcopy = os.path.join(htk_dir, 'config', 'config.HCopy')
-#config_train = os.path.join(cygwin_dir, 'config', 'config.train')
+
 #config_hvite = os.path.join(cygwin_dir, 'config', 'config.HVite')
 #mkhmmdefs_pl = os.path.join(cygwin_dir, 'src', 'acoustic_model', 'mkhmmdefs.pl')

--- a/acoustic_model/fame_hmm.py
+++ b/acoustic_model/fame_hmm.py
@@ -5,8 +5,6 @@ os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
 import tempfile
 import shutil
 import glob
-#import configparser
-#import subprocess
 import time

 import numpy as np
@@ -21,45 +19,42 @@ from htk import pyhtk


 ## ======================= user define =======================
-#repo_dir = 'C:\\Users\\Aki\\source\\repos\\acoustic_model'
-#curr_dir = repo_dir + '\\acoustic_model'
-#config_ini = curr_dir + '\\config.ini'
-#output_dir = 'C:\\OneDrive\\Research\\rug\\experiments\\friesian\\acoustic_model'
-#forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced_alignment'
+# procedure
+make_lexicon	  = 0
+make_label		  = 0 # it takes roughly 4800 sec on Surface pro 2.
+make_htk_files    = 0
+extract_features  = 0
+flat_start		  = 0
+train_model_without_sp = 1
+
+
+# pre-defined values.

 dataset_list = ['devel', 'test', 'train']
+hmmdefs_name = 'hmmdefs'

-# procedure
-extract_features  = 0
-make_lexicon	  = 0
-make_dictionary	  = 0 # 4800 sec
-make_htk_files    = 1
-combine_files	  = 0
-flat_start		  = 0
-train_model		  = 0
+lexicon_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr')
+lexicon_oov = os.path.join(default.fame_dir, 'lexicon', 'lex.oov')
+
+config_dir = os.path.join(default.htk_dir, 'config')
+config_hcopy = os.path.join(config_dir, 'config.HCopy')
+config_train = os.path.join(config_dir, 'config.train')
+global_ded   = os.path.join(config_dir, 'global.ded')
+mkphones_led = os.path.join(config_dir, 'mkphones.led')
+prototype    = os.path.join(config_dir, 'proto39')
+
+model_dir    = os.path.join(default.htk_dir, 'model')


-## ======================= load variables =======================
+# directories / files to be made.

-lexicon_dir = os.path.join(default.fame_dir, 'lexicon') 
-lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
-lexicon_oov = os.path.join(lexicon_dir, 'lex.oov')
-lexicon_htk_asr = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_asr')
-lexicon_htk_oov = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_oov')
-lexicon_htk     = os.path.join(default.htk_dir, 'lexicon', 'lex.htk')
+lexicon_dir = os.path.join(default.htk_dir, 'lexicon') 
+lexicon_htk_asr = os.path.join(lexicon_dir, 'lex.htk_asr')
+lexicon_htk_oov = os.path.join(lexicon_dir, 'lex.htk_oov')
+lexicon_htk     = os.path.join(lexicon_dir, 'lex.htk')

-global_ded = os.path.join(default.htk_dir, 'config', 'global.ded')
-
-
-#hcompv_scp = output_dir + '\\scp\\combined.scp'
-#combined_mlf = output_dir + '\\label\\combined.mlf'
-
-#model_dir  = output_dir + '\\model'
-#model0_dir = model_dir + '\\hmm0'
-#proto_init = model_dir + '\\proto38'
-#proto_name = 'proto'
-#phonelist  = output_dir + '\\config\\phonelist_friesian.txt'
-#hmmdefs_name = 'hmmdefs'
+phonelist_txt = os.path.join(config_dir, 'phonelist.txt')
+model0_dir	  = os.path.join(model_dir, 'hmm0')

 feature_dir = os.path.join(default.htk_dir, 'mfc')
 if not os.path.exists(feature_dir):
@@ -72,42 +67,18 @@ if not os.path.exists(label_dir):
 	os.makedirs(label_dir)


-
-## ======================= extract features =======================
-if extract_features:
-	
-	for dataset in dataset_list:
-		print('==== extract features on dataset {} ====\n'.format(dataset))
-
-		# a script file for HCopy 
-		print(">>> making a script file for HCopy... \n")
-		hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False)
-		hcopy_scp.close()
-
-		# get a list of features (hcopy.scp) from the filelist in FAME! corpus
-		feature_dir_ = os.path.join(feature_dir, dataset)
-		if not os.path.exists(feature_dir_):
-			os.makedirs(feature_dir_)
-
-		# extract features
-		print(">>> extracting features... \n")
-		fame_functions.make_hcopy_scp_from_filelist_in_fame(default.fame_dir, dataset, feature_dir_, hcopy_scp.name)
-		pyhtk.wav2mfc(default.config_hcopy, hcopy_scp.name)
-
-		os.remove(hcopy_scp.name)
-
-
 ## ======================= make lexicon for HTK =======================
 if make_lexicon:
-	print('==== make lexicon for HTK ====\n')
+	timer_start = time.time()
+	print('==== making lexicon for HTK ====')

 	# convert each lexicon from fame_asr phoneset to fame_htk phoneset.
-	print('>>> converting each lexicon from fame_asr phoneset to fame_htk phoneset... \n')
+	print('>>> converting each lexicon from fame_asr phoneset to fame_htk phoneset...')
 	fame_functions.lexicon_asr2htk(lexicon_asr, lexicon_htk_asr)
 	fame_functions.lexicon_asr2htk(lexicon_oov, lexicon_htk_oov)

 	# combine lexicon
-	print('>>> combining lexicon files into one lexicon... \n')
+	print('>>> combining lexicon files into one lexicon...')
 	# pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov.
 	# therefore there is no overlap between lex_asr and lex_oov.   
 	fame_functions.combine_lexicon(lexicon_htk_asr, lexicon_htk_oov, lexicon_htk)
@@ -119,28 +90,26 @@ if make_lexicon:
 	# (2) Put a '\' before any dictionary entry beginning with single quote 
 	#http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html
 	fame_functions.fix_single_quote(lexicon_htk)
+	print("elapsed time: {}".format(time.time() - timer_start))


-## ======================= make dic files =======================
-if make_dictionary:
+## ======================= make label files =======================
+if make_label:
 	for dataset in dataset_list:
 		timer_start = time.time()
-		print("==== generating HTK dictionary files on dataset {}\n".format(dataset))
+		print("==== making label files on dataset {}".format(dataset))

-		#hcompv_scp  = output_dir + '\\scp\\' + dataset + '.scp'
-		#hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'
 		script_list = os.path.join(default.fame_dir, 'data', dataset, 'text')
-		#mlf_word	= output_dir + '\\label\\' + dataset + '_word.mlf'
-		#mlf_phone   = output_dir + '\\label\\' + dataset + '_phone.mlf'
-		wav_dir = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
-		dictionary_file = os.path.join(wav_dir, 'temp.dic')
+		wav_dir_	= os.path.join(default.fame_dir, 'fame', 'wav', dataset)
+		label_dir_		= os.path.join(label_dir, dataset)
+		dictionary_file = os.path.join(label_dir_, 'temp.dic')
+		fh.make_new_directory(label_dir_)

 		# list of scripts 
 		with open(script_list, "rt", encoding="utf-8") as fin:
 			scripts = fin.read().split('\n')

 		for line in scripts:
-		#for line in ['sp0035m_train_1975_fragmentenvraaggesprekkenruilverkaveling_15413 en dat kan men nog meer']:
 			# sample line:
 			# sp0457m_test_1968_plakkenfryslanterhorne_2168 en dan begjinne je natuerlik
 			filename_ = line.split(' ')[0]
@@ -148,180 +117,144 @@ if make_dictionary:
 			sentence  = ' '.join(line.split(' ')[1:])
 			sentence_htk = fame_functions.word2htk(sentence)

-			wav_file = os.path.join(wav_dir, filename + '.wav')
-			if os.path.exists(wav_file):
-				#dictionary_file = os.path.join(wav_dir, filename + '.dic')
+			wav_file = os.path.join(wav_dir_, filename + '.wav')
+			if os.path.exists(wav_file) and pyhtk.can_be_ascii(sentence_htk) == 0:
 				if pyhtk.create_dictionary_without_log(
-					sentence, global_ded, dictionary_file, lexicon_htk) == 0:
+					sentence_htk, global_ded, dictionary_file, lexicon_htk) == 0:
 					# when the file name is too long, HDMan command does not work.
 					# therefore first temporary dictionary_file is made, then renamed. 
-					shutil.move(dictionary_file, os.path.join(wav_dir, filename + '.dic'))
-					label_file = os.path.join(wav_dir, filename + '.lab')
-					pyhtk.create_label_file(sentence, label_file)
+					shutil.move(dictionary_file, os.path.join(label_dir_, filename + '.dic'))
+
+					label_file = os.path.join(label_dir_, filename + '.lab')
+					pyhtk.create_label_file(sentence_htk, label_file)
 				else:
 					os.remove(dictionary_file)
 		print("elapsed time: {}".format(time.time() - timer_start))

-		# lexicon
-		#lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])
-
-		# list of features
-		#with open(hcompv_scp) as fin:
-		#	features = fin.read()
-		#	features = features.split('\n')
-		#i = 0
-		#missing_words = []
-		#fscp = open(hcompv_scp2, 'wt')
-		#fmlf = open(mlf_word, "wt", encoding="utf-8")
-		#fmlf.write("#!MLF!#\n")
-		#feature_nr = 1
-		#for feature in features:
-		#	sys.stdout.write("\r%d/%d" % (feature_nr, len(features)))
-		#	sys.stdout.flush()
-		#	feature_nr += 1
-		#	file_basename = os.path.basename(feature).replace('.mfc', '')
-		
-		#	# get words from scripts.
-		#	try:
-		#		script = scripts[scripts.str.contains(file_basename)]
-		#	except IndexError:
-		#		script = []
-
-		#	if len(script) != 0:
-		#		script_id  = script.index[0]
-		#		script_txt = script.get(script_id)
-		#		script_words = script_txt.split(' ')
-		#		del script_words[0]
-
-				# check if all words can be found in the lexicon.
-		#		SCRIPT_WORDS = []
-		#		script_prons = []
-		#		is_in_lexicon = 1
-		#		for word in script_words:
-		#			WORD = word.upper()
-		#			SCRIPT_WORDS.append(WORD)
-		#			extracted = lexicon_htk[lexicon_htk['word']==WORD]
-		#			if len(extracted) == 0:
-	#					missing_words.append(word)
-	#				script_prons.append(extracted)
-	#				is_in_lexicon *= len(extracted)
-
-				# if all pronunciations are found in the lexicon, update scp and mlf files.
-	#			if is_in_lexicon:
-					# add the feature filename into the .scp file.
-	#				fscp.write("{}\n".format(feature))
-	#				i += 1
-
-					# add the words to the mlf file.
-	#				fmlf.write('\"*/{}.lab\"\n'.format(file_basename))
-					#fmlf.write('{}'.format('\n'.join(SCRIPT_WORDS)))
-	#				for word_ in SCRIPT_WORDS:
-	#					if word_[0] == '\'':
-	#						word_ = '\\' + word_
-	#					fmlf.write('{}\n'.format(word_))
-	#				fmlf.write('.\n')
-	#	print("\n{0} has {1} samples.\n".format(dataset, i))
-	#	np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words)
-
-	#	fscp.close()
-	#	fmlf.close()

 ## ======================= make other required files =======================
 if make_htk_files:
-	## phonelist
-	phonelist_txt = os.path.join(default.htk_dir, 'config', 'phonelist.txt')
+	timer_start = time.time()
+	print("==== making files required for HTK ====")
+	
+	print(">>> making a phonelist...")
 	pyhtk.create_phonelist_file(fame_asr.phoneset_htk, phonelist_txt)

-	## hcomp_v.scp
-	print(">>> making a script file for HCompV... \n")
 	for dataset in dataset_list:
-		#timer_start = time.time()
+		wav_dir_	 = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
+		feature_dir_ = os.path.join(feature_dir, dataset)
+		label_dir_   = os.path.join(label_dir, dataset)
+		mlf_word  = os.path.join(label_dir, dataset + '_word.mlf')
+		mlf_phone = os.path.join(label_dir, dataset + '_phone.mlf')

-		wav_dir = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
+		#print(">>> making a script file for {}...".format(dataset))
+		#listdir    = glob.glob(os.path.join(wav_dir_, '*.dic'))
+		#mfc_list   = [filename.replace(wav_dir_, feature_dir_).replace('.dic', '.mfc') for filename in listdir]
+		#hcompv_scp = os.path.join(tmp_dir, dataset + '.scp')
+		#with open(hcompv_scp, 'wb') as f:
+		#	f.write(bytes('\n'.join(mfc_list) + '\n', 'ascii'))

-		listdir = glob.glob(os.path.join(wav_dir, '*.dic'))
-		filelist = [filename.replace(wav_dir, feature_dir).replace('.dic', '.fea') for filename in listdir]
+		print(">>> making a mlf file for {}...".format(dataset))
+		lab_list = glob.glob(os.path.join(label_dir_, '*.lab'))
+		with open(mlf_word, 'wb') as fmlf:
+			fmlf.write(bytes('#!MLF!#\n', 'ascii'))
+			for label_file in lab_list:
+				filename = os.path.basename(label_file)
+				fmlf.write(bytes('\"*/{}\"\n'.format(filename), 'ascii'))
+				with open(label_file) as flab:
+					lines = flab.read()
+				fmlf.write(bytes(lines + '.\n', 'ascii'))

+		print(">>> generating phone level transcription for {}...".format(dataset))
+		pyhtk.mlf_word2phone(lexicon_htk, mlf_phone, mlf_word, mkphones_led)
+		print("elapsed time: {}".format(time.time() - timer_start))
+
+
+## ======================= extract features =======================
+if extract_features:
+	for dataset in dataset_list:
+		timer_start = time.time()
+		print('==== extract features on dataset {} ===='.format(dataset))
+
+		wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
+		label_dir_   = os.path.join(label_dir, dataset)
+		feature_dir_ = os.path.join(feature_dir, dataset)
+		fh.make_new_directory(feature_dir_)
+
+		# a script file for HCopy 
+		print(">>> making a script file for HCopy...")
+		hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False)
+		hcopy_scp.close()
+
+		# get a list of features (hcopy.scp) 
+		# from the filelist in FAME! corpus.
+		#fame_functions.make_hcopy_scp_from_filelist_in_fame(default.fame_dir, dataset, feature_dir_, hcopy_scp.name)
+		# from the list of label files.
+		lab_list = glob.glob(os.path.join(label_dir_, '*.lab'))
+		feature_list = [
+			os.path.join(wav_dir_, os.path.basename(lab_file).replace('.lab', '.wav')) + '\t'
+			+ os.path.join(feature_dir_, os.path.basename(lab_file).replace('.lab', '.mfc'))
+				  for lab_file in lab_list]
+		with open(hcopy_scp.name, 'wb') as f:
+			f.write(bytes('\n'.join(feature_list), 'ascii'))
+		
+		# extract features.
+		print(">>> extracting features on {}...".format(dataset))
+		pyhtk.wav2mfc(config_hcopy, hcopy_scp.name)
+		os.remove(hcopy_scp.name)
+
+		# make hcompv.scp.
+		print(">>> making a script file for {}...".format(dataset))
+		listdir    = glob.glob(os.path.join(label_dir_, '*.dic'))
+		mfc_list   = [filename.replace(label_dir_, feature_dir_).replace('.dic', '.mfc') for filename in listdir]
 		hcompv_scp = os.path.join(tmp_dir, dataset + '.scp')
-		with open(hcompv_scp, 'wt', newline='\r\n') as f:
-			f.write('\n'.join(filelist))
+		with open(hcompv_scp, 'wb') as f:
+			f.write(bytes('\n'.join(mfc_list) + '\n', 'ascii'))

-
-## hcomp_scp
-# a script file for HCompV
-
-	#	print("generating phone level transcription...\n")
-	#	mkphones = output_dir + '\\label\\mkphones0.txt'
-	#	subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word
-	#	subprocess.call(subprocessStr, shell=True)
-	
-
-## ======================= combined scps and mlfs =======================
-#if combine_files:
-#	print("==== combine scps and mlfs ====\n")
-
-#	fscp = open(hcompv_scp, 'wt')
-#	fmlf = open(combined_mlf, 'wt')
-
-#	for dataset in dataset_list:
-#		fmlf.write("#!MLF!#\n")
-#		for dataset in dataset_list:
-#			each_mlf = output_dir + '\\label\\' + dataset + '_phone.mlf'
-#			each_scp = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'
-		
-#		with open(each_mlf, 'r') as fin:
-#			lines = fin.read()
-#			lines = lines.split('\n')
-#		fmlf.write('\n'.join(lines[1:]))
-
-#		with open(each_scp, 'r') as fin:
-#			lines = fin.read()
-#		fscp.write(lines)
-
-#	fscp.close()
-#	fmlf.close()
+		print("elapsed time: {}".format(time.time() - timer_start))


 ## ======================= flat start monophones =======================
 if flat_start:
-	subprocessStr = 'HCompV -T 1 -C ' + config_train + ' -m -v 0.01 -S ' + hcompv_scp + ' -M ' + model0_dir + ' ' + proto_init
-	subprocess.call(subprocessStr, shell=True)
+	hcompv_scp = os.path.join(tmp_dir, 'test.scp')
+
+	timer_start = time.time()
+	print('==== flat start ====')
+	pyhtk.flat_start(config_train, hcompv_scp, model0_dir, prototype)

 	# allocate mean & variance to all phones in the phone list
-	subprocessStr = 'perl ' + mkhmmdefs_pl + ' ' + model0_dir + '\\proto38' + ' ' + phonelist + ' > ' + model0_dir + '\\' + hmmdefs_name 
-	subprocess.call(subprocessStr, shell=True)
+	pyhtk.create_hmmdefs(
+		os.path.join(model0_dir, 'proto39'),
+	    os.path.join(model0_dir, 'hmmdefs'), 
+		phonelist_txt)
+	print("elapsed time: {}".format(time.time() - timer_start))


 ## ======================= estimate monophones =======================
-if train_model:
-	iter_num_max = 3
-	for mix_num in [128, 256, 512, 1024]:
-		for iter_num in range(1, iter_num_max+1):
-			print("===== mix{}, iter{} =====".format(mix_num, iter_num))
-			iter_num_pre = iter_num - 1
-			modelN_dir = model_dir + '\\hmm' + str(mix_num) + '-' + str(iter_num)
-			if not os.path.exists(modelN_dir):
-				os.makedirs(modelN_dir)
+if train_model_without_sp:
+	hcompv_scp = os.path.join(tmp_dir, 'test.scp')
+	mlf_file = os.path.join(label_dir, 'test_phone.mlf')
+	output_dir = os.path.join(model_dir, 'hmm1')
+	fh.make_new_directory(output_dir)

-			if iter_num == 1 and mix_num == 1:
-				modelN_dir_pre = model0_dir
-			else:
-				modelN_dir_pre = model_dir + '\\hmm' + str(mix_num) + '-' + str(iter_num_pre)
-		
-			## re-estimation
-			subprocessStr = 'HERest -T 1 -C ' + config_train + ' -v 0.01 -I ' + combined_mlf + ' -H ' + modelN_dir_pre + '\\' + hmmdefs_name + ' -M ' + modelN_dir + ' ' + phonelist + ' -S ' + hcompv_scp
-			subprocess.call(subprocessStr, shell=True)
-
-		mix_num_next = mix_num * 2
-		modelN_dir_next = model_dir + '\\hmm' + str(mix_num_next) + '-0'
-		if not os.path.exists(modelN_dir_next):
-			os.makedirs(modelN_dir_next)
-	
-		header_file = modelN_dir + '\\mix' + str(mix_num_next) + '.hed'
-		with open(header_file, 'w') as fout:
-			fout.write("MU %d {*.state[2-4].mix}" % (mix_num_next))
-
-		subprocessStr =	'HHEd -T 1 -H ' + modelN_dir + '\\' + hmmdefs_name + ' -M ' + modelN_dir_next + ' ' + header_file + ' ' + phonelist
-		
-		subprocess.call(subprocessStr, shell=True)
+	print('==== train model without sp ====')
+	if not os.path.exists(os.path.join(output_dir, 'iter0')):
+		shutil.copytree(model0_dir, os.path.join(output_dir, 'iter0'))
+	niter = 1
+	for niter in range(1, 5):
+		timer_start = time.time()
+		hmm_n = 'iter' + str(niter)
+		hmm_n_pre = 'iter' + str(niter-1)
+		modeln_dir	   = os.path.join(output_dir, hmm_n)
+		modeln_dir_pre = os.path.join(output_dir, hmm_n_pre) 
 		
+		# re-estimation
+		fh.make_new_directory(modeln_dir)
+		pyhtk.re_estimation(
+			config_train,
+			os.path.join(modeln_dir_pre, 'proto39'),
+			os.path.join(modeln_dir_pre, hmmdefs_name), 
+			modeln_dir,
+			hcompv_scp, phonelist_txt,
+			mlf_file=mlf_file)
+		print("elapsed time: {}".format(time.time() - timer_start))
--- a/acoustic_model/test.txt
+++ b/acoustic_model/test.txt