acoustic_model/acoustic_model/fame_hmm.py

import sys
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')

import tempfile
import shutil
import glob
#import configparser
#import subprocess
import time

import numpy as np
import pandas as pd

import fame_functions
from phoneset import fame_ipa, fame_asr
import defaultfiles as default
sys.path.append(default.toolbox_dir)
import file_handling as fh
from htk import pyhtk


## ======================= user define =======================
#repo_dir = 'C:\\Users\\Aki\\source\\repos\\acoustic_model'
#curr_dir = repo_dir + '\\acoustic_model'
#config_ini = curr_dir + '\\config.ini'
#output_dir = 'C:\\OneDrive\\Research\\rug\\experiments\\friesian\\acoustic_model'
#forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced_alignment'

dataset_list = ['devel', 'test', 'train']

# procedure
extract_features  = 0
make_lexicon	  = 0
make_dictionary	  = 0 # 4800 sec
make_htk_files    = 1
combine_files	  = 0
flat_start		  = 0
train_model		  = 0


## ======================= load variables =======================

lexicon_dir = os.path.join(default.fame_dir, 'lexicon')
lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
lexicon_oov = os.path.join(lexicon_dir, 'lex.oov')
lexicon_htk_asr = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_asr')
lexicon_htk_oov = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_oov')
lexicon_htk     = os.path.join(default.htk_dir, 'lexicon', 'lex.htk')

global_ded = os.path.join(default.htk_dir, 'config', 'global.ded')


#hcompv_scp = output_dir + '\\scp\\combined.scp'
#combined_mlf = output_dir + '\\label\\combined.mlf'

#model_dir  = output_dir + '\\model'
#model0_dir = model_dir + '\\hmm0'
#proto_init = model_dir + '\\proto38'
#proto_name = 'proto'
#phonelist  = output_dir + '\\config\\phonelist_friesian.txt'
#hmmdefs_name = 'hmmdefs'

feature_dir = os.path.join(default.htk_dir, 'mfc')
if not os.path.exists(feature_dir):
	os.makedirs(feature_dir)
tmp_dir = os.path.join(default.htk_dir, 'tmp')
if not os.path.exists(tmp_dir):
	os.makedirs(tmp_dir)
label_dir = os.path.join(default.htk_dir, 'label')
if not os.path.exists(label_dir):
	os.makedirs(label_dir)


## ======================= extract features =======================
if extract_features:

	for dataset in dataset_list:
		print('==== extract features on dataset {} ====\n'.format(dataset))

		# a script file for HCopy
		print(">>> making a script file for HCopy... \n")
		hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False)
		hcopy_scp.close()

		# get a list of features (hcopy.scp) from the filelist in FAME! corpus
		feature_dir_ = os.path.join(feature_dir, dataset)
		if not os.path.exists(feature_dir_):
			os.makedirs(feature_dir_)

		# extract features
		print(">>> extracting features... \n")
		fame_functions.make_hcopy_scp_from_filelist_in_fame(default.fame_dir, dataset, feature_dir_, hcopy_scp.name)
		pyhtk.wav2mfc(default.config_hcopy, hcopy_scp.name)

		os.remove(hcopy_scp.name)


## ======================= make lexicon for HTK =======================
if make_lexicon:
	print('==== make lexicon for HTK ====\n')

	# convert each lexicon from fame_asr phoneset to fame_htk phoneset.
	print('>>> converting each lexicon from fame_asr phoneset to fame_htk phoneset... \n')
	fame_functions.lexicon_asr2htk(lexicon_asr, lexicon_htk_asr)
	fame_functions.lexicon_asr2htk(lexicon_oov, lexicon_htk_oov)

	# combine lexicon
	print('>>> combining lexicon files into one lexicon... \n')
	# pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov.
	# therefore there is no overlap between lex_asr and lex_oov.
	fame_functions.combine_lexicon(lexicon_htk_asr, lexicon_htk_oov, lexicon_htk)

	## =======================
	## manually make changes to the pronunciation dictionary and save it as lex.htk
	## =======================
	# (1) Replace all tabs with single space;
	# (2) Put a '\' before any dictionary entry beginning with single quote
	#http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html
	fame_functions.fix_single_quote(lexicon_htk)


## ======================= make dic files =======================
if make_dictionary:
	for dataset in dataset_list:
		timer_start = time.time()
		print("==== generating HTK dictionary files on dataset {}\n".format(dataset))

		#hcompv_scp  = output_dir + '\\scp\\' + dataset + '.scp'
		#hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'
		script_list = os.path.join(default.fame_dir, 'data', dataset, 'text')
		#mlf_word	= output_dir + '\\label\\' + dataset + '_word.mlf'
		#mlf_phone   = output_dir + '\\label\\' + dataset + '_phone.mlf'
		wav_dir = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
		dictionary_file = os.path.join(wav_dir, 'temp.dic')

		# list of scripts
		with open(script_list, "rt", encoding="utf-8") as fin:
			scripts = fin.read().split('\n')

		for line in scripts:
		#for line in ['sp0035m_train_1975_fragmentenvraaggesprekkenruilverkaveling_15413 en dat kan men nog meer']:
			# sample line:
			# sp0457m_test_1968_plakkenfryslanterhorne_2168 en dan begjinne je natuerlik
			filename_ = line.split(' ')[0]
			filename  = '_'.join(filename_.split('_')[1:])
			sentence  = ' '.join(line.split(' ')[1:])
			sentence_htk = fame_functions.word2htk(sentence)

			wav_file = os.path.join(wav_dir, filename + '.wav')
			if os.path.exists(wav_file):
				#dictionary_file = os.path.join(wav_dir, filename + '.dic')
				if pyhtk.create_dictionary_without_log(
					sentence, global_ded, dictionary_file, lexicon_htk) == 0:
					# when the file name is too long, HDMan command does not work.
					# therefore first temporary dictionary_file is made, then renamed.
					shutil.move(dictionary_file, os.path.join(wav_dir, filename + '.dic'))
					label_file = os.path.join(wav_dir, filename + '.lab')
					pyhtk.create_label_file(sentence, label_file)
				else:
					os.remove(dictionary_file)
		print("elapsed time: {}".format(time.time() - timer_start))

		# lexicon
		#lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])

		# list of features
		#with open(hcompv_scp) as fin:
		#	features = fin.read()
		#	features = features.split('\n')
		#i = 0
		#missing_words = []
		#fscp = open(hcompv_scp2, 'wt')
		#fmlf = open(mlf_word, "wt", encoding="utf-8")
		#fmlf.write("#!MLF!#\n")
		#feature_nr = 1
		#for feature in features:
		#	sys.stdout.write("\r%d/%d" % (feature_nr, len(features)))
		#	sys.stdout.flush()
		#	feature_nr += 1
		#	file_basename = os.path.basename(feature).replace('.mfc', '')

		#	# get words from scripts.
		#	try:
		#		script = scripts[scripts.str.contains(file_basename)]
		#	except IndexError:
		#		script = []

		#	if len(script) != 0:
		#		script_id  = script.index[0]
		#		script_txt = script.get(script_id)
		#		script_words = script_txt.split(' ')
		#		del script_words[0]

				# check if all words can be found in the lexicon.
		#		SCRIPT_WORDS = []
		#		script_prons = []
		#		is_in_lexicon = 1
		#		for word in script_words:
		#			WORD = word.upper()
		#			SCRIPT_WORDS.append(WORD)
		#			extracted = lexicon_htk[lexicon_htk['word']==WORD]
		#			if len(extracted) == 0:
	#					missing_words.append(word)
	#				script_prons.append(extracted)
	#				is_in_lexicon *= len(extracted)

				# if all pronunciations are found in the lexicon, update scp and mlf files.
	#			if is_in_lexicon:
					# add the feature filename into the .scp file.
	#				fscp.write("{}\n".format(feature))
	#				i += 1

					# add the words to the mlf file.
	#				fmlf.write('\"*/{}.lab\"\n'.format(file_basename))
					#fmlf.write('{}'.format('\n'.join(SCRIPT_WORDS)))
	#				for word_ in SCRIPT_WORDS:
	#					if word_[0] == '\'':
	#						word_ = '\\' + word_
	#					fmlf.write('{}\n'.format(word_))
	#				fmlf.write('.\n')
	#	print("\n{0} has {1} samples.\n".format(dataset, i))
	#	np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words)

	#	fscp.close()
	#	fmlf.close()

## ======================= make other required files =======================
if make_htk_files:
	## phonelist
	phonelist_txt = os.path.join(default.htk_dir, 'config', 'phonelist.txt')
	pyhtk.create_phonelist_file(fame_asr.phoneset_htk, phonelist_txt)

	## hcomp_v.scp
	print(">>> making a script file for HCompV... \n")
	for dataset in dataset_list:
		#timer_start = time.time()

		wav_dir = os.path.join(default.fame_dir, 'fame', 'wav', dataset)

		listdir = glob.glob(os.path.join(wav_dir, '*.dic'))
		filelist = [filename.replace(wav_dir, feature_dir).replace('.dic', '.fea') for filename in listdir]

		hcompv_scp = os.path.join(tmp_dir, dataset + '.scp')
		with open(hcompv_scp, 'wt', newline='\r\n') as f:
			f.write('\n'.join(filelist))


## hcomp_scp
# a script file for HCompV

	#	print("generating phone level transcription...\n")
	#	mkphones = output_dir + '\\label\\mkphones0.txt'
	#	subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word
	#	subprocess.call(subprocessStr, shell=True)


## ======================= combined scps and mlfs =======================
#if combine_files:
#	print("==== combine scps and mlfs ====\n")

#	fscp = open(hcompv_scp, 'wt')
#	fmlf = open(combined_mlf, 'wt')

#	for dataset in dataset_list:
#		fmlf.write("#!MLF!#\n")
#		for dataset in dataset_list:
#			each_mlf = output_dir + '\\label\\' + dataset + '_phone.mlf'
#			each_scp = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'

#		with open(each_mlf, 'r') as fin:
#			lines = fin.read()
#			lines = lines.split('\n')
#		fmlf.write('\n'.join(lines[1:]))

#		with open(each_scp, 'r') as fin:
#			lines = fin.read()
#		fscp.write(lines)

#	fscp.close()
#	fmlf.close()


## ======================= flat start monophones =======================
if flat_start:
	subprocessStr = 'HCompV -T 1 -C ' + config_train + ' -m -v 0.01 -S ' + hcompv_scp + ' -M ' + model0_dir + ' ' + proto_init
	subprocess.call(subprocessStr, shell=True)

	# allocate mean & variance to all phones in the phone list
	subprocessStr = 'perl ' + mkhmmdefs_pl + ' ' + model0_dir + '\\proto38' + ' ' + phonelist + ' > ' + model0_dir + '\\' + hmmdefs_name
	subprocess.call(subprocessStr, shell=True)


## ======================= estimate monophones =======================
if train_model:
	iter_num_max = 3
	for mix_num in [128, 256, 512, 1024]:
		for iter_num in range(1, iter_num_max+1):
			print("===== mix{}, iter{} =====".format(mix_num, iter_num))
			iter_num_pre = iter_num - 1
			modelN_dir = model_dir + '\\hmm' + str(mix_num) + '-' + str(iter_num)
			if not os.path.exists(modelN_dir):
				os.makedirs(modelN_dir)

			if iter_num == 1 and mix_num == 1:
				modelN_dir_pre = model0_dir
			else:
				modelN_dir_pre = model_dir + '\\hmm' + str(mix_num) + '-' + str(iter_num_pre)

			## re-estimation
			subprocessStr = 'HERest -T 1 -C ' + config_train + ' -v 0.01 -I ' + combined_mlf + ' -H ' + modelN_dir_pre + '\\' + hmmdefs_name + ' -M ' + modelN_dir + ' ' + phonelist + ' -S ' + hcompv_scp
			subprocess.call(subprocessStr, shell=True)

		mix_num_next = mix_num * 2
		modelN_dir_next = model_dir + '\\hmm' + str(mix_num_next) + '-0'
		if not os.path.exists(modelN_dir_next):
			os.makedirs(modelN_dir_next)

		header_file = modelN_dir + '\\mix' + str(mix_num_next) + '.hed'
		with open(header_file, 'w') as fout:
			fout.write("MU %d {*.state[2-4].mix}" % (mix_num_next))

		subprocessStr =	'HHEd -T 1 -H ' + modelN_dir + '\\' + hmmdefs_name + ' -M ' + modelN_dir_next + ' ' + header_file + ' ' + phonelist

		subprocess.call(subprocessStr, shell=True)