acoustic_model/acoustic_model/fame_hmm.py

import sys
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')

import tempfile
import shutil
import glob
import time

import numpy as np
import pandas as pd

import fame_functions
from phoneset import fame_ipa, fame_asr, fame_phonetics
import defaultfiles as default
sys.path.append(default.toolbox_dir)
import file_handling as fh
from htk import pyhtk
#from scripts import run_command


## ======================= user define =======================
# procedure
combine_all = 1

make_lexicon	  = 0
make_label		  = 0 # it takes roughly 4800 sec on Surface pro 2.
make_mlf		  = 0
extract_features  = 0
flat_start		  = 1
train_monophone_without_sp = 1
add_sp = 1
train_monophone_with_re_aligned_mlf = 1
increase_mixture = 1
train_triphone = 0
train_triphone_tied = 0


# pre-defined values.
dataset_list = ['devel', 'test', 'train']
feature_size = 30
improvement_threshold = 0.3

lexicon_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr')
lexicon_oov = os.path.join(default.fame_dir, 'lexicon', 'lex.oov')

config_dir = os.path.join(default.htk_dir, 'config')
phonelist_full_txt = os.path.join(config_dir, 'phonelist_full.txt')
tree_hed    = os.path.join(config_dir, 'tree.hed')
quests_hed  = os.path.join(config_dir, 'quests.hed')

model_dir     = os.path.join(default.htk_dir, 'model')
model_mono0_dir    = os.path.join(model_dir, 'mono0')
model_mono1_dir    = os.path.join(model_dir, 'mono1')
model_mono1sp_dir  = os.path.join(model_dir, 'mono1sp')
model_mono1sp2_dir = os.path.join(model_dir, 'mono1sp2')
model_tri1_dir	   = os.path.join(model_dir, 'tri1')
model_tri1tied_dir = os.path.join(model_dir, 'tri1tied')

# directories / files to be made.
lexicon_dir = os.path.join(default.htk_dir, 'lexicon') 
lexicon_htk_asr = os.path.join(lexicon_dir, 'lex.htk_asr')
lexicon_htk_oov = os.path.join(lexicon_dir, 'lex.htk_oov')
lexicon_htk     = os.path.join(lexicon_dir, 'lex.htk')
lexicon_htk_with_sp  = os.path.join(lexicon_dir, 'lex_with_sp.htk')
lexicon_htk_triphone = os.path.join(lexicon_dir, 'lex_triphone.htk')

feature_dir = os.path.join(default.htk_dir, 'mfc')
fh.make_new_directory(feature_dir, existing_dir='leave')
tmp_dir = os.path.join(default.htk_dir, 'tmp')
fh.make_new_directory(tmp_dir, existing_dir='leave')
label_dir = os.path.join(default.htk_dir, 'label')
fh.make_new_directory(label_dir, existing_dir='leave')


## training
if combine_all:
	hcompv_scp_train		 = os.path.join(tmp_dir, 'all.scp')
	mlf_file_train			 = os.path.join(label_dir, 'all_phone.mlf')
	mlf_file_train_word		 = os.path.join(label_dir, 'all_word.mlf')
	mlf_file_train_with_sp   = os.path.join(label_dir, 'all_phone_with_sp.mlf')
	mlf_file_train_aligned   = os.path.join(label_dir, 'all_phone_aligned.mlf')
	triphone_mlf			 = os.path.join(label_dir, 'all_triphone.mlf')
else:
	hcompv_scp_train		 = os.path.join(tmp_dir, 'train.scp')
	mlf_file_train			 = os.path.join(label_dir, 'train_phone.mlf')
	mlf_file_train_word		 = os.path.join(label_dir, 'train_word.mlf')
	mlf_file_train_with_sp   = os.path.join(label_dir, 'train_phone_with_sp.mlf')
	mlf_file_train_aligned   = os.path.join(label_dir, 'train_phone_aligned.mlf')
	triphone_mlf			 = os.path.join(label_dir, 'train_triphone.mlf')
hcompv_scp_train_updated = hcompv_scp_train.replace('.scp', '_updated.scp')

## testing
htk_stimmen_dir = os.path.join(default.htk_dir, 'stimmen')


## ======================= make lexicon for HTK =======================
if make_lexicon:
	timer_start = time.time()
	print('==== making lexicon for HTK ====')

	# convert each lexicon from fame_asr phoneset to fame_htk phoneset.
	print('>>> converting each lexicon from fame_asr phoneset to fame_htk phoneset...')
	fame_functions.lexicon_asr2htk(lexicon_asr, lexicon_htk_asr)
	fame_functions.lexicon_asr2htk(lexicon_oov, lexicon_htk_oov)

	# combine lexicon
	print('>>> combining lexicon files into one lexicon...')
	# pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov.
	# therefore there is no overlap between lex_asr and lex_oov.   
	fame_functions.combine_lexicon(lexicon_htk_asr, lexicon_htk_oov, lexicon_htk)

	## fixing the lexicon for HTK. 
	# (1) Replace all tabs with single space;
	# (2) Put a '\' before any dictionary entry beginning with single quote 
	# http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html
	print('>>> fixing the lexicon...')
	fame_functions.fix_lexicon(lexicon_htk)

	## adding sp to the lexicon for HTK. 
	print('>>> adding sp to the lexicon...')
	with open(lexicon_htk) as f:
		lines = f.read().split('\n')
	with open(lexicon_htk_with_sp, 'wb') as f:
		f.write(bytes(' sp\n'.join(lines), 'ascii'))

	print("elapsed time: {}".format(time.time() - timer_start))
	

## intialize the instance for HTK.
chtk = pyhtk.HTK(config_dir, fame_asr.phoneset_htk, lexicon_htk_with_sp, feature_size)


## ======================= make label files =======================
if make_label:
	for dataset in dataset_list:
		timer_start = time.time()
		print("==== making label files on dataset {}".format(dataset))

		script_list = os.path.join(default.fame_dir, 'data', dataset, 'text')
		wav_dir_	= os.path.join(default.fame_dir, 'fame', 'wav', dataset)
		label_dir_		= os.path.join(label_dir, dataset)
		dictionary_file = os.path.join(label_dir_, 'temp.dic')
		fh.make_new_directory(label_dir_, existing_dir='leave')

		# list of scripts 
		with open(script_list, "rt", encoding="utf-8") as fin:
			scripts = fin.read().split('\n')

		for line in scripts:
			# sample line:
			# sp0457m_test_1968_plakkenfryslanterhorne_2168 en dan begjinne je natuerlik
			filename_ = line.split(' ')[0]
			filename  = '_'.join(filename_.split('_')[1:])
			sentence  = ' '.join(line.split(' ')[1:])
			sentence_htk = fame_functions.word2htk(sentence)

			wav_file = os.path.join(wav_dir_, filename + '.wav')
			if os.path.exists(wav_file) and chtk.can_be_ascii(sentence_htk) == 0:
				if chtk.get_number_of_missing_words(
					sentence_htk, dictionary_file) == 0:
					# when the file name is too long, HDMan command does not work.
					# therefore first temporary dictionary_file is made, then renamed. 
					shutil.move(dictionary_file, os.path.join(label_dir_, filename + '.dic'))

					label_file = os.path.join(label_dir_, filename + '.lab')
					chtk.make_label_file(sentence_htk, label_file)
				else:
					os.remove(dictionary_file)

		print("elapsed time: {}".format(time.time() - timer_start))


## ======================= make master label files =======================
if make_mlf:
	timer_start = time.time()
	print("==== making master label files ====")
	
	# train_2002_gongfansaken_10347.lab is empty. should be removed.
	empty_lab_file = os.path.join(label_dir, 'train', 'train_2002_gongfansaken_10347.lab')
	empty_dic_file = empty_lab_file.replace('.lab', '.dic')

	if os.path.exists(empty_lab_file):
		os.remove(empty_lab_file)
	if os.path.exists(empty_dic_file):
		os.remove(empty_dic_file)

	for dataset in dataset_list:
		feature_dir_ = os.path.join(feature_dir, dataset)
		label_dir_   = os.path.join(label_dir, dataset)
		mlf_word  = os.path.join(label_dir, dataset + '_word.mlf')
		mlf_phone = os.path.join(label_dir, dataset + '_phone.mlf')
		mlf_phone_with_sp = os.path.join(label_dir, dataset + '_phone_with_sp.mlf')

		print(">>> generating a word level mlf file for {}...".format(dataset))
		chtk.label2mlf(label_dir_, mlf_word)

		print(">>> generating a phone level mlf file for {}...".format(dataset))
		chtk.mlf_word2phone(mlf_phone, mlf_word, with_sp=False)
		chtk.mlf_word2phone(mlf_phone_with_sp, mlf_word, with_sp=True)
		
	print("elapsed time: {}".format(time.time() - timer_start))


## ======================= extract features =======================
if extract_features:
	for dataset in dataset_list:
		timer_start = time.time()
		print('==== extract features on dataset {} ===='.format(dataset))

		wav_dir_	 = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
		label_dir_   = os.path.join(label_dir, dataset)
		feature_dir_ = os.path.join(feature_dir, dataset)
		fh.make_new_directory(feature_dir_, existing_dir='delete')

		# a script file for HCopy 
		print(">>> making a script file for HCopy...")
		hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False)
		hcopy_scp.close()

		# get a list of features (hcopy.scp) 
		# from the filelist in FAME! corpus.
		#fame_functions.make_hcopy_scp_from_filelist_in_fame(default.fame_dir, dataset, feature_dir_, hcopy_scp.name)
		# from the list of label files.
		lab_list = glob.glob(os.path.join(label_dir_, '*.lab'))
		feature_list = [
			os.path.join(wav_dir_, os.path.basename(lab_file).replace('.lab', '.wav')) + '\t'
			+ os.path.join(feature_dir_, os.path.basename(lab_file).replace('.lab', '.mfc'))
				  for lab_file in lab_list]

		#if os.path.exists(empty_mfc_file):
		#	os.remove(empty_mfc_file)
		with open(hcopy_scp.name, 'wb') as f:
			f.write(bytes('\n'.join(feature_list), 'ascii'))
		
		# extract features.
		print(">>> extracting features on {}...".format(dataset))
		chtk.wav2mfc(hcopy_scp.name)
		os.remove(hcopy_scp.name)

		# make hcompv.scp.
		print(">>> making a script file for {}...".format(dataset))
		listdir    = glob.glob(os.path.join(label_dir_, '*.dic'))
		mfc_list   = [filename.replace(label_dir_, feature_dir_).replace('.dic', '.mfc') for filename in listdir]
		hcompv_scp = os.path.join(tmp_dir, dataset + '.scp')
		with open(hcompv_scp, 'wb') as f:
			f.write(bytes('\n'.join(mfc_list) + '\n', 'ascii'))

		print(">>> extracting features on stimmen...")
		chtk.wav2mfc(os.path.join(htk_stimmen_dir, 'hcopy.scp'))

		print("elapsed time: {}".format(time.time() - timer_start))


## ======================= flat start monophones =======================
if combine_all:
	# script files.
	fh.concatenate(
		os.path.join(tmp_dir, 'devel.scp'),
		os.path.join(tmp_dir, 'test.scp'),
		hcompv_scp_train
		)
	fh.concatenate(
		hcompv_scp_train,
		os.path.join(tmp_dir, 'train.scp'),
		hcompv_scp_train
		)

	# phone level mlfs.
	fh.concatenate(
		os.path.join(label_dir, 'devel_phone.mlf'),
		os.path.join(label_dir, 'test_phone.mlf'),
		mlf_file_train
		)
	fh.concatenate(
		mlf_file_train,
		os.path.join(label_dir, 'train_phone.mlf'),
		mlf_file_train
		)

	# phone level mlfs with sp.
	fh.concatenate(
		os.path.join(label_dir, 'devel_phone_with_sp.mlf'),
		os.path.join(label_dir, 'test_phone_with_sp.mlf'),
		mlf_file_train_with_sp
		)
	fh.concatenate(
		mlf_file_train_with_sp,
		os.path.join(label_dir, 'train_phone_with_sp.mlf'),
		mlf_file_train_with_sp
		)


	# word level mlfs.
	fh.concatenate(
		os.path.join(label_dir, 'devel_word.mlf'),
		os.path.join(label_dir, 'test_word.mlf'),
		mlf_file_train_word
		)
	fh.concatenate(
		mlf_file_train_word,
		os.path.join(label_dir, 'train_word.mlf'),
		mlf_file_train_word
		)


## ======================= flat start monophones =======================
if flat_start:
	timer_start = time.time()
	print('==== flat start ====')
	fh.make_new_directory(model_mono0_dir, existing_dir='leave')

	chtk.flat_start(hcompv_scp_train, model_mono0_dir)

	# make macros.
	vFloors = os.path.join(model_mono0_dir, 'vFloors')
	if os.path.exists(vFloors):
		chtk.make_macros(vFloors)

	# allocate mean & variance to all phones in the phone list
	print('>>> allocating mean & variance to all phones in the phone list...')
	chtk.make_hmmdefs(model_mono0_dir)
	
	print("elapsed time: {}".format(time.time() - timer_start))


## ======================= train model without short pause =======================
if train_monophone_without_sp:
	print('==== train monophone without sp ====')
	
	timer_start = time.time()
	niter = chtk.re_estimation_until_saturated(
		model_mono1_dir,
		model_mono0_dir, improvement_threshold, hcompv_scp_train, 
		os.path.join(htk_stimmen_dir, 'mfc'), 
		'mfc', 
		os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), 
		mlf_file=mlf_file_train, 
		lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic')
		)

	print("elapsed time: {}".format(time.time() - timer_start))


## ======================= adding sp to the model =======================
if add_sp:
	print('==== adding sp to the model ====')
	# reference:
	# http://www.f.waseda.jp/yusukekondo/htk.html#flat_start_estimation
	timer_start = time.time()

	# make model with sp.
	print('>>> adding sp state to the last model in the previous step...')
	fh.make_new_directory(model_mono1sp_dir, existing_dir='leave')
	niter = chtk.get_niter_max(model_mono1_dir)
	modeln_dir_pre = os.path.join(model_mono1_dir, 'iter'+str(niter))
	modeln_dir	   = os.path.join(model_mono1sp_dir, 'iter0')

	chtk.add_sp(modeln_dir_pre, modeln_dir)
	
	print('>>> re-estimation...')
	niter = chtk.re_estimation_until_saturated(
		model_mono1sp_dir, modeln_dir, improvement_threshold, hcompv_scp_train, 
		os.path.join(htk_stimmen_dir, 'mfc'), 
		'mfc', 
		os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), 
		mlf_file=mlf_file_train_with_sp, 
		lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'), 
		model_type='monophone_with_sp'
		)
	print("elapsed time: {}".format(time.time() - timer_start))
	

## ======================= train model with re-aligned mlf =======================
if train_monophone_with_re_aligned_mlf:
	print('==== traina monophone with re-aligned mlf ====')
	timer_start = time.time()

	print('>>> re-aligning the training data... ')
	niter = chtk.get_niter_max(model_mono1sp_dir)
	modeln_dir = os.path.join(model_mono1sp_dir, 'iter'+str(niter))
	chtk.make_aligned_label(
		os.path.join(modeln_dir, 'macros'),
		os.path.join(modeln_dir, 'hmmdefs'), 
		mlf_file_train_aligned, 		
		mlf_file_train_word, 
		hcompv_scp_train)
	chtk.fix_mlf(mlf_file_train_aligned)

	print('>>> updating the script file... ')
	chtk.update_script_file(
		mlf_file_train_aligned, 
		mlf_file_train_with_sp, 
		hcompv_scp_train, 
		hcompv_scp_train_updated)

	print('>>> re-estimation... ')
	timer_start = time.time()
	fh.make_new_directory(model_mono1sp2_dir, existing_dir='leave')
	niter = chtk.get_niter_max(model_mono1sp_dir)
	niter = chtk.re_estimation_until_saturated(
		model_mono1sp2_dir, 
		os.path.join(model_mono1sp_dir, 'iter'+str(niter)), 
		improvement_threshold, 
		hcompv_scp_train_updated, 
		os.path.join(htk_stimmen_dir, 'mfc'), 
		'mfc', 
		os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), 
		mlf_file=mlf_file_train_aligned, 
		lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'), 
		model_type='monophone_with_sp'
		)
	print("elapsed time: {}".format(time.time() - timer_start))


## ======================= increase mixture =======================
if increase_mixture:
	print('==== increase mixture ====')
	timer_start = time.time()
	for nmix in [2, 4, 8, 16]:
		if nmix == 2:
			modeln_dir_ = model_mono1sp2_dir
		else:
			modeln_dir_ = os.path.join(model_dir, 'mono'+str(nmix_))
		modeln_dir	= os.path.join(model_dir, 'mono'+str(nmix))

		print('mixture: {}'.format(nmix))
		fh.make_new_directory(modeln_dir, existing_dir='delete')	
		niter = chtk.get_niter_max(modeln_dir_)
		chtk.increase_mixture(
			os.path.join(modeln_dir_, 'iter'+str(niter), 'hmmdefs'), 
			nmix, 
			os.path.join(modeln_dir, 'iter0'), 
			model_type='monophone_with_sp')
		shutil.copy2(os.path.join(modeln_dir_, 'iter'+str(niter), 'macros'), 
				  os.path.join(modeln_dir, 'iter0', 'macros'))

		#improvement_threshold = -10
		niter = chtk.re_estimation_until_saturated(
			modeln_dir, 
			os.path.join(modeln_dir_, 'iter0'), 
			improvement_threshold, 
			hcompv_scp_train_updated, 
			os.path.join(htk_stimmen_dir, 'mfc'), 
			'mfc', 
			os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), 
			mlf_file=mlf_file_train_aligned, 
			lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'), 
			model_type='monophone_with_sp'
			)
		nmix_ = nmix

	print("elapsed time: {}".format(time.time() - timer_start))


## ======================= train triphone =======================
print('>>> making triphone list... ')
chtk.make_triphonelist( 
	mlf_file_train_aligned,
	triphone_mlf)

if train_triphone:
	print('==== train triphone model ====')
	timer_start = time.time()

	print('>>> init triphone model... ')
	niter = chtk.get_niter_max(model_mono1sp2_dir)
	fh.make_new_directory(os.path.join(model_tri1_dir, 'iter0'), existing_dir='leave')
	chtk.init_triphone(
		os.path.join(model_mono1sp2_dir, 'iter'+str(niter)),
		os.path.join(model_tri1_dir, 'iter0')
		)

	print('>>> re-estimation... ')
	## I wanted to train until satulated:
	#niter = chtk.re_estimation_until_saturated(
	#	model_tri1_dir, 
	#	os.path.join(model_tri1_dir, 'iter0'), 
	#	improvement_threshold, 
	#	hcompv_scp_train_updated, 
	#	os.path.join(htk_stimmen_dir, 'mfc'), 
	#	'mfc', 
	#	os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), 
	#	mlf_file=triphone_mlf, 
	#	lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'), 
	#	model_type='triphone'
	#	)
	#
	# but because the data size is limited, some triphone cannot be trained and received the error:
	#   ERROR [+8231]  GetHCIModel: Cannot find hmm [i:-]r[+???]
	# therefore only two times re-estimation is performed.
	output_dir = model_tri1_dir
	for niter in range(1, 4):
		hmm_n = 'iter' + str(niter)
		hmm_n_pre = 'iter' + str(niter-1)
		_modeln_dir	    = os.path.join(output_dir, hmm_n)
		_modeln_dir_pre = os.path.join(output_dir, hmm_n_pre) 
		
		fh.make_new_directory(_modeln_dir, 'leave')
		chtk.re_estimation(
			os.path.join(_modeln_dir_pre, 'hmmdefs'), 
			_modeln_dir,
			hcompv_scp_train_updated,
			mlf_file=triphone_mlf,
			macros=os.path.join(_modeln_dir_pre, 'macros'),
			model_type='triphone')

	print("elapsed time: {}".format(time.time() - timer_start))


## ======================= train tied-state triphones =======================
if train_triphone_tied:
	print('==== train tied-state triphones ====')
	timer_start = time.time()

	print('>>> making lexicon for triphone... ')
	chtk.make_lexicon_triphone(phonelist_full_txt, lexicon_htk_triphone)
	chtk.combine_phonelists(phonelist_full_txt)

	print('>>> making a tree header... ')
	fame_phonetics.make_quests_hed(quests_hed)
	stats = os.path.join(r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\model\tri1\iter3', 'stats')
	chtk.make_tree_header(tree_hed, quests_hed, stats, config_dir)

	print('>>> init triphone model... ')
	niter = chtk.get_niter_max(model_tri1_dir)
	fh.make_new_directory(os.path.join(model_tri1tied_dir, 'iter0'), existing_dir='leave')
	chtk.init_triphone(
		os.path.join(model_tri1_dir, 'iter'+str(niter)),
		os.path.join(model_tri1tied_dir, 'iter0'),
		tied=True)

	# I wanted to train until satulated:
	#niter = chtk.re_estimation_until_saturated(
	#	model_tri1tied_dir, 
	#	os.path.join(model_tri1tied_dir, 'iter0'), 
	#	improvement_threshold, 
	#	hcompv_scp_train_updated, 
	#	os.path.join(htk_stimmen_dir, 'mfc'), 
	#	'mfc', 
	#	os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), 
	#	mlf_file=triphone_mlf, 
	#	lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'), 
	#	model_type='triphone'
	#	)
	#
	# but because the data size is limited, some triphone cannot be trained and received the error:
	#   ERROR [+8231]  GetHCIModel: Cannot find hmm [i:-]r[+???]
	# therefore only 3 times re-estimation is performed.
	output_dir = model_tri1tied_dir
	for niter in range(1, 4):
		hmm_n = 'iter' + str(niter)
		hmm_n_pre = 'iter' + str(niter-1)
		_modeln_dir	    = os.path.join(output_dir, hmm_n)
		_modeln_dir_pre = os.path.join(output_dir, hmm_n_pre) 
		
		fh.make_new_directory(_modeln_dir, 'leave')
		chtk.re_estimation(
			os.path.join(_modeln_dir_pre, 'hmmdefs'), 
			_modeln_dir,
			hcompv_scp_train_updated,
			mlf_file=triphone_mlf,
			macros=os.path.join(_modeln_dir_pre, 'macros'),
			model_type='triphone')

	print("elapsed time: {}".format(time.time() - timer_start))