diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo index 0c5501e..67b05b0 100644 Binary files a/.vs/acoustic_model/v15/.suo and b/.vs/acoustic_model/v15/.suo differ diff --git a/acoustic_model.sln b/acoustic_model.sln index 7a60eb3..69850c8 100644 --- a/acoustic_model.sln +++ b/acoustic_model.sln @@ -5,7 +5,21 @@ VisualStudioVersion = 15.0.26730.12 MinimumVisualStudioVersion = 10.0.40219.1 Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "acoustic_model", "acoustic_model\acoustic_model.pyproj", "{4D8C8573-32F0-4A62-9E62-3CE5CC680390}" EndProject -Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "forced_alignment", "..\forced_alignment\forced_alignment\forced_alignment.pyproj", "{92E4D819-38D0-467A-ABEE-09662EEAA084}" +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{3DCEA49A-8FD7-4255-A223-573DCD2595E0}" + ProjectSection(SolutionItems) = preProject + ..\forced_alignment\forced_alignment\__init__.py = ..\forced_alignment\forced_alignment\__init__.py + ..\forced_alignment\forced_alignment\convert_phone_set.py = ..\forced_alignment\forced_alignment\convert_phone_set.py + ..\forced_alignment\forced_alignment\defaultfiles.py = ..\forced_alignment\forced_alignment\defaultfiles.py + ..\forced_alignment\forced_alignment\forced_alignment.pyproj = ..\forced_alignment\forced_alignment\forced_alignment.pyproj + ..\forced_alignment\forced_alignment\htk_dict.py = ..\forced_alignment\forced_alignment\htk_dict.py + ..\forced_alignment\forced_alignment\lexicon.py = ..\forced_alignment\forced_alignment\lexicon.py + ..\forced_alignment\forced_alignment\mlf.py = ..\forced_alignment\forced_alignment\mlf.py + ..\forced_alignment\forced_alignment\pronunciations.py = ..\forced_alignment\forced_alignment\pronunciations.py + ..\forced_alignment\forced_alignment\pyhtk.py = ..\forced_alignment\forced_alignment\pyhtk.py + ..\forced_alignment\forced_alignment\scripts.py = ..\forced_alignment\forced_alignment\scripts.py + ..\forced_alignment\forced_alignment\tempfilename.py = ..\forced_alignment\forced_alignment\tempfilename.py + ..\forced_alignment\forced_alignment\test_environment.py = ..\forced_alignment\forced_alignment\test_environment.py + EndProjectSection EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution @@ -15,8 +29,6 @@ Global GlobalSection(ProjectConfigurationPlatforms) = postSolution {4D8C8573-32F0-4A62-9E62-3CE5CC680390}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {4D8C8573-32F0-4A62-9E62-3CE5CC680390}.Release|Any CPU.ActiveCfg = Release|Any CPU - {92E4D819-38D0-467A-ABEE-09662EEAA084}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {92E4D819-38D0-467A-ABEE-09662EEAA084}.Release|Any CPU.ActiveCfg = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/acoustic_model/__pycache__/acoustic_model_functions.cpython-36.pyc b/acoustic_model/__pycache__/acoustic_model_functions.cpython-36.pyc index 7e0973a..99ef1fa 100644 Binary files a/acoustic_model/__pycache__/acoustic_model_functions.cpython-36.pyc and b/acoustic_model/__pycache__/acoustic_model_functions.cpython-36.pyc differ diff --git a/acoustic_model/acoustic_model.py b/acoustic_model/acoustic_model.py index 10f01af..f8f6222 100644 --- a/acoustic_model/acoustic_model.py +++ b/acoustic_model/acoustic_model.py @@ -3,7 +3,9 @@ import sys import tempfile import configparser import subprocess +from collections import Counter +import numpy as np import pandas as pd @@ -12,13 +14,26 @@ repo_dir = 'C:\\Users\\Aki\\source\\repos\\acoustic_model' curr_dir = repo_dir + '\\acoustic_model' config_ini = curr_dir + '\\config.ini' output_dir = 'd:\\OneDrive\\Research\\rug\\experiments\\friesian\\acoustic_model' -forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced-alignment' +forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced_alignment' + +dataset_list = ['devel', 'test', 'train'] + +# procedure +extract_features = 0 +make_feature_list = 0 +conv_lexicon = 0 +check_lexicon = 0 +make_mlf = 0 +combine_files = 0 +flat_start = 0 +train_model = 1 +forced_alignment = 0 + sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir)) sys.path.append(forced_alignment_module) from forced_alignment import convert_phone_set - import acoustic_model_functions as am_func @@ -30,88 +45,294 @@ config.read(config_ini) config_hcopy = config['Settings']['config_hcopy'] config_train = config['Settings']['config_train'] +mkhmmdefs_pl = config['Settings']['mkhmmdefs_pl'] FAME_dir = config['Settings']['FAME_dir'] -lexicon_file = FAME_dir + '\\lexicon\\lex.asr' -dataset_list = ['devel', 'test', 'train'] +lex_asr = FAME_dir + '\\lexicon\\lex.asr' +lex_asr_htk = FAME_dir + '\\lexicon\\lex.asr_htk' +lex_oov = FAME_dir + '\\lexicon\\lex.oov' +lex_oov_htk = FAME_dir + '\\lexicon\\lex.oov_htk' +#lex_ipa = FAME_dir + '\\lexicon\\lex.ipa' +#lex_ipa_ = FAME_dir + '\\lexicon\\lex.ipa_' +#lex_ipa_htk = FAME_dir + '\\lexicon\\lex.ipa_htk' +lex_htk = FAME_dir + '\\lexicon\\lex_original.htk' +lex_htk_ = FAME_dir + '\\lexicon\\lex.htk' + +hcompv_scp = output_dir + '\\scp\\combined.scp' +combined_mlf = output_dir + '\\label\\combined.mlf' + +model_dir = output_dir + '\\model' +model0_dir = model_dir + '\\hmm0' +proto_init = model_dir + '\\proto38' +proto_name = 'proto' +phonelist = output_dir + '\\config\\phonelist_friesian.txt' +hmmdefs_name = 'hmmdefs' + ## ======================= extract features ======================= -##dataset = dataset_list[0] -#for dataset in dataset_list: -# print(dataset) +if extract_features: + print("==== extract features ====\n") + + for dataset in dataset_list: + print(dataset) - ## make a script file for HCopy - #hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False) - #hcopy_scp.close() + # a script file for HCopy + hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False) + hcopy_scp.close() - ## using the filelist in FAME! corpus - #feature_dir = output_dir + '\\mfc\\' + dataset - #am_func.make_hcopy_scp_from_filelist_in_fame(FAME_dir, dataset, feature_dir, hcopy_scp.name) + # get a list of features (hcopy.scp) from the filelist in FAME! corpus + feature_dir = output_dir + '\\mfc\\' + dataset + am_func.make_hcopy_scp_from_filelist_in_fame(FAME_dir, dataset, feature_dir, hcopy_scp.name) - ## extract features - #subprocessStr = 'HCopy -C ' + config_hcopy + ' -S ' + hcopy_scp.name - #subprocess.call(subprocessStr, shell=True) - - #os.remove(hcopy_scp.name) + # extract features + subprocessStr = 'HCopy -C ' + config_hcopy + ' -S ' + hcopy_scp.name + subprocess.call(subprocessStr, shell=True) ## ======================= make a list of features ======================= -##dataset = dataset_list[2] -#for dataset in dataset_list: -# print(dataset) +if make_feature_list: + print("==== make a list of features ====\n") -# feature_dir = output_dir + '\\mfc\\' + dataset -# hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp' + for dataset in dataset_list: + print(dataset) -# am_func.make_filelist(feature_dir, hcompv_scp) + feature_dir = output_dir + '\\mfc\\' + dataset + hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp' + + am_func.make_filelist(feature_dir, hcompv_scp) -## ======================= check the phonemes used in the lexicon ======================= -phonelist = am_func.get_phonelist(lexicon_file) # 49 -phonelist_list = list(phonelist) +## ======================= convert lexicon from ipa to fame_htk ======================= +if conv_lexicon: + print('==== convert lexicon from ipa 2 fame ====\n') -#lines_g1 = am_func.find_phone(lexicon_file, 'g') -#lines_g2 = am_func.find_phone(lexicon_file, 'ɡ') + # lex.asr is Kaldi compatible version of lex.ipa. + # to check... + #lexicon_ipa = pd.read_table(lex_ipa, names=['word', 'pronunciation']) + #with open(lex_ipa_, "w", encoding="utf-8") as fout: + # for word, pronunciation in zip(lexicon_ipa['word'], lexicon_ipa['pronunciation']): + # # ignore nasalization and '.' + # pronunciation_ = pronunciation.replace(u'ⁿ', '') + # pronunciation_ = pronunciation_.replace('.', '') + # pronunciation_split = convert_phone_set.split_ipa_fame(pronunciation_) + # fout.write("{0}\t{1}\n".format(word, ' '.join(pronunciation_split))) + + # convert each lexicon from ipa description to fame_htk phoneset. + am_func.ipa2famehtk_lexicon(lex_oov, lex_oov_htk) + am_func.ipa2famehtk_lexicon(lex_asr, lex_asr_htk) + + # combine lexicon + # pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov. + # therefore there is no overlap between lex_asr and lex_oov. + am_func.combine_lexicon(lex_asr_htk, lex_oov_htk, lex_htk) + + +## ======================= check if all the phones are successfully converted ======================= +if check_lexicon: + print("==== check if all the phones are successfully converted. ====\n") + + # the phones used in the lexicon. + phonelist = am_func.get_phonelist(lex_htk) + + # the lines which include a specific phone. + lines = am_func.find_phone(lex_asr, 'g') + + # statistics over the lexicon + lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation']) + pronunciation = lexicon_htk['pronunciation'] + phones_all = [] + for word in pronunciation: + phones_all = phones_all + word.split() + c = Counter(phones_all) + + +## ======================= +## manually make changes to the pronunciation dictionary and save it as lex.htk +## ======================= +# (1) Replace all tabs with single space; +# (2) Put a '\' before any dictionary entry beginning with single quote +#http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html ## ======================= make label file ======================= -dataset = 'train' -hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp' -script_list = FAME_dir + '\\data\\' + dataset + '\\text' +if make_mlf: + print("==== make mlf ====\n") -lexicon = pd.read_table(lexicon_file, names=['word', 'pronunciation']) + print("generating word level transcription...\n") + for dataset in dataset_list: + hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp' + hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp' + script_list = FAME_dir + '\\data\\' + dataset + '\\text' + mlf_word = output_dir + '\\label\\' + dataset + '_word.mlf' + mlf_phone = output_dir + '\\label\\' + dataset + '_phone.mlf' -with open(hcompv_scp) as fin: - features = fin.read() - features = features.split('\n') + # lexicon + lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation']) -with open(script_list, "rt", encoding="utf-8") as fin: - scripts = fin.read() - scripts = pd.Series(scripts.split('\n')) + # list of features + with open(hcompv_scp) as fin: + features = fin.read() + features = features.split('\n') + + # list of scripts + with open(script_list, "rt", encoding="utf-8") as fin: + scripts = fin.read() + scripts = pd.Series(scripts.split('\n')) + + i = 0 + missing_words = [] + fscp = open(hcompv_scp2, 'wt') + fmlf = open(mlf_word, "wt", encoding="utf-8") + fmlf.write("#!MLF!#\n") + feature_nr = 1 + for feature in features: + sys.stdout.write("\r%d/%d" % (feature_nr, len(features))) + sys.stdout.flush() + feature_nr += 1 + file_basename = os.path.basename(feature).replace('.mfc', '') + + # get words from scripts. + try: + script = scripts[scripts.str.contains(file_basename)] + except IndexError: + script = [] + + if len(script) != 0: + script_id = script.index[0] + script_txt = script.get(script_id) + script_words = script_txt.split(' ') + del script_words[0] + + # check if all words can be found in the lexicon. + SCRIPT_WORDS = [] + script_prons = [] + is_in_lexicon = 1 + for word in script_words: + WORD = word.upper() + SCRIPT_WORDS.append(WORD) + extracted = lexicon_htk[lexicon_htk['word']==WORD] + if len(extracted) == 0: + missing_words.append(word) + script_prons.append(extracted) + is_in_lexicon *= len(extracted) + + # if all pronunciations are found in the lexicon, update scp and mlf files. + if is_in_lexicon: + # add the feature filename into the .scp file. + fscp.write("{}\n".format(feature)) + i += 1 + + # add the words to the mlf file. + fmlf.write('\"*/{}.lab\"\n'.format(file_basename)) + #fmlf.write('{}'.format('\n'.join(SCRIPT_WORDS))) + for word_ in SCRIPT_WORDS: + if word_[0] == '\'': + word_ = '\\' + word_ + fmlf.write('{}\n'.format(word_)) + fmlf.write('.\n') + print("\n{0} has {1} samples.\n".format(dataset, i)) + np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words) + + fscp.close() + fmlf.close() -feature = features[0] -file_basename = os.path.basename(feature).replace('.mfc', '') + ## generate phone level transcription + print("generating phone level transcription...\n") + mkphones = output_dir + '\\label\\mkphones0.txt' + subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word + subprocess.call(subprocessStr, shell=True) + -# get words from scripts. -script = scripts[scripts.str.contains(file_basename)] -script_id = script.index[0] -script_txt = script.get(script_id) -script_words = script_txt.split(' ') -del script_words[0] +## ======================= combined scps and mlfs ======================= +if combine_files: + print("==== combine scps and mlfs ====\n") -# make the label file. -SCRIPT_WORDS = [] -script_prons = [] -all_prons_found = 1 -for word in script_words: - SCRIPT_WORDS.append(word.upper()) - extracted = lexicon[lexicon['word']==word] - script_prons.append(extracted) - all_prons_found *= len(extracted) -# make the dict file. + fscp = open(hcompv_scp, 'wt') + fmlf = open(combined_mlf, 'wt') -convert_phone_set.ipa2fame(phonelist_list) -phonelist_list + for dataset in dataset_list: + fmlf.write("#!MLF!#\n") + for dataset in dataset_list: + each_mlf = output_dir + '\\label\\' + dataset + '_phone.mlf' + each_scp = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp' + + with open(each_mlf, 'r') as fin: + lines = fin.read() + lines = lines.split('\n') + fmlf.write('\n'.join(lines[1:])) + with open(each_scp, 'r') as fin: + lines = fin.read() + fscp.write(lines) + + fscp.close() + fmlf.close() + + +## ======================= flat start monophones ======================= +if flat_start: + subprocessStr = 'HCompV -T 1 -C ' + config_train + ' -m -v 0.01 -S ' + hcompv_scp + ' -M ' + model0_dir + ' ' + proto_init + subprocess.call(subprocessStr, shell=True) + + # allocate mean & variance to all phones in the phone list + subprocessStr = 'perl ' + mkhmmdefs_pl + ' ' + model0_dir + '\\proto38' + ' ' + phonelist + ' > ' + model0_dir + '\\' + hmmdefs_name + subprocess.call(subprocessStr, shell=True) + + +## ======================= estimate monophones ======================= +if train_model: + iter_num_max = 3 + for mix_num in [16, 32, 64, 128]: + for iter_num in range(1, iter_num_max+1): + print("===== mix{}, iter{} =====".format(mix_num, iter_num)) + iter_num_pre = iter_num - 1 + modelN_dir = model_dir + '\\hmm' + str(mix_num) + '-' + str(iter_num) + if not os.path.exists(modelN_dir): + os.makedirs(modelN_dir) + + if iter_num == 1 and mix_num == 1: + modelN_dir_pre = model0_dir + else: + modelN_dir_pre = model_dir + '\\hmm' + str(mix_num) + '-' + str(iter_num_pre) + + ## re-estimation + subprocessStr = 'HERest -T 1 -C ' + config_train + ' -v 0.01 -I ' + combined_mlf + ' -H ' + modelN_dir_pre + '\\' + hmmdefs_name + ' -M ' + modelN_dir + ' ' + phonelist + ' -S ' + hcompv_scp + subprocess.call(subprocessStr, shell=True) + + mix_num_next = mix_num * 2 + modelN_dir_next = model_dir + '\\hmm' + str(mix_num_next) + '-0' + if not os.path.exists(modelN_dir_next): + os.makedirs(modelN_dir_next) + + header_file = modelN_dir + '\\mix' + str(mix_num_next) + '.hed' + with open(header_file, 'w') as fout: + fout.write("MU %d {*.state[2-4].mix}" % (mix_num_next)) + + subprocessStr = 'HHEd -T 1 -H ' + modelN_dir + '\\' + hmmdefs_name + ' -M ' + modelN_dir_next + ' ' + header_file + ' ' + phonelist + subprocess.call(subprocessStr, shell=True) + + +### ======================= forced alignment ======================= +#if forced_alignment: +# try: +# scripts.run_command([ +# 'HVite','-T', '1', '-a', '-C', configHVite, +# '-H', AcousticModel, '-m', '-I', +# mlf_file, '-i', fa_file, '-S', +# script_file, htk_dict_file, filePhoneList +# ]) +# except: +# print("\033[91mHVite command failed with these input files:\033[0m") +# print(_debug_show_file('HVite config', configHVite)) +# print(_debug_show_file('Accoustic model', AcousticModel)) +# print(_debug_show_file('Master Label file', mlf_file)) +# print(_debug_show_file('Output', fa_file)) +# print(_debug_show_file('Script file', script_file)) +# print(_debug_show_file('HTK dictionary', htk_dict_file)) +# print(_debug_show_file('Phoneme list', filePhoneList)) +# raise + + +##os.remove(hcopy_scp.name) diff --git a/acoustic_model/acoustic_model_functions.py b/acoustic_model/acoustic_model_functions.py index c742dfd..04e19c3 100644 --- a/acoustic_model/acoustic_model_functions.py +++ b/acoustic_model/acoustic_model_functions.py @@ -1,9 +1,18 @@ import os import sys +import pandas as pd + + +## ======================= user define ======================= repo_dir = 'C:\\Users\\Aki\\source\\repos\\acoustic_model' curr_dir = repo_dir + '\\acoustic_model' +forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced_alignment' + + sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir)) +sys.path.append(forced_alignment_module) +from forced_alignment import convert_phone_set def make_hcopy_scp_from_filelist_in_fame(FAME_dir, dataset, feature_dir, hcopy_scp): @@ -61,4 +70,33 @@ def find_phone(lexicon_file, phone): pron = line[1] if phone in pron: extracted.append(line) - return extracted \ No newline at end of file + return extracted + + +def ipa2famehtk_lexicon(lexicon_file_in, lexicon_file_out): + """ Convert a lexicon file from IPA to HTK format for FAME! corpus. """ + + lexicon_in = pd.read_table(lexicon_file_in, names=['word', 'pronunciation']) + with open(lexicon_file_out, "w", encoding="utf-8") as fout: + for word, pronunciation in zip(lexicon_in['word'], lexicon_in['pronunciation']): + pronunciation_no_space = pronunciation.replace(' ', '') + pronunciation_famehtk = convert_phone_set.ipa2famehtk(pronunciation_no_space) + if 'ceh' not in pronunciation_famehtk and 'sh' not in pronunciation_famehtk: + fout.write("{0}\t{1}\n".format(word.upper(), pronunciation_famehtk)) + + +def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out): + """ Combine two lexicon files and sort by words. """ + + with open(lexicon_file1, "rt", encoding="utf-8") as fin: + lines1 = fin.read() + lines1 = lines1.split('\n') + with open(lexicon_file2, "rt", encoding="utf-8") as fin: + lines2 = fin.read() + lines2 = lines2.split('\n') + + lex1 = pd.read_table(lexicon_file1, names=['word', 'pronunciation']) + lex2 = pd.read_table(lexicon_file2, names=['word', 'pronunciation']) + lex = pd.concat([lex1, lex2]) + lex = lex.sort_values(by='word', ascending=True) + lex.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t') \ No newline at end of file diff --git a/acoustic_model/config.ini b/acoustic_model/config.ini index dd7b4fc..e11c611 100644 --- a/acoustic_model/config.ini +++ b/acoustic_model/config.ini @@ -1,4 +1,5 @@ [Settings] config_hcopy = c:\cygwin64\home\Aki\acoustic_model\config\config.HCopy config_train = c:\cygwin64\home\Aki\acoustic_model\config\config.train +mkhmmdefs_pl = c:\cygwin64\home\Aki\acoustic_model\src\acoustic_model\mkhmmdefs.pl FAME_dir = d:\OneDrive\Research\rug\experiments\friesian\corpus \ No newline at end of file diff --git a/acoustic_model/performance_check.py b/acoustic_model/performance_check.py new file mode 100644 index 0000000..a3e66d9 --- /dev/null +++ b/acoustic_model/performance_check.py @@ -0,0 +1,22 @@ +### ======================= forced alignment ======================= +#if forced_alignment: +# try: +# scripts.run_command([ +# 'HVite','-T', '1', '-a', '-C', configHVite, +# '-H', AcousticModel, '-m', '-I', +# mlf_file, '-i', fa_file, '-S', +# script_file, htk_dict_file, filePhoneList +# ]) +# except: +# print("\033[91mHVite command failed with these input files:\033[0m") +# print(_debug_show_file('HVite config', configHVite)) +# print(_debug_show_file('Accoustic model', AcousticModel)) +# print(_debug_show_file('Master Label file', mlf_file)) +# print(_debug_show_file('Output', fa_file)) +# print(_debug_show_file('Script file', script_file)) +# print(_debug_show_file('HTK dictionary', htk_dict_file)) +# print(_debug_show_file('Phoneme list', filePhoneList)) +# raise + + +##os.remove(hcopy_scp.name)