import sys import os os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') import tempfile import shutil import glob import time import numpy as np import pandas as pd import fame_functions from phoneset import fame_ipa, fame_asr, fame_phonetics import defaultfiles as default sys.path.append(default.toolbox_dir) import file_handling as fh from htk import pyhtk #from scripts import run_command ## ======================= user define ======================= # procedure combine_all = 1 make_lexicon = 0 make_label = 0 # it takes roughly 4800 sec on Surface pro 2. make_mlf = 0 extract_features = 0 flat_start = 1 train_monophone_without_sp = 1 add_sp = 1 train_monophone_with_re_aligned_mlf = 1 increase_mixture = 1 train_triphone = 0 train_triphone_tied = 0 # pre-defined values. dataset_list = ['devel', 'test', 'train'] feature_size = 30 improvement_threshold = 0.3 lexicon_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr') lexicon_oov = os.path.join(default.fame_dir, 'lexicon', 'lex.oov') config_dir = os.path.join(default.htk_dir, 'config') phonelist_full_txt = os.path.join(config_dir, 'phonelist_full.txt') tree_hed = os.path.join(config_dir, 'tree.hed') quests_hed = os.path.join(config_dir, 'quests.hed') model_dir = os.path.join(default.htk_dir, 'model') model_mono0_dir = os.path.join(model_dir, 'mono0') model_mono1_dir = os.path.join(model_dir, 'mono1') model_mono1sp_dir = os.path.join(model_dir, 'mono1sp') model_mono1sp2_dir = os.path.join(model_dir, 'mono1sp2') model_tri1_dir = os.path.join(model_dir, 'tri1') model_tri1tied_dir = os.path.join(model_dir, 'tri1tied') # directories / files to be made. lexicon_dir = os.path.join(default.htk_dir, 'lexicon') lexicon_htk_asr = os.path.join(lexicon_dir, 'lex.htk_asr') lexicon_htk_oov = os.path.join(lexicon_dir, 'lex.htk_oov') lexicon_htk = os.path.join(lexicon_dir, 'lex.htk') lexicon_htk_with_sp = os.path.join(lexicon_dir, 'lex_with_sp.htk') lexicon_htk_triphone = os.path.join(lexicon_dir, 'lex_triphone.htk') feature_dir = os.path.join(default.htk_dir, 'mfc') fh.make_new_directory(feature_dir, existing_dir='leave') tmp_dir = os.path.join(default.htk_dir, 'tmp') fh.make_new_directory(tmp_dir, existing_dir='leave') label_dir = os.path.join(default.htk_dir, 'label') fh.make_new_directory(label_dir, existing_dir='leave') ## training if combine_all: hcompv_scp_train = os.path.join(tmp_dir, 'all.scp') mlf_file_train = os.path.join(label_dir, 'all_phone.mlf') mlf_file_train_word = os.path.join(label_dir, 'all_word.mlf') mlf_file_train_with_sp = os.path.join(label_dir, 'all_phone_with_sp.mlf') mlf_file_train_aligned = os.path.join(label_dir, 'all_phone_aligned.mlf') triphone_mlf = os.path.join(label_dir, 'all_triphone.mlf') else: hcompv_scp_train = os.path.join(tmp_dir, 'train.scp') mlf_file_train = os.path.join(label_dir, 'train_phone.mlf') mlf_file_train_word = os.path.join(label_dir, 'train_word.mlf') mlf_file_train_with_sp = os.path.join(label_dir, 'train_phone_with_sp.mlf') mlf_file_train_aligned = os.path.join(label_dir, 'train_phone_aligned.mlf') triphone_mlf = os.path.join(label_dir, 'train_triphone.mlf') hcompv_scp_train_updated = hcompv_scp_train.replace('.scp', '_updated.scp') ## testing htk_stimmen_dir = os.path.join(default.htk_dir, 'stimmen') ## ======================= make lexicon for HTK ======================= if make_lexicon: timer_start = time.time() print('==== making lexicon for HTK ====') # convert each lexicon from fame_asr phoneset to fame_htk phoneset. print('>>> converting each lexicon from fame_asr phoneset to fame_htk phoneset...') fame_functions.lexicon_asr2htk(lexicon_asr, lexicon_htk_asr) fame_functions.lexicon_asr2htk(lexicon_oov, lexicon_htk_oov) # combine lexicon print('>>> combining lexicon files into one lexicon...') # pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov. # therefore there is no overlap between lex_asr and lex_oov. fame_functions.combine_lexicon(lexicon_htk_asr, lexicon_htk_oov, lexicon_htk) ## fixing the lexicon for HTK. # (1) Replace all tabs with single space; # (2) Put a '\' before any dictionary entry beginning with single quote # http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html print('>>> fixing the lexicon...') fame_functions.fix_lexicon(lexicon_htk) ## adding sp to the lexicon for HTK. print('>>> adding sp to the lexicon...') with open(lexicon_htk) as f: lines = f.read().split('\n') with open(lexicon_htk_with_sp, 'wb') as f: f.write(bytes(' sp\n'.join(lines), 'ascii')) print("elapsed time: {}".format(time.time() - timer_start)) ## intialize the instance for HTK. chtk = pyhtk.HTK(config_dir, fame_asr.phoneset_htk, lexicon_htk_with_sp, feature_size) ## ======================= make label files ======================= if make_label: for dataset in dataset_list: timer_start = time.time() print("==== making label files on dataset {}".format(dataset)) script_list = os.path.join(default.fame_dir, 'data', dataset, 'text') wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset) label_dir_ = os.path.join(label_dir, dataset) dictionary_file = os.path.join(label_dir_, 'temp.dic') fh.make_new_directory(label_dir_, existing_dir='leave') # list of scripts with open(script_list, "rt", encoding="utf-8") as fin: scripts = fin.read().split('\n') for line in scripts: # sample line: # sp0457m_test_1968_plakkenfryslanterhorne_2168 en dan begjinne je natuerlik filename_ = line.split(' ')[0] filename = '_'.join(filename_.split('_')[1:]) sentence = ' '.join(line.split(' ')[1:]) sentence_htk = fame_functions.word2htk(sentence) wav_file = os.path.join(wav_dir_, filename + '.wav') if os.path.exists(wav_file) and chtk.can_be_ascii(sentence_htk) == 0: if chtk.get_number_of_missing_words( sentence_htk, dictionary_file) == 0: # when the file name is too long, HDMan command does not work. # therefore first temporary dictionary_file is made, then renamed. shutil.move(dictionary_file, os.path.join(label_dir_, filename + '.dic')) label_file = os.path.join(label_dir_, filename + '.lab') chtk.make_label_file(sentence_htk, label_file) else: os.remove(dictionary_file) print("elapsed time: {}".format(time.time() - timer_start)) ## ======================= make master label files ======================= if make_mlf: timer_start = time.time() print("==== making master label files ====") # train_2002_gongfansaken_10347.lab is empty. should be removed. empty_lab_file = os.path.join(label_dir, 'train', 'train_2002_gongfansaken_10347.lab') empty_dic_file = empty_lab_file.replace('.lab', '.dic') if os.path.exists(empty_lab_file): os.remove(empty_lab_file) if os.path.exists(empty_dic_file): os.remove(empty_dic_file) for dataset in dataset_list: feature_dir_ = os.path.join(feature_dir, dataset) label_dir_ = os.path.join(label_dir, dataset) mlf_word = os.path.join(label_dir, dataset + '_word.mlf') mlf_phone = os.path.join(label_dir, dataset + '_phone.mlf') mlf_phone_with_sp = os.path.join(label_dir, dataset + '_phone_with_sp.mlf') print(">>> generating a word level mlf file for {}...".format(dataset)) chtk.label2mlf(label_dir_, mlf_word) print(">>> generating a phone level mlf file for {}...".format(dataset)) chtk.mlf_word2phone(mlf_phone, mlf_word, with_sp=False) chtk.mlf_word2phone(mlf_phone_with_sp, mlf_word, with_sp=True) print("elapsed time: {}".format(time.time() - timer_start)) ## ======================= extract features ======================= if extract_features: for dataset in dataset_list: timer_start = time.time() print('==== extract features on dataset {} ===='.format(dataset)) wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset) label_dir_ = os.path.join(label_dir, dataset) feature_dir_ = os.path.join(feature_dir, dataset) fh.make_new_directory(feature_dir_, existing_dir='delete') # a script file for HCopy print(">>> making a script file for HCopy...") hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False) hcopy_scp.close() # get a list of features (hcopy.scp) # from the filelist in FAME! corpus. #fame_functions.make_hcopy_scp_from_filelist_in_fame(default.fame_dir, dataset, feature_dir_, hcopy_scp.name) # from the list of label files. lab_list = glob.glob(os.path.join(label_dir_, '*.lab')) feature_list = [ os.path.join(wav_dir_, os.path.basename(lab_file).replace('.lab', '.wav')) + '\t' + os.path.join(feature_dir_, os.path.basename(lab_file).replace('.lab', '.mfc')) for lab_file in lab_list] #if os.path.exists(empty_mfc_file): # os.remove(empty_mfc_file) with open(hcopy_scp.name, 'wb') as f: f.write(bytes('\n'.join(feature_list), 'ascii')) # extract features. print(">>> extracting features on {}...".format(dataset)) chtk.wav2mfc(hcopy_scp.name) os.remove(hcopy_scp.name) # make hcompv.scp. print(">>> making a script file for {}...".format(dataset)) listdir = glob.glob(os.path.join(label_dir_, '*.dic')) mfc_list = [filename.replace(label_dir_, feature_dir_).replace('.dic', '.mfc') for filename in listdir] hcompv_scp = os.path.join(tmp_dir, dataset + '.scp') with open(hcompv_scp, 'wb') as f: f.write(bytes('\n'.join(mfc_list) + '\n', 'ascii')) print(">>> extracting features on stimmen...") chtk.wav2mfc(os.path.join(htk_stimmen_dir, 'hcopy.scp')) print("elapsed time: {}".format(time.time() - timer_start)) ## ======================= flat start monophones ======================= if combine_all: # script files. fh.concatenate( os.path.join(tmp_dir, 'devel.scp'), os.path.join(tmp_dir, 'test.scp'), hcompv_scp_train ) fh.concatenate( hcompv_scp_train, os.path.join(tmp_dir, 'train.scp'), hcompv_scp_train ) # phone level mlfs. fh.concatenate( os.path.join(label_dir, 'devel_phone.mlf'), os.path.join(label_dir, 'test_phone.mlf'), mlf_file_train ) fh.concatenate( mlf_file_train, os.path.join(label_dir, 'train_phone.mlf'), mlf_file_train ) # phone level mlfs with sp. fh.concatenate( os.path.join(label_dir, 'devel_phone_with_sp.mlf'), os.path.join(label_dir, 'test_phone_with_sp.mlf'), mlf_file_train_with_sp ) fh.concatenate( mlf_file_train_with_sp, os.path.join(label_dir, 'train_phone_with_sp.mlf'), mlf_file_train_with_sp ) # word level mlfs. fh.concatenate( os.path.join(label_dir, 'devel_word.mlf'), os.path.join(label_dir, 'test_word.mlf'), mlf_file_train_word ) fh.concatenate( mlf_file_train_word, os.path.join(label_dir, 'train_word.mlf'), mlf_file_train_word ) ## ======================= flat start monophones ======================= if flat_start: timer_start = time.time() print('==== flat start ====') fh.make_new_directory(model_mono0_dir, existing_dir='leave') chtk.flat_start(hcompv_scp_train, model_mono0_dir) # make macros. vFloors = os.path.join(model_mono0_dir, 'vFloors') if os.path.exists(vFloors): chtk.make_macros(vFloors) # allocate mean & variance to all phones in the phone list print('>>> allocating mean & variance to all phones in the phone list...') chtk.make_hmmdefs(model_mono0_dir) print("elapsed time: {}".format(time.time() - timer_start)) ## ======================= train model without short pause ======================= if train_monophone_without_sp: print('==== train monophone without sp ====') timer_start = time.time() niter = chtk.re_estimation_until_saturated( model_mono1_dir, model_mono0_dir, improvement_threshold, hcompv_scp_train, os.path.join(htk_stimmen_dir, 'mfc'), 'mfc', os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), mlf_file=mlf_file_train, lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic') ) print("elapsed time: {}".format(time.time() - timer_start)) ## ======================= adding sp to the model ======================= if add_sp: print('==== adding sp to the model ====') # reference: # http://www.f.waseda.jp/yusukekondo/htk.html#flat_start_estimation timer_start = time.time() # make model with sp. print('>>> adding sp state to the last model in the previous step...') fh.make_new_directory(model_mono1sp_dir, existing_dir='leave') niter = chtk.get_niter_max(model_mono1_dir) modeln_dir_pre = os.path.join(model_mono1_dir, 'iter'+str(niter)) modeln_dir = os.path.join(model_mono1sp_dir, 'iter0') chtk.add_sp(modeln_dir_pre, modeln_dir) print('>>> re-estimation...') niter = chtk.re_estimation_until_saturated( model_mono1sp_dir, modeln_dir, improvement_threshold, hcompv_scp_train, os.path.join(htk_stimmen_dir, 'mfc'), 'mfc', os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), mlf_file=mlf_file_train_with_sp, lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'), model_type='monophone_with_sp' ) print("elapsed time: {}".format(time.time() - timer_start)) ## ======================= train model with re-aligned mlf ======================= if train_monophone_with_re_aligned_mlf: print('==== traina monophone with re-aligned mlf ====') timer_start = time.time() print('>>> re-aligning the training data... ') niter = chtk.get_niter_max(model_mono1sp_dir) modeln_dir = os.path.join(model_mono1sp_dir, 'iter'+str(niter)) chtk.make_aligned_label( os.path.join(modeln_dir, 'macros'), os.path.join(modeln_dir, 'hmmdefs'), mlf_file_train_aligned, mlf_file_train_word, hcompv_scp_train) chtk.fix_mlf(mlf_file_train_aligned) print('>>> updating the script file... ') chtk.update_script_file( mlf_file_train_aligned, mlf_file_train_with_sp, hcompv_scp_train, hcompv_scp_train_updated) print('>>> re-estimation... ') timer_start = time.time() fh.make_new_directory(model_mono1sp2_dir, existing_dir='leave') niter = chtk.get_niter_max(model_mono1sp_dir) niter = chtk.re_estimation_until_saturated( model_mono1sp2_dir, os.path.join(model_mono1sp_dir, 'iter'+str(niter)), improvement_threshold, hcompv_scp_train_updated, os.path.join(htk_stimmen_dir, 'mfc'), 'mfc', os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), mlf_file=mlf_file_train_aligned, lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'), model_type='monophone_with_sp' ) print("elapsed time: {}".format(time.time() - timer_start)) ## ======================= increase mixture ======================= if increase_mixture: print('==== increase mixture ====') timer_start = time.time() for nmix in [2, 4, 8, 16]: if nmix == 2: modeln_dir_ = model_mono1sp2_dir else: modeln_dir_ = os.path.join(model_dir, 'mono'+str(nmix_)) modeln_dir = os.path.join(model_dir, 'mono'+str(nmix)) print('mixture: {}'.format(nmix)) fh.make_new_directory(modeln_dir, existing_dir='delete') niter = chtk.get_niter_max(modeln_dir_) chtk.increase_mixture( os.path.join(modeln_dir_, 'iter'+str(niter), 'hmmdefs'), nmix, os.path.join(modeln_dir, 'iter0'), model_type='monophone_with_sp') shutil.copy2(os.path.join(modeln_dir_, 'iter'+str(niter), 'macros'), os.path.join(modeln_dir, 'iter0', 'macros')) #improvement_threshold = -10 niter = chtk.re_estimation_until_saturated( modeln_dir, os.path.join(modeln_dir_, 'iter0'), improvement_threshold, hcompv_scp_train_updated, os.path.join(htk_stimmen_dir, 'mfc'), 'mfc', os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), mlf_file=mlf_file_train_aligned, lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'), model_type='monophone_with_sp' ) nmix_ = nmix print("elapsed time: {}".format(time.time() - timer_start)) ## ======================= train triphone ======================= print('>>> making triphone list... ') chtk.make_triphonelist( mlf_file_train_aligned, triphone_mlf) if train_triphone: print('==== train triphone model ====') timer_start = time.time() print('>>> init triphone model... ') niter = chtk.get_niter_max(model_mono1sp2_dir) fh.make_new_directory(os.path.join(model_tri1_dir, 'iter0'), existing_dir='leave') chtk.init_triphone( os.path.join(model_mono1sp2_dir, 'iter'+str(niter)), os.path.join(model_tri1_dir, 'iter0') ) print('>>> re-estimation... ') ## I wanted to train until satulated: #niter = chtk.re_estimation_until_saturated( # model_tri1_dir, # os.path.join(model_tri1_dir, 'iter0'), # improvement_threshold, # hcompv_scp_train_updated, # os.path.join(htk_stimmen_dir, 'mfc'), # 'mfc', # os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), # mlf_file=triphone_mlf, # lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'), # model_type='triphone' # ) # # but because the data size is limited, some triphone cannot be trained and received the error: # ERROR [+8231] GetHCIModel: Cannot find hmm [i:-]r[+???] # therefore only two times re-estimation is performed. output_dir = model_tri1_dir for niter in range(1, 4): hmm_n = 'iter' + str(niter) hmm_n_pre = 'iter' + str(niter-1) _modeln_dir = os.path.join(output_dir, hmm_n) _modeln_dir_pre = os.path.join(output_dir, hmm_n_pre) fh.make_new_directory(_modeln_dir, 'leave') chtk.re_estimation( os.path.join(_modeln_dir_pre, 'hmmdefs'), _modeln_dir, hcompv_scp_train_updated, mlf_file=triphone_mlf, macros=os.path.join(_modeln_dir_pre, 'macros'), model_type='triphone') print("elapsed time: {}".format(time.time() - timer_start)) ## ======================= train tied-state triphones ======================= if train_triphone_tied: print('==== train tied-state triphones ====') timer_start = time.time() print('>>> making lexicon for triphone... ') chtk.make_lexicon_triphone(phonelist_full_txt, lexicon_htk_triphone) chtk.combine_phonelists(phonelist_full_txt) print('>>> making a tree header... ') fame_phonetics.make_quests_hed(quests_hed) stats = os.path.join(r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\model\tri1\iter3', 'stats') chtk.make_tree_header(tree_hed, quests_hed, stats, config_dir) print('>>> init triphone model... ') niter = chtk.get_niter_max(model_tri1_dir) fh.make_new_directory(os.path.join(model_tri1tied_dir, 'iter0'), existing_dir='leave') chtk.init_triphone( os.path.join(model_tri1_dir, 'iter'+str(niter)), os.path.join(model_tri1tied_dir, 'iter0'), tied=True) # I wanted to train until satulated: #niter = chtk.re_estimation_until_saturated( # model_tri1tied_dir, # os.path.join(model_tri1tied_dir, 'iter0'), # improvement_threshold, # hcompv_scp_train_updated, # os.path.join(htk_stimmen_dir, 'mfc'), # 'mfc', # os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), # mlf_file=triphone_mlf, # lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'), # model_type='triphone' # ) # # but because the data size is limited, some triphone cannot be trained and received the error: # ERROR [+8231] GetHCIModel: Cannot find hmm [i:-]r[+???] # therefore only 3 times re-estimation is performed. output_dir = model_tri1tied_dir for niter in range(1, 4): hmm_n = 'iter' + str(niter) hmm_n_pre = 'iter' + str(niter-1) _modeln_dir = os.path.join(output_dir, hmm_n) _modeln_dir_pre = os.path.join(output_dir, hmm_n_pre) fh.make_new_directory(_modeln_dir, 'leave') chtk.re_estimation( os.path.join(_modeln_dir_pre, 'hmmdefs'), _modeln_dir, hcompv_scp_train_updated, mlf_file=triphone_mlf, macros=os.path.join(_modeln_dir_pre, 'macros'), model_type='triphone') print("elapsed time: {}".format(time.time() - timer_start))