diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo index 7c3c3f1..95b20ba 100644 Binary files a/.vs/acoustic_model/v15/.suo and b/.vs/acoustic_model/v15/.suo differ diff --git a/acoustic_model/fame_functions.py b/acoustic_model/fame_functions.py index 9f4e127..c084686 100644 --- a/acoustic_model/fame_functions.py +++ b/acoustic_model/fame_functions.py @@ -345,6 +345,7 @@ def fix_lexicon(lexicon_file): for i in lex[lex['word'].str.startswith('\'')].index.values: lex.iat[i, 0] = lex.iat[i, 0].replace('\'', '\\\'') + # to_csv does not work with space seperator. therefore all tabs should manually be replaced. #lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\') lex.to_csv(lexicon_file, index=False, header=False, sep='\t', encoding='utf-8') diff --git a/acoustic_model/fame_hmm.py b/acoustic_model/fame_hmm.py index 99ac65e..8f6bc90 100644 --- a/acoustic_model/fame_hmm.py +++ b/acoustic_model/fame_hmm.py @@ -25,11 +25,11 @@ make_label = 0 # it takes roughly 4800 sec on Surface pro 2. make_mlf = 0 extract_features = 0 flat_start = 0 -train_model_without_sp = 0 +train_monophone_without_sp = 0 add_sp = 0 -train_model_with_re_aligned_mlf = 0 -train_triphone = 1 - +train_monophone_with_re_aligned_mlf = 0 +train_triphone = 0 +train_triphone_tied = 1 # pre-defined values. @@ -46,16 +46,18 @@ lexicon_oov = os.path.join(default.fame_dir, 'lexicon', 'lex.oov') config_dir = os.path.join(default.htk_dir, 'config') model_dir = os.path.join(default.htk_dir, 'model') -model0_dir = os.path.join(model_dir, 'hmm0') -model1_dir = os.path.join(model_dir, 'hmm1') -model1sp_dir = os.path.join(model_dir, 'hmm1sp') -model1sp2_dir = os.path.join(model_dir, 'hmm1sp2') +model_mono0_dir = os.path.join(model_dir, 'mono0') +model_mono1_dir = os.path.join(model_dir, 'mono1') +model_mono1sp_dir = os.path.join(model_dir, 'mono1sp') +model_mono1sp2_dir = os.path.join(model_dir, 'mono1sp2') +model_tri1_dir = os.path.join(model_dir, 'tri1') # directories / files to be made. lexicon_dir = os.path.join(default.htk_dir, 'lexicon') lexicon_htk_asr = os.path.join(lexicon_dir, 'lex.htk_asr') lexicon_htk_oov = os.path.join(lexicon_dir, 'lex.htk_oov') lexicon_htk = os.path.join(lexicon_dir, 'lex.htk') +#lexicon_htk_with_sp = os.path.join(lexicon_dir, 'lex_with_sp.htk') feature_dir = os.path.join(default.htk_dir, 'mfc') fh.make_new_directory(feature_dir, existing_dir='leave') @@ -68,6 +70,7 @@ fh.make_new_directory(label_dir, existing_dir='leave') ## training hcompv_scp_train = os.path.join(tmp_dir, 'train.scp') mlf_file_train = os.path.join(label_dir, 'train_phone.mlf') +mlf_file_train_with_sp = os.path.join(label_dir, 'train_phone_with_sp.mlf') mlf_file_train_aligned = os.path.join(label_dir, 'train_phone_aligned.mlf') hcompv_scp_train_updated = hcompv_scp_train.replace('.scp', '_updated.scp') @@ -97,8 +100,17 @@ if make_lexicon: # http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html print('>>> fixing the lexicon...') fame_functions.fix_lexicon(lexicon_htk) - print("elapsed time: {}".format(time.time() - timer_start)) + ## add sp to the end of each line. + #print('>>> adding sp...') + #with open(lexicon_htk) as f: + # lines = f.read().split('\n') + #lines = [line + ' sp' for line in lines] + #with open(lexicon_htk_with_sp, 'wb') as f: + # f.write(bytes('\n'.join(lines), 'ascii')) + + print("elapsed time: {}".format(time.time() - timer_start)) + ## intialize the instance for HTK. chtk = pyhtk.HTK(config_dir, fame_asr.phoneset_htk, lexicon_htk, feature_size) @@ -164,12 +176,15 @@ if make_mlf: label_dir_ = os.path.join(label_dir, dataset) mlf_word = os.path.join(label_dir, dataset + '_word.mlf') mlf_phone = os.path.join(label_dir, dataset + '_phone.mlf') + mlf_phone_with_sp = os.path.join(label_dir, dataset + '_phone_with_sp.mlf') print(">>> generating a word level mlf file for {}...".format(dataset)) chtk.label2mlf(label_dir_, mlf_word) print(">>> generating a phone level mlf file for {}...".format(dataset)) - chtk.mlf_word2phone(mlf_phone, mlf_word) + chtk.mlf_word2phone(mlf_phone, mlf_word, with_sp=False) + chtk.mlf_word2phone(mlf_phone_with_sp, mlf_word, with_sp=True) + print("elapsed time: {}".format(time.time() - timer_start)) @@ -224,33 +239,33 @@ if extract_features: if flat_start: timer_start = time.time() print('==== flat start ====') - fh.make_new_directory(model0_dir, existing_dir='leave') + fh.make_new_directory(model_mono0_dir, existing_dir='leave') - chtk.flat_start(hcompv_scp_train, model0_dir) + chtk.flat_start(hcompv_scp_train, model_mono0_dir) # create macros. - vFloors = os.path.join(model0_dir, 'vFloors') + vFloors = os.path.join(model_mono0_dir, 'vFloors') if os.path.exists(vFloors): chtk.create_macros(vFloors) # allocate mean & variance to all phones in the phone list print('>>> allocating mean & variance to all phones in the phone list...') chtk.create_hmmdefs( - os.path.join(model0_dir, proto_name), - os.path.join(model0_dir, 'hmmdefs') + os.path.join(model_mono0_dir, proto_name), + os.path.join(model_mono0_dir, 'hmmdefs') ) print("elapsed time: {}".format(time.time() - timer_start)) ## ======================= train model without short pause ======================= -if train_model_without_sp: - print('==== train model without sp ====') +if train_monophone_without_sp: + print('==== train monophone without sp ====') timer_start = time.time() niter = chtk.re_estimation_until_saturated( - model1_dir, - model0_dir, improvement_threshold, hcompv_scp_train, + model_mono1_dir, + model_mono0_dir, improvement_threshold, hcompv_scp_train, os.path.join(htk_stimmen_dir, 'mfc'), 'mfc', os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), @@ -270,32 +285,34 @@ if add_sp: # make model with sp. print('>>> adding sp state to the last model in the previous step...') - fh.make_new_directory(model1sp_dir, existing_dir='leave') - niter = chtk.get_niter_max(model1_dir) - modeln_dir_pre = os.path.join(model1_dir, 'iter'+str(niter)) - modeln_dir = os.path.join(model1sp_dir, 'iter0') + fh.make_new_directory(model_mono1sp_dir, existing_dir='leave') + niter = chtk.get_niter_max(model_mono1_dir) + modeln_dir_pre = os.path.join(model_mono1_dir, 'iter'+str(niter)) + modeln_dir = os.path.join(model_mono1sp_dir, 'iter0') + + #hmmdefs_pre = os.path.join(modeln_dir_pre, 'hmmdefs') chtk.add_sp(modeln_dir_pre, modeln_dir) print("elapsed time: {}".format(time.time() - timer_start)) niter = chtk.re_estimation_until_saturated( - model1sp_dir, modeln_dir, improvement_threshold, hcompv_scp_train, + model_mono1sp_dir, modeln_dir, improvement_threshold, hcompv_scp_train, os.path.join(htk_stimmen_dir, 'mfc'), 'mfc', os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), - mlf_file=mlf_file_train, + mlf_file=mlf_file_train_with_sp, lexicon_file=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'), model_type='monophone_with_sp' ) ## ======================= train model with re-aligned mlf ======================= -if train_model_with_re_aligned_mlf: - print('==== traina model with re-aligned mlf ====') +if train_monophone_with_re_aligned_mlf: + print('==== traina monophone with re-aligned mlf ====') print('>>> re-aligning the training data... ') timer_start = time.time() - niter = chtk.get_niter_max(model1sp_dir) - modeln_dir = os.path.join(model1sp_dir, 'iter'+str(niter)) + niter = chtk.get_niter_max(model_mono1sp_dir) + modeln_dir = os.path.join(model_mono1sp_dir, 'iter'+str(niter)) chtk.make_aligned_label( os.path.join(modeln_dir, 'macros'), os.path.join(modeln_dir, 'hmmdefs'), @@ -306,18 +323,18 @@ if train_model_with_re_aligned_mlf: print('>>> updating the script file... ') chtk.update_script_file( mlf_file_train_aligned, - mlf_file_train, + mlf_file_train_with_sp, hcompv_scp_train, hcompv_scp_train_updated) print("elapsed time: {}".format(time.time() - timer_start)) print('>>> re-estimation... ') timer_start = time.time() - fh.make_new_directory(model1sp2_dir, existing_dir='leave') - niter = chtk.get_niter_max(model1sp_dir) + fh.make_new_directory(model_mono1sp2_dir, existing_dir='leave') + niter = chtk.get_niter_max(model_mono1sp_dir) niter = chtk.re_estimation_until_saturated( - model1sp2_dir, - os.path.join(model1sp_dir, 'iter'+str(niter)), + model_mono1sp2_dir, + os.path.join(model_mono1sp_dir, 'iter'+str(niter)), improvement_threshold, hcompv_scp_train_updated, os.path.join(htk_stimmen_dir, 'mfc'), @@ -332,25 +349,68 @@ if train_model_with_re_aligned_mlf: ## ======================= train triphone ======================= if train_triphone: - model_out_dir = os.path.join(model_dir, 'hmm1_tri', 'iter1') + print('==== traina triphone model ====') + #model_out_dir = os.path.join(model_dir, 'hmm1_tri', 'iter1') - triphonelist_txt = os.path.join(config_dir, 'triphonelist_txt') + triphonelist_txt = os.path.join(config_dir, 'triphonelist.txt') triphone_mlf = os.path.join(default.htk_dir, 'label', 'train_triphone.mlf') + print('>>> making triphone list... ') chtk.make_triphonelist( triphonelist_txt, triphone_mlf, mlf_file_train_aligned) - #run_command([ - # 'HERest', '-B', - # '-C', config_train, - # '-I', triphone_mlf, - # '-t', '250.0', '150.0', '1000.0', - # '-s', 'stats' - # '-S', hcompv_scp_train, - # '-H', macros, - # '-H', hmmdefs, - # '-M', model_out_dir, - # os.path.join(config_dir, 'triphonelist.txt') - #]) + print('>>> making triphone header... ') + chtk.make_tri_hed( + os.path.join(config_dir, 'mktri.hed') + ) + + print('>>> init triphone model... ') + niter = chtk.get_niter_max(model_mono1sp2_dir) + fh.make_new_directory(os.path.join(model_tri1_dir, 'iter0'), existing_dir='leave') + chtk.init_triphone( + os.path.join(model_mono1sp2_dir, 'iter'+str(niter)), + os.path.join(model_tri1_dir, 'iter0') + ) + + print('>>> re-estimation... ') + # I wanted to train until satulated: + # #niter = chtk.re_estimation_until_saturated( + # model_tri1_dir, + # os.path.join(model_tri1_dir, 'iter0'), + # improvement_threshold, + # hcompv_scp_train_updated, + # os.path.join(htk_stimmen_dir, 'mfc'), + # 'mfc', + # os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), + # mlf_file=triphone_mlf, + # lexicon_file=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'), + # model_type='triphone' + # ) + # + # but because the data size is limited, some triphone cannot be trained and received the error: + # ERROR [+8231] GetHCIModel: Cannot find hmm [i:-]r[+???] + # therefore only two times re-estimation is performed. + output_dir = model_tri1_dir + + for niter in range(1, 4): + hmm_n = 'iter' + str(niter) + hmm_n_pre = 'iter' + str(niter-1) + _modeln_dir = os.path.join(output_dir, hmm_n) + _modeln_dir_pre = os.path.join(output_dir, hmm_n_pre) + + fh.make_new_directory(_modeln_dir, 'leave') + chtk.re_estimation( + os.path.join(_modeln_dir_pre, 'hmmdefs'), + _modeln_dir, + hcompv_scp_train_updated, + mlf_file=triphone_mlf, + macros=os.path.join(_modeln_dir_pre, 'macros'), + model_type='triphone') + + +## ======================= train triphone ======================= +if train_triphone_tied: + print('==== traina tied-state triphone ====') + \ No newline at end of file