diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo index 1238af1..712c255 100644 Binary files a/.vs/acoustic_model/v15/.suo and b/.vs/acoustic_model/v15/.suo differ diff --git a/acoustic_model/acoustic_model.pyproj b/acoustic_model/acoustic_model.pyproj index ebdbadc..715d6c0 100644 --- a/acoustic_model/acoustic_model.pyproj +++ b/acoustic_model/acoustic_model.pyproj @@ -4,7 +4,7 @@ 2.0 4d8c8573-32f0-4a62-9e62-3ce5cc680390 . - htk_vs_kaldi.py + fame_hmm.py . diff --git a/acoustic_model/fame_functions.py b/acoustic_model/fame_functions.py index 9d3992b..9ca7e0d 100644 --- a/acoustic_model/fame_functions.py +++ b/acoustic_model/fame_functions.py @@ -12,6 +12,10 @@ import defaultfiles as default import convert_phoneset from phoneset import fame_ipa, fame_asr +sys.path.append(default.toolbox_dir) +from htk import pyhtk + + #def read_fileFA(fileFA): # """ # read the result file of HTK forced alignment. @@ -371,4 +375,25 @@ def ipa2htk(ipa): asr_splitted = convert_phoneset.convert_phoneset(ipa_splitted, translation_key_ipa2asr) asr_splitted = fame_asr.phone_reduction(asr_splitted) htk_splitted = convert_phoneset.convert_phoneset(asr_splitted, fame_asr.translation_key_asr2htk) - return ''.join(htk_splitted) \ No newline at end of file + return ''.join(htk_splitted) + + +def performance_on_stimmen(stimmen_dir, hmmdefs): + #hmmdefs = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\model_\hmm1\iter20\hmmdefs' + #stimmen_dir = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\stimmen' + lattice_file = os.path.join(stimmen_dir, 'word_lattice.ltc') + hvite_scp = os.path.join(stimmen_dir, 'hvite.scp') + #fh.make_filelist(os.path.join(stimmen_dir, 'mfc'), hvite_scp, file_type='mfc') + hresult_scp = os.path.join(stimmen_dir, 'hresult.scp') + #fh.make_filelist(os.path.join(stimmen_dir, 'mfc'), hresult_scp, file_type='rec') + lexicon_file = os.path.join(stimmen_dir, 'lexicon_recognition.dic') + chtk = pyhtk.HTK(config_dir, fame_asr.phoneset_htk, lexicon_file) + + result = chtk.recognition( + lattice_file, + hmmdefs, + hvite_scp + ) + per_sentence, per_word = chtk.calc_recognition_performance(hresult_scp) + + return per_sentence['accuracy'] \ No newline at end of file diff --git a/acoustic_model/fame_hmm.py b/acoustic_model/fame_hmm.py index 7228c00..4e2f430 100644 --- a/acoustic_model/fame_hmm.py +++ b/acoustic_model/fame_hmm.py @@ -22,30 +22,27 @@ from htk import pyhtk # procedure make_lexicon = 0 make_label = 0 # it takes roughly 4800 sec on Surface pro 2. -make_htk_files = 0 +make_mlf = 0 extract_features = 0 flat_start = 0 train_model_without_sp = 0 add_sp = 0 train_model_with_sp = 0 -train_model_with_sp_align_mlf = 1 +train_model_with_sp_align_mlf = 0 +train_triphone = 0 # pre-defined values. - dataset_list = ['devel', 'test', 'train'] hmmdefs_name = 'hmmdefs' -proto_name = 'proto39' +proto_name = 'proto' lexicon_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr') lexicon_oov = os.path.join(default.fame_dir, 'lexicon', 'lex.oov') config_dir = os.path.join(default.htk_dir, 'config') -config_hcopy = os.path.join(config_dir, 'config.HCopy') -config_train = os.path.join(config_dir, 'config.train') -global_ded = os.path.join(config_dir, 'global.ded') -mkphones_led = os.path.join(config_dir, 'mkphones.led') + sil_hed = os.path.join(config_dir, 'sil.hed') prototype = os.path.join(config_dir, proto_name) @@ -53,25 +50,20 @@ model_dir = os.path.join(default.htk_dir, 'model') # directories / files to be made. - lexicon_dir = os.path.join(default.htk_dir, 'lexicon') lexicon_htk_asr = os.path.join(lexicon_dir, 'lex.htk_asr') lexicon_htk_oov = os.path.join(lexicon_dir, 'lex.htk_oov') lexicon_htk = os.path.join(lexicon_dir, 'lex.htk') -phonelist_txt = os.path.join(config_dir, 'phonelist.txt') -model0_dir = os.path.join(model_dir, 'hmm0') -model1_dir = os.path.join(model_dir, 'hmm1') + +#model1_dir = os.path.join(model_dir, 'hmm1') feature_dir = os.path.join(default.htk_dir, 'mfc') -if not os.path.exists(feature_dir): - os.makedirs(feature_dir) +fh.make_new_directory(feature_dir, existing_dir='leave') tmp_dir = os.path.join(default.htk_dir, 'tmp') -if not os.path.exists(tmp_dir): - os.makedirs(tmp_dir) +fh.make_new_directory(tmp_dir, existing_dir='leave') label_dir = os.path.join(default.htk_dir, 'label') -if not os.path.exists(label_dir): - os.makedirs(label_dir) +fh.make_new_directory(label_dir, existing_dir='leave') ## training hcompv_scp_train = os.path.join(tmp_dir, 'train.scp') @@ -98,20 +90,21 @@ if make_lexicon: # therefore there is no overlap between lex_asr and lex_oov. fame_functions.combine_lexicon(lexicon_htk_asr, lexicon_htk_oov, lexicon_htk) - ## ======================= - ## manually make changes to the pronunciation dictionary and save it as lex.htk - ## ======================= + ## fixing the lexicon for HTK. # (1) Replace all tabs with single space; # (2) Put a '\' before any dictionary entry beginning with single quote - #http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html + # http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html print('>>> fixing the lexicon...') fame_functions.fix_lexicon(lexicon_htk) print("elapsed time: {}".format(time.time() - timer_start)) +## intialize the instance for HTK. +chtk = pyhtk.HTK(config_dir, fame_asr.phoneset_htk, lexicon_htk) + + ## ======================= make label files ======================= if make_label: - # train_2002_gongfansaken_10347.lab is empty. should be removed. for dataset in dataset_list: timer_start = time.time() print("==== making label files on dataset {}".format(dataset)) @@ -120,7 +113,7 @@ if make_label: wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset) label_dir_ = os.path.join(label_dir, dataset) dictionary_file = os.path.join(label_dir_, 'temp.dic') - fh.make_new_directory(label_dir_) + fh.make_new_directory(label_dir_, existing_dir='leave') # list of scripts with open(script_list, "rt", encoding="utf-8") as fin: @@ -135,56 +128,48 @@ if make_label: sentence_htk = fame_functions.word2htk(sentence) wav_file = os.path.join(wav_dir_, filename + '.wav') - if os.path.exists(wav_file) and pyhtk.can_be_ascii(sentence_htk) == 0: - if pyhtk.create_dictionary_without_log( - sentence_htk, global_ded, dictionary_file, lexicon_htk) == 0: + if os.path.exists(wav_file) and chtk.can_be_ascii(sentence_htk) == 0: + if chtk.get_number_of_missing_words( + sentence_htk, dictionary_file) == 0: # when the file name is too long, HDMan command does not work. # therefore first temporary dictionary_file is made, then renamed. shutil.move(dictionary_file, os.path.join(label_dir_, filename + '.dic')) label_file = os.path.join(label_dir_, filename + '.lab') - pyhtk.create_label_file(sentence_htk, label_file) + chtk.create_label_file(sentence_htk, label_file) else: os.remove(dictionary_file) + print("elapsed time: {}".format(time.time() - timer_start)) -## ======================= make other required files ======================= -if make_htk_files: +## ======================= make master label files ======================= +if make_mlf: timer_start = time.time() - print("==== making files required for HTK ====") + print("==== making master label files ====") - print(">>> making a phonelist...") - pyhtk.create_phonelist_file(fame_asr.phoneset_htk, phonelist_txt) + # train_2002_gongfansaken_10347.lab is empty. should be removed. + empty_lab_file = os.path.join(label_dir, 'train', 'train_2002_gongfansaken_10347.lab') + empty_dic_file = empty_lab_file.replace('.lab', '.dic') + + if os.path.exists(empty_lab_file): + os.remove(empty_lab_file) + if os.path.exists(empty_dic_file): + os.remove(empty_dic_file) for dataset in dataset_list: - wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset) + #wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset) feature_dir_ = os.path.join(feature_dir, dataset) label_dir_ = os.path.join(label_dir, dataset) mlf_word = os.path.join(label_dir, dataset + '_word.mlf') mlf_phone = os.path.join(label_dir, dataset + '_phone.mlf') - #print(">>> making a script file for {}...".format(dataset)) - #listdir = glob.glob(os.path.join(wav_dir_, '*.dic')) - #mfc_list = [filename.replace(wav_dir_, feature_dir_).replace('.dic', '.mfc') for filename in listdir] - #hcompv_scp = os.path.join(tmp_dir, dataset + '.scp') - #with open(hcompv_scp, 'wb') as f: - # f.write(bytes('\n'.join(mfc_list) + '\n', 'ascii')) - - print(">>> making a mlf file for {}...".format(dataset)) - lab_list = glob.glob(os.path.join(label_dir_, '*.lab')) - with open(mlf_word, 'wb') as fmlf: - fmlf.write(bytes('#!MLF!#\n', 'ascii')) - for label_file in lab_list: - filename = os.path.basename(label_file) - fmlf.write(bytes('\"*/{}\"\n'.format(filename), 'ascii')) - with open(label_file) as flab: - lines = flab.read() - fmlf.write(bytes(lines + '.\n', 'ascii')) - - print(">>> generating phone level transcription for {}...".format(dataset)) - pyhtk.mlf_word2phone(lexicon_htk, mlf_phone, mlf_word, mkphones_led) - print("elapsed time: {}".format(time.time() - timer_start)) + print(">>> generating a word level mlf file for {}...".format(dataset)) + chtk.label2mlf(label_dir_, mlf_word) + print(">>> generating a phone level mlf file for {}...".format(dataset)) + chtk.mlf_word2phone(mlf_phone, mlf_word) + + print("elapsed time: {}".format(time.time() - timer_start)) ## ======================= extract features ======================= @@ -196,7 +181,7 @@ if extract_features: wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset) label_dir_ = os.path.join(label_dir, dataset) feature_dir_ = os.path.join(feature_dir, dataset) - fh.make_new_directory(feature_dir_) + fh.make_new_directory(feature_dir_, existing_dir='delete') # a script file for HCopy print(">>> making a script file for HCopy...") @@ -212,12 +197,15 @@ if extract_features: os.path.join(wav_dir_, os.path.basename(lab_file).replace('.lab', '.wav')) + '\t' + os.path.join(feature_dir_, os.path.basename(lab_file).replace('.lab', '.mfc')) for lab_file in lab_list] + + if os.path.exists(empty_mfc_file): + os.remove(empty_mfc_file) with open(hcopy_scp.name, 'wb') as f: f.write(bytes('\n'.join(feature_list), 'ascii')) # extract features. print(">>> extracting features on {}...".format(dataset)) - pyhtk.wav2mfc(config_hcopy, hcopy_scp.name) + chtk.wav2mfc(hcopy_scp.name) os.remove(hcopy_scp.name) # make hcompv.scp. @@ -235,21 +223,18 @@ if extract_features: if flat_start: timer_start = time.time() print('==== flat start ====') - pyhtk.flat_start(config_train, hcompv_scp_train, model0_dir, prototype) + feature_size = 39 + model0_dir = os.path.join(model_dir, 'hmm0') + fh.make_new_directory(model0_dir, existing_dir='leave') + + chtk.flat_start(hcompv_scp_train, model0_dir, feature_size) # allocate mean & variance to all phones in the phone list print('>>> allocating mean & variance to all phones in the phone list...') - pyhtk.create_hmmdefs( + chtk.create_hmmdefs( os.path.join(model0_dir, proto_name), - os.path.join(model0_dir, 'hmmdefs'), - phonelist_txt) - - # make macros - print('>>> making macros...') - with open(os.path.join(model0_dir, 'vFloors')) as f: - lines = f.read() - with open(os.path.join(model0_dir, 'macros'), 'wb') as f: - f.write(bytes('~o 39\n' + lines, 'ascii')) + os.path.join(model0_dir, 'hmmdefs') + ) print("elapsed time: {}".format(time.time() - timer_start)) @@ -362,4 +347,24 @@ if train_model_with_sp_align_mlf: hcompv_scp_train, phonelist_txt, mlf_file=mlf_file_train_aligned, macros=os.path.join(modeln_dir_pre, 'macros')) - print("elapsed time: {}".format(time.time() - timer_start)) \ No newline at end of file + print("elapsed time: {}".format(time.time() - timer_start)) + + +# train triphone. +if train_triphone: + triphone_mlf = os.path.join(default.htk_dir, 'label', 'train_triphone.mlf') + macros = os.path.join(model_dir, 'hmm1_tri', 'iter0', 'macros') + hmmdefs = os.path.join(model_dir, 'hmm1_tri', 'iter0', 'hmmdefs') + model_out_dir = os.path.join(model_dir, 'hmm1_tri', 'iter1') + run_command([ + 'HERest', '-B', + '-C', config_train, + '-I', triphone_mlf, + '-t', '250.0', '150.0', '1000.0', + '-s', 'stats' + '-S', hcompv_scp_train, + '-H', macros, + '-H', hmmdefs, + '-M', model_out_dir, + os.path.join(config_dir, 'triphonelist.txt') + ]) \ No newline at end of file diff --git a/acoustic_model/htk_vs_kaldi.py b/acoustic_model/htk_vs_kaldi.py index c35a42f..5297b79 100644 --- a/acoustic_model/htk_vs_kaldi.py +++ b/acoustic_model/htk_vs_kaldi.py @@ -53,7 +53,7 @@ from htk import pyhtk # procedure make_dic_file = 0 -make_HTK_files = 1 +make_HTK_files = 0 extract_features = 0 #make_htk_dict_files = 0 #do_forced_alignment_htk = 0 @@ -171,7 +171,7 @@ if make_HTK_files: filename = row['filename'].replace('.wav', '.lab') label_file = os.path.join(feature_dir, filename) with open(label_file, 'wb') as f: - label_string = 'START\n' + row['word'].upper() + '\nEND\n' + label_string = 'SILENCE\n' + row['word'].upper() + '\nSILENCE\n' f.write(bytes(label_string, 'ascii')) @@ -249,7 +249,7 @@ with open(hresult_scp, 'wb') as f: # calculate result performance = np.zeros((1, 2)) -for niter in range(1, 50): +for niter in range(50, 60): output = pyhtk.recognition( os.path.join(config_dir, 'config.rec'), lattice_file, @@ -265,6 +265,16 @@ for niter in range(1, 50): + #output = run_command_with_output([ + # 'HVite', '-T', '1', + # '-C', config_rec, + # '-w', lattice_file, + # '-H', hmm, + # dictionary_file, phonelist_txt, + # '-S', HVite_scp + #]) + + ## ======================= forced alignment using HTK ======================= if do_forced_alignment_htk: diff --git a/acoustic_model/phoneset/fame_asr.py b/acoustic_model/phoneset/fame_asr.py index 6165d5c..398d2b3 100644 --- a/acoustic_model/phoneset/fame_asr.py +++ b/acoustic_model/phoneset/fame_asr.py @@ -128,7 +128,11 @@ translation_key_word2htk = { 'ä': 'ao', 'ë': 'ee', 'ï': 'ie', 'ö': 'oe', 'ü': 'ue', } #[translation_key_word2htk.get(i, i) for i in not_in_ascii] - +#Stop: p, b, t, d, k, g +#Nasal: m, n, ng(ŋ) +#Fricative: s, z, f, v, h, x +#Liquid: l, r +#Vowel: a, a:, e:, i, i:, i_(i̯), o, o:, u, u:, u_(ṷ), oe(ö), oe:(ö:), ue(ü), ue:(ü:), O(ɔ), O:(ɔ:), Oe(ɔ̈), A(ə), E(ɛ), E:(ɛ:), I(ɪ), I:(ɪ:) ## the list of multi character phones. diff --git a/acoustic_model/stimmen_test.py b/acoustic_model/stimmen_test.py index 93546ca..f7911a7 100644 --- a/acoustic_model/stimmen_test.py +++ b/acoustic_model/stimmen_test.py @@ -77,4 +77,17 @@ for word in word_list: for key, value in zip(c.keys(), c.values()): if value > 3: pronunciations[key] = value - print(pronunciations) \ No newline at end of file + print(pronunciations) + + +monophone_mlf = os.path.join(default.htk_dir, 'label', 'train_phone_aligned.mlf') +triphone_mlf = os.path.join(default.htk_dir, 'label', 'train_triphone.mlf') +def filenames_in_mlf(file_mlf): + with open(file_mlf) as f: + lines_ = f.read().split('\n') + lines = [line for line in lines_ if len(line.split(' ')) == 1 and line != '.'] + filenames = [line.replace('"', '').replace('*/', '') for line in lines[1:-1]] + return filenames +filenames_mono = filenames_in_mlf(monophone_mlf) +filenames_tri = filenames_in_mlf(triphone_mlf) +