diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo index 95b20ba..7b01e60 100644 Binary files a/.vs/acoustic_model/v15/.suo and b/.vs/acoustic_model/v15/.suo differ diff --git a/acoustic_model/acoustic_model.pyproj b/acoustic_model/acoustic_model.pyproj index 715d6c0..487d08f 100644 --- a/acoustic_model/acoustic_model.pyproj +++ b/acoustic_model/acoustic_model.pyproj @@ -51,6 +51,9 @@ + + Code + diff --git a/acoustic_model/fame_functions.py b/acoustic_model/fame_functions.py index c084686..77fd931 100644 --- a/acoustic_model/fame_functions.py +++ b/acoustic_model/fame_functions.py @@ -370,7 +370,8 @@ def ipa2asr(ipa): def ipa2htk(ipa): curr_dir = os.path.dirname(os.path.abspath(__file__)) translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0) - + #translation_key_ipa2asr = np.load(r'c:\Users\Aki\source\repos\acoustic_model\acoustic_model\phoneset\fame_ipa2asr.npy').item(0) + ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones) ipa_splitted = fame_ipa.phone_reduction(ipa_splitted) asr_splitted = convert_phoneset.convert_phoneset(ipa_splitted, translation_key_ipa2asr) diff --git a/acoustic_model/fame_hmm.py b/acoustic_model/fame_hmm.py index 8f6bc90..723448d 100644 --- a/acoustic_model/fame_hmm.py +++ b/acoustic_model/fame_hmm.py @@ -11,7 +11,7 @@ import numpy as np import pandas as pd import fame_functions -from phoneset import fame_ipa, fame_asr +from phoneset import fame_ipa, fame_asr, fame_phonetics import defaultfiles as default sys.path.append(default.toolbox_dir) import file_handling as fh @@ -44,6 +44,9 @@ lexicon_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr') lexicon_oov = os.path.join(default.fame_dir, 'lexicon', 'lex.oov') config_dir = os.path.join(default.htk_dir, 'config') +phonelist_full_txt = os.path.join(config_dir, 'phonelist_full.txt') +tree_hed = os.path.join(config_dir, 'tree.hed') +quest_hed = os.path.join(config_dir, 'quests.hed') model_dir = os.path.join(default.htk_dir, 'model') model_mono0_dir = os.path.join(model_dir, 'mono0') @@ -57,7 +60,7 @@ lexicon_dir = os.path.join(default.htk_dir, 'lexicon') lexicon_htk_asr = os.path.join(lexicon_dir, 'lex.htk_asr') lexicon_htk_oov = os.path.join(lexicon_dir, 'lex.htk_oov') lexicon_htk = os.path.join(lexicon_dir, 'lex.htk') -#lexicon_htk_with_sp = os.path.join(lexicon_dir, 'lex_with_sp.htk') +lexicon_htk_triphone = os.path.join(lexicon_dir, 'lex_triphone.htk') feature_dir = os.path.join(default.htk_dir, 'mfc') fh.make_new_directory(feature_dir, existing_dir='leave') @@ -270,7 +273,7 @@ if train_monophone_without_sp: 'mfc', os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), mlf_file=mlf_file_train, - lexicon_file=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic') + lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic') ) print("elapsed time: {}".format(time.time() - timer_start)) @@ -290,27 +293,27 @@ if add_sp: modeln_dir_pre = os.path.join(model_mono1_dir, 'iter'+str(niter)) modeln_dir = os.path.join(model_mono1sp_dir, 'iter0') - #hmmdefs_pre = os.path.join(modeln_dir_pre, 'hmmdefs') chtk.add_sp(modeln_dir_pre, modeln_dir) - print("elapsed time: {}".format(time.time() - timer_start)) - + + print('>>> re-estimation...') niter = chtk.re_estimation_until_saturated( model_mono1sp_dir, modeln_dir, improvement_threshold, hcompv_scp_train, os.path.join(htk_stimmen_dir, 'mfc'), 'mfc', os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), mlf_file=mlf_file_train_with_sp, - lexicon_file=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'), + lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'), model_type='monophone_with_sp' ) - + print("elapsed time: {}".format(time.time() - timer_start)) + ## ======================= train model with re-aligned mlf ======================= if train_monophone_with_re_aligned_mlf: print('==== traina monophone with re-aligned mlf ====') + timer_start = time.time() print('>>> re-aligning the training data... ') - timer_start = time.time() niter = chtk.get_niter_max(model_mono1sp_dir) modeln_dir = os.path.join(model_mono1sp_dir, 'iter'+str(niter)) chtk.make_aligned_label( @@ -326,7 +329,6 @@ if train_monophone_with_re_aligned_mlf: mlf_file_train_with_sp, hcompv_scp_train, hcompv_scp_train_updated) - print("elapsed time: {}".format(time.time() - timer_start)) print('>>> re-estimation... ') timer_start = time.time() @@ -341,7 +343,7 @@ if train_monophone_with_re_aligned_mlf: 'mfc', os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), mlf_file=mlf_file_train_aligned, - lexicon_file=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'), + lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'), model_type='monophone_with_sp' ) print("elapsed time: {}".format(time.time() - timer_start)) @@ -350,7 +352,7 @@ if train_monophone_with_re_aligned_mlf: ## ======================= train triphone ======================= if train_triphone: print('==== traina triphone model ====') - #model_out_dir = os.path.join(model_dir, 'hmm1_tri', 'iter1') + timer_start = time.time() triphonelist_txt = os.path.join(config_dir, 'triphonelist.txt') triphone_mlf = os.path.join(default.htk_dir, 'label', 'train_triphone.mlf') @@ -385,7 +387,7 @@ if train_triphone: # 'mfc', # os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), # mlf_file=triphone_mlf, - # lexicon_file=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'), + # lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'), # model_type='triphone' # ) # @@ -409,8 +411,21 @@ if train_triphone: macros=os.path.join(_modeln_dir_pre, 'macros'), model_type='triphone') + print("elapsed time: {}".format(time.time() - timer_start)) + ## ======================= train triphone ======================= if train_triphone_tied: print('==== traina tied-state triphone ====') - \ No newline at end of file + timer_start = time.time() + + print('>>> making lexicon for triphone... ') + chtk.make_triphone_full(phonelist_full_txt, lexicon_htk_triphone) + + print('>>> making headers... ') + chtk.make_tree_header(tree_hed) + fame_phonetics.make_quests_hed(quest_hed) + + print("elapsed time: {}".format(time.time() - timer_start)) + + diff --git a/acoustic_model/fame_test.py b/acoustic_model/fame_test.py index c1a432e..a096bd3 100644 --- a/acoustic_model/fame_test.py +++ b/acoustic_model/fame_test.py @@ -109,30 +109,30 @@ np.save(os.path.join('phoneset', 'fame_ipa2asr.npy'), translation_key_ipa2asr) ## check which letters are not coded in ascii. -print('asr phones which cannot be coded in ascii:\n') -for i in fame_asr.phoneset_short: - try: - i_encoded = i.encode("ascii") - #print("{0} --> {1}".format(i, i.encode("ascii"))) - except UnicodeEncodeError: - print(">>> {}".format(i)) +#print('asr phones which cannot be coded in ascii:\n') +#for i in fame_asr.phoneset_short: +# try: +# i_encoded = i.encode("ascii") +# #print("{0} --> {1}".format(i, i.encode("ascii"))) +# except UnicodeEncodeError: +# print(">>> {}".format(i)) -print("letters in the scripts which is not coded in ascii:\n") -for dataset in ['train', 'devel', 'test']: - timer_start = time.time() +#print("letters in the scripts which is not coded in ascii:\n") +#for dataset in ['train', 'devel', 'test']: +# timer_start = time.time() - script_list = os.path.join(default.fame_dir, 'data', dataset, 'text') - with open(script_list, "rt", encoding="utf-8") as fin: - scripts = fin.read().split('\n') +# script_list = os.path.join(default.fame_dir, 'data', dataset, 'text') +# with open(script_list, "rt", encoding="utf-8") as fin: +# scripts = fin.read().split('\n') - for line in scripts: - sentence = ' '.join(line.split(' ')[1:]) - sentence_htk = fame_functions.word2htk(sentence) +# for line in scripts: +# sentence = ' '.join(line.split(' ')[1:]) +# sentence_htk = fame_functions.word2htk(sentence) - #if len(re.findall(r'[âêôûč\'àéèúćäëïöü]', sentence))==0: - try: - sentence_htk = bytes(sentence_htk, 'ascii') - except UnicodeEncodeError: - print(sentence) - print(sentence_htk) +# #if len(re.findall(r'[âêôûč\'àéèúćäëïöü]', sentence))==0: +# try: +# sentence_htk = bytes(sentence_htk, 'ascii') +# except UnicodeEncodeError: +# print(sentence) +# print(sentence_htk) diff --git a/acoustic_model/phoneset/fame_asr.py b/acoustic_model/phoneset/fame_asr.py index 398d2b3..18a5ff2 100644 --- a/acoustic_model/phoneset/fame_asr.py +++ b/acoustic_model/phoneset/fame_asr.py @@ -80,8 +80,11 @@ def phone_reduction(phones): Args: phones (list): list of phones. """ + if sum([phone in phones for phone in phones_to_be_removed]) != 0: + print('input includes phone(s) which is not defined in fame_asr.') + print('those phone(s) are removed.') return [reduction_key.get(i, i) for i in phones - if not i in phones_to_be_removed] + if i not in phones_to_be_removed] phoneset_short = list(set(phone_reduction(phoneset))) phoneset_short.sort() @@ -96,7 +99,7 @@ translation_key_asr2htk = { 'ṷ': 'u_', # on the analogy of German umlaut, 'e' is used. - 'ö': 'oe', 'ö:': 'oe:', + 'ö': 'oe', 'ö:': 'oe:', '' 'ü': 'ue', 'ü:': 'ue:', # on the analogy of Chinese... diff --git a/acoustic_model/phoneset/fame_ipa.py b/acoustic_model/phoneset/fame_ipa.py index 8859b9f..21645c9 100644 --- a/acoustic_model/phoneset/fame_ipa.py +++ b/acoustic_model/phoneset/fame_ipa.py @@ -61,7 +61,7 @@ phoneset = [ 'ɔⁿ', 'ɔ:', 'ɔ:ⁿ', - #'ɔ̈', # not included in lex.ipa + 'ɔ̈', # not included in lex.ipa 'ɔ̈.', 'ɔ̈:', diff --git a/acoustic_model/phoneset/fame_phonetics.py b/acoustic_model/phoneset/fame_phonetics.py new file mode 100644 index 0000000..067664b --- /dev/null +++ b/acoustic_model/phoneset/fame_phonetics.py @@ -0,0 +1,197 @@ +import sys +import os +os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') + +import fame_functions +from phoneset import fame_ipa, fame_asr +import convert_phoneset + + +## general +stop = 'p, b, t, d, k, g' +nasal = 'm, n, ŋ' +fricative = 's, z, f, v, h, x, j' +liquid = 'l, r' +vowel = 'a, a:, e:, i, i:, i̯, o, o:, u, u:, ṷ, ö, ö:, ü, ü:, ɔ, ɔ:, ɔ̈, ə, ɛ, ɛ:, ɪ, ɪ:' + +## consonant +c_front = 'p, b, m, f, v' +c_central = 't, d, n, s, z, l, r' +c_back = 'k, g, ŋ, h, x, j' + +fortis = 'p, t, k, f, s' +lenis = 'b, d, g, v, z, j' +neither_fortis_nor_lenis = 'm, n, ŋ, h, l, r, x' + +coronal = 't, d, n, s, z, l, r, j' +non_coronal = 'p, b, m, k, g, ŋ, f, v, h, x' + +anterior = 'p, b, m, t, d, n, f, v, s, z, l' +non_anterior = 'k, g, ŋ, h, x, j, r' + +continuent = 'm, n, ŋ, f, v, s, z, h, l, r' +non_continuent = 'p, b, t, d, k, g, x, j' + +strident = 's, z, j' +non_strident = 'f, v, h' +unstrident = 'p, b, t, d, m, n, ŋ, k, g, r, x' + +glide = 'h, l, r' +syllabic = 'm, l, ŋ' + +unvoiced = 'p, t, k, s, f, x, h' +voiced = 'b, d, g, z, v, m, n, ŋ, l, r, j' + +#affricate: ??? +non_affricate = 's, z, f, v' + +voiced_stop = 'b, d, g' +unvoiced_stop = 'p, t, k' +front_stop = 'p, b' +central_stop = 't, d' +back_stop = 'k, g' + +voiced_fricative = 'z, v' +unvoiced_fricative = 's, f' +front_fricative = 'f, v' +central_fricative = 's, z' +back_fricative = 'j' + + +## vowel +v_front = 'i, i:, i̯, ɪ, ɪ:, e:, ə, ɛ, ɛ:, a, a:' +v_central = 'ə, ɛ, ɛ:, a, a:' +v_back = 'u, u:, ü, ü:, ṷ, ɔ, ɔ:, ɔ̈, ö, ö:, o, o:' + +long = 'a:, e:, i:, o:, u:, ö:, ü:, ɔ:, ɛ:, ɪ:' +short = 'a, i, i̯, o, u, ṷ, ö, ü, ɔ, ɔ̈, ə, ɛ, ɪ' + +#Dipthong: ??? +#Front-Start: ??? +#Fronting: ??? + +high = 'i, i:, i̯, ɪ, ɪ: u, u:, ṷ, ə, e:, o, o:, ö, ö:, ü, ü:' +medium = 'e:, ə, ɛ, ɛ:, ɔ, ɔ:, ɔ̈, o, o:, ö, ö:' +low = 'a, a:, ɛ, ɛ:, ɔ, ɔ:, ɔ̈' + +rounded = 'a, a:, o, o:, u, u:, ṷ, ö, ö:, ü, ü:, ɔ, ɔ:, ɔ̈' +unrounded = 'i, i:, i̯, e:, ə, ɛ, ɛ:, ɪ, ɪ:' + +i_vowel = 'i, i:, i̯, ɪ, ɪ:' +e_vowel = 'e:,ə, ɛ, ɛ:' +a_vowel = 'a, a:' +o_vowel = 'o, o:, ö, ö:, ɔ, ɔ:, ɔ̈' +u_vowel = 'u, u:, ṷ, ü, ü:' + +## htk phoneset +phoneset = fame_asr.phoneset_htk + +## convert ipa group to htk format for quests.hed. +def _ipa2quest(R_or_L, ipa_text): + assert R_or_L in ['R', 'L'], print('the first argument should be either R or L.') + ipa_list = ipa_text.replace(' ', '').split(',') + if R_or_L == 'R': + quests_list = ['*+' + fame_functions.ipa2htk(ipa) for ipa in ipa_list] + else: + quests_list = [fame_functions.ipa2htk(ipa) + '-*' for ipa in ipa_list] + return ','.join(quests_list) + + +def make_quests_hed(quest_hed): + def _add_quests_item(R_or_L, item_name_, ipa_text): + assert R_or_L in ['R', 'L'], print('the first argument should be either R or L.') + item_name = R_or_L + '_' + item_name_ + with open(quest_hed, 'ab') as f: + f.write(bytes('QS "' + item_name + '"\t{ ' + _ipa2quest(R_or_L, ipa_text) + ' }\n', 'ascii')) + + if os.path.exists(quest_hed): + os.remove(quest_hed) + + for R_or_L in ['R', 'L']: + _add_quests_item(R_or_L, 'NonBoundary', '*') + _add_quests_item(R_or_L, 'Silence', 'sil') + + _add_quests_item(R_or_L, 'Stop', stop) + _add_quests_item(R_or_L, 'Nasal', nasal) + _add_quests_item(R_or_L, 'Fricative', fricative) + _add_quests_item(R_or_L, 'Liquid', liquid) + _add_quests_item(R_or_L, 'Vowel', vowel) + + _add_quests_item(R_or_L, 'C-Front', c_front) + _add_quests_item(R_or_L, 'C-Central', c_central) + _add_quests_item(R_or_L, 'C-Back', c_back) + + _add_quests_item(R_or_L, 'V-Front', v_front) + _add_quests_item(R_or_L, 'V-Central', v_central) + _add_quests_item(R_or_L, 'V-Back', v_back) + + _add_quests_item(R_or_L, 'Front', c_front + v_front) + _add_quests_item(R_or_L, 'Central', c_central + v_central) + _add_quests_item(R_or_L, 'Back', c_front + v_back) + + _add_quests_item(R_or_L, 'Fortis', fortis) + _add_quests_item(R_or_L, 'Lenis', lenis) + _add_quests_item(R_or_L, 'UnFortLenis', neither_fortis_nor_lenis) + + _add_quests_item(R_or_L, 'Coronal', coronal) + _add_quests_item(R_or_L, 'NonCoronal', non_coronal) + + _add_quests_item(R_or_L, 'Anterior', anterior) + _add_quests_item(R_or_L, 'NonAnterior', non_anterior) + + _add_quests_item(R_or_L, 'Continuent', continuent) + _add_quests_item(R_or_L, 'NonContinuent', non_continuent) + + _add_quests_item(R_or_L, 'Strident', strident) + _add_quests_item(R_or_L, 'NonStrident', non_strident) + _add_quests_item(R_or_L, 'UnStrident', unstrident) + + _add_quests_item(R_or_L, 'Glide', glide) + _add_quests_item(R_or_L, 'Syllabic', syllabic) + + _add_quests_item(R_or_L, 'Unvoiced-Cons', unvoiced) + _add_quests_item(R_or_L, 'Voiced-Cons', voiced) + _add_quests_item(R_or_L, 'Unvoiced-All', unvoiced + ', sil') + + _add_quests_item(R_or_L, 'Long', long) + _add_quests_item(R_or_L, 'Short', short) + + #_add_quests_item(R_or_L, 'Dipthong', xxx) + #_add_quests_item(R_or_L, 'Front-Start', xxx) + #_add_quests_item(R_or_L, 'Fronting', xxx) + + _add_quests_item(R_or_L, 'High', high) + _add_quests_item(R_or_L, 'Medium', medium) + _add_quests_item(R_or_L, 'Low', low) + + _add_quests_item(R_or_L, 'Rounded', rounded) + _add_quests_item(R_or_L, 'UnRounded', unrounded) + + #_add_quests_item(R_or_L, 'Affricative', rounded) + _add_quests_item(R_or_L, 'NonAffricative', non_affricate) + + _add_quests_item(R_or_L, 'IVowel', i_vowel) + _add_quests_item(R_or_L, 'EVowel', e_vowel) + _add_quests_item(R_or_L, 'AVowel', a_vowel) + _add_quests_item(R_or_L, 'OVowel', o_vowel) + _add_quests_item(R_or_L, 'UVowel', u_vowel) + + _add_quests_item(R_or_L, 'Voiced-Stop', voiced_stop) + _add_quests_item(R_or_L, 'UnVoiced-Stop', unvoiced_stop) + + _add_quests_item(R_or_L, 'Front-Stop', front_stop) + _add_quests_item(R_or_L, 'Central-Stop', central_stop) + _add_quests_item(R_or_L, 'Back-Stop', back_stop) + + _add_quests_item(R_or_L, 'Voiced-Fric', voiced_fricative) + _add_quests_item(R_or_L, 'UnVoiced-Fric', unvoiced_fricative) + + _add_quests_item(R_or_L, 'Front-Fric', front_fricative) + _add_quests_item(R_or_L, 'Central-Fric', central_fricative) + _add_quests_item(R_or_L, 'Back-Fric', back_fricative) + + for p in phoneset: + _add_quests_item(R_or_L, p, p) + + return + \ No newline at end of file