diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo index 649e0b1..1238af1 100644 Binary files a/.vs/acoustic_model/v15/.suo and b/.vs/acoustic_model/v15/.suo differ diff --git a/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc b/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc index 545949d..7b3cfb7 100644 Binary files a/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc and b/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc differ diff --git a/acoustic_model/acoustic_model.pyproj b/acoustic_model/acoustic_model.pyproj index 715d6c0..ebdbadc 100644 --- a/acoustic_model/acoustic_model.pyproj +++ b/acoustic_model/acoustic_model.pyproj @@ -4,7 +4,7 @@ 2.0 4d8c8573-32f0-4a62-9e62-3ce5cc680390 . - fame_hmm.py + htk_vs_kaldi.py . diff --git a/acoustic_model/convert_phoneset.py b/acoustic_model/convert_phoneset.py index 7bc39f7..d575c99 100644 --- a/acoustic_model/convert_phoneset.py +++ b/acoustic_model/convert_phoneset.py @@ -38,3 +38,9 @@ def convert_phoneset(word_list, translation_key): translation_key (dict): """ return [translation_key.get(phone, phone) for phone in word_list] + + +def phone_reduction(phones, reduction_key): + multi_character_tokenize(wo.strip(), multi_character_phones) + return [reduction_key.get(i, i) for i in phones + if not i in phones_to_be_removed] \ No newline at end of file diff --git a/acoustic_model/defaultfiles.py b/acoustic_model/defaultfiles.py index ef0dfd4..1e262f9 100644 --- a/acoustic_model/defaultfiles.py +++ b/acoustic_model/defaultfiles.py @@ -17,6 +17,7 @@ novo_api_dir = os.path.join(WSL_dir, 'python-novo-api', 'novoapi') rug_dir = r'c:\OneDrive\Research\rug' experiments_dir = os.path.join(rug_dir, 'experiments') htk_dir = os.path.join(experiments_dir, 'acoustic_model', 'fame', 'htk') +kaldi_dir = os.path.join(WSL_dir, 'kaldi-trunk', 'egs', '_stimmen') stimmen_dir = os.path.join(experiments_dir, 'stimmen') # data diff --git a/acoustic_model/fame_functions.py b/acoustic_model/fame_functions.py index 10f16cd..9d3992b 100644 --- a/acoustic_model/fame_functions.py +++ b/acoustic_model/fame_functions.py @@ -321,9 +321,11 @@ def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out): lex.to_csv(lexicon_out, index=False, header=False, sep='\t', encoding='utf-8') -def fix_single_quote(lexicon_file): - """ add '\' before all single quote at the beginning of words. - convert special characters to ascii compatible characters. +def fix_lexicon(lexicon_file): + """ fix lexicon + - add '\' before all single quote at the beginning of words. + - convert special characters to ascii compatible characters. + - add silence. Args: lexicon_file (path): lexicon file, which will be overwitten. @@ -331,6 +333,12 @@ def fix_single_quote(lexicon_file): """ lex = load_lexicon(lexicon_file) lex = lex.dropna() # remove N/A. + + # add 'sil' + row = pd.Series(['SILENCE', 'sil'], index=lex.columns) + lex = lex.append(row, ignore_index=True) + lex = lex.sort_values(by='word', ascending=True) + for i in lex[lex['word'].str.startswith('\'')].index.values: lex.iat[i, 0] = lex.iat[i, 0].replace('\'', '\\\'') # to_csv does not work with space seperator. therefore all tabs should manually be replaced. @@ -346,10 +354,11 @@ def word2htk(word): def ipa2asr(ipa): curr_dir = os.path.dirname(os.path.abspath(__file__)) translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0) - + #ipa_ = fame_asr.phone_reduction(ipa) ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones) ipa_splitted = fame_ipa.phone_reduction(ipa_splitted) asr_splitted = convert_phoneset.convert_phoneset(ipa_splitted, translation_key_ipa2asr) + asr_splitted = fame_asr.phone_reduction(asr_splitted) return ''.join(asr_splitted) @@ -360,5 +369,6 @@ def ipa2htk(ipa): ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones) ipa_splitted = fame_ipa.phone_reduction(ipa_splitted) asr_splitted = convert_phoneset.convert_phoneset(ipa_splitted, translation_key_ipa2asr) + asr_splitted = fame_asr.phone_reduction(asr_splitted) htk_splitted = convert_phoneset.convert_phoneset(asr_splitted, fame_asr.translation_key_asr2htk) return ''.join(htk_splitted) \ No newline at end of file diff --git a/acoustic_model/fame_hmm.py b/acoustic_model/fame_hmm.py index b3d1070..7228c00 100644 --- a/acoustic_model/fame_hmm.py +++ b/acoustic_model/fame_hmm.py @@ -27,7 +27,8 @@ extract_features = 0 flat_start = 0 train_model_without_sp = 0 add_sp = 0 -train_model_with_sp = 1 +train_model_with_sp = 0 +train_model_with_sp_align_mlf = 1 @@ -75,6 +76,7 @@ if not os.path.exists(label_dir): ## training hcompv_scp_train = os.path.join(tmp_dir, 'train.scp') mlf_file_train = os.path.join(label_dir, 'train_phone.mlf') +mlf_file_train_aligned = os.path.join(label_dir, 'train_phone_aligned.mlf') ## train without sp niter_max = 10 @@ -102,7 +104,8 @@ if make_lexicon: # (1) Replace all tabs with single space; # (2) Put a '\' before any dictionary entry beginning with single quote #http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html - fame_functions.fix_single_quote(lexicon_htk) + print('>>> fixing the lexicon...') + fame_functions.fix_lexicon(lexicon_htk) print("elapsed time: {}".format(time.time() - timer_start)) @@ -269,11 +272,11 @@ if train_model_without_sp: fh.make_new_directory(modeln_dir) pyhtk.re_estimation( config_train, - os.path.join(modeln_dir_pre, 'macros'), os.path.join(modeln_dir_pre, hmmdefs_name), modeln_dir, hcompv_scp_train, phonelist_txt, - mlf_file=mlf_file_train) + mlf_file=mlf_file_train, + macros=os.path.join(modeln_dir_pre, 'macros')) print("elapsed time: {}".format(time.time() - timer_start)) @@ -321,7 +324,6 @@ if add_sp: ## ======================= train model with short pause ======================= if train_model_with_sp: print('==== train model with sp ====') - #for niter in range(niter_max+1, niter_max*2+1): for niter in range(20, 50): timer_start = time.time() hmm_n = 'iter' + str(niter) @@ -333,9 +335,31 @@ if train_model_with_sp: fh.make_new_directory(modeln_dir) pyhtk.re_estimation( config_train, - os.path.join(modeln_dir_pre, 'macros'), os.path.join(modeln_dir_pre, hmmdefs_name), modeln_dir, hcompv_scp_train, phonelist_txt, - mlf_file=mlf_file_train) + mlf_file=mlf_file_train, + macros=os.path.join(modeln_dir_pre, 'macros')) + print("elapsed time: {}".format(time.time() - timer_start)) + + +## ======================= train model with short pause ======================= +if train_model_with_sp_align_mlf: + print('==== train model with sp with align.mlf ====') + for niter in range(50, 60): + timer_start = time.time() + hmm_n = 'iter' + str(niter) + hmm_n_pre = 'iter' + str(niter-1) + modeln_dir = os.path.join(model1_dir, hmm_n) + modeln_dir_pre = os.path.join(model1_dir, hmm_n_pre) + + # re-estimation + fh.make_new_directory(modeln_dir) + pyhtk.re_estimation( + config_train, + os.path.join(modeln_dir_pre, hmmdefs_name), + modeln_dir, + hcompv_scp_train, phonelist_txt, + mlf_file=mlf_file_train_aligned, + macros=os.path.join(modeln_dir_pre, 'macros')) print("elapsed time: {}".format(time.time() - timer_start)) \ No newline at end of file diff --git a/acoustic_model/htk_vs_kaldi.py b/acoustic_model/htk_vs_kaldi.py index 00c699c..c35a42f 100644 --- a/acoustic_model/htk_vs_kaldi.py +++ b/acoustic_model/htk_vs_kaldi.py @@ -11,6 +11,7 @@ import glob import numpy as np import pandas as pd +from collections import Counter #import matplotlib.pyplot as plt #from sklearn.metrics import confusion_matrix @@ -50,11 +51,14 @@ from htk import pyhtk #lex_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr') #lex_asr_htk = os.path.join(default.fame_dir, 'lexicon', 'lex.asr_htk') -## procedure +# procedure +make_dic_file = 0 +make_HTK_files = 1 +extract_features = 0 #make_htk_dict_files = 0 #do_forced_alignment_htk = 0 #eval_forced_alignment_htk = 0 -#make_kaldi_data_files = 0 +make_kaldi_files = 0 #make_kaldi_lexicon_txt = 0 #load_forced_alignment_kaldi = 1 #eval_forced_alignment_kaldi = 1 @@ -66,13 +70,34 @@ from htk import pyhtk #sys.path.append(os.path.join(default.repo_dir, 'toolbox')) #from evaluation import plot_confusion_matrix -config_dir = os.path.join(default.htk_dir, 'config') -model_dir = os.path.join(default.htk_dir, 'model') -lattice_file = os.path.join(config_dir, 'stimmen.ltc') -#pyhtk.create_word_lattice_file( -# os.path.join(config_dir, 'stimmen.net'), -# lattice_file) -hvite_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test.scp') +## HTK related files. +config_dir = os.path.join(default.htk_dir, 'config') +model_dir = os.path.join(default.htk_dir, 'model') +feature_dir = os.path.join(default.htk_dir, 'mfc', 'stimmen') + +config_hcopy = os.path.join(config_dir, 'config.HCopy') + +# files to be made. +lattice_file = os.path.join(config_dir, 'stimmen.ltc') +phonelist_txt = os.path.join(config_dir, 'phonelist.txt') +stimmen_dic = os.path.join(default.htk_dir, 'lexicon', 'stimmen_recognition.dic') +hcopy_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test_hcopy.scp') +hvite_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test_hvite.scp') +hresult_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test_result.scp') + + +## Kaldi related files. +kaldi_data_dir = os.path.join(default.kaldi_dir, 'data') + +# files to be made. +wav_scp = os.path.join(kaldi_data_dir, 'test', 'wav.scp') +text_file = os.path.join(kaldi_data_dir, 'test', 'text') +utt2spk = os.path.join(kaldi_data_dir, 'test', 'utt2spk') +corpus_txt = os.path.join(kaldi_data_dir, 'local', 'corpus.txt') +lexicon_txt = os.path.join(kaldi_data_dir, 'local', 'dict', 'lexicon.txt') +nonsilence_phones_txt = os.path.join(kaldi_data_dir, 'local', 'dict', 'nonsilence_phones.txt') +silence_phones_txt = os.path.join(kaldi_data_dir, 'local', 'dict', 'silence_phones.txt') +optional_silence_txt = os.path.join(kaldi_data_dir, 'local', 'dict', 'optional_silence.txt') ## ======================= load test data ====================== @@ -85,392 +110,468 @@ df = stimmen_functions.add_row_htk(df) word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)] word_list = sorted(word_list) -# pronunciation variants + +## ======================= make dic file to check pronunciation variants ====================== +# dic file should be manually modified depends on the task - recognition / forced-alignemnt. +if make_dic_file: + # for HTK. + with open(stimmen_dic, mode='wb') as f: + for word in word_list: + df_ = df[df['word']==word] + pronunciations = list(np.unique(df_['htk'])) + pronunciations_ = [word.upper() + ' sil ' + ' '.join(convert_phoneset.split_word( + htk, fame_asr.multi_character_phones_htk)) + ' sil' + for htk in pronunciations] + f.write(bytes('\n'.join(pronunciations_) + '\n', 'ascii')) + f.write(bytes('SILENCE sil\n', 'ascii')) + + # for Kaldi. + fh.make_new_directory(os.path.join(kaldi_data_dir, 'local', 'dict')) + with open(lexicon_txt, mode='wb') as f: + f.write(bytes('!SIL sil\n', 'utf-8')) + f.write(bytes(' spn\n', 'utf-8')) + for word in word_list: + df_ = df[df['word']==word] + pronunciations = list(np.unique(df_['asr'])) + pronunciations_ = [word.lower() + ' ' + ' '.join(convert_phoneset.split_word( + asr, fame_asr.multi_character_phones)) + for asr in pronunciations] + f.write(bytes('\n'.join(pronunciations_) + '\n', 'utf-8')) + + +## ======================= test data for recognition ====================== +# only target pronunciation variants. +df_rec = pd.DataFrame(index=[], columns=list(df.keys())) for word in word_list: - df_ = df[df['word']==word] - print('{0} has {1} variants'.format(word, len(np.unique(df_['htk']))) + variants = [htk.replace(' ', '') + for htk in stimmen_functions.load_pronunciations(word.upper(), stimmen_dic)] + df_ = df[df['word'] == word] + for index, row in df_.iterrows(): + if row['htk'] in variants: + df_rec = df_rec.append(row, ignore_index=True) -#fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav') -#output = pyhtk.recognition( -# os.path.join(default.htk_dir, 'config', 'config.rec', -# lattice_file, -# os.path.join(model_dir, 'hmm1', 'iter13'), -# dictionary_file, -# os.path.join(config_dir, 'phonelist.txt'), -# hvite_scp) +## ======================= make files required for HTK ====================== +if make_HTK_files: + # make a word lattice file. + pyhtk.create_word_lattice_file( + os.path.join(config_dir, 'stimmen.net'), + lattice_file) - #pyhtk.create_label_file( - # row['word'], - # os.path.join(stimmen_test_dir, filename.replace('.wav', '.lab'))) + # extract features. + with open(hcopy_scp, 'wb') as f: + filelist = [os.path.join(stimmen_test_dir, filename) + '\t' + + os.path.join(feature_dir, os.path.basename(filename).replace('.wav', '.mfc')) + for filename in df['filename']] + f.write(bytes('\n'.join(filelist), 'ascii')) + pyhtk.wav2mfc(config_hcopy, hcopy_scp) + + # make label files. + for index, row in df.iterrows(): + filename = row['filename'].replace('.wav', '.lab') + label_file = os.path.join(feature_dir, filename) + with open(label_file, 'wb') as f: + label_string = 'START\n' + row['word'].upper() + '\nEND\n' + f.write(bytes(label_string, 'ascii')) + + +## ======================= make files required for Kaldi ======================= +if make_kaldi_files: + fh.make_new_directory(os.path.join(kaldi_data_dir, 'test')) + fh.make_new_directory(os.path.join(kaldi_data_dir, 'test', 'local')) + fh.make_new_directory(os.path.join(kaldi_data_dir, 'conf')) + + # remove previous files. + if os.path.exists(wav_scp): + os.remove(wav_scp) + if os.path.exists(text_file): + os.remove(text_file) + if os.path.exists(utt2spk): + os.remove(utt2spk) + + f_wav_scp = open(wav_scp, 'a', encoding="utf-8", newline='\n') + f_text_file = open(text_file, 'a', encoding="utf-8", newline='\n') + f_utt2spk = open(utt2spk, 'a', encoding="utf-8", newline='\n') + + # make wav.scp, text, and utt2spk files. + for i, row in df_rec.iterrows(): + filename = row['filename'] + print('=== {0}: {1} ==='.format(i, filename)) + + wav_file = os.path.join(stimmen_test_dir, filename) + #if os.path.exists(wav_file): + speaker_id = 'speaker_' + str(i).zfill(4) + utterance_id = filename.replace('.wav', '') + utterance_id = utterance_id.replace(' ', '_') + utterance_id = speaker_id + '-' + utterance_id + + # output + f_wav_scp.write('{0} {1}\n'.format( + utterance_id, + wav_file.replace('c:/', '/mnt/c/').replace('\\', '/'))) # convert path to unix format. + f_text_file.write('{0}\t{1}\n'.format(utterance_id, df_rec['word'][i].lower())) + f_utt2spk.write('{0} {1}\n'.format(utterance_id, speaker_id)) + + f_wav_scp.close() + f_text_file.close() + f_utt2spk.close() + + with open(corpus_txt, 'wb') as f: + f.write(bytes('\n'.join([word.lower() for word in word_list]) + '\n', 'utf-8')) + + with open(nonsilence_phones_txt, 'wb') as f: + f.write(bytes('\n'.join(fame_asr.phoneset_short) + '\n', 'utf-8')) + + with open(silence_phones_txt, 'wb') as f: + f.write(bytes('sil\nspn\n', 'utf-8')) + + with open(optional_silence_txt, 'wb') as f: + f.write(bytes('sil\n', 'utf-8')) + + with open(os.path.join(kaldi_data_dir, 'conf', 'decode.config'), 'wb') as f: + f.write(bytes('first_beam=10.0\n', 'utf-8')) + f.write(bytes('beam=13.0\n', 'utf-8')) + f.write(bytes('lattice_beam=6.0\n', 'utf-8')) + + with open(os.path.join(kaldi_data_dir, 'conf', 'mfcc.conf'), 'wb') as f: + f.write(bytes('--use-energy=false', 'utf-8')) + + +## ======================= recognition ====================== + +listdir = glob.glob(os.path.join(feature_dir, '*.mfc')) +with open(hvite_scp, 'wb') as f: + f.write(bytes('\n'.join(listdir), 'ascii')) + +with open(hresult_scp, 'wb') as f: + f.write(bytes('\n'.join(listdir).replace('.mfc', '.rec'), 'ascii')) + + +# calculate result +performance = np.zeros((1, 2)) +for niter in range(1, 50): + output = pyhtk.recognition( + os.path.join(config_dir, 'config.rec'), + lattice_file, + os.path.join(default.htk_dir, 'model', 'hmm1', 'iter' + str(niter), 'hmmdefs'), + stimmen_dic, phonelist_txt, hvite_scp) + + output = pyhtk.calc_recognition_performance( + stimmen_dic, hresult_scp) + per_sentence, per_word = pyhtk.load_recognition_output_all(output) + performance_ = np.array([niter, per_sentence['accuracy']]).reshape(1, 2) + performance = np.r_[performance, performance_] + print('{0}: {1}[%]'.format(niter, per_sentence['accuracy'])) -## ======================= make a HTK dic file ====================== -#if make_htk_dic_file: -# output_type = 3 -dictionary_txt = os.path.join(default.htk_dir, 'lexicon', 'stimmen.dic') -#for word in word_list: -word = word_list[2] -# pronunciation variant of the target word. -pronunciations = df_test['asr'][df_test['word'].str.match(word)] - # make dic file. - #am_func.make_htk_dict(word, pronvar_, htk_dict_file, output_type) - ## ======================= forced alignment using HTK ======================= if do_forced_alignment_htk: - - #for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]: - for hmm_num in [256, 512, 1024]: - hmm_num_str = str(hmm_num) - acoustic_model = os.path.join(acoustic_model_dir, 'hmm' + hmm_num_str + r'-2\hmmdefs') + + #for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]: + for hmm_num in [256, 512, 1024]: + hmm_num_str = str(hmm_num) + acoustic_model = os.path.join(acoustic_model_dir, 'hmm' + hmm_num_str + r'-2\hmmdefs') - predictions = pd.DataFrame({'filename': [''], - 'word': [''], - 'xsampa': [''], - 'ipa': [''], - 'famehtk': [''], - 'prediction': ['']}) - for i, filename in enumerate(df['filename']): - print('=== {0}/{1} ==='.format(i, len(df))) - if (i in df['filename'].keys()) and (isinstance(df['filename'][i], str)): - wav_file = os.path.join(wav_dir, filename) - if os.path.exists(wav_file): - word = df['word'][i] - WORD = word.upper() - fa_file = os.path.join(fa_dir, filename.replace('.wav', '.txt') + hmm_num_str) - - #if not os.path.exists(fa_file): - # make label file. - label_file = os.path.join(wav_dir, filename.replace('.wav', '.lab')) - with open(label_file, 'w') as f: - lines = f.write(WORD) + predictions = pd.DataFrame({'filename': [''], + 'word': [''], + 'xsampa': [''], + 'ipa': [''], + 'famehtk': [''], + 'prediction': ['']}) + for i, filename in enumerate(df['filename']): + print('=== {0}/{1} ==='.format(i, len(df))) + if (i in df['filename'].keys()) and (isinstance(df['filename'][i], str)): + wav_file = os.path.join(wav_dir, filename) + if os.path.exists(wav_file): + word = df['word'][i] + WORD = word.upper() + fa_file = os.path.join(fa_dir, filename.replace('.wav', '.txt') + hmm_num_str) + + #if not os.path.exists(fa_file): + # make label file. + label_file = os.path.join(wav_dir, filename.replace('.wav', '.lab')) + with open(label_file, 'w') as f: + lines = f.write(WORD) - htk_dict_file = os.path.join(htk_dict_dir, word + '.dic') + htk_dict_file = os.path.join(htk_dict_dir, word + '.dic') - pyhtk.doHVite(wav_file, label_file, htk_dict_file, fa_file, default.config_hvite, - default.phonelist, acoustic_model) - os.remove(label_file) + pyhtk.doHVite(wav_file, label_file, htk_dict_file, fa_file, default.config_hvite, + default.phonelist, acoustic_model) + os.remove(label_file) - prediction = am_func.read_fileFA(fa_file) + prediction = am_func.read_fileFA(fa_file) - print('{0}: {1} -> {2}'.format(WORD, df['famehtk'][i], prediction)) - else: - prediction = '' - print('!!!!! file not found.') + print('{0}: {1} -> {2}'.format(WORD, df['famehtk'][i], prediction)) + else: + prediction = '' + print('!!!!! file not found.') - line = pd.Series([df['filename'][i], df['word'][i], df['xsampa'][i], df['ipa'][i], df['famehtk'][i], prediction], index=['filename', 'word', 'xsampa', 'ipa', 'famehtk', 'prediction'], name=i) - predictions = predictions.append(line) - else: - prediction = '' - print('!!!!! invalid entry.') + line = pd.Series([df['filename'][i], df['word'][i], df['xsampa'][i], df['ipa'][i], df['famehtk'][i], prediction], index=['filename', 'word', 'xsampa', 'ipa', 'famehtk', 'prediction'], name=i) + predictions = predictions.append(line) + else: + prediction = '' + print('!!!!! invalid entry.') - predictions.to_pickle(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.pkl')) + predictions.to_pickle(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.pkl')) -## ======================= make files which is used for forced alignment by Kaldi ======================= -if make_kaldi_data_files: - wav_scp = os.path.join(kaldi_data_dir, 'wav.scp') - text_file = os.path.join(kaldi_data_dir, 'text') - utt2spk = os.path.join(kaldi_data_dir, 'utt2spk') - - # remove previous files. - if os.path.exists(wav_scp): - os.remove(wav_scp) - if os.path.exists(text_file): - os.remove(text_file) - if os.path.exists(utt2spk): - os.remove(utt2spk) - - f_wav_scp = open(wav_scp, 'a', encoding="utf-8", newline='\n') - f_text_file = open(text_file, 'a', encoding="utf-8", newline='\n') - f_utt2spk = open(utt2spk, 'a', encoding="utf-8", newline='\n') - - # make wav.scp, text, and utt2spk files. - for i in df.index: - filename = df['filename'][i] - print('=== {0}: {1} ==='.format(i, filename)) - - #if (i in df['filename'].keys()) and (isinstance(df['filename'][i], str)): - wav_file = os.path.join(wav_dir, filename) - if os.path.exists(wav_file): - speaker_id = 'speaker_' + str(i).zfill(4) - utterance_id = filename.replace('.wav', '') - utterance_id = utterance_id.replace(' ', '_') - utterance_id = speaker_id + '-' + utterance_id - - # wav.scp file - wav_file_unix = wav_file.replace('\\', '/') - wav_file_unix = wav_file_unix.replace('c:/', '/mnt/c/') - - f_wav_scp.write('{0} {1}\n'.format(utterance_id, wav_file_unix)) - - # text file - word = df['word'][i].lower() - f_text_file.write('{0}\t{1}\n'.format(utterance_id, word)) - - # utt2spk - f_utt2spk.write('{0} {1}\n'.format(utterance_id, speaker_id)) - - f_wav_scp.close() - f_text_file.close() - f_utt2spk.close() ## ======================= make lexicon txt which is used by Kaldi ======================= if make_kaldi_lexicon_txt: - option_num = 6 + option_num = 6 - # remove previous file. - if os.path.exists(lexicon_txt): - os.remove(lexicon_txt) - lexiconp_txt = lexicon_txt.replace('lexicon.txt', 'lexiconp.txt') - if os.path.exists(lexiconp_txt): - os.remove(lexiconp_txt) - - # output lexicon.txt - f_lexicon_txt = open(lexicon_txt, 'a', encoding="utf-8", newline='\n') - pronvar_list_all = [] - for word in word_list: + # remove previous file. + if os.path.exists(lexicon_txt): + os.remove(lexicon_txt) + lexiconp_txt = lexicon_txt.replace('lexicon.txt', 'lexiconp.txt') + if os.path.exists(lexiconp_txt): + os.remove(lexiconp_txt) + + # output lexicon.txt + f_lexicon_txt = open(lexicon_txt, 'a', encoding="utf-8", newline='\n') + pronvar_list_all = [] + for word in word_list: - # pronunciation variant of the target word. - pronunciation_variants = df['ipa'][df['word'].str.match(word)] + # pronunciation variant of the target word. + pronunciation_variants = df['ipa'][df['word'].str.match(word)] - c = Counter(pronunciation_variants) - total_num = sum(c.values()) + c = Counter(pronunciation_variants) + total_num = sum(c.values()) - #with open(result_dir + '\\' + word + '.csv', 'a', encoding="utf-8", newline='\n') as f: - # for key in c.keys(): - # f.write("{0},{1}\n".format(key,c[key])) + #with open(result_dir + '\\' + word + '.csv', 'a', encoding="utf-8", newline='\n') as f: + # for key in c.keys(): + # f.write("{0},{1}\n".format(key,c[key])) - for key, value in c.most_common(option_num): - # make possible pronunciation variant list. - pronvar_list = am_func.fame_pronunciation_variant(key) + for key, value in c.most_common(option_num): + # make possible pronunciation variant list. + pronvar_list = am_func.fame_pronunciation_variant(key) - for pronvar_ in pronvar_list: - split_ipa = convert_phone_set.split_fame_ipa(pronvar_) - pronvar_out = ' '.join(split_ipa) - pronvar_list_all.append([word, pronvar_out]) + for pronvar_ in pronvar_list: + split_ipa = convert_phone_set.split_fame_ipa(pronvar_) + pronvar_out = ' '.join(split_ipa) + pronvar_list_all.append([word, pronvar_out]) - pronvar_list_all = np.array(pronvar_list_all) - pronvar_list_all = np.unique(pronvar_list_all, axis=0) + pronvar_list_all = np.array(pronvar_list_all) + pronvar_list_all = np.unique(pronvar_list_all, axis=0) - - # output - f_lexicon_txt.write('\tSPN\n') - for line in pronvar_list_all: - f_lexicon_txt.write('{0}\t{1}\n'.format(line[0].lower(), line[1])) + + # output + f_lexicon_txt.write('\tSPN\n') + for line in pronvar_list_all: + f_lexicon_txt.write('{0}\t{1}\n'.format(line[0].lower(), line[1])) - f_lexicon_txt.close() + f_lexicon_txt.close() ## ======================= load kaldi forced alignment result ======================= if load_forced_alignment_kaldi: - phones_txt = os.path.join(default.kaldi_dir, 'data', 'lang', 'phones.txt') - merged_alignment_txt = os.path.join(default.kaldi_dir, 'exp', 'tri1_alignme', 'merged_alignment.txt') - - #filenames = np.load(data_dir + '\\filenames.npy') - #words = np.load(data_dir + '\\words.npy') - #pronunciations = np.load(data_dir + '\\pronunciations_ipa.npy') - #pronvar_list_all = np.load(data_dir + '\\pronvar_list_all.npy') - #word_list = np.unique(words) + phones_txt = os.path.join(default.kaldi_dir, 'data', 'lang', 'phones.txt') + merged_alignment_txt = os.path.join(default.kaldi_dir, 'exp', 'tri1_alignme', 'merged_alignment.txt') + + #filenames = np.load(data_dir + '\\filenames.npy') + #words = np.load(data_dir + '\\words.npy') + #pronunciations = np.load(data_dir + '\\pronunciations_ipa.npy') + #pronvar_list_all = np.load(data_dir + '\\pronvar_list_all.npy') + #word_list = np.unique(words) - # load the mapping between phones and ids. - with open(phones_txt, 'r', encoding="utf-8") as f: - mapping_phone2id = f.read().split('\n') + # load the mapping between phones and ids. + with open(phones_txt, 'r', encoding="utf-8") as f: + mapping_phone2id = f.read().split('\n') - phones = [] - phone_ids = [] # ID of phones - for m in mapping_phone2id: - m = m.split(' ') - if len(m) > 1: - phones.append(m[0]) - phone_ids.append(int(m[1])) + phones = [] + phone_ids = [] # ID of phones + for m in mapping_phone2id: + m = m.split(' ') + if len(m) > 1: + phones.append(m[0]) + phone_ids.append(int(m[1])) - # load the result of FA. - with open(merged_alignment_txt, 'r') as f: - lines = f.read() - lines = lines.split('\n') + # load the result of FA. + with open(merged_alignment_txt, 'r') as f: + lines = f.read() + lines = lines.split('\n') - predictions = pd.DataFrame({'filename': [''], - 'word': [''], - 'xsampa': [''], - 'ipa': [''], - 'famehtk': [''], - 'prediction': ['']}) - #fa_filenames = [] - #fa_pronunciations = [] - utterance_id_ = '' - pronunciation = [] - for line in lines: - line = line.split(' ') - if len(line) == 5: - utterance_id = line[0] - if utterance_id == utterance_id_: - phone_id = int(line[4]) - #if not phone_id == 1: - phone_ = phones[phone_ids.index(phone_id)] - phone = re.sub(r'_[A-Z]', '', phone_) - if not phone == 'SIL': - pronunciation.append(phone) - else: - filename = re.sub(r'speaker_[0-9]{4}-', '', utterance_id_) - prediction = ''.join(pronunciation) - df_ = df[df['filename'].str.match(filename)] - df_idx = df_.index[0] - prediction_ = pd.Series([#filename, - #df_['word'][df_idx], - #df_['xsampa'][df_idx], - #df_['ipa'][df_idx], - #df_['famehtk'][df_idx], - df_.iloc[0,1], - df_.iloc[0,3], - df_.iloc[0,4], - df_.iloc[0,2], - df_.iloc[0,0], - prediction], - index=['filename', 'word', 'xsampa', 'ipa', 'famehtk', 'prediction'], - name=df_idx) - predictions = predictions.append(prediction_) - #fa_filenames.append() - #fa_pronunciations.append(' '.join(pronunciation)) - pronunciation = [] + predictions = pd.DataFrame({'filename': [''], + 'word': [''], + 'xsampa': [''], + 'ipa': [''], + 'famehtk': [''], + 'prediction': ['']}) + #fa_filenames = [] + #fa_pronunciations = [] + utterance_id_ = '' + pronunciation = [] + for line in lines: + line = line.split(' ') + if len(line) == 5: + utterance_id = line[0] + if utterance_id == utterance_id_: + phone_id = int(line[4]) + #if not phone_id == 1: + phone_ = phones[phone_ids.index(phone_id)] + phone = re.sub(r'_[A-Z]', '', phone_) + if not phone == 'SIL': + pronunciation.append(phone) + else: + filename = re.sub(r'speaker_[0-9]{4}-', '', utterance_id_) + prediction = ''.join(pronunciation) + df_ = df[df['filename'].str.match(filename)] + df_idx = df_.index[0] + prediction_ = pd.Series([#filename, + #df_['word'][df_idx], + #df_['xsampa'][df_idx], + #df_['ipa'][df_idx], + #df_['famehtk'][df_idx], + df_.iloc[0,1], + df_.iloc[0,3], + df_.iloc[0,4], + df_.iloc[0,2], + df_.iloc[0,0], + prediction], + index=['filename', 'word', 'xsampa', 'ipa', 'famehtk', 'prediction'], + name=df_idx) + predictions = predictions.append(prediction_) + #fa_filenames.append() + #fa_pronunciations.append(' '.join(pronunciation)) + pronunciation = [] - utterance_id_ = utterance_id - predictions.to_pickle(os.path.join(result_dir, 'kaldi', 'predictions.pkl')) + utterance_id_ = utterance_id + predictions.to_pickle(os.path.join(result_dir, 'kaldi', 'predictions.pkl')) ## ======================= evaluate the result of forced alignment ======================= if eval_forced_alignment_htk: - htk_dict_dir = os.path.join(default.experiments_dir, 'stimmen', 'dic_short') + htk_dict_dir = os.path.join(default.experiments_dir, 'stimmen', 'dic_short') - compare_hmm_num = 1 + compare_hmm_num = 1 - if compare_hmm_num: - f_result = open(os.path.join(result_dir, 'result.csv'), 'w') - f_result.write("nmix,Oog,Oog,Oor,Oor,Pauw,Pauw,Reus,Reus,Reuzenrad,Reuzenrad,Roeiboot,Roeiboot,Rozen,Rozen\n") + if compare_hmm_num: + f_result = open(os.path.join(result_dir, 'result.csv'), 'w') + f_result.write("nmix,Oog,Oog,Oor,Oor,Pauw,Pauw,Reus,Reus,Reuzenrad,Reuzenrad,Roeiboot,Roeiboot,Rozen,Rozen\n") - for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]: - #for hmm_num in [256]: - hmm_num_str = str(hmm_num) - if compare_hmm_num: - f_result.write("{},".format(hmm_num_str)) + for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]: + #for hmm_num in [256]: + hmm_num_str = str(hmm_num) + if compare_hmm_num: + f_result.write("{},".format(hmm_num_str)) - #match = np.load(data_dir + '\\match_hmm' + hmm_num_str + '.npy') - #prediction = np.load(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.npy')) - #prediction = pd.Series(prediction, index=df.index, name='prediction') - #result = pd.concat([df, prediction], axis=1) - result = pd.read_pickle(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.pkl')) + #match = np.load(data_dir + '\\match_hmm' + hmm_num_str + '.npy') + #prediction = np.load(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.npy')) + #prediction = pd.Series(prediction, index=df.index, name='prediction') + #result = pd.concat([df, prediction], axis=1) + result = pd.read_pickle(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.pkl')) - # load pronunciation variants - for word in word_list: - htk_dict_file = os.path.join(htk_dict_dir, word + '.dic') - with open(htk_dict_file, 'r') as f: - lines = f.read().split('\n')[:-1] - pronunciation_variants = [line.split('\t')[1] for line in lines] + # load pronunciation variants + for word in word_list: + htk_dict_file = os.path.join(htk_dict_dir, word + '.dic') + with open(htk_dict_file, 'r') as f: + lines = f.read().split('\n')[:-1] + pronunciation_variants = [line.split('\t')[1] for line in lines] - # see only words which appears in top 3. - result_ = result[result['word'].str.match(word)] - result_ = result_[result_['famehtk'].isin(pronunciation_variants)] + # see only words which appears in top 3. + result_ = result[result['word'].str.match(word)] + result_ = result_[result_['famehtk'].isin(pronunciation_variants)] - match_num = sum(result_['famehtk'] == result_['prediction']) - total_num = len(result_) + match_num = sum(result_['famehtk'] == result_['prediction']) + total_num = len(result_) - print("word '{0}': {1}/{2} ({3:.2f} %)".format(word, match_num, total_num, match_num/total_num*100)) - if compare_hmm_num: - f_result.write("{0},{1},".format(match_num, total_num)) - else: - # output confusion matrix - cm = confusion_matrix(result_['famehtk'], result_['prediction']) + print("word '{0}': {1}/{2} ({3:.2f} %)".format(word, match_num, total_num, match_num/total_num*100)) + if compare_hmm_num: + f_result.write("{0},{1},".format(match_num, total_num)) + else: + # output confusion matrix + cm = confusion_matrix(result_['famehtk'], result_['prediction']) - plt.figure() - plot_confusion_matrix(cm, classes=pronunciation_variants, normalize=False) - plt.savefig(result_dir + '\\cm_' + word + '.png') + plt.figure() + plot_confusion_matrix(cm, classes=pronunciation_variants, normalize=False) + plt.savefig(result_dir + '\\cm_' + word + '.png') - if compare_hmm_num: - f_result.write('\n') + if compare_hmm_num: + f_result.write('\n') - if compare_hmm_num: - f_result.close() + if compare_hmm_num: + f_result.close() ## ======================= evaluate the result of forced alignment of kaldi ======================= if eval_forced_alignment_kaldi: - result = pd.read_pickle(os.path.join(result_dir, 'kaldi', 'predictions.pkl')) + result = pd.read_pickle(os.path.join(result_dir, 'kaldi', 'predictions.pkl')) - f_result = open(os.path.join(result_dir, 'result.csv'), 'w') - f_result.write("word,total,valid,match,[%]\n") + f_result = open(os.path.join(result_dir, 'result.csv'), 'w') + f_result.write("word,total,valid,match,[%]\n") - # load pronunciation variants - with open(lexicon_txt, 'r', encoding="utf-8", newline='\n') as f: - lines = f.read().split('\n')[:-1] - pronunciation_variants_all = [line.split('\t') for line in lines] + # load pronunciation variants + with open(lexicon_txt, 'r', encoding="utf-8", newline='\n') as f: + lines = f.read().split('\n')[:-1] + pronunciation_variants_all = [line.split('\t') for line in lines] - word_list = np.delete(word_list, [0], 0) # remove 'Oog' - for word in word_list: + word_list = np.delete(word_list, [0], 0) # remove 'Oog' + for word in word_list: - # load pronunciation variant of the word. - pronunciation_variants = [] - for line in pronunciation_variants_all: - if line[0] == word.lower(): - pronunciation_variants.append(line[1].replace(' ', '')) + # load pronunciation variant of the word. + pronunciation_variants = [] + for line in pronunciation_variants_all: + if line[0] == word.lower(): + pronunciation_variants.append(line[1].replace(' ', '')) - # see only words which appears in top 3. - result_ = result[result['word'].str.match(word)] - result_tolerant = pd.DataFrame({ - 'filename': [''], - 'word': [''], - 'xsampa': [''], - 'ipa': [''], - 'prediction': [''], - 'match': ['']}) + # see only words which appears in top 3. + result_ = result[result['word'].str.match(word)] + result_tolerant = pd.DataFrame({ + 'filename': [''], + 'word': [''], + 'xsampa': [''], + 'ipa': [''], + 'prediction': [''], + 'match': ['']}) - for i in range(0, len(result_)): - line = result_.iloc[i] + for i in range(0, len(result_)): + line = result_.iloc[i] - # make a list of all possible pronunciation variants of ipa description. - # i.e. possible answers from forced alignment. - ipa = line['ipa'] - pronvar_list = [ipa] - pronvar_list_ = am_func.fame_pronunciation_variant(ipa) - if not pronvar_list_ is None: - pronvar_list += list(pronvar_list_) + # make a list of all possible pronunciation variants of ipa description. + # i.e. possible answers from forced alignment. + ipa = line['ipa'] + pronvar_list = [ipa] + pronvar_list_ = am_func.fame_pronunciation_variant(ipa) + if not pronvar_list_ is None: + pronvar_list += list(pronvar_list_) - # only focus on pronunciations which can be estimated from ipa. - if len(set(pronvar_list) & set(pronunciation_variants)) > 0: - if line['prediction'] in pronvar_list: - ismatch = True - else: - ismatch = False + # only focus on pronunciations which can be estimated from ipa. + if len(set(pronvar_list) & set(pronunciation_variants)) > 0: + if line['prediction'] in pronvar_list: + ismatch = True + else: + ismatch = False - line_df = pd.DataFrame(result_.iloc[i]).T - df_idx = line_df.index[0] - result_tolerant_ = pd.Series([line_df.loc[df_idx, 'filename'], - line_df.loc[df_idx, 'word'], - line_df.loc[df_idx, 'xsampa'], - line_df.loc[df_idx, 'ipa'], - line_df.loc[df_idx, 'prediction'], - ismatch], - index=['filename', 'word', 'xsampa', 'ipa', 'prediction', 'match'], - name=df_idx) - result_tolerant = result_tolerant.append(result_tolerant_) - # remove the first entry (dummy) - result_tolerant = result_tolerant.drop(0, axis=0) + line_df = pd.DataFrame(result_.iloc[i]).T + df_idx = line_df.index[0] + result_tolerant_ = pd.Series([line_df.loc[df_idx, 'filename'], + line_df.loc[df_idx, 'word'], + line_df.loc[df_idx, 'xsampa'], + line_df.loc[df_idx, 'ipa'], + line_df.loc[df_idx, 'prediction'], + ismatch], + index=['filename', 'word', 'xsampa', 'ipa', 'prediction', 'match'], + name=df_idx) + result_tolerant = result_tolerant.append(result_tolerant_) + # remove the first entry (dummy) + result_tolerant = result_tolerant.drop(0, axis=0) - total_num = len(result_) - valid_num = len(result_tolerant) - match_num = np.sum(result_tolerant['match']) + total_num = len(result_) + valid_num = len(result_tolerant) + match_num = np.sum(result_tolerant['match']) - print("word '{0}': {1}/{2} ({3:.2f} %) originally {4}".format(word, match_num, valid_num, match_num/valid_num*100, total_num)) - f_result.write("{0},{1},{2},{3},{4}\n".format(word, total_num, valid_num, match_num, match_num/valid_num*100)) + print("word '{0}': {1}/{2} ({3:.2f} %) originally {4}".format(word, match_num, valid_num, match_num/valid_num*100, total_num)) + f_result.write("{0},{1},{2},{3},{4}\n".format(word, total_num, valid_num, match_num, match_num/valid_num*100)) - f_result.close() - ## output confusion matrix - #cm = confusion_matrix(result_['ipa'], result_['prediction']) + f_result.close() + ## output confusion matrix + #cm = confusion_matrix(result_['ipa'], result_['prediction']) - #plt.figure() - #plot_confusion_matrix(cm, classes=pronunciation_variants, normalize=False) - #plt.savefig(result_dir + '\\cm_' + word + '.png') + #plt.figure() + #plot_confusion_matrix(cm, classes=pronunciation_variants, normalize=False) + #plt.savefig(result_dir + '\\cm_' + word + '.png') diff --git a/acoustic_model/phoneset/fame_asr.py b/acoustic_model/phoneset/fame_asr.py index b11359b..6165d5c 100644 --- a/acoustic_model/phoneset/fame_asr.py +++ b/acoustic_model/phoneset/fame_asr.py @@ -68,14 +68,21 @@ phoneset = [ # the phones which seldom occur are replaced with another more popular phones. # replacements are based on the advice from Martijn Wieling. reduction_key = { - 'y':'i:', 'e':'e:', 'ə:':'ɛ:', 'r:':'r', 'ɡ':'g' + 'y':'i:', 'e':'e:', 'ə:':'ɛ:', 'r:':'r', 'ɡ':'g', + # aki added because this is used in stimmen_project. + 'ɔ̈:':'ɔ:' } # already removed beforehand in phoneset. Just to be sure. -phones_to_be_removed = ['ú', 's:', 'ɔ̈:'] +phones_to_be_removed = ['ú', 's:'] def phone_reduction(phones): + """ + Args: + phones (list): list of phones. + """ return [reduction_key.get(i, i) for i in phones if not i in phones_to_be_removed] + phoneset_short = list(set(phone_reduction(phoneset))) phoneset_short.sort() @@ -96,7 +103,8 @@ translation_key_asr2htk = { 'ŋ': 'ng', # refer to Xsampa. - 'ɔ': 'O', 'ɔ:': 'O:', 'ɔ̈': 'Oe', + 'ɔ': 'O', 'ɔ:': 'O:', 'ɔ̈': 'Oe', + #'ɔ̈:': 'O:', # does not appear in FAME, but used in stimmen. 'ɛ': 'E', 'ɛ:': 'E:', 'ɪ': 'I', 'ɪ:': 'I:', diff --git a/acoustic_model/stimmen_functions.py b/acoustic_model/stimmen_functions.py index a272d42..cfdac62 100644 --- a/acoustic_model/stimmen_functions.py +++ b/acoustic_model/stimmen_functions.py @@ -81,3 +81,25 @@ def add_row_asr(df): for index, row in df.iterrows(): asr.append(fame_functions.ipa2asr(row['ipa'])) return df.assign(asr=asr) + + +def load_pronunciations(WORD, htk_dic): + """ load pronunciation variants from HTK dic file. + + Args: + WORD (str): word in capital letters. + htk_dic (path): HTK dict file. + + Returns: + (pronunciations) (list): pronunciation variants of WORD. + + Notes: + Because this function loads all contents from htk_dic file, + it is not recommended to use for large lexicon. + + """ + with open(htk_dic) as f: + lines = f.read().replace(' sil', '') + lines = lines.split('\n') + return [' '.join(line.split(' ')[1:]) + for line in lines if line.split(' ')[0]==WORD] \ No newline at end of file diff --git a/acoustic_model/stimmen_test.py b/acoustic_model/stimmen_test.py index 60e96eb..93546ca 100644 --- a/acoustic_model/stimmen_test.py +++ b/acoustic_model/stimmen_test.py @@ -2,8 +2,9 @@ import os os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') import sys import shutil +from collections import Counter -#import numpy as np +import numpy as np import pandas as pd import defaultfiles as default @@ -62,3 +63,18 @@ for ipa in df['ipa']: if ':' in ipa_splitted: print(ipa_splitted) + +## check pronunciation variants +df_clean = stimmen_functions.load_transcriptions_clean(stimmen_test_dir) +df_clean = stimmen_functions.add_row_asr(df_clean) +df_clean = stimmen_functions.add_row_htk(df_clean) + +for word in word_list: +#word = word_list[1] + df_ = df_clean[df_clean['word']==word] + c = Counter(df_['htk']) + pronunciations = dict() + for key, value in zip(c.keys(), c.values()): + if value > 3: + pronunciations[key] = value + print(pronunciations) \ No newline at end of file