diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo index 3ce8f85..45e0fe2 100644 Binary files a/.vs/acoustic_model/v15/.suo and b/.vs/acoustic_model/v15/.suo differ diff --git a/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc b/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc index 869323d..a74cd44 100644 Binary files a/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc and b/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc differ diff --git a/acoustic_model/acoustic_model.pyproj b/acoustic_model/acoustic_model.pyproj index 8faedc8..5319301 100644 --- a/acoustic_model/acoustic_model.pyproj +++ b/acoustic_model/acoustic_model.pyproj @@ -4,8 +4,7 @@ 2.0 4d8c8573-32f0-4a62-9e62-3ce5cc680390 . - - + fame_hmm.py . diff --git a/acoustic_model/defaultfiles.py b/acoustic_model/defaultfiles.py index 7c4a8cf..b10d247 100644 --- a/acoustic_model/defaultfiles.py +++ b/acoustic_model/defaultfiles.py @@ -39,11 +39,11 @@ toolbox_dir = os.path.join(repo_dir, 'toolbox') #config_hvite = os.path.join(htk_config_dir, 'config.HVite') #acoustic_model = os.path.join(htk_config_dir, 'hmmdefs.compo') #acoustic_model = r'c:\cygwin64\home\A.Kunikoshi\acoustic_model\model\barbara\hmm128-2\hmmdefs.compo' -#phonelist_txt = os.path.join(htk_config_dir, 'phonelist.txt') +phonelist_txt = os.path.join(htk_dir, 'config', 'phonelist.txt') WSL_dir = r'C:\OneDrive\WSL' #fame_dir = os.path.join(WSL_dir, 'kaldi-trunk', 'egs', 'fame') -fame_dir = r'd:\_corpus\fame' +fame_dir = r'c:\OneDrive\Research\rug\_data\FAME' fame_s5_dir = os.path.join(fame_dir, 's5') fame_corpus_dir = os.path.join(fame_dir, 'corpus') diff --git a/acoustic_model/fame_functions.py b/acoustic_model/fame_functions.py index 5fe60e5..cb87620 100644 --- a/acoustic_model/fame_functions.py +++ b/acoustic_model/fame_functions.py @@ -290,15 +290,17 @@ def lexicon_asr2htk(lexicon_file_asr, lexicon_file_htk): """ lex_asr = load_lexicon(lexicon_file_asr) + def word2htk_(row): + return word2htk(row['word']) def asr2htk_space_delimited_(row): return asr2htk_space_delimited(row['pronunciation']) lex_htk = pd.DataFrame({ - 'word': lex_asr['word'], + 'word': lex_asr.apply(word2htk_, axis=1).str.upper(), 'pronunciation': lex_asr.apply(asr2htk_space_delimited_, axis=1) }) lex_htk = lex_htk.ix[:, ['word', 'pronunciation']] - lex_htk.to_csv(lexicon_file_htk, header=None, index=None, sep='\t') + lex_htk.to_csv(lexicon_file_htk, header=None, index=None, sep='\t', encoding='utf-8') return @@ -316,20 +318,26 @@ def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out): lex2 = load_lexicon(lexicon_file2) lex = pd.concat([lex1, lex2]) lex = lex.sort_values(by='word', ascending=True) - lex.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t') + lex.to_csv(lexicon_out, index=False, header=False, sep='\t', encoding='utf-8') def fix_single_quote(lexicon_file): """ add '\' before all single quote at the beginning of words. + convert special characters to ascii compatible characters. Args: lexicon_file (path): lexicon file, which will be overwitten. """ lex = load_lexicon(lexicon_file) + lex = lex.dropna() # remove N/A. for i in lex[lex['word'].str.startswith('\'')].index.values: lex.iat[i, 0] = lex.iat[i, 0].replace('\'', '\\\'') - # to_csv does not work with space seperator. therefore all tabs should manually be replaced. - #lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\') - lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep='\t') + # to_csv does not work with space seperator. therefore all tabs should manually be replaced. + #lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\') + lex.to_csv(lexicon_file, index=False, header=False, sep='\t', encoding='utf-8') return + + +def word2htk(word): + return ''.join([fame_asr.translation_key_word2htk.get(i, i) for i in word]) diff --git a/acoustic_model/fame_hmm.py b/acoustic_model/fame_hmm.py index ba2732c..9ce920b 100644 --- a/acoustic_model/fame_hmm.py +++ b/acoustic_model/fame_hmm.py @@ -3,6 +3,7 @@ import os os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') import tempfile +import shutil #import configparser #import subprocess import time @@ -11,6 +12,7 @@ import numpy as np import pandas as pd import fame_functions +from phoneset import fame_ipa, fame_asr import defaultfiles as default sys.path.append(default.toolbox_dir) import file_handling as fh @@ -28,7 +30,7 @@ dataset_list = ['devel', 'test', 'train'] # procedure extract_features = 0 -make_lexicon = 0 +make_lexicon = 1 make_mlf = 0 combine_files = 0 flat_start = 0 @@ -44,6 +46,9 @@ lexicon_htk_asr = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_asr') lexicon_htk_oov = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_oov') lexicon_htk = os.path.join(default.htk_dir, 'lexicon', 'lex.htk') +global_ded = os.path.join(default.htk_dir, 'config', 'global.ded') + + #hcompv_scp = output_dir + '\\scp\\combined.scp' #combined_mlf = output_dir + '\\label\\combined.mlf' @@ -60,14 +65,17 @@ if not os.path.exists(feature_dir): tmp_dir = os.path.join(default.htk_dir, 'tmp') if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) +label_dir = os.path.join(default.htk_dir, 'label') +if not os.path.exists(label_dir): + os.makedirs(label_dir) + ## ======================= extract features ======================= if extract_features: - print('==== extract features ====\n') - + for dataset in dataset_list: - print('==== dataset: {} ===='.format(dataset)) + print('==== extract features on dataset {} ====\n'.format(dataset)) # a script file for HCopy print(">>> making a script file for HCopy... \n") @@ -89,6 +97,8 @@ if extract_features: hcompv_scp = os.path.join(tmp_dir, dataset + '.scp') fh.make_filelist(feature_dir_, hcompv_scp, '.mfc') + os.remove(hcopy_scp.name) + ## ======================= make lexicon for HTK ======================= if make_lexicon: @@ -114,94 +124,132 @@ if make_lexicon: fame_functions.fix_single_quote(lexicon_htk) +## ======================= make phonelist ======================= +#phonelist_txt = os.path.join(default.htk_dir, 'config', 'phonelist.txt') +#pyhtk.create_phonelist_file(fame_asr.phoneset_htk, phonelist_txt) +#sentence = 'ien fan de minsken fan it deiferbliuw sels brúntsje visser' +#log_txt = os.path.join(default.htk_dir, 'config', 'log.txt') +#dictionary_file = os.path.join(default.htk_dir, 'config', 'test.dic') +#pyhtk.create_dictionary( +# sentence, global_ded, log_txt, dictionary_file, lexicon_htk) +#pyhtk.create_dictionary_without_log( +# sentence, global_ded, dictionary_file, lexicon_htk) + + ## ======================= make label file ======================= if make_mlf: - print("==== make mlf ====\n") - - print("generating word level transcription...\n") for dataset in dataset_list: - hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp' - hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp' - script_list = FAME_dir + '\\data\\' + dataset + '\\text' - mlf_word = output_dir + '\\label\\' + dataset + '_word.mlf' - mlf_phone = output_dir + '\\label\\' + dataset + '_phone.mlf' + timer_start = time.time() + print("==== generating word level transcription on dataset {}\n".format(dataset)) - # lexicon - lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation']) - - # list of features - with open(hcompv_scp) as fin: - features = fin.read() - features = features.split('\n') + #hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp' + #hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp' + script_list = os.path.join(default.fame_dir, 'data', dataset, 'text') + #mlf_word = output_dir + '\\label\\' + dataset + '_word.mlf' + #mlf_phone = output_dir + '\\label\\' + dataset + '_phone.mlf' + wav_dir = os.path.join(default.fame_dir, 'fame', 'wav', dataset) + dictionary_file = os.path.join(wav_dir, 'temp.dic') # list of scripts with open(script_list, "rt", encoding="utf-8") as fin: - scripts = fin.read() - scripts = pd.Series(scripts.split('\n')) + scripts = fin.read().split('\n') - i = 0 - missing_words = [] - fscp = open(hcompv_scp2, 'wt') - fmlf = open(mlf_word, "wt", encoding="utf-8") - fmlf.write("#!MLF!#\n") - feature_nr = 1 - for feature in features: - sys.stdout.write("\r%d/%d" % (feature_nr, len(features))) - sys.stdout.flush() - feature_nr += 1 - file_basename = os.path.basename(feature).replace('.mfc', '') + for line in scripts: + #for line in ['sp0035m_train_1975_fragmentenvraaggesprekkenruilverkaveling_15413 en dat kan men nog meer']: + # sample line: + # sp0457m_test_1968_plakkenfryslanterhorne_2168 en dan begjinne je natuerlik + filename_ = line.split(' ')[0] + filename = '_'.join(filename_.split('_')[1:]) + sentence = ' '.join(line.split(' ')[1:]) - # get words from scripts. - try: - script = scripts[scripts.str.contains(file_basename)] - except IndexError: - script = [] + wav_file = os.path.join(wav_dir, filename + '.wav') + if len(re.findall(r'[\w]+[âêûô\'ú]+[\w]+', sentence))==0: + try: + sentence_ascii = bytes(sentence, 'ascii') + except UnicodeEncodeError: + print(sentence) + #if os.path.exists(wav_file): + # #dictionary_file = os.path.join(wav_dir, filename + '.dic') + # if pyhtk.create_dictionary_without_log( + # sentence, global_ded, dictionary_file, lexicon_htk) == 0: + # # when the file name is too long, HDMan command does not work. + # # therefore first temporary dictionary_file is made, then renamed. + # shutil.move(dictionary_file, os.path.join(wav_dir, filename + '.dic')) + # label_file = os.path.join(wav_dir, filename + '.lab') + # pyhtk.create_label_file(sentence, label_file) + # else: + # os.remove(dictionary_file) + print("elapsed time: {}".format(time.time() - timer_start)) + # lexicon + #lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation']) - if len(script) != 0: - script_id = script.index[0] - script_txt = script.get(script_id) - script_words = script_txt.split(' ') - del script_words[0] + # list of features + #with open(hcompv_scp) as fin: + # features = fin.read() + # features = features.split('\n') + #i = 0 + #missing_words = [] + #fscp = open(hcompv_scp2, 'wt') + #fmlf = open(mlf_word, "wt", encoding="utf-8") + #fmlf.write("#!MLF!#\n") + #feature_nr = 1 + #for feature in features: + # sys.stdout.write("\r%d/%d" % (feature_nr, len(features))) + # sys.stdout.flush() + # feature_nr += 1 + # file_basename = os.path.basename(feature).replace('.mfc', '') + + # # get words from scripts. + # try: + # script = scripts[scripts.str.contains(file_basename)] + # except IndexError: + # script = [] + + # if len(script) != 0: + # script_id = script.index[0] + # script_txt = script.get(script_id) + # script_words = script_txt.split(' ') + # del script_words[0] # check if all words can be found in the lexicon. - SCRIPT_WORDS = [] - script_prons = [] - is_in_lexicon = 1 - for word in script_words: - WORD = word.upper() - SCRIPT_WORDS.append(WORD) - extracted = lexicon_htk[lexicon_htk['word']==WORD] - if len(extracted) == 0: - missing_words.append(word) - script_prons.append(extracted) - is_in_lexicon *= len(extracted) + # SCRIPT_WORDS = [] + # script_prons = [] + # is_in_lexicon = 1 + # for word in script_words: + # WORD = word.upper() + # SCRIPT_WORDS.append(WORD) + # extracted = lexicon_htk[lexicon_htk['word']==WORD] + # if len(extracted) == 0: + # missing_words.append(word) + # script_prons.append(extracted) + # is_in_lexicon *= len(extracted) # if all pronunciations are found in the lexicon, update scp and mlf files. - if is_in_lexicon: + # if is_in_lexicon: # add the feature filename into the .scp file. - fscp.write("{}\n".format(feature)) - i += 1 + # fscp.write("{}\n".format(feature)) + # i += 1 # add the words to the mlf file. - fmlf.write('\"*/{}.lab\"\n'.format(file_basename)) + # fmlf.write('\"*/{}.lab\"\n'.format(file_basename)) #fmlf.write('{}'.format('\n'.join(SCRIPT_WORDS))) - for word_ in SCRIPT_WORDS: - if word_[0] == '\'': - word_ = '\\' + word_ - fmlf.write('{}\n'.format(word_)) - fmlf.write('.\n') - print("\n{0} has {1} samples.\n".format(dataset, i)) - np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words) + # for word_ in SCRIPT_WORDS: + # if word_[0] == '\'': + # word_ = '\\' + word_ + # fmlf.write('{}\n'.format(word_)) + # fmlf.write('.\n') + # print("\n{0} has {1} samples.\n".format(dataset, i)) + # np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words) - fscp.close() - fmlf.close() + # fscp.close() + # fmlf.close() ## generate phone level transcription - print("generating phone level transcription...\n") - mkphones = output_dir + '\\label\\mkphones0.txt' - subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word - subprocess.call(subprocessStr, shell=True) + # print("generating phone level transcription...\n") + # mkphones = output_dir + '\\label\\mkphones0.txt' + # subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word + # subprocess.call(subprocessStr, shell=True) ## ======================= combined scps and mlfs ======================= diff --git a/acoustic_model/fame_test.py b/acoustic_model/fame_test.py index d330e7f..c7b2e59 100644 --- a/acoustic_model/fame_test.py +++ b/acoustic_model/fame_test.py @@ -3,6 +3,7 @@ import os os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') from collections import Counter import time +import re import numpy as np import pandas as pd @@ -82,22 +83,52 @@ np.save(os.path.join('phoneset', 'fame_ipa2asr.npy'), translation_key_ipa2asr) ## check if all the phones in lexicon.htk are in fame_asr.py. -timer_start = time.time() -phoneset_htk = fame_asr.phoneset_htk -phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_htk) -phoneset_lex.remove('') -print("phones which is in lexicon.htk but not in the fame_asr.py are:\n{}".format( - set(phoneset_htk) - set(phoneset_lex))) -print("elapsed time: {}".format(time.time() - timer_start)) +#timer_start = time.time() +#phoneset_htk = fame_asr.phoneset_htk +#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_htk) +#phoneset_lex.remove('') +#print("phones which is in lexicon.htk but not in the fame_asr.py are:\n{}".format( +# set(phoneset_htk) - set(phoneset_lex))) +#print("elapsed time: {}".format(time.time() - timer_start)) -# statistics over the lexicon -lex_htk = fame_functions.load_lexicon(lexicon_htk) -phones_all = (' '.join(lex_htk['pronunciation'])).split(' ') -c = Counter(phones_all) +## statistics over the lexicon +#lex_htk = fame_functions.load_lexicon(lexicon_htk) +#phones_all = (' '.join(lex_htk['pronunciation'])).split(' ') +#c = Counter(phones_all) + +#lexicon_out = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\lexicon\lex.htk2' +#for i in lex_htk[lex_htk['word'].str.startswith('\'')].index.values: +# lex_htk.iat[i, 0] = lex_htk.iat[i, 0].replace('\'', '\\\'') +## to_csv does not work with space seperator. therefore all tabs should manually be replaced. +##lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\') +#lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t') + + +## check which letters are not coded in ascii. +print('asr phones which cannot be coded in ascii:\n') +for i in fame_asr.phoneset_short: + try: + i_encoded = i.encode("ascii") + #print("{0} --> {1}".format(i, i.encode("ascii"))) + except UnicodeEncodeError: + print(">>> {}".format(i)) + +print("letters in the scripts which is not coded in ascii:\n") +for dataset in ['train', 'devel', 'test']: + timer_start = time.time() + + script_list = os.path.join(default.fame_dir, 'data', dataset, 'text') + with open(script_list, "rt", encoding="utf-8") as fin: + scripts = fin.read().split('\n') + + for line in scripts: + sentence = ' '.join(line.split(' ')[1:]) + sentence_htk = fame_functions.word2htk(sentence) + + #if len(re.findall(r'[âêôûč\'àéèúćäëïöü]', sentence))==0: + try: + sentence_htk = bytes(sentence_htk, 'ascii') + except UnicodeEncodeError: + print(sentence) + print(sentence_htk) -lexicon_out = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\lexicon\lex.htk2' -for i in lex_htk[lex_htk['word'].str.startswith('\'')].index.values: - lex_htk.iat[i, 0] = lex_htk.iat[i, 0].replace('\'', '\\\'') -# to_csv does not work with space seperator. therefore all tabs should manually be replaced. -#lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\') -lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t') diff --git a/acoustic_model/phoneset/fame_asr.py b/acoustic_model/phoneset/fame_asr.py index 8408646..a9f47a7 100644 --- a/acoustic_model/phoneset/fame_asr.py +++ b/acoustic_model/phoneset/fame_asr.py @@ -103,12 +103,22 @@ translation_key_asr2htk = { } phoneset_htk = [translation_key_asr2htk.get(i, i) for i in phoneset_short] -## check -#for i in phoneset_short: -# try: -# print("{0} --> {1}".format(i, i.encode("ascii"))) -# except UnicodeEncodeError: -# print(">>> {}".format(i)) +#not_in_ascii = [ +# '\'', +# 'â', 'ê', 'ô', 'û', 'č', +# 'à', 'í', 'é', 'è', 'ú', 'ć', +# 'ä', 'ë', 'ï', 'ö', 'ü' +#] +translation_key_word2htk = { + '\'': '\\\'', + 'í':'i1', 'é':'e1', 'ú':'u1', 'ć':'c1', + 'à':'a2', 'è':'e2', + 'â':'a3', 'ê':'e3', 'ô':'o3', 'û':'u3', + 'č':'c4', + 'ä': 'ao', 'ë': 'ee', 'ï': 'ie', 'ö': 'oe', 'ü': 'ue', +} +#[translation_key_word2htk.get(i, i) for i in not_in_ascii] + ## the list of multi character phones. diff --git a/acoustic_model/test.txt b/acoustic_model/test.txt new file mode 100644 index 0000000..e69de29