From 87abbbb95aeb0b72cd5b553cbec8e0901d472e4e Mon Sep 17 00:00:00 2001 From: yemaozi88 <428968@gmail.com> Date: Sun, 27 Jan 2019 23:52:33 +0100 Subject: [PATCH] correspondence between lex.asr and lex.ipa is automatically obtained. header is added to the functions in fame_functions.py. --- .vs/acoustic_model/v15/.suo | Bin 89600 -> 95232 bytes _tmp/phone_to_be_searched.npy | Bin 500 -> 0 bytes _tmp/translation_key.npy | Bin 368 -> 0 bytes .../__pycache__/defaultfiles.cpython-36.pyc | Bin 1260 -> 1260 bytes acoustic_model/convert_phone_set.py | 10 +- acoustic_model/defaultfiles.py | 1 - acoustic_model/fame_functions.py | 234 +++++++++--------- acoustic_model/fame_hmm.py | 83 +++++-- acoustic_model/fame_phoneset.py | 60 ++++- 9 files changed, 244 insertions(+), 144 deletions(-) delete mode 100644 _tmp/phone_to_be_searched.npy delete mode 100644 _tmp/translation_key.npy diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo index ccccfcff5af26b0250002bbe105ee51ea032efa5..0b78f7c18ba1bfbd4d3cd5bf2b32eb15230e5e5c 100644 GIT binary patch delta 4448 zcmcIne^k^}7XRMw%z(fEgA6|u7zq$Y9B~*Dm7oS3e-R>(Azw*Gmw)>^sN{u8j-9xY_{zTx{_$+Z1r>z(tN z`{TWL-@WhN_r7n0UXAdDrae1C$8nsRFyi%kiEt`{{$z0sAAF?Xh4(w&xC~2}8k_iL zcoo0qy}{3uXrIEA9H&K4hquvG?+lU{Wi6ffL|HZyeTEQ92$XefOb+og$H+`qEz2Q9 zQ)GSu(Ri8PNz^IxsYJ6!`8F@}h8dVnbRuCc!H+PW!1T8X3B*ScG6+^eA;Cgm!A~Wy zaSzJ$38KY>hY4(B0*F3NU}K78Y9qRyFqy06xOv3PCoCY$A+UklWy*NgvShi0Xc{3_ zmLDQoN?1xLBP=5X~iKikc@l}C_gr5^!1eUZgf{9=t zup-DMj4c9YoE3rM9FtiQFq+v>9NzYbEF=@Hk$IMNHh^WlfUr=OBZ#uBHxMQh?oK*O z=neu)=pw>m0<-g&Oc~Ew#T~h$d|KOT5-SO72vvl2gwcUs?~^kB6wwVbucUr#+S&Lx z>fcCU$yesLjNzHCmE<;AClK8N9I67hg|YoP3LN)KLKopz(&4~L=y0mtfjjQs>Cy7t zS{Dz;abrjuynC%!$7#l{X5?s>yvNR?ppBQ}#~VStjCE2t=nCKMS;MCXlHsR@&o2=!; z6cQE_m~#vz#6Kp>j}u)+SV}09b<1VCjwm`;wZj3-egA3EkOv96qz~R(_2S20U3sZ- zVZ$sIINSu;!`waA^QMi@$Rl(BV!{RgzXH?ZdXcvxCxSn#?WpD%Lkt zZKwWRyxJUvA||<$3U!TiY44{bE88rsO!5F)->(-hrGgq8H$NI|Zf!9u^S9Cb?C6@> z#@Y&J^{TbC^_5j?R#mL8exhbWRgH_>Frc+r(6XmQC?TGJ{R4WuI;7Rl2cD~LsI5tJ zHcg>#ewI0_`9aTa!1U5cXkm6y-8@Yap6`QDv(j2y(jQDWVc%vOzIa=QI=3+>(=7(1!N|sG1R0`eO=K!9YW>HfhA7j(2&)pfFVVP;Nj1I@* zkp!GN5(w?sUr;Af$pwmA7eJ&A;`hD2v@f%0Vra_7{D#24`_4#*GZ5T&1|CVIeHNw* zo)dnaMjkDPk4i^7?SPI>wWRKPMDv~H(S6t^**v|#MtAZwh{Q9!0iIsoeLe@5osY#F zf;~7w+IhZFC63RAEYWEKKeS$aMn&H)u`U-TsGM9AuDh7%c^A;~a+lQAx7?pBV9BLe zDfZGe)jdps=t_r3De|%d@U;)~t$NL9h6CuS$gb`=S@xnP*Sgx8jLc~n7JEVUYI{Sq zt7@9FqVlPVC#o9ka?RLRlcK6-jpqXYzn$y8(gpapPnXM%E!X@gOS?Zki<>WHVf|;@ z(fL6U3hyT4&CjxLQEv1uIyw-BQSVz38x* zamzKUXa(r|ZuIQcr8M>hO8J54!&8)zMDN~(==<#grVppyFIsLsC5qqAlc)XHB50SX zz6M=gDv-ZjoeLr4esOfE0T2e`SRJ5YC0&i5P$&=^=WqBPsMZVl^2( zM!WDF#neErd=F7$GW|iKyK~i4`X@)SvM3(0#N)^{t0x0~ia361pK<6*p-0ex0C>D8 z)ICt=6L_o}h!By74e0p19OJHs0J}W&?AyVKYBgl5%H9pc!nZ79Z$1QxeG|cGr2i6W z^iR2o`(wG|H};s=3vZ{rH}$LGl^?%p%sVmf@>#ex;lkPUe-?f+ojf2eh@II#cUT_1%Xa@_4DL&TOuqGk1T{p95j#XsZWMI{&+Us&|`QQJ`ENf21xEnE`%%4 zl|~jmnI7E}T?${&7b|gX&ssAaQjM{N|0*f!DJlc4Msz2EQQW$Wo)q`8rP#6g952cNH{L#mm z`Ebtf_dECO`91Er2lrp}?$i3-{KRxgl62sf*Xuyif{{E*I=v&a( zxJ}+kKjQ1MH)=m*$1MCo3uV&9J4-N11d$*DtOYAT9Eb;cfaSfPgIyTAKp0pBrh;6M z3|hcFK+G)+^kQ7rR~ElK3W`MmrTBFN=me9&5|9t30r5bvO@urFpB(Qg9fVn5=6 zhXRJ+!xfM#ffGCoYCzETD0aPR8x-{cWdr&JATuBbCuTvmf*s@Zu^2xAc7k^BBzOw+ z05|Z+by1ghAC=xlw(XF|WqZ;A$iZv}eko(@C-3HyEMCjn6~qClnqk^9*onESnwwCfAW&%*>Whz8@6-vn9Y zPAr~{-Uh@b3(#)?^T7C8SPr=en1HB-@i`3Eg4a9E>lMYY8i<91Z4KnofNmr5X4?F3 z6CMo64)os!Refa>XH;WQ8!$YAKA2pQ(BR5eKB14FB+|KkoK{#Lhy0zsvT2^kuEg(= z-?{Ij9{@i9FKN#jt4NKOJ#jAjA-BfuqGNm=fBU&L{%QG8G=DmAjZPF8??0QVxsrDi zy`_yUSW7xlvqH(Ph5H7pmyu2sxnEJUme0po^^ZbvQX4bfBXdUXGy{nZ37`Fwa($Du z(cDP#B)t+|34`!c81<{MwbytoX@x-&K8srM>C2#hg~t+Sv6`^Xuo2QFm?Z7x6R}4Wywb67lamKPuh-BNc6$TPv5LxuOJ-W>2 zHoKN@(1}tP%E)>+B6c8}$Lyk`TC2G`Bvfq7>*Z?)Ox$%nCZeP`&#|$krn$v&1iR@_ zCa6s{KiP()UHqpr739%luoB$FJQOgRxL;&*_5~}DpHP=(0o32YdYQ>|H`Oy z{O~6rD7$dDG)oR?AOU&wk4emtEoOC-JXm{DrxO9;Q;T=lwjf1oz*_tr;g-Bm z(mriT!rbN&Qg_Iny=S@fKryfHnJrunvEFayg6@t;@hQaX2;Dz^Rk`vR3did zUTxOj!r)_WJJ{)2eDXgVS@p22?xQe&5FbMjXU8?yw6=&6aXEsc@x`JSZ^I1D-7`#F zxTiqrpG0vyvwN~ArpWp8%sGJ?c|z2PYL_FcYigaXb*|d2b*MUjEkzpdGh6SJQaV4l zXR6+pwP3MDS#vCA>svmX)|1%2S%MD3opIo(i$xXX$)_dY-uYe5#iE z$8tS)y|*IjBb~o&43bfL9FESt+j4^!2Z)<~S8B+{N$@ysXf9<;t6`Oje`woZ+d*a#sXSXtJA{$p+X%9}+MM(D7@MO0#C523U;QbVK-&ituVLRR3 zx1b-!+kQxQ*=D#AIVzN@BwlzUjSt>f&2w%RCyMJ&u!-A(xDwfYJLO&19_HZ>B9!_B zif6|M{aiX~5DHHCxHw7NHG&OEz)Oke#=DzG%b7k{Bz(Fv; z8Xwu|SS1HUf)L8Z4ZZDVfnEw{1lmd)Vs9d`9n4SS1H%r~D&iHs;N z?A|}P@xEuCUoMWj+h5t$xT`!S-;ufWqJ=xIVHxl6p-?R6yqYmS`-w{8B&r_>Blu$t z_uVg)SR;)bN+BD)s1%0L)5`0MaQlesN-&DgwZHL)SLb|nTq3!0)DAkoVSHwNZG5Ch z8tQ8rTn_HIk-&$GqW_PvQXUJX6lJoBYKGiNC_9zdJ4EAGjZ$&0Tg$6oq*X3_Db7tYa@Nw1%? z{&f}8`^PwTbg5X$(1@c?TvH`BtvB&yYid|v5v4Pg7Wfy58~frSN^Ba945X4Q;L1ZA zlov}ViAOBqL(RFgKSYtUXq~uUCg0YD4VWo(Ad|wmsI_*eHiw=g<*jT=A4<=sccTU@ ggu{I%%^aG!f-chC=7jK{l;ol0Io` zVAyua#pPGN89fDZ+%(%3Spa16+G{{2pRt)GyNgUSip`%K51M^m>`n9($aMEj(p6;g s74yjCpUfkZU$EVfx$iXlphw2Dn!U4UJPCYLc!V=!`^WUxmHCqK2LvuDqW}N^ diff --git a/_tmp/translation_key.npy b/_tmp/translation_key.npy deleted file mode 100644 index 96c1125fc128e683beffe38d4b916e6769a9989f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 368 zcmY+7Jx>Bb5Qg^iX6pZSw3zBH`LQcfDE1Yr5Wjtc_->AL~dghcfg%glhCjxY6%o;J%WG=4I>q`M3SDe0u-V0yA2^NQW09FovK`vXW!}yiwgVD4 z3+!~U>j$e#Gwg+7c=?@$O~Hl1{^0!%2Xh?ycJb>IM}Kf^-zyYl(b6MIvSi7iEX$S{ eDzakf2~}CO^njYI$-K 1: -# pronunciation = line[1] -# if phone in pronunciation: -# extracted.append(line) -# return extracted - #def ipa2famehtk_lexicon(lexicon_file_in, lexicon_file_out): # """ Convert a lexicon file from IPA to HTK format for FAME! corpus. """ @@ -128,25 +110,6 @@ import convert_phone_set # return ipa -def make_hcopy_scp_from_filelist_in_fame(fame_dir, dataset, feature_dir, hcopy_scp): - """ Make a script file for HCopy using the filelist in FAME! corpus. """ - - filelist_txt = os.path.join(fame_dir, 'fame', 'filelists', dataset + 'list.txt') - with open(filelist_txt) as fin: - filelist = fin.read() - filelist = filelist.split('\n') - - with open(hcopy_scp, 'w') as fout: - for filename_ in filelist: - filename = filename_.replace('.TextGrid', '') - - if len(filename) > 3: # remove '.', '..' and '' - wav_file = os.path.join(fame_dir, 'fame', 'wav', dataset, filename + '.wav') - mfc_file = os.path.join(feature_dir, filename + '.mfc') - - fout.write(wav_file + '\t' + mfc_file + '\n') - - #def make_filelist(input_dir, output_txt): # """ Make a list of files in the input_dir. """ # filenames = os.listdir(input_dir) @@ -191,98 +154,147 @@ def make_hcopy_scp_from_filelist_in_fame(fame_dir, dataset, feature_dir, hcopy_s # f.write('{0}\t{1}\n'.format(WORD, key)) +def make_hcopy_scp_from_filelist_in_fame(fame_dir, dataset, feature_dir, hcopy_scp): + """ Make a script file for HCopy using the filelist in FAME! corpus. + + Args: + fame_dir (path): the directory of FAME corpus. + dataset (str): 'devel', 'test' or 'train'. + feature_dir (path): the directory where feature will be stored. + hcopy_scp (path): a script file for HCopy to be made. + + """ + filelist_txt = os.path.join(fame_dir, 'fame', 'filelists', dataset + 'list.txt') + with open(filelist_txt) as fin: + filelist = fin.read() + filelist = filelist.split('\n') + + with open(hcopy_scp, 'w') as fout: + for filename_ in filelist: + filename = filename_.replace('.TextGrid', '') + + if len(filename) > 3: # remove '.', '..' and '' + wav_file = os.path.join(fame_dir, 'fame', 'wav', dataset, filename + '.wav') + mfc_file = os.path.join(feature_dir, filename + '.mfc') + + fout.write(wav_file + '\t' + mfc_file + '\n') def load_lexicon(lexicon_file): + """ load lexicon file as Data Frame. + + Args: + lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'. + + Returns: + lex (df): lexicon as Data Frame, which has columns 'word' and 'pronunciation'. + + """ lex = pd.read_csv(lexicon_file, delimiter='\t', header=None, encoding="utf-8") lex.rename(columns={0: 'word', 1: 'pronunciation'}, inplace=True) return lex -def get_phonelist(lexicon_asr): - """ Make a list of phones which appears in the lexicon. """ - - #with open(lexicon_file, "rt", encoding="utf-8") as fin: - # lines = fin.read() - # lines = lines.split('\n') - # phonelist = set([]) - # for line in lines: - # line = line.split('\t') - # if len(line) > 1: - # pronunciation = set(line[1].split()) - # phonelist = phonelist | pronunciation - lex = load_lexicon(lexicon_asr) - return set(' '.join(lex['pronunciation']).split(' ')) - - -def extract_unknown_phones(word_list, known_phones): - return [i for i in word_list if not i in known_phones] - - -if __name__ == '__main__': - import time - timer_start = time.time() - - #def get_translation_key(): - dir_tmp = r'c:\Users\Aki\source\repos\acoustic_model\_tmp' - lexicon_ipa = r'd:\_corpus\FAME\lexicon\lex.ipa' - lexicon_asr = r'd:\_corpus\FAME\lexicon\lex.asr' - - lex_ipa = load_lexicon(lexicon_ipa) - lex_asr = load_lexicon(lexicon_asr) - if 1: - phone_to_be_searched = fame_phoneset.phoneset_ipa[:] - translation_key = dict() - for word in lex_ipa['word']: - if np.sum(lex_ipa['word'] == word) == 1 and np.sum(lex_asr['word'] == word) == 1: - ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1] - asr = lex_asr[lex_asr['word'] == word].iat[0, 1] +def get_phoneset_from_lexicon(lexicon_file, phoneset='asr'): + """ Make a list of phones which appears in the lexicon. - ipa_list = convert_phone_set.split_word(ipa, fame_phoneset.multi_character_phones_ipa) - asr_list = asr.split(' ') + Args: + lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'. + phoneset (str): the phoneset with which lexicon_file is written. 'asr'(default) or 'ipa'. - # if there are phones which is not in phone_to_be_searched - #if len([True for i in asr_list if i in phone_to_be_searched]) > 0: - if(len(ipa_list) == len(asr_list)): - print("{0}: {1} --> {2}".format(word, ipa_list, asr_list)) - for ipa_, asr_ in zip(ipa_list, asr_list): - if ipa_ in phone_to_be_searched: - translation_key[ipa_] = asr_ - phone_to_be_searched.remove(ipa_) + Returns: + (list_of_phones) (set): the set of phones included in the lexicon_file. - print("elapsed time: {}".format(time.time() - timer_start)) + """ + assert phoneset in ['asr', 'ipa'], 'phoneset should be \'asr\' or \'ipa\'' - np.save(os.path.join(dir_tmp, 'translation_key.npy'), translation_key) - np.save(os.path.join(dir_tmp, 'phone_to_be_searched.npy'), phone_to_be_searched) - else: - translation_key = np.load(os.path.join(dir_tmp, 'translation_key.npy')).item() - phone_to_be_searched = np.load(os.path.join(dir_tmp, 'phone_to_be_searched.npy')).item() + lex = load_lexicon(lexicon_file) + if phoneset == 'asr': + return set(' '.join(lex['pronunciation']).split(' ')) + elif phoneset == 'ipa': + join_pronunciations = ''.join(lex['pronunciation']) + return set(convert_phone_set.split_word(join_pronunciations, fame_phoneset.multi_character_phones_ipa)) - #phone_unknown = list(phone_to_be_searched) - ##phone_unknown.remove('') - #phone_known = list(translation_key.keys()) +def extract_unknown_phones(ipa, known_phones): + """extract unknown phones in the pronunciation written in IPA. - #p = phone_unknown[0] + Args: + ipa (str): a pronunciation written in IPA. + known_phones (list): list of phones already know. + + Returns: + (list_of_phones) (list): unknown phones not included in 'known_phones'. + + """ + ipa_split = convert_phone_set.split_word(ipa, fame_phoneset.multi_character_phones_ipa) + return [i for i in ipa_split if not i in known_phones] + + +def get_translation_key(lexicon_file_ipa, lexicon_file_asr): + """ get correspondence between lexicon_file_ipa and lexicon_file_asr. + + Args: + lexicon_file_ipa (path): lexicon in the format of 'word' /t 'pronunciation (IPA)'. + lexicon_file_asr (path): lexicon in the format of 'word' /t 'pronunciation (asr)'. + the each character of 'pronunciation' should be delimited by ' '. + + Returns: + translation_key (dict): translation key from ipa to asr. + (phone_unknown) (list): the list of IPA phones, which does not appear in lexicon_file_asr. + + """ + lex_ipa = load_lexicon(lexicon_file_ipa) + lex_asr = load_lexicon(lexicon_file_asr) + phone_unknown = fame_phoneset.phoneset_ipa[:] + translation_key = dict() + for word in lex_ipa['word']: + if np.sum(lex_ipa['word'] == word) == 1 and np.sum(lex_asr['word'] == word) == 1: + ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1] + asr = lex_asr[lex_asr['word'] == word].iat[0, 1] - ### extract lines which contains 'unknown' phone. - #lex_ipa_ = lex_ipa[lex_ipa['pronunciation'].str.count(p)>0] - ##phone_unknown_ = phone_unknown[:] - ##phone_unknown_.remove(p) - #phone_known_ = phone_known[:] - #phone_known_.append(p) - #for index, row in lex_ipa_.iterrows(): - # ipa = row['pronunciation'] - # phone_extract_unknown_phones(asr_list, phone_known_): + ipa_list = convert_phone_set.split_word(ipa, fame_phoneset.multi_character_phones_ipa) + asr_list = asr.split(' ') - # # check the number of phones in phone_unknown_ - # if len([True for i in asr_list if i in phone_unknown_]) == 0: - # word = row['word'] - # ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1] - # print("{0}: {1} --> {2}".format(word, ipa, asr)) - # #print("{0}:{1}".format(index, row['pronunciation'])) + # if there are phones which is not in phone_unknown + #if len([True for i in asr_list if i in phone_unknown]) > 0: + if(len(ipa_list) == len(asr_list)): + print("{0}: {1} --> {2}".format(word, ipa_list, asr_list)) + for ipa_, asr_ in zip(ipa_list, asr_list): + if ipa_ in phone_unknown: + translation_key[ipa_] = asr_ + phone_unknown.remove(ipa_) + return translation_key, list(phone_unknown) +def find_phone(lexicon_file, phone, phoneset='ipa'): + """ extract rows where the phone is used in the lexicon_file. - \ No newline at end of file + Args: + lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'. + phone (str): the phone to be searched. + phoneset (str): the phoneset with which lexicon_file is written. 'asr' or 'ipa'(default). + + Returns: + extracted (df): rows where the phone is used. + + ToDo: + * develop when the phonset == 'asr'. + + """ + assert phoneset in ['asr', 'ipa'], 'phoneset should be \'asr\' or \'ipa\'' + + lex = load_lexicon(lexicon_file) + + # to reduce the calculation time, only target rows which include 'phone' at least once. + lex_ = lex[lex['pronunciation'].str.count(phone)>0] + + extracted = pd.DataFrame(index=[], columns=['word', 'pronunciation']) + for index, row in lex_.iterrows(): + if phoneset == 'ipa': + pronunciation = convert_phone_set.split_word(row['pronunciation'], fame_phoneset.multi_character_phones_ipa) + if phone in pronunciation: + extracted_ = pd.Series([row['word'], pronunciation], index=extracted.columns) + extracted = extracted.append(extracted_, ignore_index=True) + return extracted \ No newline at end of file diff --git a/acoustic_model/fame_hmm.py b/acoustic_model/fame_hmm.py index d6327e7..fe319d0 100644 --- a/acoustic_model/fame_hmm.py +++ b/acoustic_model/fame_hmm.py @@ -6,6 +6,7 @@ import tempfile #import configparser #import subprocess #from collections import Counter +import time #import numpy as np #import pandas as pd @@ -27,8 +28,8 @@ from htk import pyhtk dataset_list = ['devel', 'test', 'train'] # procedure -extract_features = 1 -#conv_lexicon = 0 +extract_features = 0 +conv_lexicon = 1 #check_lexicon = 0 #make_mlf = 0 #combine_files = 0 @@ -84,16 +85,14 @@ if not os.path.exists(tmp_dir): ## ======================= extract features ======================= if extract_features: for dataset in dataset_list: - #for dataset in ['test']: print('==== {} ===='.format(dataset)) # a script file for HCopy print(">>> making a script file for HCopy... \n") hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False) hcopy_scp.close() - #hcopy_scp = os.path.join(default.htk_dir, 'tmp', 'HCopy.scp') - ## get a list of features (hcopy.scp) from the filelist in FAME! corpus + # get a list of features (hcopy.scp) from the filelist in FAME! corpus feature_dir_ = os.path.join(feature_dir, dataset) if not os.path.exists(feature_dir_): os.makedirs(feature_dir_) @@ -101,32 +100,70 @@ if extract_features: # extract features print(">>> extracting features... \n") fame_functions.make_hcopy_scp_from_filelist_in_fame(default.fame_dir, dataset, feature_dir_, hcopy_scp.name) - - #subprocessStr = 'HCopy -C ' + config_hcopy + ' -S ' + hcopy_scp.name - #subprocess.call(subprocessStr, shell=True) pyhtk.wav2mfc(default.config_hcopy, hcopy_scp.name) # a script file for HCompV print(">>> making a script file for HCompV... \n") - - -## ======================= make a list of features ======================= -#if make_feature_list: -# print("==== make a list of features ====\n") - -# for dataset in dataset_list: -# print(dataset) - - #feature_dir = output_dir + '\\mfc\\' + dataset hcompv_scp = os.path.join(tmp_dir, dataset + '.scp') - - #am_func.make_filelist(feature_dir, hcompv_scp) fh.make_filelist(feature_dir_, hcompv_scp, '.mfc') ## ======================= convert lexicon from ipa to fame_htk ======================= if conv_lexicon: print('==== convert lexicon from ipa 2 fame ====\n') + + #dir_out = r'c:\Users\Aki\source\repos\acoustic_model\_tmp' + lexicon_dir = os.path.join(default.fame_dir, 'lexicon') + lexicon_ipa = os.path.join(lexicon_dir, 'lex.ipa') + lexicon_asr = os.path.join(lexicon_dir, 'lex.asr') + + # get the correspondence between lex_ipa and lex_asr. + lex_asr = fame_functions.load_lexicon(lexicon_asr) + lex_ipa = fame_functions.load_lexicon(lexicon_ipa) + if 1: + timer_start = time.time() + translation_key, phone_unknown = fame_functions.get_translation_key(lexicon_ipa, lexicon_asr) + print("elapsed time: {}".format(time.time() - timer_start)) + + np.save('translation_key_ipa2asr.npy', translation_key) + np.save('phone_unknown.npy', phone_unknown) + else: + translation_key = np.load('translation_key_ipa2asr.npy').item() + phone_unknown = np.load('phone_unknown.npy') + phone_unknown = list(phone_unknown) + + + ## manually check the correspondence for the phone in phone_unknown. + #p = phone_unknown[0] + #lex_ipa_ = find_phone(lexicon_ipa, p, phoneset='ipa') + + #for word in lex_ipa_['word']: + # ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1] + # if np.sum(lex_asr['word'] == word) > 0: + # asr = lex_asr[lex_asr['word'] == word].iat[0, 1] + + # ipa_list = convert_phone_set.split_word(ipa, fame_phoneset.multi_character_phones_ipa) + # asr_list = asr.split(' ') + # if p in ipa_list and (len(ipa_list) == len(asr_list)): + # print("{0}: {1} --> {2}".format(word, ipa_list, asr_list)) + # for ipa_, asr_ in zip(ipa_list, asr_list): + # if ipa_ in phone_unknown: + # translation_key[ipa_] = asr_ + # phone_unknown.remove(ipa_) + + + ## check if all the phones in lexicon_ipa are in fame_phoneset.py. + #timer_start = time.time() + #phoneset_lex = get_phoneset_from_lexicon(lexicon_ipa, phoneset='ipa') + #print("elapsed time: {}".format(time.time() - timer_start)) + + #phoneset_py = fame_phoneset.phoneset_ipa + #set(phoneset_lex) - set(phoneset_py) + + ##timer_start = time.time() + ##extracted = find_phone(lexicon_ipa, 'ⁿ') + ##print("elapsed time: {}".format(time.time() - timer_start)) + # lex.asr is Kaldi compatible version of lex.ipa. # to check... @@ -140,13 +177,13 @@ if conv_lexicon: # fout.write("{0}\t{1}\n".format(word, ' '.join(pronunciation_split))) # convert each lexicon from ipa description to fame_htk phoneset. - am_func.ipa2famehtk_lexicon(lex_oov, lex_oov_htk) - am_func.ipa2famehtk_lexicon(lex_asr, lex_asr_htk) + #am_func.ipa2famehtk_lexicon(lex_oov, lex_oov_htk) + #am_func.ipa2famehtk_lexicon(lex_asr, lex_asr_htk) # combine lexicon # pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov. # therefore there is no overlap between lex_asr and lex_oov. - am_func.combine_lexicon(lex_asr_htk, lex_oov_htk, lex_htk) + #am_func.combine_lexicon(lex_asr_htk, lex_oov_htk, lex_htk) ## ======================= check if all the phones are successfully converted ======================= diff --git a/acoustic_model/fame_phoneset.py b/acoustic_model/fame_phoneset.py index b1a07de..2c2387a 100644 --- a/acoustic_model/fame_phoneset.py +++ b/acoustic_model/fame_phoneset.py @@ -1,41 +1,79 @@ +""" definition of the phones to be used. """ + +## phones in IPA. phoneset_ipa = [ # vowels 'i̯', + 'i̯ⁿ', 'y', 'i', + 'i.', + 'iⁿ', 'i:', + 'i:ⁿ', 'ɪ', - 'ɪ:', + 'ɪⁿ', + 'ɪ.', + #'ɪ:', # not included in lex.ipa + 'ɪ:ⁿ', 'e', 'e:', + 'e:ⁿ', 'ə', + 'əⁿ', 'ə:', 'ɛ', + 'ɛ.', + 'ɛⁿ', 'ɛ:', + 'ɛ:ⁿ', 'a', + 'aⁿ', + 'a.', 'a:', + 'a:ⁿ', 'ṷ', - 'ú', + 'ṷ.', + 'ṷⁿ', + #'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. 'u', + 'uⁿ', + 'u.', 'u:', + 'u:ⁿ', 'ü', + 'ü.', + 'üⁿ', 'ü:', + 'ü:ⁿ', 'o', + 'oⁿ', + 'o.', 'o:', + 'o:ⁿ', 'ö', + 'ö.', + 'öⁿ', 'ö:', + 'ö:ⁿ', 'ɔ', + 'ɔ.', + 'ɔⁿ', 'ɔ:', - 'ɔ̈', + 'ɔ:ⁿ', + #'ɔ̈', # not included in lex.ipa + 'ɔ̈.', 'ɔ̈:', # plosives 'p', 'b', - 't', + 't', + 'tⁿ', 'd', 'k', 'g', + 'ɡ', # = 'g' # nasals 'm', @@ -48,8 +86,22 @@ phoneset_ipa = [ 's', 's:', 'z', + 'zⁿ', 'x', 'h', + + # tap and flip + 'r', + 'r.', # only appears in word 'mearpartijestelsel'(does not exist in lex_asr) and 'tenoarpartij'. + 'r:', # only appears in word 'mûsearflearmûs' and 'sjochdêr'. + + # approximant + 'j', + 'j.', + 'l' ] +## the list of multi character phones. +# for example, the length of 'i̯ⁿ' is 3, but in the codes it is treated as one letter. multi_character_phones_ipa = [i for i in phoneset_ipa if len(i) > 1] +multi_character_phones_ipa.sort(key=len, reverse=True) \ No newline at end of file