From 22cccfb61dd734764c5b6ce7812f2fde6fa4ddfe Mon Sep 17 00:00:00 2001 From: yemaozi88 <428968@gmail.com> Date: Sun, 3 Feb 2019 00:34:35 +0100 Subject: [PATCH] fix the bug there are characters in the lexicon which cannot be described in ascii. --- .vs/acoustic_model/v15/.suo | Bin 102912 -> 107520 bytes .../__pycache__/defaultfiles.cpython-36.pyc | Bin 1260 -> 1328 bytes acoustic_model/acoustic_model.pyproj | 3 +- acoustic_model/defaultfiles.py | 4 +- acoustic_model/fame_functions.py | 20 +- acoustic_model/fame_hmm.py | 188 +++++++++++------- acoustic_model/fame_test.py | 65 ++++-- acoustic_model/phoneset/fame_asr.py | 22 +- acoustic_model/test.txt | 0 9 files changed, 199 insertions(+), 103 deletions(-) create mode 100644 acoustic_model/test.txt diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo index 3ce8f85377cd6cde6370d2052d18b3560c2969ef..45e0fe26cbf1e65248b4054b2288526087176549 100644 GIT binary patch delta 8421 zcmeHMdtB33mj9g}4-qLwUPS~2M5RR_goGs2!Ms5TAt3|-sTD#}*6CEs z2-kW$-P*NT+wEr_Z6!0s+HH;2>2}>&>&)7Ark~GtbUr)XS$F?%+;OJu&elCQ36^o3 znd)xm@%i-T^UXQ;+{f?UbIv{YmlOZdc>kO62ZZFxV~aBd9Jh|+xLCv|*RNkk!T~~d z%(Q}EEO^^~yy2A#&__JH557#U@w?$L8IRd<*AeXVa0z~4SwbK-b=no{V-w(f+*Sm# z;j$6=T*Pif4uT2GkR~Cr5jsRNLWHPA_z-yr)^82cMTk^HXH>@IX~^$F&;_zmHdKs) zZHTM|4J)GjgHd`n(iwymu?!(ZBp}$t^W}e#SsRsAEokdU-m;*sLEeu@3Cl=ET-J0; z((-$%X`w{HCiN=6Z5h#d=B zCy?KXC`J?_REQbgx!C_*Uo118@@C`tp1~gT#$)rdvKVD-+iX4Sk!LbyOGZ~C&5iP` z{s8jKmdyUlmWha4*|H7g%$9M81-A5Y6)2e3F*9a6f*JE(L?r@^$t_2)hcR0+?<_zt zTQUWRhHSC`@-JP=wBCFqxbWfCLy9r~uer>KNkHA0Fp09SGQp{mv}so^Oi!Cq z%N|DoQ~5!pk0A~r*osM%vbGH5pF|u+1aMGNJ~FI<6>@Ox!+CF_;2#$VzlZ#}X#Fju zXqVS#D%~&^^i+NWg1M@TF*J||ne>Vf3TY?>Di;emWjN|^WcP(HEoym-9UqST{e|MB z_U?{;+XHUfu)D-{C=>gi-VB-0LX;&MsuDw*Ajc4M^oMIQ4=?-qnUcLP+Wr|gn4U7s zz5jBd>UrIT)9|m$&z>&(cRf9j0>YW9U%)uIR^?-B*|?7<`Nu_cY8xa66}kf;_*J~= zKuBg+C27~~DE3|QFkLONVNB}aJe-qr6JPRbax^(RSY(LzK#Y9CeQ+E|sHX;-VR#zcaBYF@-u;gIuC}W;m(!~2A_{k=oC;sI8V1CQ{5HRwA zc0SO?)2n_^1Qpvtae)aw;NSyUe4v|Wo)JjSPZ&aNvg5@(Chk$3Y$t(!H4*X&MAQFw zM9?orUJyLfzk#>0^Eespe*o!2h`op}BOXRD$9e>@55YW|xxo}t=3H#?%wL%7Ye;Y9 zSFG(R^rP&*#tzJvo<{r)f;AjP`nL#jaH|vk0WXJ~vsnA5Xe;Y+egTh;b#mUZ_CdhK z2gdlo2tVUJlisT8tm?kRQtfSW1lhSC9?CN$4OagXwoY zgDYf((F#%=Ts8CUM+F=omd$_7k4X^l*swG#MhnJq*+luB?BIWXI}HN0{LD*DDLj|# z<_7WO#8-nsycFBDp!x^Wqt3Oo^e`)(Zjd)y%LM|i!Zq8=A<w*0|QNEXs%D*UutJD<(9|qn`n0J04PyZ?-yx7{y_4LQFpw3`d zex#&@*x_QKLh~9q|IPurF$RS6-qqlm{b3eQjBm#=$s!tB4T?;3+g;c*#G=FX)X;9^ z*$ah+X`fVGj;F$Mcr|*?I%7YtLqdPD9@5EbK24?h94(%h5LR`GYrKSPJC&9GTW&Gm zatnvKbXn(Pkv3A+grZkCr}-mz%7nVI-7;(=KYKUrHytKEvLRZs1Xh09a-*6!zun{U zWYvKJUx_eWCVHAx4UMD1x@!BN5{toXacC`e=0Zjg~RJ(4cgPb+xXV&8{(Po6@4v8mmi{ zbv2sdW|cu=s~%Dh47S#(HG`TSi@L5()$49kS=!vSqQTzQ+99#fV=F2WOZ&>3y_x|o z7vGHC7_a)dVGhXBm$ZLrarmTG=2uioQ^S%g5cNCO_4%3ia+|Xd_lF20Hr44;z zsaIrGYn%E+2Bp1E)^F07>KzVUZ=IyHzEeJ;9dvKf40#5%!@9u{t+#okt<%`CS>I$e z$vtCgSF_$Fb{6#si=0}cN>VyX_MMmbJl1OacyEhZ+Uc#4iET!AS zE#5AdVRMa8rRkN`_UIdij4j=xwg!*9ztP_4XlSu$jYHB+J*w_jTZdI8ujx=38b?eu z;_iwfr@da=I9fecYn0lDw1eB_!;W6L#aq-|?dspuBOYn7j&^ySR-4i8)hj)1#vXI^ zsI0;#ueBHTjF#1pj7o%q`f96Z&><}wmpRI6Mpb3C!!k>^PCqbewi*Uhj;_{b{pR8E zvQ49Pl9qZ&bxp^p+~X1IZDw_SS67>|T;%jhy}b>lv3jd&yspe_w3IiLHues5^oY9q zdJSG#=}ji>GkjsvzjTvI^ipZlxT&@j-)btPdP915xyiPfz2zviPF+)<*< z6uJkSSw57d+hP!`%u{+P6>=%0z(zW|6_YcH9GIX@DX>1Ad7_o6@Wz^(mwfvx=0$KQ z4HVR$0*X0*8u;PaLP!WFffChPiCSI3)vjGhkA^M8{1FhZ~Fm@BM-o3kk>DiEvdj{ ztQ=9yNyx6_U!~_V;VN|D4}S$N><{!He7`E3{9y^8Pf1Q22|wpZ0&M7(E&}wFqeyc*`YYPK$IfOc9?08_A)D*aGT52u3wpQVUwLm&hlJ)HwXEHP6E-Qo4G>*jh`euR4f{~zh# z2DmP)^&qul?!?87nJjDW$`+Ue`r=m5vwqq1Vmor)t<3B5&`ME<&%|d{h;WO!`Vv`WUCNn9L;Y)JbjHbQ&j>et)}*NmKKjWcp9f z%x8bUd+xdS{?7OJJLleYzHD|LG52IlRFy!BRFqJ-CU9wFWQ2QzK=kplYt#_yFP0R> zJT*isMXTCD9pZg;8)b@3L0iIiW6ay>JX%pA1({+x_-bwu=<$30-{8RU-LRRU0OW$J z0got#9Rsch3&9;=BCvvFa4q0*O9GbtYv8wo>0q(Hzhh(yoT>hUkp$QnFb>29`r2Us zg^RC(y$37}^k=}P27JCbHx0+-fxbBb8wI;6;LnCl0}ikeq$nafI4NZ;+*-i5U_Gh= z{tW@k{dK)%Y16XN5eKdVt3f?j0~!DiS{JbFe@8qPJUM+DdTd}G82_=eJ@B`JaL@sq zz@@GaS^8+{`<%L>Po}Qp0gM<;9p9gu1%0dyCyN8049kg30#?A8=5c1&e!z+3V{Rev z`Pxq=as~Q1kvwKJkC z0{MUw$^ATmJos6GWA2*^pA(tLB5@`c0L}oPeWtaw2=nCW!tJ3x$h^LJETq;8agF!lR6fac0$R9ilY8AH!w532X*?z{J4)?}QJ{lC6pyI8Ysfh7WYu(D`}SWJ#RFgX;U6NvqAgm6!E-0 zc5FRXcb(FztXEo$kqN)1sb5@m<5HEX>Zz(;s_M&C*<46@Mqc@g|xTEsrm|4uU6|jG085l4Ip#M4saLX=kA1MfxZWJ0Tzfe-VOGGUcjPDa3#^dkA4>X z2fbxEqQ7}H3pt|Kn)UUn-k^RYo3Q5FOuR3N?#FGSC4aa0-HQ&x@zP1tC$-c!tNKRO z)&3Jp6NdBB;!9j-O~yalEhL;_R>Qgmsku{C4YlhM={K?#`dwyyyQ+7a_1&t__Kq>e z^nF02iiv8B5xeg^>8DL1YyT1B$bo8d<(FI`l*W^)aP3D-2TzjTri!e4A2uHDUmk>b zMArSMj8o@VnnD_}L)Rm7)qG=d;aH>?e8eIukKIO%;z!5gWYQK&4v9w0+@i(EvF;#~ zk|+uuO`JqZVjbQYjmZ24rA1i_zorlCieF2L0w>r%&^Z4{EQtvN5h7xsm=sYkm}WQ! zoXBwlRT|#GB*GP2lx)0jakY9?ft*p^H%<)i3KOfQTr2K;t}8Sg`)ds5XIA8eFnWQWlYb4P9x?9Eb}?EHYoM7u`MKav+_|aySh$D*0`MFLYCw;&_px zzeh$^P@7nHG)%lZc;Qo$p^3V;uXx- zH?-KBHWi7^p9*b_bjYjL#Ak=;}1 zS8_od{Vcq5v=erRXLcIRhaP7V7%Dr}q7H7{)mHE@@I#KXy)qhk`kB;@GR%k+d z-rJW{y=mj zOX#%Fq$dASL-AB3k2lg*aj`Zw2;ZTypq$ppNeif0PFh5d$fkvKg1Y3ncG@7bS5PEH z%gk+bhoxR=_U#$+xQ%j<<>{ueN?>7lYGcSu;^g8a+%QxwDWhJxQAX8Lfy^yKQ16w~ zLFp)`WwDqizZpgw7kv_pTt=&@jb}QObuXsf{#oBvNmhBH zigM*S8^y>uQ>ZYWKeI*~e`Z~c1BZVH^Nfol6S*qp@mn{6gG!W54Y>cSofIK7hfNMt zLA>%ZYM0eFQ?%;3yh)Z~%iveBzh!!NeP%7mMh9JeS^pyP@;64dv}~a;p?SUkuPR?v zi4%)cKf7iIzCh+(g-GQq%PC$uwotQY_-3jmTF&*6BdsgQMh|l_-YSpWLJr#OuOa&? zij?^^l=o2$2=8E`=)NUEURZ$=K+!gHRaiV1YN(W?zJR_$(KK8kuIuY)2NvxE`)QLL zT1k`S={h=q675z=Ar^VFoyHs3sC42JjCOs%{ zFL5y=sc$nPXAxxSCTQX-LRC3N!%Jf~UCYsa>SA-m#sX=O@9o8U>+uI%+ldw9>pkqmmH{tSzFA1Kz9ykx=k4s9RK_NgyJ)As z3Mx3i>8$Z#b-N-ucb3abT{H_*g#fq9_Wj6BWdr6mxDVwtBv40(_tR1T^k+$k;^dJP z^3+D`k^_BYrCRPtk;&Dx@bgp9Cc8?=Dt6Q*$kKl7pzGziGP*&&kxIOg7Ra)G+6(!8 zk63>gJN|a|efuB3cX;JQ`M2+q`>5Ds(kPK)7U56IdVKD#RqRTW@894!*}IvHuOZV7 Wa&;Fb?OQ?Lly)B?K1{P7wi` zD-7iericPnh`@E*ilvAHt(ItJjS__la;HdwwTMCaVkuH7(kU`2vOqQBDU87knsP5e z{_(4_nJmDhGC6>8q9SiWMt)vuPG)h5UP(ns6~D7pjDKFLOHpQ7YD{>rPn9xIr6gB; zN@7W3jGLpc>t-eJfqFM@Waeko(*l~$!Op@YAjZPT2ZREQd~5JP{@VCIARZPQw5I delta 309 zcmdnM^@dZ$n3tF9--dt~HC_gW#|%h-4ajx?;^IFH6IEjBQ@EnIQn;hIQyH?DvY1o3 zQ+S%0qIlpuULcP*g)fQ^D#Dk-4;0~t^7vB(fINW|!6-qvh){}fGjo&>lrNAX0#qRk z*DabNmcpDO-pm>$0v1zcPUT9G0BaG2DiTeROp!{FPLTmB5=&tWX3&(KY{Ix#k>A-W z#y>CBr6{v3H6}dRr;0ztDkeTTzo?+JI3_JIH+8cqlN=+X_~a1giJQML^D*jZ0KLh< k&cY-h#=^)4gaV9w94t&MOh7ghScHXzQ2@vjVG>{h0P+_=LjV8( diff --git a/acoustic_model/acoustic_model.pyproj b/acoustic_model/acoustic_model.pyproj index 8faedc8..5319301 100644 --- a/acoustic_model/acoustic_model.pyproj +++ b/acoustic_model/acoustic_model.pyproj @@ -4,8 +4,7 @@ 2.0 4d8c8573-32f0-4a62-9e62-3ce5cc680390 . - - + fame_hmm.py . diff --git a/acoustic_model/defaultfiles.py b/acoustic_model/defaultfiles.py index 7c4a8cf..b10d247 100644 --- a/acoustic_model/defaultfiles.py +++ b/acoustic_model/defaultfiles.py @@ -39,11 +39,11 @@ toolbox_dir = os.path.join(repo_dir, 'toolbox') #config_hvite = os.path.join(htk_config_dir, 'config.HVite') #acoustic_model = os.path.join(htk_config_dir, 'hmmdefs.compo') #acoustic_model = r'c:\cygwin64\home\A.Kunikoshi\acoustic_model\model\barbara\hmm128-2\hmmdefs.compo' -#phonelist_txt = os.path.join(htk_config_dir, 'phonelist.txt') +phonelist_txt = os.path.join(htk_dir, 'config', 'phonelist.txt') WSL_dir = r'C:\OneDrive\WSL' #fame_dir = os.path.join(WSL_dir, 'kaldi-trunk', 'egs', 'fame') -fame_dir = r'd:\_corpus\fame' +fame_dir = r'c:\OneDrive\Research\rug\_data\FAME' fame_s5_dir = os.path.join(fame_dir, 's5') fame_corpus_dir = os.path.join(fame_dir, 'corpus') diff --git a/acoustic_model/fame_functions.py b/acoustic_model/fame_functions.py index 5fe60e5..cb87620 100644 --- a/acoustic_model/fame_functions.py +++ b/acoustic_model/fame_functions.py @@ -290,15 +290,17 @@ def lexicon_asr2htk(lexicon_file_asr, lexicon_file_htk): """ lex_asr = load_lexicon(lexicon_file_asr) + def word2htk_(row): + return word2htk(row['word']) def asr2htk_space_delimited_(row): return asr2htk_space_delimited(row['pronunciation']) lex_htk = pd.DataFrame({ - 'word': lex_asr['word'], + 'word': lex_asr.apply(word2htk_, axis=1).str.upper(), 'pronunciation': lex_asr.apply(asr2htk_space_delimited_, axis=1) }) lex_htk = lex_htk.ix[:, ['word', 'pronunciation']] - lex_htk.to_csv(lexicon_file_htk, header=None, index=None, sep='\t') + lex_htk.to_csv(lexicon_file_htk, header=None, index=None, sep='\t', encoding='utf-8') return @@ -316,20 +318,26 @@ def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out): lex2 = load_lexicon(lexicon_file2) lex = pd.concat([lex1, lex2]) lex = lex.sort_values(by='word', ascending=True) - lex.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t') + lex.to_csv(lexicon_out, index=False, header=False, sep='\t', encoding='utf-8') def fix_single_quote(lexicon_file): """ add '\' before all single quote at the beginning of words. + convert special characters to ascii compatible characters. Args: lexicon_file (path): lexicon file, which will be overwitten. """ lex = load_lexicon(lexicon_file) + lex = lex.dropna() # remove N/A. for i in lex[lex['word'].str.startswith('\'')].index.values: lex.iat[i, 0] = lex.iat[i, 0].replace('\'', '\\\'') - # to_csv does not work with space seperator. therefore all tabs should manually be replaced. - #lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\') - lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep='\t') + # to_csv does not work with space seperator. therefore all tabs should manually be replaced. + #lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\') + lex.to_csv(lexicon_file, index=False, header=False, sep='\t', encoding='utf-8') return + + +def word2htk(word): + return ''.join([fame_asr.translation_key_word2htk.get(i, i) for i in word]) diff --git a/acoustic_model/fame_hmm.py b/acoustic_model/fame_hmm.py index ba2732c..9ce920b 100644 --- a/acoustic_model/fame_hmm.py +++ b/acoustic_model/fame_hmm.py @@ -3,6 +3,7 @@ import os os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') import tempfile +import shutil #import configparser #import subprocess import time @@ -11,6 +12,7 @@ import numpy as np import pandas as pd import fame_functions +from phoneset import fame_ipa, fame_asr import defaultfiles as default sys.path.append(default.toolbox_dir) import file_handling as fh @@ -28,7 +30,7 @@ dataset_list = ['devel', 'test', 'train'] # procedure extract_features = 0 -make_lexicon = 0 +make_lexicon = 1 make_mlf = 0 combine_files = 0 flat_start = 0 @@ -44,6 +46,9 @@ lexicon_htk_asr = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_asr') lexicon_htk_oov = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_oov') lexicon_htk = os.path.join(default.htk_dir, 'lexicon', 'lex.htk') +global_ded = os.path.join(default.htk_dir, 'config', 'global.ded') + + #hcompv_scp = output_dir + '\\scp\\combined.scp' #combined_mlf = output_dir + '\\label\\combined.mlf' @@ -60,14 +65,17 @@ if not os.path.exists(feature_dir): tmp_dir = os.path.join(default.htk_dir, 'tmp') if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) +label_dir = os.path.join(default.htk_dir, 'label') +if not os.path.exists(label_dir): + os.makedirs(label_dir) + ## ======================= extract features ======================= if extract_features: - print('==== extract features ====\n') - + for dataset in dataset_list: - print('==== dataset: {} ===='.format(dataset)) + print('==== extract features on dataset {} ====\n'.format(dataset)) # a script file for HCopy print(">>> making a script file for HCopy... \n") @@ -89,6 +97,8 @@ if extract_features: hcompv_scp = os.path.join(tmp_dir, dataset + '.scp') fh.make_filelist(feature_dir_, hcompv_scp, '.mfc') + os.remove(hcopy_scp.name) + ## ======================= make lexicon for HTK ======================= if make_lexicon: @@ -114,94 +124,132 @@ if make_lexicon: fame_functions.fix_single_quote(lexicon_htk) +## ======================= make phonelist ======================= +#phonelist_txt = os.path.join(default.htk_dir, 'config', 'phonelist.txt') +#pyhtk.create_phonelist_file(fame_asr.phoneset_htk, phonelist_txt) +#sentence = 'ien fan de minsken fan it deiferbliuw sels brúntsje visser' +#log_txt = os.path.join(default.htk_dir, 'config', 'log.txt') +#dictionary_file = os.path.join(default.htk_dir, 'config', 'test.dic') +#pyhtk.create_dictionary( +# sentence, global_ded, log_txt, dictionary_file, lexicon_htk) +#pyhtk.create_dictionary_without_log( +# sentence, global_ded, dictionary_file, lexicon_htk) + + ## ======================= make label file ======================= if make_mlf: - print("==== make mlf ====\n") - - print("generating word level transcription...\n") for dataset in dataset_list: - hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp' - hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp' - script_list = FAME_dir + '\\data\\' + dataset + '\\text' - mlf_word = output_dir + '\\label\\' + dataset + '_word.mlf' - mlf_phone = output_dir + '\\label\\' + dataset + '_phone.mlf' + timer_start = time.time() + print("==== generating word level transcription on dataset {}\n".format(dataset)) - # lexicon - lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation']) - - # list of features - with open(hcompv_scp) as fin: - features = fin.read() - features = features.split('\n') + #hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp' + #hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp' + script_list = os.path.join(default.fame_dir, 'data', dataset, 'text') + #mlf_word = output_dir + '\\label\\' + dataset + '_word.mlf' + #mlf_phone = output_dir + '\\label\\' + dataset + '_phone.mlf' + wav_dir = os.path.join(default.fame_dir, 'fame', 'wav', dataset) + dictionary_file = os.path.join(wav_dir, 'temp.dic') # list of scripts with open(script_list, "rt", encoding="utf-8") as fin: - scripts = fin.read() - scripts = pd.Series(scripts.split('\n')) + scripts = fin.read().split('\n') - i = 0 - missing_words = [] - fscp = open(hcompv_scp2, 'wt') - fmlf = open(mlf_word, "wt", encoding="utf-8") - fmlf.write("#!MLF!#\n") - feature_nr = 1 - for feature in features: - sys.stdout.write("\r%d/%d" % (feature_nr, len(features))) - sys.stdout.flush() - feature_nr += 1 - file_basename = os.path.basename(feature).replace('.mfc', '') + for line in scripts: + #for line in ['sp0035m_train_1975_fragmentenvraaggesprekkenruilverkaveling_15413 en dat kan men nog meer']: + # sample line: + # sp0457m_test_1968_plakkenfryslanterhorne_2168 en dan begjinne je natuerlik + filename_ = line.split(' ')[0] + filename = '_'.join(filename_.split('_')[1:]) + sentence = ' '.join(line.split(' ')[1:]) - # get words from scripts. - try: - script = scripts[scripts.str.contains(file_basename)] - except IndexError: - script = [] + wav_file = os.path.join(wav_dir, filename + '.wav') + if len(re.findall(r'[\w]+[âêûô\'ú]+[\w]+', sentence))==0: + try: + sentence_ascii = bytes(sentence, 'ascii') + except UnicodeEncodeError: + print(sentence) + #if os.path.exists(wav_file): + # #dictionary_file = os.path.join(wav_dir, filename + '.dic') + # if pyhtk.create_dictionary_without_log( + # sentence, global_ded, dictionary_file, lexicon_htk) == 0: + # # when the file name is too long, HDMan command does not work. + # # therefore first temporary dictionary_file is made, then renamed. + # shutil.move(dictionary_file, os.path.join(wav_dir, filename + '.dic')) + # label_file = os.path.join(wav_dir, filename + '.lab') + # pyhtk.create_label_file(sentence, label_file) + # else: + # os.remove(dictionary_file) + print("elapsed time: {}".format(time.time() - timer_start)) + # lexicon + #lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation']) - if len(script) != 0: - script_id = script.index[0] - script_txt = script.get(script_id) - script_words = script_txt.split(' ') - del script_words[0] + # list of features + #with open(hcompv_scp) as fin: + # features = fin.read() + # features = features.split('\n') + #i = 0 + #missing_words = [] + #fscp = open(hcompv_scp2, 'wt') + #fmlf = open(mlf_word, "wt", encoding="utf-8") + #fmlf.write("#!MLF!#\n") + #feature_nr = 1 + #for feature in features: + # sys.stdout.write("\r%d/%d" % (feature_nr, len(features))) + # sys.stdout.flush() + # feature_nr += 1 + # file_basename = os.path.basename(feature).replace('.mfc', '') + + # # get words from scripts. + # try: + # script = scripts[scripts.str.contains(file_basename)] + # except IndexError: + # script = [] + + # if len(script) != 0: + # script_id = script.index[0] + # script_txt = script.get(script_id) + # script_words = script_txt.split(' ') + # del script_words[0] # check if all words can be found in the lexicon. - SCRIPT_WORDS = [] - script_prons = [] - is_in_lexicon = 1 - for word in script_words: - WORD = word.upper() - SCRIPT_WORDS.append(WORD) - extracted = lexicon_htk[lexicon_htk['word']==WORD] - if len(extracted) == 0: - missing_words.append(word) - script_prons.append(extracted) - is_in_lexicon *= len(extracted) + # SCRIPT_WORDS = [] + # script_prons = [] + # is_in_lexicon = 1 + # for word in script_words: + # WORD = word.upper() + # SCRIPT_WORDS.append(WORD) + # extracted = lexicon_htk[lexicon_htk['word']==WORD] + # if len(extracted) == 0: + # missing_words.append(word) + # script_prons.append(extracted) + # is_in_lexicon *= len(extracted) # if all pronunciations are found in the lexicon, update scp and mlf files. - if is_in_lexicon: + # if is_in_lexicon: # add the feature filename into the .scp file. - fscp.write("{}\n".format(feature)) - i += 1 + # fscp.write("{}\n".format(feature)) + # i += 1 # add the words to the mlf file. - fmlf.write('\"*/{}.lab\"\n'.format(file_basename)) + # fmlf.write('\"*/{}.lab\"\n'.format(file_basename)) #fmlf.write('{}'.format('\n'.join(SCRIPT_WORDS))) - for word_ in SCRIPT_WORDS: - if word_[0] == '\'': - word_ = '\\' + word_ - fmlf.write('{}\n'.format(word_)) - fmlf.write('.\n') - print("\n{0} has {1} samples.\n".format(dataset, i)) - np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words) + # for word_ in SCRIPT_WORDS: + # if word_[0] == '\'': + # word_ = '\\' + word_ + # fmlf.write('{}\n'.format(word_)) + # fmlf.write('.\n') + # print("\n{0} has {1} samples.\n".format(dataset, i)) + # np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words) - fscp.close() - fmlf.close() + # fscp.close() + # fmlf.close() ## generate phone level transcription - print("generating phone level transcription...\n") - mkphones = output_dir + '\\label\\mkphones0.txt' - subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word - subprocess.call(subprocessStr, shell=True) + # print("generating phone level transcription...\n") + # mkphones = output_dir + '\\label\\mkphones0.txt' + # subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word + # subprocess.call(subprocessStr, shell=True) ## ======================= combined scps and mlfs ======================= diff --git a/acoustic_model/fame_test.py b/acoustic_model/fame_test.py index d330e7f..c7b2e59 100644 --- a/acoustic_model/fame_test.py +++ b/acoustic_model/fame_test.py @@ -3,6 +3,7 @@ import os os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') from collections import Counter import time +import re import numpy as np import pandas as pd @@ -82,22 +83,52 @@ np.save(os.path.join('phoneset', 'fame_ipa2asr.npy'), translation_key_ipa2asr) ## check if all the phones in lexicon.htk are in fame_asr.py. -timer_start = time.time() -phoneset_htk = fame_asr.phoneset_htk -phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_htk) -phoneset_lex.remove('') -print("phones which is in lexicon.htk but not in the fame_asr.py are:\n{}".format( - set(phoneset_htk) - set(phoneset_lex))) -print("elapsed time: {}".format(time.time() - timer_start)) +#timer_start = time.time() +#phoneset_htk = fame_asr.phoneset_htk +#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_htk) +#phoneset_lex.remove('') +#print("phones which is in lexicon.htk but not in the fame_asr.py are:\n{}".format( +# set(phoneset_htk) - set(phoneset_lex))) +#print("elapsed time: {}".format(time.time() - timer_start)) -# statistics over the lexicon -lex_htk = fame_functions.load_lexicon(lexicon_htk) -phones_all = (' '.join(lex_htk['pronunciation'])).split(' ') -c = Counter(phones_all) +## statistics over the lexicon +#lex_htk = fame_functions.load_lexicon(lexicon_htk) +#phones_all = (' '.join(lex_htk['pronunciation'])).split(' ') +#c = Counter(phones_all) + +#lexicon_out = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\lexicon\lex.htk2' +#for i in lex_htk[lex_htk['word'].str.startswith('\'')].index.values: +# lex_htk.iat[i, 0] = lex_htk.iat[i, 0].replace('\'', '\\\'') +## to_csv does not work with space seperator. therefore all tabs should manually be replaced. +##lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\') +#lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t') + + +## check which letters are not coded in ascii. +print('asr phones which cannot be coded in ascii:\n') +for i in fame_asr.phoneset_short: + try: + i_encoded = i.encode("ascii") + #print("{0} --> {1}".format(i, i.encode("ascii"))) + except UnicodeEncodeError: + print(">>> {}".format(i)) + +print("letters in the scripts which is not coded in ascii:\n") +for dataset in ['train', 'devel', 'test']: + timer_start = time.time() + + script_list = os.path.join(default.fame_dir, 'data', dataset, 'text') + with open(script_list, "rt", encoding="utf-8") as fin: + scripts = fin.read().split('\n') + + for line in scripts: + sentence = ' '.join(line.split(' ')[1:]) + sentence_htk = fame_functions.word2htk(sentence) + + #if len(re.findall(r'[âêôûč\'àéèúćäëïöü]', sentence))==0: + try: + sentence_htk = bytes(sentence_htk, 'ascii') + except UnicodeEncodeError: + print(sentence) + print(sentence_htk) -lexicon_out = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\lexicon\lex.htk2' -for i in lex_htk[lex_htk['word'].str.startswith('\'')].index.values: - lex_htk.iat[i, 0] = lex_htk.iat[i, 0].replace('\'', '\\\'') -# to_csv does not work with space seperator. therefore all tabs should manually be replaced. -#lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\') -lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t') diff --git a/acoustic_model/phoneset/fame_asr.py b/acoustic_model/phoneset/fame_asr.py index 8408646..a9f47a7 100644 --- a/acoustic_model/phoneset/fame_asr.py +++ b/acoustic_model/phoneset/fame_asr.py @@ -103,12 +103,22 @@ translation_key_asr2htk = { } phoneset_htk = [translation_key_asr2htk.get(i, i) for i in phoneset_short] -## check -#for i in phoneset_short: -# try: -# print("{0} --> {1}".format(i, i.encode("ascii"))) -# except UnicodeEncodeError: -# print(">>> {}".format(i)) +#not_in_ascii = [ +# '\'', +# 'â', 'ê', 'ô', 'û', 'č', +# 'à', 'í', 'é', 'è', 'ú', 'ć', +# 'ä', 'ë', 'ï', 'ö', 'ü' +#] +translation_key_word2htk = { + '\'': '\\\'', + 'í':'i1', 'é':'e1', 'ú':'u1', 'ć':'c1', + 'à':'a2', 'è':'e2', + 'â':'a3', 'ê':'e3', 'ô':'o3', 'û':'u3', + 'č':'c4', + 'ä': 'ao', 'ë': 'ee', 'ï': 'ie', 'ö': 'oe', 'ü': 'ue', +} +#[translation_key_word2htk.get(i, i) for i in not_in_ascii] + ## the list of multi character phones. diff --git a/acoustic_model/test.txt b/acoustic_model/test.txt new file mode 100644 index 0000000..e69de29