From c185072d5b9e6614f4d19142edc0484018bb5957 Mon Sep 17 00:00:00 2001 From: yemaozi88 <428968@gmail.com> Date: Thu, 14 Feb 2019 00:21:28 +0100 Subject: [PATCH] label alignment using HVite is added. --- .vs/acoustic_model/v15/.suo | Bin 102400 -> 96256 bytes .../__pycache__/defaultfiles.cpython-36.pyc | Bin 1051 -> 1110 bytes acoustic_model/acoustic_model.pyproj | 2 +- acoustic_model/convert_phoneset.py | 6 + acoustic_model/defaultfiles.py | 1 + acoustic_model/fame_functions.py | 18 +- acoustic_model/fame_hmm.py | 38 +- acoustic_model/htk_vs_kaldi.py | 747 ++++++++++-------- acoustic_model/phoneset/fame_asr.py | 14 +- acoustic_model/stimmen_functions.py | 22 + acoustic_model/stimmen_test.py | 18 +- 11 files changed, 527 insertions(+), 339 deletions(-) diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo index 649e0b19014b39935d8f606a8be47e0e105fde18..1238af17ce169a064f0f0a3453aecbdaee497dce 100644 GIT binary patch delta 5873 zcmd6r33OD|8OPuEGJzyynLrkn2q6#%WD;hQ0TPxllMPvffZ&3uNq`9?gv^jk5RyO! zq*Y6VWG?a$**Zl^ZMB#Ogrh72C?Zvsil?+y1jJUX3PCtWJjL|?B`-KCmD<`K@0{P< z?|yfA_uc!y%ZzJFC)XAK24i1Y(t=)+`oh$KFI!t%+3N@5U!2lQ4s~SIz1ed3G|gsr zm8h1h=`C?xepD`&MYUgbCpX*~@!|`=xVmtdnP4#J571Uy2!-7p+y(MMFAxKUfdJ4^ zt`2q(=nN9TPrz7^43>e%K|jFdA4J>-pzm$z2uT_*-9*L^I0gU{=%HmUMQjAoU=rvC z?gt^De)uo8yz23p>##&6T<$o#*45=3{gIR4qLP@x34x^ajfT=W|Ha z?BR%uG@GZOfZu@&%+#EN5T|H%u@+|_p5HhndSW>YJE+hyxMC&jDlNSb@gmKx*5Vq( zwO|QI0_ngC+;Ul{L)qiHiYdPv>Dx?s-Fmn);yLrHH|YYKho5^tQuDJOxGx-74`#qx z;BL~h8P6czpxMG1RNt4Khu}C2 zUIDLanXe%}3SJkcF5M!I!#b(uoI?BuAS_+>GzB|}-V{|K_v&~K#j3*z;#x@m|8F1k zPQP#Up__}|5LFR%^m}m~-v>oiWL@weehc1P5Zt*(bv)qhUCx~xNW4;c$lE;5v6Dqs zxXIs%DtpC^ZZ+R-u8U7|YU@CfJqfa>w=8lq^s0W-6>?iXr$%ujZjgwxp7oez&j8sy z^)R=pUYi9|F;7|;2svG<)L&#+F4Jd7;Dl6`tmUg*Mp<+e{Q z9{BUkOUr{|1JUVQN^F5LY<`kd(@l~d6QgqudopCV;f%guL$5tuCoQeaDZl5nE}6?) z#lcMx>OTxR3LS@4)q*pp983Rc5?_vuQ;m0*l78wtahVrhxO(_ZpQW=_Zfd0)SYF+RN>LQjj=Wt0dcNJ>>Uau#y9&1(MYN*fVyo`CBBoP{l(H>w_NT4v6Y+T`SX zr`=vQ)4r%9PIfGIRM<;v<*b>)nl#28mmRM7#-X1|H4F6}D>jb2twCe}lvh)YQKSqY0>BClDw5M_@I18y)$c zN%DNI$os*gU|RM8r$;>$=4+Hi#g-u@72lKs5`qKeZ5)LX{(prwGZe{pi?5h zBHPF-WH1ukO9L1J@<28)0ux99sbCZkCo4vg5FwSHd5UGFB9%s{Bb-ObUGamo>0X{C z4o?`Zx|X~AJxgVG!`(W*b?_AUm*DBA+cqmtcw7LlWZoy+W=9d%1;9%H<(PdZq+PY4sY*S_Nq!}$*h9;_Sv?w{8{$$ zg|-T3fn%<{+*WCGig1TPmu4`dc&1pf%jnZNYFhV-9Kd&oDOw-gTXpK^>jV7UtGrRh z_OPTjb!eJln7KB$sI1afER4H*M|Ldq;*v63!Q7(q*<~f=bMTnUPx9tA=sB~17#<1KG zUdP53$ec7Uni_aKMf)i8!uaY}40j!_DO=p28o5bkHQ)$HwiscEN$ z#Hsg&h%HwGmCHjYOgyzKLDVb=q8jwPGC0+2Fr~M@=rTC6#k`M0#M)g0Lrg}KA$4eb z&MY1+@wZ+5-OVdR{OX91)S+fWN=kd?Vr2HBMZ7ic4j?bJEqr}3g*J`CqkplWuVReE zVFf-jpahuvw)1dYiiK^b9}G!?H;E-F|idj!g2b z&LbY(u7WyG&*!0pn6tcd3oF$HXiKLgKJ?mumUN2>dgL@PGNQx1S`|R#RE@jBQ z6ryB2L;)cqaR%A(DU)VOcK7;MZYw@L4!GlJ6LZ!^p|Ebs9ek@Aa@&tF2k!fJ+>yCl zeirVf`M6G2N;cJX$VH((;xXr#)X*bWNimJ2K}zyNlpt&;rYn7yP!AYZ_erm?yle3+ zl06<-bbT*IymNM|>b>TY#g(%b)%7l>!*$*zycb*{=W#sp(GZAxInR4)(>WIj%lj_( z1+VJ8X(7)sooBo3zGRNF;%9{sA1mO?5aftUy(qf8S+NIGxM=t|OnH-NxHxs(B3zd& zij}C|x{@^OUX(2-s_rl;=QrF%SQbmE1}W7>4D_=Qn|G9#2i z(e>JGenhvQeD{H#cTFyL7Jg8Dx;Y_fTw-1QGgtA(w+@pJmEpnkgn0LIzpq|DF^u-n zI$nj}ts4<}Tp#jZOX9nyUtB!#y(XTQt5~wHtFknYg2bwE!MDA~$ysK%y+LpM;KH8h zU7O`Ecf8uGI%vsaKE{KwSHh_VkVjWXFq-tL&TEmBz+L22UPr|E4e z{n)G#`O_IsQCJSA@%=M)b4TXe{6CXc;~_9znj{F$y)@Y6nyt! zcZ_;ov?R^b>T*;AdV_4Pgx^26U zDEajL5Z-mA!j{N_mR$!V@!;iO-P%S(?A1T^d6-XuwqWv>!RCK0f&qS^aS)9Tldulx zu2_art+K{MWr`(=Ms!M^R^q5kR$Ni!5(jq=Vg>!r-(}OEj%7YX^J-* zrxGpgzP2YNM1OUgOP*8WESXboui*bR#DPmzcb>({H<^xG%@VyNy@y+l7e6jbQ06R# zJs#<*zLFGyUjtm0gYUt6e>0^}D=EvuDd$e(%75~7gj{WmzvsBN9G}J~PL#M}3}~*% zq*a}N6ax?`H%DdAN2FXBO5g0DBIi+QbLm7{>!-+36s47HDxxyp^P92fFDI>#h1QbN zHIHJOV@v5=pm2Yd+?+g#PWuU+Ym{{>V@o{ zO=kJqWd@RRx|%ll^B-aj%K21Upu{YpSpVrfDaz>SbXZvO9t%7~7keB#WV$p_oZ1(w Rc!%ROw-`y^d~yaE>7QAj-{k-R delta 6514 zcmd^@dt6k<702({U0%ZCDu~N_*&wX4z{2uW)a9kb8pEnlYZQS9kqC&e7zGp72-QSu z+?~`>lUQ97)8=84-F!mYR9wJXZM8|Pscl|GL)s?x(+{?>H8zd?%{~Z8n@^jjpZ?Jf ze0R>AJ9qBPoHJ+UUh7DJ^yWb!c}L)0=ye8mU9RnlhFA)cK{zM_831kGP$LfpiQum& z$NBW)n%yUNBG-agFdN(qmVpGY8O#MKU@AbLB^$^BqK)yA)Fpirg=}B|L0}pX7dwy} z!DR3t(1CI=4v6a_RY9Hu0zr&VKg*Yk>vurT98(s$G?WX#Dlk#Zp_o3AZnK#qD9L7y zU0E~cmajuz4&uk?MD2S}*7$T`$Y+4npbo47qK-%nknaL(K_gg?rxdh8umDlvZeK3s zO+Gmtc_GLLdEj1<3fRZg5^kDz=%J%9?yXS#1LID87S0m9UxqA<0{g{g%*6oVcR0Gh#AQ_n-$Z|ZfBD}cDyZ|ZJeSv)uja+j|>#?oCL z-)O=Gzp1zQDv5?7AWsKHz;9}y7pBey!qg=|0V;towJ`5YFdvAz{xs27_FMaFDBmJ2 zk|aF@VZbCHjA#Z60WL|4!4e=ES&IC2FxDYfpu7@PgBBp_)!WRI1MWm29YXq8E4KM+ zh+4v$BHw^~qfdSSzNfIFU>~Pe>>-#tZd`#zbeN;C7wiN3!NcGY@F-xGaXYj6Aq;>) zZ~};%P9pyiI0a7obU#6U2ApL{p%L_nt>$HCn1v{lwJR4XLhxv|>$Hx2s7(IvJzr&! z|K0N=tn?~tkL;pX*j4=gkhM?h3M@jumxjBpM+eHBWv56jrc*4Ub|9uvj9I{j$~DgV zYeX-wk0Lt1upxFmTIbRxyhF}un1LU&_Ou=LC@s^@(+02iwJIjVWtq2=LeQ)yZHG-) z?!aZgVvkSWg?s_n4Q>LH00R$!Z-M&&2cm=fknabN0MX8)$Pa*TgC}j~w1B4|ykIlm z#9o~rnfrYd`hfq5B0r9@8}x$#FbGb7AA*wrhW(8rMdpb4XEQZ-3JWm(OqK9@^@87} z?7q3=z8_qBr0T4R^=l$sADa}P&dpYeb$yhXs&HwNhsco^==_d}Sar9CSH@E~d-K8Z z^39zFX8o>}n=&b!S5}h3HF4JO$*#Rdhg=_NgUG(vLO*r2FWy01mO&_{ZViz{&=Jedqp?iB zl?}#ZjkDmMA3zCpc!^ed3DM{ssNP3l=y#=;G?8k~fM9^i{N`=Y+?P2g3cpR{YDs>Z{gYZ0X0vCAMW=;$dgExG6Yc=|th$lI!`ZTgj0lW7aUOOS^2LOxSJt4mq=~LHW6NJO<2j$8oUnd!kNqr@tb zNeyl5nloB9O~vVHx?y@}j@??J5a$H3ngSRm-~~npW;25bwPS@h3jHMNv_Ia>ZtB*?<`n3& zd{;X$(kZvK)wNc(uB&fo+{lVGvmGzVnEOyLt9~im71ei=l;XG{lE_0@`tiy30XMzJ z`UlcT#(p}GKnGcI$yS=do?j8qLI)R#;F`_*R?tNLKq)1&HBY|9lDoC6{;B=;2TuH> z875XWXmwdnS;?_eHh#<`7aQ^lT;uIm$hnsSdA6A%<7Z&UU5CF$sTRK*B@^5P{~q?U z;uVao?L*LE#YvrfRtBk=`OIxW2GM`vuV-i5&$?aib0%^e9JP-)pV!9HyHVoI=})0Pq3js9sc8p$ z^}`6BWu{4AdA^x`Si_^^>4K(Qn8}lp-18xeGqdOd_5@F2nZxm6q4hOQb*m6M5f@l- zikh3!X)`~cKzgs?5a++DeR@9*q=GM)X5qUz{5#bY#!Mf$LziOP#UTLi9{t{F!7+ok zC`rdc-pyy}DG{TV5bG|OO&LyQm$5omUAoglST1iOJdgcP&6-Tuqj@X}yiEn68PCb3-q-FRoXAad5{`r-$1Al|% ze|SIb!a`loRn-{Ito;oqzWe^Rg{q(5U;b9@_IAV)C{6&d}J~e zj+VJB4vVNz^x2n8A{UEjD({b_VzyvJ_b;s^rn-EmIH3yHDIm{d=X#%ZKetAc2`(nq zr$34JJc;jAQ#zY)M$O$Nq~oSUnjW|cF`-W4)!C*)S zqv@W_2LCq3%p;S$ZsquZ{->Aa|M9B2@D-NECVfp^cK&CVM!;s_UCee_D^_Vtu;1U9 z`EZ_WZmDo<{zx^Vse1#dSmg)Lea7t@>9$_Vqe(JusGuT#1QpqcA-K0yPrKyeS_<7D z-fF;pkzpov?|cKjAU{$-GXC2#x~uoD652!jQaUXb+ey^utwEoyB7boi$z*J(FQHd6 zpN8bzo*Et$e+Ka>esitVg5A zPFl*7vS=nZS*U~+rRtzj@!|qn8(M%*j#>D|l`rM-%uGs*HsS-;YMkk_q0Yv)KJQkA zmk&`tjZdr~O|PbwhLn8BOeG=tsFDQ?p1b3}=x%Y++U@Pt!AvTPUxSdf7GFA7OAXQ; zko`|dLcI<#szu_Y7|t%vRHLsmJbewxMlW;GDI-tMq{2WW>UK+P%dCZL@oasl*ipS- z+ietM;%6M@C^|0VP(PkhwFGo}ty;6CZHJQ>anY&CI&u{z<*Uu?R=<)6cOG{ zwQY=VYP>7P=vf$f`23yi@p*pcW;{TT2D`lt`Zwkjc*z8)*LeJyEqWsoic}~heq|vg N{=5FG*nm&Ue*-Y&8L0pO diff --git a/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc b/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc index 545949dc9f1ff2f1c8bc72b5858babe7242ef981..7b3cfb76d56ee6cf0925267fe709b794b4bbd1bd 100644 GIT binary patch delta 264 zcmbQuag9USn3tF9jDKQ`96JNUV+JI^24p(`aq+u}${qC_Df}q{DT2)`QM{=PSxi~X zsoW_-%}h~zP##x`aEeGXa}~G=vj9*=fKh}I0NgM-=Kufz delta 168 zcmcb{F`Gl#n3tF9+>D5r>ud}Rj~S2vkk8-%#Km_eDtAP3r|>s3Me(LGWHDtir*fqT zqzE=MNAaZyMe#$$cvFN^M1W!fP(F8xXbMk?Sc-TvYm^{Vj59?dMKVPyMH(n3l)@Oy rpeZw1jPdGZcBb=;!jrEvZJ9icc^0F>WJ4B7E2.0 4d8c8573-32f0-4a62-9e62-3ce5cc680390 . - fame_hmm.py + htk_vs_kaldi.py . diff --git a/acoustic_model/convert_phoneset.py b/acoustic_model/convert_phoneset.py index 7bc39f7..d575c99 100644 --- a/acoustic_model/convert_phoneset.py +++ b/acoustic_model/convert_phoneset.py @@ -38,3 +38,9 @@ def convert_phoneset(word_list, translation_key): translation_key (dict): """ return [translation_key.get(phone, phone) for phone in word_list] + + +def phone_reduction(phones, reduction_key): + multi_character_tokenize(wo.strip(), multi_character_phones) + return [reduction_key.get(i, i) for i in phones + if not i in phones_to_be_removed] \ No newline at end of file diff --git a/acoustic_model/defaultfiles.py b/acoustic_model/defaultfiles.py index ef0dfd4..1e262f9 100644 --- a/acoustic_model/defaultfiles.py +++ b/acoustic_model/defaultfiles.py @@ -17,6 +17,7 @@ novo_api_dir = os.path.join(WSL_dir, 'python-novo-api', 'novoapi') rug_dir = r'c:\OneDrive\Research\rug' experiments_dir = os.path.join(rug_dir, 'experiments') htk_dir = os.path.join(experiments_dir, 'acoustic_model', 'fame', 'htk') +kaldi_dir = os.path.join(WSL_dir, 'kaldi-trunk', 'egs', '_stimmen') stimmen_dir = os.path.join(experiments_dir, 'stimmen') # data diff --git a/acoustic_model/fame_functions.py b/acoustic_model/fame_functions.py index 10f16cd..9d3992b 100644 --- a/acoustic_model/fame_functions.py +++ b/acoustic_model/fame_functions.py @@ -321,9 +321,11 @@ def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out): lex.to_csv(lexicon_out, index=False, header=False, sep='\t', encoding='utf-8') -def fix_single_quote(lexicon_file): - """ add '\' before all single quote at the beginning of words. - convert special characters to ascii compatible characters. +def fix_lexicon(lexicon_file): + """ fix lexicon + - add '\' before all single quote at the beginning of words. + - convert special characters to ascii compatible characters. + - add silence. Args: lexicon_file (path): lexicon file, which will be overwitten. @@ -331,6 +333,12 @@ def fix_single_quote(lexicon_file): """ lex = load_lexicon(lexicon_file) lex = lex.dropna() # remove N/A. + + # add 'sil' + row = pd.Series(['SILENCE', 'sil'], index=lex.columns) + lex = lex.append(row, ignore_index=True) + lex = lex.sort_values(by='word', ascending=True) + for i in lex[lex['word'].str.startswith('\'')].index.values: lex.iat[i, 0] = lex.iat[i, 0].replace('\'', '\\\'') # to_csv does not work with space seperator. therefore all tabs should manually be replaced. @@ -346,10 +354,11 @@ def word2htk(word): def ipa2asr(ipa): curr_dir = os.path.dirname(os.path.abspath(__file__)) translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0) - + #ipa_ = fame_asr.phone_reduction(ipa) ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones) ipa_splitted = fame_ipa.phone_reduction(ipa_splitted) asr_splitted = convert_phoneset.convert_phoneset(ipa_splitted, translation_key_ipa2asr) + asr_splitted = fame_asr.phone_reduction(asr_splitted) return ''.join(asr_splitted) @@ -360,5 +369,6 @@ def ipa2htk(ipa): ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones) ipa_splitted = fame_ipa.phone_reduction(ipa_splitted) asr_splitted = convert_phoneset.convert_phoneset(ipa_splitted, translation_key_ipa2asr) + asr_splitted = fame_asr.phone_reduction(asr_splitted) htk_splitted = convert_phoneset.convert_phoneset(asr_splitted, fame_asr.translation_key_asr2htk) return ''.join(htk_splitted) \ No newline at end of file diff --git a/acoustic_model/fame_hmm.py b/acoustic_model/fame_hmm.py index b3d1070..7228c00 100644 --- a/acoustic_model/fame_hmm.py +++ b/acoustic_model/fame_hmm.py @@ -27,7 +27,8 @@ extract_features = 0 flat_start = 0 train_model_without_sp = 0 add_sp = 0 -train_model_with_sp = 1 +train_model_with_sp = 0 +train_model_with_sp_align_mlf = 1 @@ -75,6 +76,7 @@ if not os.path.exists(label_dir): ## training hcompv_scp_train = os.path.join(tmp_dir, 'train.scp') mlf_file_train = os.path.join(label_dir, 'train_phone.mlf') +mlf_file_train_aligned = os.path.join(label_dir, 'train_phone_aligned.mlf') ## train without sp niter_max = 10 @@ -102,7 +104,8 @@ if make_lexicon: # (1) Replace all tabs with single space; # (2) Put a '\' before any dictionary entry beginning with single quote #http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html - fame_functions.fix_single_quote(lexicon_htk) + print('>>> fixing the lexicon...') + fame_functions.fix_lexicon(lexicon_htk) print("elapsed time: {}".format(time.time() - timer_start)) @@ -269,11 +272,11 @@ if train_model_without_sp: fh.make_new_directory(modeln_dir) pyhtk.re_estimation( config_train, - os.path.join(modeln_dir_pre, 'macros'), os.path.join(modeln_dir_pre, hmmdefs_name), modeln_dir, hcompv_scp_train, phonelist_txt, - mlf_file=mlf_file_train) + mlf_file=mlf_file_train, + macros=os.path.join(modeln_dir_pre, 'macros')) print("elapsed time: {}".format(time.time() - timer_start)) @@ -321,7 +324,6 @@ if add_sp: ## ======================= train model with short pause ======================= if train_model_with_sp: print('==== train model with sp ====') - #for niter in range(niter_max+1, niter_max*2+1): for niter in range(20, 50): timer_start = time.time() hmm_n = 'iter' + str(niter) @@ -333,9 +335,31 @@ if train_model_with_sp: fh.make_new_directory(modeln_dir) pyhtk.re_estimation( config_train, - os.path.join(modeln_dir_pre, 'macros'), os.path.join(modeln_dir_pre, hmmdefs_name), modeln_dir, hcompv_scp_train, phonelist_txt, - mlf_file=mlf_file_train) + mlf_file=mlf_file_train, + macros=os.path.join(modeln_dir_pre, 'macros')) + print("elapsed time: {}".format(time.time() - timer_start)) + + +## ======================= train model with short pause ======================= +if train_model_with_sp_align_mlf: + print('==== train model with sp with align.mlf ====') + for niter in range(50, 60): + timer_start = time.time() + hmm_n = 'iter' + str(niter) + hmm_n_pre = 'iter' + str(niter-1) + modeln_dir = os.path.join(model1_dir, hmm_n) + modeln_dir_pre = os.path.join(model1_dir, hmm_n_pre) + + # re-estimation + fh.make_new_directory(modeln_dir) + pyhtk.re_estimation( + config_train, + os.path.join(modeln_dir_pre, hmmdefs_name), + modeln_dir, + hcompv_scp_train, phonelist_txt, + mlf_file=mlf_file_train_aligned, + macros=os.path.join(modeln_dir_pre, 'macros')) print("elapsed time: {}".format(time.time() - timer_start)) \ No newline at end of file diff --git a/acoustic_model/htk_vs_kaldi.py b/acoustic_model/htk_vs_kaldi.py index 00c699c..c35a42f 100644 --- a/acoustic_model/htk_vs_kaldi.py +++ b/acoustic_model/htk_vs_kaldi.py @@ -11,6 +11,7 @@ import glob import numpy as np import pandas as pd +from collections import Counter #import matplotlib.pyplot as plt #from sklearn.metrics import confusion_matrix @@ -50,11 +51,14 @@ from htk import pyhtk #lex_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr') #lex_asr_htk = os.path.join(default.fame_dir, 'lexicon', 'lex.asr_htk') -## procedure +# procedure +make_dic_file = 0 +make_HTK_files = 1 +extract_features = 0 #make_htk_dict_files = 0 #do_forced_alignment_htk = 0 #eval_forced_alignment_htk = 0 -#make_kaldi_data_files = 0 +make_kaldi_files = 0 #make_kaldi_lexicon_txt = 0 #load_forced_alignment_kaldi = 1 #eval_forced_alignment_kaldi = 1 @@ -66,13 +70,34 @@ from htk import pyhtk #sys.path.append(os.path.join(default.repo_dir, 'toolbox')) #from evaluation import plot_confusion_matrix -config_dir = os.path.join(default.htk_dir, 'config') -model_dir = os.path.join(default.htk_dir, 'model') -lattice_file = os.path.join(config_dir, 'stimmen.ltc') -#pyhtk.create_word_lattice_file( -# os.path.join(config_dir, 'stimmen.net'), -# lattice_file) -hvite_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test.scp') +## HTK related files. +config_dir = os.path.join(default.htk_dir, 'config') +model_dir = os.path.join(default.htk_dir, 'model') +feature_dir = os.path.join(default.htk_dir, 'mfc', 'stimmen') + +config_hcopy = os.path.join(config_dir, 'config.HCopy') + +# files to be made. +lattice_file = os.path.join(config_dir, 'stimmen.ltc') +phonelist_txt = os.path.join(config_dir, 'phonelist.txt') +stimmen_dic = os.path.join(default.htk_dir, 'lexicon', 'stimmen_recognition.dic') +hcopy_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test_hcopy.scp') +hvite_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test_hvite.scp') +hresult_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test_result.scp') + + +## Kaldi related files. +kaldi_data_dir = os.path.join(default.kaldi_dir, 'data') + +# files to be made. +wav_scp = os.path.join(kaldi_data_dir, 'test', 'wav.scp') +text_file = os.path.join(kaldi_data_dir, 'test', 'text') +utt2spk = os.path.join(kaldi_data_dir, 'test', 'utt2spk') +corpus_txt = os.path.join(kaldi_data_dir, 'local', 'corpus.txt') +lexicon_txt = os.path.join(kaldi_data_dir, 'local', 'dict', 'lexicon.txt') +nonsilence_phones_txt = os.path.join(kaldi_data_dir, 'local', 'dict', 'nonsilence_phones.txt') +silence_phones_txt = os.path.join(kaldi_data_dir, 'local', 'dict', 'silence_phones.txt') +optional_silence_txt = os.path.join(kaldi_data_dir, 'local', 'dict', 'optional_silence.txt') ## ======================= load test data ====================== @@ -85,392 +110,468 @@ df = stimmen_functions.add_row_htk(df) word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)] word_list = sorted(word_list) -# pronunciation variants + +## ======================= make dic file to check pronunciation variants ====================== +# dic file should be manually modified depends on the task - recognition / forced-alignemnt. +if make_dic_file: + # for HTK. + with open(stimmen_dic, mode='wb') as f: + for word in word_list: + df_ = df[df['word']==word] + pronunciations = list(np.unique(df_['htk'])) + pronunciations_ = [word.upper() + ' sil ' + ' '.join(convert_phoneset.split_word( + htk, fame_asr.multi_character_phones_htk)) + ' sil' + for htk in pronunciations] + f.write(bytes('\n'.join(pronunciations_) + '\n', 'ascii')) + f.write(bytes('SILENCE sil\n', 'ascii')) + + # for Kaldi. + fh.make_new_directory(os.path.join(kaldi_data_dir, 'local', 'dict')) + with open(lexicon_txt, mode='wb') as f: + f.write(bytes('!SIL sil\n', 'utf-8')) + f.write(bytes(' spn\n', 'utf-8')) + for word in word_list: + df_ = df[df['word']==word] + pronunciations = list(np.unique(df_['asr'])) + pronunciations_ = [word.lower() + ' ' + ' '.join(convert_phoneset.split_word( + asr, fame_asr.multi_character_phones)) + for asr in pronunciations] + f.write(bytes('\n'.join(pronunciations_) + '\n', 'utf-8')) + + +## ======================= test data for recognition ====================== +# only target pronunciation variants. +df_rec = pd.DataFrame(index=[], columns=list(df.keys())) for word in word_list: - df_ = df[df['word']==word] - print('{0} has {1} variants'.format(word, len(np.unique(df_['htk']))) + variants = [htk.replace(' ', '') + for htk in stimmen_functions.load_pronunciations(word.upper(), stimmen_dic)] + df_ = df[df['word'] == word] + for index, row in df_.iterrows(): + if row['htk'] in variants: + df_rec = df_rec.append(row, ignore_index=True) -#fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav') -#output = pyhtk.recognition( -# os.path.join(default.htk_dir, 'config', 'config.rec', -# lattice_file, -# os.path.join(model_dir, 'hmm1', 'iter13'), -# dictionary_file, -# os.path.join(config_dir, 'phonelist.txt'), -# hvite_scp) +## ======================= make files required for HTK ====================== +if make_HTK_files: + # make a word lattice file. + pyhtk.create_word_lattice_file( + os.path.join(config_dir, 'stimmen.net'), + lattice_file) - #pyhtk.create_label_file( - # row['word'], - # os.path.join(stimmen_test_dir, filename.replace('.wav', '.lab'))) + # extract features. + with open(hcopy_scp, 'wb') as f: + filelist = [os.path.join(stimmen_test_dir, filename) + '\t' + + os.path.join(feature_dir, os.path.basename(filename).replace('.wav', '.mfc')) + for filename in df['filename']] + f.write(bytes('\n'.join(filelist), 'ascii')) + pyhtk.wav2mfc(config_hcopy, hcopy_scp) + + # make label files. + for index, row in df.iterrows(): + filename = row['filename'].replace('.wav', '.lab') + label_file = os.path.join(feature_dir, filename) + with open(label_file, 'wb') as f: + label_string = 'START\n' + row['word'].upper() + '\nEND\n' + f.write(bytes(label_string, 'ascii')) + + +## ======================= make files required for Kaldi ======================= +if make_kaldi_files: + fh.make_new_directory(os.path.join(kaldi_data_dir, 'test')) + fh.make_new_directory(os.path.join(kaldi_data_dir, 'test', 'local')) + fh.make_new_directory(os.path.join(kaldi_data_dir, 'conf')) + + # remove previous files. + if os.path.exists(wav_scp): + os.remove(wav_scp) + if os.path.exists(text_file): + os.remove(text_file) + if os.path.exists(utt2spk): + os.remove(utt2spk) + + f_wav_scp = open(wav_scp, 'a', encoding="utf-8", newline='\n') + f_text_file = open(text_file, 'a', encoding="utf-8", newline='\n') + f_utt2spk = open(utt2spk, 'a', encoding="utf-8", newline='\n') + + # make wav.scp, text, and utt2spk files. + for i, row in df_rec.iterrows(): + filename = row['filename'] + print('=== {0}: {1} ==='.format(i, filename)) + + wav_file = os.path.join(stimmen_test_dir, filename) + #if os.path.exists(wav_file): + speaker_id = 'speaker_' + str(i).zfill(4) + utterance_id = filename.replace('.wav', '') + utterance_id = utterance_id.replace(' ', '_') + utterance_id = speaker_id + '-' + utterance_id + + # output + f_wav_scp.write('{0} {1}\n'.format( + utterance_id, + wav_file.replace('c:/', '/mnt/c/').replace('\\', '/'))) # convert path to unix format. + f_text_file.write('{0}\t{1}\n'.format(utterance_id, df_rec['word'][i].lower())) + f_utt2spk.write('{0} {1}\n'.format(utterance_id, speaker_id)) + + f_wav_scp.close() + f_text_file.close() + f_utt2spk.close() + + with open(corpus_txt, 'wb') as f: + f.write(bytes('\n'.join([word.lower() for word in word_list]) + '\n', 'utf-8')) + + with open(nonsilence_phones_txt, 'wb') as f: + f.write(bytes('\n'.join(fame_asr.phoneset_short) + '\n', 'utf-8')) + + with open(silence_phones_txt, 'wb') as f: + f.write(bytes('sil\nspn\n', 'utf-8')) + + with open(optional_silence_txt, 'wb') as f: + f.write(bytes('sil\n', 'utf-8')) + + with open(os.path.join(kaldi_data_dir, 'conf', 'decode.config'), 'wb') as f: + f.write(bytes('first_beam=10.0\n', 'utf-8')) + f.write(bytes('beam=13.0\n', 'utf-8')) + f.write(bytes('lattice_beam=6.0\n', 'utf-8')) + + with open(os.path.join(kaldi_data_dir, 'conf', 'mfcc.conf'), 'wb') as f: + f.write(bytes('--use-energy=false', 'utf-8')) + + +## ======================= recognition ====================== + +listdir = glob.glob(os.path.join(feature_dir, '*.mfc')) +with open(hvite_scp, 'wb') as f: + f.write(bytes('\n'.join(listdir), 'ascii')) + +with open(hresult_scp, 'wb') as f: + f.write(bytes('\n'.join(listdir).replace('.mfc', '.rec'), 'ascii')) + + +# calculate result +performance = np.zeros((1, 2)) +for niter in range(1, 50): + output = pyhtk.recognition( + os.path.join(config_dir, 'config.rec'), + lattice_file, + os.path.join(default.htk_dir, 'model', 'hmm1', 'iter' + str(niter), 'hmmdefs'), + stimmen_dic, phonelist_txt, hvite_scp) + + output = pyhtk.calc_recognition_performance( + stimmen_dic, hresult_scp) + per_sentence, per_word = pyhtk.load_recognition_output_all(output) + performance_ = np.array([niter, per_sentence['accuracy']]).reshape(1, 2) + performance = np.r_[performance, performance_] + print('{0}: {1}[%]'.format(niter, per_sentence['accuracy'])) -## ======================= make a HTK dic file ====================== -#if make_htk_dic_file: -# output_type = 3 -dictionary_txt = os.path.join(default.htk_dir, 'lexicon', 'stimmen.dic') -#for word in word_list: -word = word_list[2] -# pronunciation variant of the target word. -pronunciations = df_test['asr'][df_test['word'].str.match(word)] - # make dic file. - #am_func.make_htk_dict(word, pronvar_, htk_dict_file, output_type) - ## ======================= forced alignment using HTK ======================= if do_forced_alignment_htk: - - #for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]: - for hmm_num in [256, 512, 1024]: - hmm_num_str = str(hmm_num) - acoustic_model = os.path.join(acoustic_model_dir, 'hmm' + hmm_num_str + r'-2\hmmdefs') + + #for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]: + for hmm_num in [256, 512, 1024]: + hmm_num_str = str(hmm_num) + acoustic_model = os.path.join(acoustic_model_dir, 'hmm' + hmm_num_str + r'-2\hmmdefs') - predictions = pd.DataFrame({'filename': [''], - 'word': [''], - 'xsampa': [''], - 'ipa': [''], - 'famehtk': [''], - 'prediction': ['']}) - for i, filename in enumerate(df['filename']): - print('=== {0}/{1} ==='.format(i, len(df))) - if (i in df['filename'].keys()) and (isinstance(df['filename'][i], str)): - wav_file = os.path.join(wav_dir, filename) - if os.path.exists(wav_file): - word = df['word'][i] - WORD = word.upper() - fa_file = os.path.join(fa_dir, filename.replace('.wav', '.txt') + hmm_num_str) - - #if not os.path.exists(fa_file): - # make label file. - label_file = os.path.join(wav_dir, filename.replace('.wav', '.lab')) - with open(label_file, 'w') as f: - lines = f.write(WORD) + predictions = pd.DataFrame({'filename': [''], + 'word': [''], + 'xsampa': [''], + 'ipa': [''], + 'famehtk': [''], + 'prediction': ['']}) + for i, filename in enumerate(df['filename']): + print('=== {0}/{1} ==='.format(i, len(df))) + if (i in df['filename'].keys()) and (isinstance(df['filename'][i], str)): + wav_file = os.path.join(wav_dir, filename) + if os.path.exists(wav_file): + word = df['word'][i] + WORD = word.upper() + fa_file = os.path.join(fa_dir, filename.replace('.wav', '.txt') + hmm_num_str) + + #if not os.path.exists(fa_file): + # make label file. + label_file = os.path.join(wav_dir, filename.replace('.wav', '.lab')) + with open(label_file, 'w') as f: + lines = f.write(WORD) - htk_dict_file = os.path.join(htk_dict_dir, word + '.dic') + htk_dict_file = os.path.join(htk_dict_dir, word + '.dic') - pyhtk.doHVite(wav_file, label_file, htk_dict_file, fa_file, default.config_hvite, - default.phonelist, acoustic_model) - os.remove(label_file) + pyhtk.doHVite(wav_file, label_file, htk_dict_file, fa_file, default.config_hvite, + default.phonelist, acoustic_model) + os.remove(label_file) - prediction = am_func.read_fileFA(fa_file) + prediction = am_func.read_fileFA(fa_file) - print('{0}: {1} -> {2}'.format(WORD, df['famehtk'][i], prediction)) - else: - prediction = '' - print('!!!!! file not found.') + print('{0}: {1} -> {2}'.format(WORD, df['famehtk'][i], prediction)) + else: + prediction = '' + print('!!!!! file not found.') - line = pd.Series([df['filename'][i], df['word'][i], df['xsampa'][i], df['ipa'][i], df['famehtk'][i], prediction], index=['filename', 'word', 'xsampa', 'ipa', 'famehtk', 'prediction'], name=i) - predictions = predictions.append(line) - else: - prediction = '' - print('!!!!! invalid entry.') + line = pd.Series([df['filename'][i], df['word'][i], df['xsampa'][i], df['ipa'][i], df['famehtk'][i], prediction], index=['filename', 'word', 'xsampa', 'ipa', 'famehtk', 'prediction'], name=i) + predictions = predictions.append(line) + else: + prediction = '' + print('!!!!! invalid entry.') - predictions.to_pickle(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.pkl')) + predictions.to_pickle(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.pkl')) -## ======================= make files which is used for forced alignment by Kaldi ======================= -if make_kaldi_data_files: - wav_scp = os.path.join(kaldi_data_dir, 'wav.scp') - text_file = os.path.join(kaldi_data_dir, 'text') - utt2spk = os.path.join(kaldi_data_dir, 'utt2spk') - - # remove previous files. - if os.path.exists(wav_scp): - os.remove(wav_scp) - if os.path.exists(text_file): - os.remove(text_file) - if os.path.exists(utt2spk): - os.remove(utt2spk) - - f_wav_scp = open(wav_scp, 'a', encoding="utf-8", newline='\n') - f_text_file = open(text_file, 'a', encoding="utf-8", newline='\n') - f_utt2spk = open(utt2spk, 'a', encoding="utf-8", newline='\n') - - # make wav.scp, text, and utt2spk files. - for i in df.index: - filename = df['filename'][i] - print('=== {0}: {1} ==='.format(i, filename)) - - #if (i in df['filename'].keys()) and (isinstance(df['filename'][i], str)): - wav_file = os.path.join(wav_dir, filename) - if os.path.exists(wav_file): - speaker_id = 'speaker_' + str(i).zfill(4) - utterance_id = filename.replace('.wav', '') - utterance_id = utterance_id.replace(' ', '_') - utterance_id = speaker_id + '-' + utterance_id - - # wav.scp file - wav_file_unix = wav_file.replace('\\', '/') - wav_file_unix = wav_file_unix.replace('c:/', '/mnt/c/') - - f_wav_scp.write('{0} {1}\n'.format(utterance_id, wav_file_unix)) - - # text file - word = df['word'][i].lower() - f_text_file.write('{0}\t{1}\n'.format(utterance_id, word)) - - # utt2spk - f_utt2spk.write('{0} {1}\n'.format(utterance_id, speaker_id)) - - f_wav_scp.close() - f_text_file.close() - f_utt2spk.close() ## ======================= make lexicon txt which is used by Kaldi ======================= if make_kaldi_lexicon_txt: - option_num = 6 + option_num = 6 - # remove previous file. - if os.path.exists(lexicon_txt): - os.remove(lexicon_txt) - lexiconp_txt = lexicon_txt.replace('lexicon.txt', 'lexiconp.txt') - if os.path.exists(lexiconp_txt): - os.remove(lexiconp_txt) - - # output lexicon.txt - f_lexicon_txt = open(lexicon_txt, 'a', encoding="utf-8", newline='\n') - pronvar_list_all = [] - for word in word_list: + # remove previous file. + if os.path.exists(lexicon_txt): + os.remove(lexicon_txt) + lexiconp_txt = lexicon_txt.replace('lexicon.txt', 'lexiconp.txt') + if os.path.exists(lexiconp_txt): + os.remove(lexiconp_txt) + + # output lexicon.txt + f_lexicon_txt = open(lexicon_txt, 'a', encoding="utf-8", newline='\n') + pronvar_list_all = [] + for word in word_list: - # pronunciation variant of the target word. - pronunciation_variants = df['ipa'][df['word'].str.match(word)] + # pronunciation variant of the target word. + pronunciation_variants = df['ipa'][df['word'].str.match(word)] - c = Counter(pronunciation_variants) - total_num = sum(c.values()) + c = Counter(pronunciation_variants) + total_num = sum(c.values()) - #with open(result_dir + '\\' + word + '.csv', 'a', encoding="utf-8", newline='\n') as f: - # for key in c.keys(): - # f.write("{0},{1}\n".format(key,c[key])) + #with open(result_dir + '\\' + word + '.csv', 'a', encoding="utf-8", newline='\n') as f: + # for key in c.keys(): + # f.write("{0},{1}\n".format(key,c[key])) - for key, value in c.most_common(option_num): - # make possible pronunciation variant list. - pronvar_list = am_func.fame_pronunciation_variant(key) + for key, value in c.most_common(option_num): + # make possible pronunciation variant list. + pronvar_list = am_func.fame_pronunciation_variant(key) - for pronvar_ in pronvar_list: - split_ipa = convert_phone_set.split_fame_ipa(pronvar_) - pronvar_out = ' '.join(split_ipa) - pronvar_list_all.append([word, pronvar_out]) + for pronvar_ in pronvar_list: + split_ipa = convert_phone_set.split_fame_ipa(pronvar_) + pronvar_out = ' '.join(split_ipa) + pronvar_list_all.append([word, pronvar_out]) - pronvar_list_all = np.array(pronvar_list_all) - pronvar_list_all = np.unique(pronvar_list_all, axis=0) + pronvar_list_all = np.array(pronvar_list_all) + pronvar_list_all = np.unique(pronvar_list_all, axis=0) - - # output - f_lexicon_txt.write('\tSPN\n') - for line in pronvar_list_all: - f_lexicon_txt.write('{0}\t{1}\n'.format(line[0].lower(), line[1])) + + # output + f_lexicon_txt.write('\tSPN\n') + for line in pronvar_list_all: + f_lexicon_txt.write('{0}\t{1}\n'.format(line[0].lower(), line[1])) - f_lexicon_txt.close() + f_lexicon_txt.close() ## ======================= load kaldi forced alignment result ======================= if load_forced_alignment_kaldi: - phones_txt = os.path.join(default.kaldi_dir, 'data', 'lang', 'phones.txt') - merged_alignment_txt = os.path.join(default.kaldi_dir, 'exp', 'tri1_alignme', 'merged_alignment.txt') - - #filenames = np.load(data_dir + '\\filenames.npy') - #words = np.load(data_dir + '\\words.npy') - #pronunciations = np.load(data_dir + '\\pronunciations_ipa.npy') - #pronvar_list_all = np.load(data_dir + '\\pronvar_list_all.npy') - #word_list = np.unique(words) + phones_txt = os.path.join(default.kaldi_dir, 'data', 'lang', 'phones.txt') + merged_alignment_txt = os.path.join(default.kaldi_dir, 'exp', 'tri1_alignme', 'merged_alignment.txt') + + #filenames = np.load(data_dir + '\\filenames.npy') + #words = np.load(data_dir + '\\words.npy') + #pronunciations = np.load(data_dir + '\\pronunciations_ipa.npy') + #pronvar_list_all = np.load(data_dir + '\\pronvar_list_all.npy') + #word_list = np.unique(words) - # load the mapping between phones and ids. - with open(phones_txt, 'r', encoding="utf-8") as f: - mapping_phone2id = f.read().split('\n') + # load the mapping between phones and ids. + with open(phones_txt, 'r', encoding="utf-8") as f: + mapping_phone2id = f.read().split('\n') - phones = [] - phone_ids = [] # ID of phones - for m in mapping_phone2id: - m = m.split(' ') - if len(m) > 1: - phones.append(m[0]) - phone_ids.append(int(m[1])) + phones = [] + phone_ids = [] # ID of phones + for m in mapping_phone2id: + m = m.split(' ') + if len(m) > 1: + phones.append(m[0]) + phone_ids.append(int(m[1])) - # load the result of FA. - with open(merged_alignment_txt, 'r') as f: - lines = f.read() - lines = lines.split('\n') + # load the result of FA. + with open(merged_alignment_txt, 'r') as f: + lines = f.read() + lines = lines.split('\n') - predictions = pd.DataFrame({'filename': [''], - 'word': [''], - 'xsampa': [''], - 'ipa': [''], - 'famehtk': [''], - 'prediction': ['']}) - #fa_filenames = [] - #fa_pronunciations = [] - utterance_id_ = '' - pronunciation = [] - for line in lines: - line = line.split(' ') - if len(line) == 5: - utterance_id = line[0] - if utterance_id == utterance_id_: - phone_id = int(line[4]) - #if not phone_id == 1: - phone_ = phones[phone_ids.index(phone_id)] - phone = re.sub(r'_[A-Z]', '', phone_) - if not phone == 'SIL': - pronunciation.append(phone) - else: - filename = re.sub(r'speaker_[0-9]{4}-', '', utterance_id_) - prediction = ''.join(pronunciation) - df_ = df[df['filename'].str.match(filename)] - df_idx = df_.index[0] - prediction_ = pd.Series([#filename, - #df_['word'][df_idx], - #df_['xsampa'][df_idx], - #df_['ipa'][df_idx], - #df_['famehtk'][df_idx], - df_.iloc[0,1], - df_.iloc[0,3], - df_.iloc[0,4], - df_.iloc[0,2], - df_.iloc[0,0], - prediction], - index=['filename', 'word', 'xsampa', 'ipa', 'famehtk', 'prediction'], - name=df_idx) - predictions = predictions.append(prediction_) - #fa_filenames.append() - #fa_pronunciations.append(' '.join(pronunciation)) - pronunciation = [] + predictions = pd.DataFrame({'filename': [''], + 'word': [''], + 'xsampa': [''], + 'ipa': [''], + 'famehtk': [''], + 'prediction': ['']}) + #fa_filenames = [] + #fa_pronunciations = [] + utterance_id_ = '' + pronunciation = [] + for line in lines: + line = line.split(' ') + if len(line) == 5: + utterance_id = line[0] + if utterance_id == utterance_id_: + phone_id = int(line[4]) + #if not phone_id == 1: + phone_ = phones[phone_ids.index(phone_id)] + phone = re.sub(r'_[A-Z]', '', phone_) + if not phone == 'SIL': + pronunciation.append(phone) + else: + filename = re.sub(r'speaker_[0-9]{4}-', '', utterance_id_) + prediction = ''.join(pronunciation) + df_ = df[df['filename'].str.match(filename)] + df_idx = df_.index[0] + prediction_ = pd.Series([#filename, + #df_['word'][df_idx], + #df_['xsampa'][df_idx], + #df_['ipa'][df_idx], + #df_['famehtk'][df_idx], + df_.iloc[0,1], + df_.iloc[0,3], + df_.iloc[0,4], + df_.iloc[0,2], + df_.iloc[0,0], + prediction], + index=['filename', 'word', 'xsampa', 'ipa', 'famehtk', 'prediction'], + name=df_idx) + predictions = predictions.append(prediction_) + #fa_filenames.append() + #fa_pronunciations.append(' '.join(pronunciation)) + pronunciation = [] - utterance_id_ = utterance_id - predictions.to_pickle(os.path.join(result_dir, 'kaldi', 'predictions.pkl')) + utterance_id_ = utterance_id + predictions.to_pickle(os.path.join(result_dir, 'kaldi', 'predictions.pkl')) ## ======================= evaluate the result of forced alignment ======================= if eval_forced_alignment_htk: - htk_dict_dir = os.path.join(default.experiments_dir, 'stimmen', 'dic_short') + htk_dict_dir = os.path.join(default.experiments_dir, 'stimmen', 'dic_short') - compare_hmm_num = 1 + compare_hmm_num = 1 - if compare_hmm_num: - f_result = open(os.path.join(result_dir, 'result.csv'), 'w') - f_result.write("nmix,Oog,Oog,Oor,Oor,Pauw,Pauw,Reus,Reus,Reuzenrad,Reuzenrad,Roeiboot,Roeiboot,Rozen,Rozen\n") + if compare_hmm_num: + f_result = open(os.path.join(result_dir, 'result.csv'), 'w') + f_result.write("nmix,Oog,Oog,Oor,Oor,Pauw,Pauw,Reus,Reus,Reuzenrad,Reuzenrad,Roeiboot,Roeiboot,Rozen,Rozen\n") - for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]: - #for hmm_num in [256]: - hmm_num_str = str(hmm_num) - if compare_hmm_num: - f_result.write("{},".format(hmm_num_str)) + for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]: + #for hmm_num in [256]: + hmm_num_str = str(hmm_num) + if compare_hmm_num: + f_result.write("{},".format(hmm_num_str)) - #match = np.load(data_dir + '\\match_hmm' + hmm_num_str + '.npy') - #prediction = np.load(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.npy')) - #prediction = pd.Series(prediction, index=df.index, name='prediction') - #result = pd.concat([df, prediction], axis=1) - result = pd.read_pickle(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.pkl')) + #match = np.load(data_dir + '\\match_hmm' + hmm_num_str + '.npy') + #prediction = np.load(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.npy')) + #prediction = pd.Series(prediction, index=df.index, name='prediction') + #result = pd.concat([df, prediction], axis=1) + result = pd.read_pickle(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.pkl')) - # load pronunciation variants - for word in word_list: - htk_dict_file = os.path.join(htk_dict_dir, word + '.dic') - with open(htk_dict_file, 'r') as f: - lines = f.read().split('\n')[:-1] - pronunciation_variants = [line.split('\t')[1] for line in lines] + # load pronunciation variants + for word in word_list: + htk_dict_file = os.path.join(htk_dict_dir, word + '.dic') + with open(htk_dict_file, 'r') as f: + lines = f.read().split('\n')[:-1] + pronunciation_variants = [line.split('\t')[1] for line in lines] - # see only words which appears in top 3. - result_ = result[result['word'].str.match(word)] - result_ = result_[result_['famehtk'].isin(pronunciation_variants)] + # see only words which appears in top 3. + result_ = result[result['word'].str.match(word)] + result_ = result_[result_['famehtk'].isin(pronunciation_variants)] - match_num = sum(result_['famehtk'] == result_['prediction']) - total_num = len(result_) + match_num = sum(result_['famehtk'] == result_['prediction']) + total_num = len(result_) - print("word '{0}': {1}/{2} ({3:.2f} %)".format(word, match_num, total_num, match_num/total_num*100)) - if compare_hmm_num: - f_result.write("{0},{1},".format(match_num, total_num)) - else: - # output confusion matrix - cm = confusion_matrix(result_['famehtk'], result_['prediction']) + print("word '{0}': {1}/{2} ({3:.2f} %)".format(word, match_num, total_num, match_num/total_num*100)) + if compare_hmm_num: + f_result.write("{0},{1},".format(match_num, total_num)) + else: + # output confusion matrix + cm = confusion_matrix(result_['famehtk'], result_['prediction']) - plt.figure() - plot_confusion_matrix(cm, classes=pronunciation_variants, normalize=False) - plt.savefig(result_dir + '\\cm_' + word + '.png') + plt.figure() + plot_confusion_matrix(cm, classes=pronunciation_variants, normalize=False) + plt.savefig(result_dir + '\\cm_' + word + '.png') - if compare_hmm_num: - f_result.write('\n') + if compare_hmm_num: + f_result.write('\n') - if compare_hmm_num: - f_result.close() + if compare_hmm_num: + f_result.close() ## ======================= evaluate the result of forced alignment of kaldi ======================= if eval_forced_alignment_kaldi: - result = pd.read_pickle(os.path.join(result_dir, 'kaldi', 'predictions.pkl')) + result = pd.read_pickle(os.path.join(result_dir, 'kaldi', 'predictions.pkl')) - f_result = open(os.path.join(result_dir, 'result.csv'), 'w') - f_result.write("word,total,valid,match,[%]\n") + f_result = open(os.path.join(result_dir, 'result.csv'), 'w') + f_result.write("word,total,valid,match,[%]\n") - # load pronunciation variants - with open(lexicon_txt, 'r', encoding="utf-8", newline='\n') as f: - lines = f.read().split('\n')[:-1] - pronunciation_variants_all = [line.split('\t') for line in lines] + # load pronunciation variants + with open(lexicon_txt, 'r', encoding="utf-8", newline='\n') as f: + lines = f.read().split('\n')[:-1] + pronunciation_variants_all = [line.split('\t') for line in lines] - word_list = np.delete(word_list, [0], 0) # remove 'Oog' - for word in word_list: + word_list = np.delete(word_list, [0], 0) # remove 'Oog' + for word in word_list: - # load pronunciation variant of the word. - pronunciation_variants = [] - for line in pronunciation_variants_all: - if line[0] == word.lower(): - pronunciation_variants.append(line[1].replace(' ', '')) + # load pronunciation variant of the word. + pronunciation_variants = [] + for line in pronunciation_variants_all: + if line[0] == word.lower(): + pronunciation_variants.append(line[1].replace(' ', '')) - # see only words which appears in top 3. - result_ = result[result['word'].str.match(word)] - result_tolerant = pd.DataFrame({ - 'filename': [''], - 'word': [''], - 'xsampa': [''], - 'ipa': [''], - 'prediction': [''], - 'match': ['']}) + # see only words which appears in top 3. + result_ = result[result['word'].str.match(word)] + result_tolerant = pd.DataFrame({ + 'filename': [''], + 'word': [''], + 'xsampa': [''], + 'ipa': [''], + 'prediction': [''], + 'match': ['']}) - for i in range(0, len(result_)): - line = result_.iloc[i] + for i in range(0, len(result_)): + line = result_.iloc[i] - # make a list of all possible pronunciation variants of ipa description. - # i.e. possible answers from forced alignment. - ipa = line['ipa'] - pronvar_list = [ipa] - pronvar_list_ = am_func.fame_pronunciation_variant(ipa) - if not pronvar_list_ is None: - pronvar_list += list(pronvar_list_) + # make a list of all possible pronunciation variants of ipa description. + # i.e. possible answers from forced alignment. + ipa = line['ipa'] + pronvar_list = [ipa] + pronvar_list_ = am_func.fame_pronunciation_variant(ipa) + if not pronvar_list_ is None: + pronvar_list += list(pronvar_list_) - # only focus on pronunciations which can be estimated from ipa. - if len(set(pronvar_list) & set(pronunciation_variants)) > 0: - if line['prediction'] in pronvar_list: - ismatch = True - else: - ismatch = False + # only focus on pronunciations which can be estimated from ipa. + if len(set(pronvar_list) & set(pronunciation_variants)) > 0: + if line['prediction'] in pronvar_list: + ismatch = True + else: + ismatch = False - line_df = pd.DataFrame(result_.iloc[i]).T - df_idx = line_df.index[0] - result_tolerant_ = pd.Series([line_df.loc[df_idx, 'filename'], - line_df.loc[df_idx, 'word'], - line_df.loc[df_idx, 'xsampa'], - line_df.loc[df_idx, 'ipa'], - line_df.loc[df_idx, 'prediction'], - ismatch], - index=['filename', 'word', 'xsampa', 'ipa', 'prediction', 'match'], - name=df_idx) - result_tolerant = result_tolerant.append(result_tolerant_) - # remove the first entry (dummy) - result_tolerant = result_tolerant.drop(0, axis=0) + line_df = pd.DataFrame(result_.iloc[i]).T + df_idx = line_df.index[0] + result_tolerant_ = pd.Series([line_df.loc[df_idx, 'filename'], + line_df.loc[df_idx, 'word'], + line_df.loc[df_idx, 'xsampa'], + line_df.loc[df_idx, 'ipa'], + line_df.loc[df_idx, 'prediction'], + ismatch], + index=['filename', 'word', 'xsampa', 'ipa', 'prediction', 'match'], + name=df_idx) + result_tolerant = result_tolerant.append(result_tolerant_) + # remove the first entry (dummy) + result_tolerant = result_tolerant.drop(0, axis=0) - total_num = len(result_) - valid_num = len(result_tolerant) - match_num = np.sum(result_tolerant['match']) + total_num = len(result_) + valid_num = len(result_tolerant) + match_num = np.sum(result_tolerant['match']) - print("word '{0}': {1}/{2} ({3:.2f} %) originally {4}".format(word, match_num, valid_num, match_num/valid_num*100, total_num)) - f_result.write("{0},{1},{2},{3},{4}\n".format(word, total_num, valid_num, match_num, match_num/valid_num*100)) + print("word '{0}': {1}/{2} ({3:.2f} %) originally {4}".format(word, match_num, valid_num, match_num/valid_num*100, total_num)) + f_result.write("{0},{1},{2},{3},{4}\n".format(word, total_num, valid_num, match_num, match_num/valid_num*100)) - f_result.close() - ## output confusion matrix - #cm = confusion_matrix(result_['ipa'], result_['prediction']) + f_result.close() + ## output confusion matrix + #cm = confusion_matrix(result_['ipa'], result_['prediction']) - #plt.figure() - #plot_confusion_matrix(cm, classes=pronunciation_variants, normalize=False) - #plt.savefig(result_dir + '\\cm_' + word + '.png') + #plt.figure() + #plot_confusion_matrix(cm, classes=pronunciation_variants, normalize=False) + #plt.savefig(result_dir + '\\cm_' + word + '.png') diff --git a/acoustic_model/phoneset/fame_asr.py b/acoustic_model/phoneset/fame_asr.py index b11359b..6165d5c 100644 --- a/acoustic_model/phoneset/fame_asr.py +++ b/acoustic_model/phoneset/fame_asr.py @@ -68,14 +68,21 @@ phoneset = [ # the phones which seldom occur are replaced with another more popular phones. # replacements are based on the advice from Martijn Wieling. reduction_key = { - 'y':'i:', 'e':'e:', 'ə:':'ɛ:', 'r:':'r', 'ɡ':'g' + 'y':'i:', 'e':'e:', 'ə:':'ɛ:', 'r:':'r', 'ɡ':'g', + # aki added because this is used in stimmen_project. + 'ɔ̈:':'ɔ:' } # already removed beforehand in phoneset. Just to be sure. -phones_to_be_removed = ['ú', 's:', 'ɔ̈:'] +phones_to_be_removed = ['ú', 's:'] def phone_reduction(phones): + """ + Args: + phones (list): list of phones. + """ return [reduction_key.get(i, i) for i in phones if not i in phones_to_be_removed] + phoneset_short = list(set(phone_reduction(phoneset))) phoneset_short.sort() @@ -96,7 +103,8 @@ translation_key_asr2htk = { 'ŋ': 'ng', # refer to Xsampa. - 'ɔ': 'O', 'ɔ:': 'O:', 'ɔ̈': 'Oe', + 'ɔ': 'O', 'ɔ:': 'O:', 'ɔ̈': 'Oe', + #'ɔ̈:': 'O:', # does not appear in FAME, but used in stimmen. 'ɛ': 'E', 'ɛ:': 'E:', 'ɪ': 'I', 'ɪ:': 'I:', diff --git a/acoustic_model/stimmen_functions.py b/acoustic_model/stimmen_functions.py index a272d42..cfdac62 100644 --- a/acoustic_model/stimmen_functions.py +++ b/acoustic_model/stimmen_functions.py @@ -81,3 +81,25 @@ def add_row_asr(df): for index, row in df.iterrows(): asr.append(fame_functions.ipa2asr(row['ipa'])) return df.assign(asr=asr) + + +def load_pronunciations(WORD, htk_dic): + """ load pronunciation variants from HTK dic file. + + Args: + WORD (str): word in capital letters. + htk_dic (path): HTK dict file. + + Returns: + (pronunciations) (list): pronunciation variants of WORD. + + Notes: + Because this function loads all contents from htk_dic file, + it is not recommended to use for large lexicon. + + """ + with open(htk_dic) as f: + lines = f.read().replace(' sil', '') + lines = lines.split('\n') + return [' '.join(line.split(' ')[1:]) + for line in lines if line.split(' ')[0]==WORD] \ No newline at end of file diff --git a/acoustic_model/stimmen_test.py b/acoustic_model/stimmen_test.py index 60e96eb..93546ca 100644 --- a/acoustic_model/stimmen_test.py +++ b/acoustic_model/stimmen_test.py @@ -2,8 +2,9 @@ import os os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') import sys import shutil +from collections import Counter -#import numpy as np +import numpy as np import pandas as pd import defaultfiles as default @@ -62,3 +63,18 @@ for ipa in df['ipa']: if ':' in ipa_splitted: print(ipa_splitted) + +## check pronunciation variants +df_clean = stimmen_functions.load_transcriptions_clean(stimmen_test_dir) +df_clean = stimmen_functions.add_row_asr(df_clean) +df_clean = stimmen_functions.add_row_htk(df_clean) + +for word in word_list: +#word = word_list[1] + df_ = df_clean[df_clean['word']==word] + c = Counter(df_['htk']) + pronunciations = dict() + for key, value in zip(c.keys(), c.values()): + if value > 3: + pronunciations[key] = value + print(pronunciations) \ No newline at end of file