From 8f89f60538c5dba41e713c9a02e7dbcf6b7461af Mon Sep 17 00:00:00 2001 From: yemaozi88 <428968@gmail.com> Date: Fri, 8 Feb 2019 14:10:32 +0100 Subject: [PATCH] dataset is made. --- .vs/acoustic_model/v15/.suo | Bin 106496 -> 102400 bytes acoustic_model/fame_functions.py | 10 ++++ acoustic_model/htk_vs_kaldi.py | 85 ++++++++-------------------- acoustic_model/stimmen_functions.py | 49 +++++++++++++++- acoustic_model/stimmen_test.py | 31 +++++----- 5 files changed, 97 insertions(+), 78 deletions(-) diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo index d1feede8ad1b869ad42871b1c365587eef4327fc..649e0b19014b39935d8f606a8be47e0e105fde18 100644 GIT binary patch delta 6370 zcmeI03slrq9>?eR2c`iLM?^(l!iYG+h{OA#!k~Z(h(MZ;KtKVdPlu7Y{p~77A znx?@-rB+sqE7qpsBeR#9wZ-mwJmzU@dx}<@?yj=mI}c@T>(sXG>Fn8i&WHQE_kaKQ zfA8=6yN^NNCD*sgo5v6CCzHwS!Oh;@USu)?17FDWlw30AEuH+_k#}i6YV?(?MyjE= z**DT^s$tc3wF9@ITCK$&Bu8+AB{&SN9|mg@@CPm+5d;JD-s_HB0fvJcD0^A!#WRD| zu0^f_o**7f1+#$LptX{Seg-{?yb{7Z);$aQ)FfbHs25OK5 zoPc;P(mdG7Kn^^t^>Nl*JYN8NY@f2I3qm;>6w$?=ao6I z=K}9Ob)xkOlzpvr&dA4s0#FDR0ntXJV%YbA`#}jP!&joqVN?Rq-~nqc>}sn$26+sK z0+C=P7zNnhsnkW6(zazEWS)+S@5(%S2dZr{i#|o_k2)csn5qu8s2hfX)C;A4sWyqIX?G5Y$$1ufOlF7CL z=H=vHZ-60&CD;P{VQZNq-v%B5+rbX-DA);hfyY2AXah#@IA{k?fG5FIU^mzU_JXIu zGvHb991!1oo*>H$D0Bc5*bjaK4uDQ@5WEOp0>1?>gG1mjI0CxBQE&_#2Pc5&_$2aI z!71>XweAh%Z-Te1Wy_>lvQv3WX@=27GB~;Y-!r#D;byPRGfGJ~$F-8qvMSelI>Wxf z-&a`Gp!$B};mTQJp+ddhAd)&%*X(FNyFRe?ha0orkn5sdXs>ilX8LPb4bkhYD&ir7 zyNVg-s1#-BunMqE@2gp*&vZ6vUIl@4R5+3+C`rTSe4&(TwM-wR=dHEmVsc5jN(LWa=DS{P zFP6B)SL@Bba+nF%w6W{YqzKPBG@*2oRX3)x#73>dI+WJ2>zTE5+Ei6=i1?^5J1kOc||J zm_@)JQ*`txGR}}fgwI!7`?2|bEz0XaqqSa0-DI`l#&Oh&!-451R?@32nWG#};n8aV zkN(K?E-KdU;$@0UKSo9`$yg#8(sB}CeaG>u{LA*I8^<4edF@v?y_*j8W^Iv>B9;y zxzQl~wAWR(*K5#Ct+lHa$X8lzJ(+Ht9m=)y5u&;xIAnO>u0XG(lZ$qJ_H4$~ zvUBr=L!1j`MY4P>@eLDiEW=S4X`P#X51>uAF@>S5P6^inXp-q>riP1hfOLsE!S zTtu=MrlH;5bZwi$jwPoc0H++J8Ae07_RLc+9VuBFQ?SO^tVP3HYoHxBr?ZHE+w!#K zez?pO6c*)Il~m?dE-sxE6BS%pSW#I~SXo)TWMM^xaD&FsE;pqg?&xRSDjE0L8=FXO zj@Oko@0&x?#jdzCpCtqZNm`9bzp;x*%UU+qIfq4rYGN#MEf6oT0UH(UP;+?y(1;LC zNZ1{r@546}DOq8&+UZv4>u-OjS3_=6U9F)-FKM+*cdx%^c(f+OxZ{b_B>PG!B%2#ck4{L3wpc$J-9rIICaGDiU(}-9BjN~Z#*qwtQ~(2 zr+!Ohi*c=Af|ZiZ;y*cRd{5$}paonwgPr+O%?mVib)fhiVoCm8Suo^0Er`Nd?w8s8 zNH7gxv#zQ;gM*3Fj~Or52atyM44}`L-&fweHGt&u{EBic9hMN_&UFb?$G5nVrsI5< z!}rXg<65}GokJJfXE=Yen4G!JmAah7)j`~{9b`Ns2=Ri!;+TtKZ++e*;vkBR{j&7H zVA2uKRFao%!pctl5Yl5NKfwus2Rx{YcuNShdFyjE_e-OZog=--6Wyq(z~-Xdzd(s51_a?RbQvB2qAs2^TQT*VQ$+Gho zEUl`z9T)PXh$wC33ZdJGe~t;R_a(L3u4a+PnuCAx-_~#K1hFt}{=4v^dg8w1KJLN> zbD76`)6@|*4{EjAG2m8OQKz*nP2NT{n0I+&7UGcU(UHr~jHF1?+5HcMMU+@OoqJR? z?5D?pyjqiA7#hWOgQ(Y`PH4pwoJqy8TX-mDpRwrA{H{hylP8(Q9@>$e;G{vMIEYrN8fszU=toDfBFN%cXLDDuse+6~`-OT$h3oHKbBDS1zPF zb~@UbuS_L5-!ct*LufokYW{rsG)(ST5?*diBQJKOTh1@0q1nY$D&dvsWT0iDzX%?j zj)r(IlAoPU%Pk{L<(HCZ#xLLn%*&nEZ=@K$at6-MU*F8D#V;BU$zYb4_hisYi<_Ig zo%z`e_^+Xe$~!M+QZ~_({7Nz{8kBJA03Nc4cJPkn2-0@*QeZ3I!EeD)YCKf-S$KwOOF(<^)>NF{ zp;O8C=jyWU&N2V$yz0TTs%awcu7Tv0E1M z9!$yNapDD`BRuNLPuG!`U8^jS(;$i$n`D7d*4*PpQtq?FRA{o2ZOr{K`BUE)T=?85RC$iK2dO?-ecHi@XKum};^6_!L0bTt~A zNEP-Pu8D~;o3vrlX(nMK*ffuoJeuZVv}2phG|iBXowhSXjgz!$)6nnk{!5sNO_S7S zI=#-vIsg67e;?<3=iKYMk4Cut5#1$m$tsmf4SX>=Itrm8Ap6;>sZwL&4-61$Vz5xt;hoSID>51~1~>%V3fu;W z_oa|YfEh>w5&$jG4#We(CI|9i;6{~dhiVp-WxxzTyzGFC1f~Jo0I{~&zyv_N7wr+~ zselD&1H_`IKt2Sd1J?s_fD8!x+aZN_gr2C#vVhDT9nFQZPU*~toTTW&ShNZ}F9Zxq z|Gb{6$>9ZfsP3tnvSz%G?m*{qV2xsFf{X>`12$kiumPw6Y5_a25fHO9D6&zJqOTbf zm|aT8-Q!+lDS8oPCNLf71SSHz0QNaKqlYI?@BR#dUy6>a1pX+xLxC5QiKanchyY?^ zC3qHrp96^ZBAB7T{}6gPUXMBsk}j0Ij8iNK3Vw?Kvh z?}RP_e;pufL{N)?13)P7P0BkF`1=)I1U^d97b#MF9}ZmvJ`WIqpC>3M0$&Lf0>VfH z{$@bz{3gipfnT85EP>1bLSYr(iN@p!DHDTg@NZpD)x5>^%2#(PaueicMbCm9AHNo* zf1)Dqg50X;JJ?4Pay4RI0xT&ijhVH2wn$sgtlAfPoL!zM!`Hp`>LjmT@)%s?VXcN8 zbT9i1e@j?v+zwqU=J=i1I-0P{koqeYky=E9?SCbMxfXVL&62lL^2jMHC3Pk%k2Nt@ z>hcLBHYx%YF#9z1ltv8ItaNC1Q7~M1_Kg;h)SqxEO!GFLMDwO!eKir=@$!R7zAN)9 zi0PeKlZ|l<6__L_B@9kupG<}P((mEIhQnNG?+**HurCHZ}Nfn81>l{DnL zGQFLsl(o*-=gpU*7o!k-BB@kOAn$|3zF3&!lNb5PTP$%YpJXOm&&+wp*mu|#ZhUm{ zA5RS&9X=q+N`Mh!ho8hCQNvJ+P#yQO(i>m#CP@8F-DFx?{#xFw_Zp`i89DO)l-e2k z@gJQ9sb_-m_kb%I$e_6(-cQ(x^c$A$gxNjH@Ou<_FXUc81ouA3nLrmHrk@HqSJ8VE z>47{bbY{`!GWQY}y@EyYpERLu->L&LV5%;C;9ihf*viRS(QbySQi+K}ll zOPW+Is%(`LWwf|G`LW2@xaT(;vVN@hJs!qR95Jx%FYofMl2}EV-lsouoC@B1|3v*W z_qk-#^UnEUFO-U092-uDp&L82r4u_n`^OLRYtW_&9goB1zkwbaq+aDxZBkZx+<#=I@3ECTi9Ml9@|7(cqW%<@ z5^~Q17v1e^U9gY5H%a}ODUvVcwqc5IP{>nVw|=vIowMH2WN)#%SgdoNwkS8#6l9o< z83&T%T@FX%2FKRwmYU`Ur>iB~+19bmXk>1upIKg>rG>R(zEzmdu5&ck*lX9VZ*17u zwAtR|s{Trcv#s8>Nlc~7$tlV-2Ity|j}6{xDK@mh-HIpo^;Au*5-th7=vH(F8H)S^ z&;?t1A-SSI2q~Cz7znXOFy>LVQ;3S)| z^eq?u*2YHj4Ji?rzdQcxknP@AzB!sVA>|ZePJZ6vWZmRdqP; zT2+lImv3|zC0LVa~u&G;)ssl=2i#bgwd+??m*rr%? zLT*=dH{`b)GEgGLkiN4mNfF*%l6S4--7fX-{*<(0woz;vHz(g{4N0E;ezx;s1{>N^ zl4LS{8T2}s%(}J3-dx%2sB37nGx>m9ClVB8B5=g856>qiCgOl_wk;${)zIQ-0)^(_ z{3|vVZ!>%EMIl|pEm4%oT=Md8YQT4a$M9RRvL1Fhbq-4{N@cHFjNx7Ad5N`IZq|s? zK%5Ms)`ssz_g;3HjnZ*;`QT`zm>$hH`Ych}tm+m=D=1mrYHQF*7HW8CZ>H(9ND5C}JO=<9R~uyY!GI%=5ZdZ8cSxeKD_pLLw*uF8D6((MVoz)V1-N@AA(g)t-Zwf}W$f zzRUJV$$Lm*-aebpcKj%L9(wiPE-upBr}q9-wYR)e&D$mL@mFH)fiwcXf3N1MmS-E)?ITmBN(OkIT#?FrH z>@l(_q+>rjp9*Tkadiwv@0Q4kxzgCC5i`9KnmUQ?J~oMM9SO{C^`4S=tVE0W`RlQY zuo%i1JUo$ZC-(7(hWqrC(*J=!?5lq2zhVx*nt!w(eBXPWsJCS06jfNv)|?yj3XJ)k zy1~pCdY#ybb4Y~|joM{zaj}=ql>~Ow5lc6%!qq@&@{2PR&3bKpAEgPLL8wF7Y|F(& z#Kb^j8hh?ydHiRgHK=Cz{|;T$8tWK8k^-z41S_Hl*C@-Fu}US5Ex{&hAfmI?;;ie9 zp?TjZ++5aDZHNGqV*_V%U29W~tHIGE4&KFBZ7I9&n2yFYw(R5bxK${Cp=N`Qi-*c8 zX0(4jdwQd3HXLu@PBrQIn|exMkB(f7$Ci|4@{1dd$ieZOA0wl$;T$LK&`~Vw9*JQo z7yqWmZQ`mc{`z%_L{Q^sHS@kY(mf!Pzt=?alkRvTHsqAK+eB{OoJ+-n#bFddT#l#W z2sOYuvz$xd z{;9bBemF9P=f)CSHAiuQN~Q77Q}pkdhVwlyC+ZJ|oq7Aho~XXJCN2EK>JJhYiQ9x^ zEab9|h%m2c5ff0%b)hJ4ran z1h%_-A;G2 z=iKQSHHt4Ur5;+vV^@-e7nV{Y)r}3K=iyP50;@`XzKkB>)-rmU50_Fc-4(R)%UDd8 z(6{hAWl9n2Jt)kQxYLF~#s6fP{89#g+5%^+D5VZwy__aVo-vQq^Ea1KJU?4U&4a_s zshWa&E8r$4j31v0{vkBYT7{jOuJa% zS_*uT#BF6@%^PdLAHy12LJxCGC$+IFC8>kO>*y)PTwjN+WN)D4!Lv3hrhsEnfD!n7 z1ip6MJmHzie7FXFSZTvFYiqD^EbK_Y!~C{dN(_X7?O&|phwPYbXfw?Iu#wV*MHe^M zQ6FobU&$|&U<@qP_5V+lK@jEF<724_Z-ZCM_>B2f%4G``vd$Ny_~-SM4POO9T2@aB z1Oe;0rvWL@9SB-)8+ae+l($kvP*b<#b%46Jh@fQ(mmZaNtJu!675qp$<>T8x`XqHy zV%P?igUKtWfkDf-qn0}VSLu`(DSm{)@AtgQMn$}`16k|ZhIExXsZVS?p?5ocakPM} ze54kYNaXid*;mgOG*Bh~?fvkPTe#sCLD)%r+5MD4wsEB1vp$iZ+lR{)N?a)|<-b>x zNHYunWFH+s5CWg-_M>?14t|Bglp!u+qQu1p3+JD^P@hs~Bki*5M49*s@1GI%viN(9 dcbdr2{Hu7c{`+%D1z#COBit56!^|i}{}UNq>~sJC diff --git a/acoustic_model/fame_functions.py b/acoustic_model/fame_functions.py index 295ed79..10f16cd 100644 --- a/acoustic_model/fame_functions.py +++ b/acoustic_model/fame_functions.py @@ -343,6 +343,16 @@ def word2htk(word): return ''.join([fame_asr.translation_key_word2htk.get(i, i) for i in word]) +def ipa2asr(ipa): + curr_dir = os.path.dirname(os.path.abspath(__file__)) + translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0) + + ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones) + ipa_splitted = fame_ipa.phone_reduction(ipa_splitted) + asr_splitted = convert_phoneset.convert_phoneset(ipa_splitted, translation_key_ipa2asr) + return ''.join(asr_splitted) + + def ipa2htk(ipa): curr_dir = os.path.dirname(os.path.abspath(__file__)) translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0) diff --git a/acoustic_model/htk_vs_kaldi.py b/acoustic_model/htk_vs_kaldi.py index 9095b3c..00c699c 100644 --- a/acoustic_model/htk_vs_kaldi.py +++ b/acoustic_model/htk_vs_kaldi.py @@ -9,7 +9,7 @@ import sys import shutil import glob -#import numpy as np +import numpy as np import pandas as pd #import matplotlib.pyplot as plt #from sklearn.metrics import confusion_matrix @@ -75,24 +75,22 @@ lattice_file = os.path.join(config_dir, 'stimmen.ltc') hvite_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test.scp') -## ======================= make test data ====================== -# copy wav files which is in the stimmen data. +## ======================= load test data ====================== stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test' -fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav') -df = stimmen_functions.load_transcriptions() +df = stimmen_functions.load_transcriptions_clean(stimmen_test_dir) +df = stimmen_functions.add_row_asr(df) +df = stimmen_functions.add_row_htk(df) + word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)] word_list = sorted(word_list) +# pronunciation variants +for word in word_list: + df_ = df[df['word']==word] + print('{0} has {1} variants'.format(word, len(np.unique(df_['htk']))) -# after manually removed files which does not contain clear sound, -# update df as df_test. -wav_file_list = glob.glob(os.path.join(stimmen_test_dir, '*.wav')) -df_test = pd.DataFrame(index=[], columns=list(df.keys())) -for wav_file in wav_file_list: - filename = os.path.basename(wav_file) - df_ = df[df['filename'].str.match(filename)] - df_test = pd.concat([df_test, df_]) +#fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav') #output = pyhtk.recognition( # os.path.join(default.htk_dir, 'config', 'config.rec', @@ -102,58 +100,21 @@ for wav_file in wav_file_list: # os.path.join(config_dir, 'phonelist.txt'), # hvite_scp) -htk = [fame_functions.ipa2htk(ipa) for ipa in df['ipa']] + #pyhtk.create_label_file( + # row['word'], + # os.path.join(stimmen_test_dir, filename.replace('.wav', '.lab'))) -ipa = 'e:χ' -fame_functions.ipa2htk(ipa) - - - -# Filename, Word, Self Xsampa -df = pd.read_excel(xls, 'original') - -ipas = [] -famehtks = [] -for xsampa in df['Self Xsampa']: - if not isinstance(xsampa, float): # 'NaN' - # typo? - xsampa = xsampa.replace('r2:z@rA:\\t', 'r2:z@rA:t') - xsampa = xsampa.replace(';', ':') - - ipa = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa) - ipa = ipa.replace('ː', ':') - ipa = ipa.replace(' ', '') - ipas.append(ipa) - famehtk = convert_phone_set.ipa2famehtk(ipa) - famehtks.append(famehtk) - else: - ipas.append('') - famehtks.append('') - -# extract interesting cols. -df = pd.DataFrame({'filename': df['Filename'], - 'word': df['Word'], - 'xsampa': df['Self Xsampa'], - 'ipa': pd.Series(ipas), - 'famehtk': pd.Series(famehtks)}) -# cleansing. -df = df[~df['famehtk'].isin(['/', ''])] - -word_list = np.unique(df['word']) - - -## ======================= make dict files used for HTK. ====================== -if make_htk_dict_files: - output_type = 3 - - for word in word_list: - htk_dict_file = htk_dict_dir + '\\' + word + '.dic' - - # pronunciation variant of the target word. - pronvar_ = df['famehtk'][df['word'].str.match(word)] +## ======================= make a HTK dic file ====================== +#if make_htk_dic_file: +# output_type = 3 +dictionary_txt = os.path.join(default.htk_dir, 'lexicon', 'stimmen.dic') +#for word in word_list: +word = word_list[2] +# pronunciation variant of the target word. +pronunciations = df_test['asr'][df_test['word'].str.match(word)] # make dic file. - am_func.make_htk_dict(word, pronvar_, htk_dict_file, output_type) + #am_func.make_htk_dict(word, pronvar_, htk_dict_file, output_type) ## ======================= forced alignment using HTK ======================= diff --git a/acoustic_model/stimmen_functions.py b/acoustic_model/stimmen_functions.py index 9d28093..a272d42 100644 --- a/acoustic_model/stimmen_functions.py +++ b/acoustic_model/stimmen_functions.py @@ -1,13 +1,15 @@ import os os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') +import glob import pandas as pd import convert_xsampa2ipa import defaultfiles as default +import fame_functions -def load_transcriptions(): +def _load_transcriptions(): stimmen_transcription = pd.ExcelFile(default.stimmen_transcription_xlsx) df = pd.read_excel(stimmen_transcription, 'original') @@ -34,5 +36,48 @@ def load_transcriptions(): 'word': df['Word'], 'xsampa': df['Self Xsampa'], 'ipa': pd.Series(ipas)}) - df_ = df_[~df_['ipa'].str.contains('/')] + + # not valid inputs, but seperator. + df_ = df_[~df_['ipa'].str.contains('/')] return df_.dropna() + + +def load_transcriptions(): + """ in default.stimmen_transcription_xlsx + rows of which wav files can be easily found""" + df = _load_transcriptions() + df_ = pd.DataFrame(index=[], columns=list(df.keys())) + for index, row in df.iterrows(): + filename = row['filename'] + if isinstance(filename, str): + wav_file = os.path.join(default.stimmen_wav_dir, filename) + if os.path.exists(wav_file): + df_ = df_.append(row, ignore_index=True) + return df_ + + +def load_transcriptions_clean(clean_wav_dir): + df = _load_transcriptions() + wav_file_list = glob.glob(os.path.join(clean_wav_dir, '*.wav')) + df_clean = pd.DataFrame(index=[], columns=list(df.keys())) + for wav_file in wav_file_list: + filename = os.path.basename(wav_file) + df_ = df[df['filename'].str.match(filename)] + df_clean = pd.concat([df_clean, df_]) + return df_clean + + +def add_row_htk(df): + """ df['htk'] is made from df['ipa'] and added. """ + htk = [] + for index, row in df.iterrows(): + htk.append(fame_functions.ipa2htk(row['ipa'])) + return df.assign(htk=htk) + + +def add_row_asr(df): + """ df['asr'] is made from df['ipa'] and added. """ + asr = [] + for index, row in df.iterrows(): + asr.append(fame_functions.ipa2asr(row['ipa'])) + return df.assign(asr=asr) diff --git a/acoustic_model/stimmen_test.py b/acoustic_model/stimmen_test.py index 8cbdace..60e96eb 100644 --- a/acoustic_model/stimmen_test.py +++ b/acoustic_model/stimmen_test.py @@ -1,9 +1,7 @@ import os os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') import sys - import shutil -import glob #import numpy as np import pandas as pd @@ -24,23 +22,27 @@ from htk import pyhtk ## ======================= make test data ====================== -# copy wav files which is in the stimmen data. stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test' -fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav') +## copy wav files which is in the stimmen data. df = stimmen_functions.load_transcriptions() -#word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)] -#word_list = sorted(word_list) - #for index, row in df.iterrows(): # filename = row['filename'] -# if isinstance(filename, str): -# wav_file = os.path.join(default.stimmen_wav_dir, filename) -# if os.path.exists(wav_file): -# shutil.copy(wav_file, os.path.join(stimmen_test_dir, filename)) -# pyhtk.create_label_file( -# row['word'], -# os.path.join(stimmen_test_dir, filename.replace('.wav', '.lab'))) +# wav_file = os.path.join(default.stimmen_wav_dir, filename) +# shutil.copy(wav_file, os.path.join(stimmen_test_dir, filename)) + +# after manually removed files which has too much noise and multiple words... +# update the info. +df_clean = stimmen_functions.load_transcriptions_clean(stimmen_test_dir) + +# count how many files are removed due to the quality. +word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)] +word_list = sorted(word_list) +for word in word_list: + df_ = df[df['word']==word] + df_clean_ = df_clean[df_clean['word']==word] + print('word {0} has {1} clean files among {2} files ({3:.2f} [%]).'.format( + word, len(df_clean_), len(df_), len(df_clean_)/len(df_)*100)) ## check phones included in stimmen but not in FAME! @@ -59,3 +61,4 @@ for ipa in df['ipa']: ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones) if ':' in ipa_splitted: print(ipa_splitted) +