From d6e005b1cbc1afa561ece42a2dca69d94f3c1c84 Mon Sep 17 00:00:00 2001 From: yemaozi88 <428968@gmail.com> Date: Mon, 7 Jan 2019 11:50:24 +0100 Subject: [PATCH] find pronunciation variants which all phones are in novo70. --- .vs/acoustic_model/v15/.suo | Bin 77312 -> 78336 bytes .../acoustic_model_functions.cpython-36.pyc | Bin 5904 -> 5850 bytes acoustic_model/acoustic_model.pyproj | 3 + acoustic_model/acoustic_model_functions.py | 2 +- acoustic_model/check_novoapi.py | 82 +++++++++++-- acoustic_model/forced_alignment_novo.py | 112 ++++++++++++++++++ 6 files changed, 187 insertions(+), 12 deletions(-) create mode 100644 acoustic_model/forced_alignment_novo.py diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo index 4e9f3ed1ef89718ac33640b3435dcb74dd09b21d..88f58eb1a02007ca1a7b8925fd996cd673a84a9c 100644 GIT binary patch delta 4012 zcmc&$3viUx75>l7ZeT+aHe`3Rgk`fy1KD7Rn`9wNgs_1%JchAQisRG(2@;y%Nrc>d^I+j;UelS&94G=q!3QZi0GZuk%5P4V!T9h^eY|q_?MnFev zZST&v=bZaE_uTuPd+#rv(un6Y0S8|3>&$vV&NzW-7m$o6j2_aKN4ha$-=qO%(sK#<1OuUn zFr6@k5cQ`a{eFUra35hDfw$cgmJ@=tk&4+Q`Bw2Vi}aF6{T!0_5@z5y?LDvv7ibI9 zA0fMqP(=uj2>(};y^>HvE7PqerIFA~hP{atAA!%%Lb5fYuO&Hwz{iUuCFV`kzc#9) zf#}I0`)>63AESmSJ>iLi!C_R#_awRmPS_}dg^*4dPT+kxS-c-7D~>Rdz{wgz;O$Ye zGRWp+@wyOMq?-t|Z+^i}3ZE!SUMAUN37oua!cg)k69t}1QSv5{&Yecf?d0+L$<${m zfs+^Q7bUNlrwgxiQzIu&(s})ll3Yw!LRd;DCsYtB2~h`Jt|0rzgy=3s+q_iAX;?+5 zBk+389hYlJ))N{C>j*=sKI&EHv z=Y2VNyy#1Jb2f^!j5InFZ%(EpY@fr>umY=YJ*Q$D3*?(&W z!yPlwEK+}n!6vQBBsn`olCC8j4%*``)M93xcX8Gn)-H1=k56GeP*d&ME7|&T*pKG(>tg3f&Oq@ z!5cLu-{!%WgU%_hZFvgMPP0kN{t4IBe=QpZ%1VYet7c+J(-ciXK|vvk%ex^D{Y@U6 z(bTGVnLLVt6P>lzq?Lu1XZsz`YiuOe1dn?ghz01fio0{C(W-r}{;3;y@uhprnVoDwB9M1sbvYfGfbSYe9I5o?w zpSi7U*M{Ng(#g2(hQAK(I)cB=%8Y%0E{J--_p+?YLriI5QpVJDiTqN=g$k1X9AOJ# zD?uhiSkp+ezb<78O|iK$7vq)}!ISuXUkZv1qGDm4jc&m0=d*EDuMwYY&PvEH%*o4} z)K*&IT~paY9Vbe@MEu=8eOgKiwrz?I{;Z6f_1J1yu1A~`3ns4!AXug&YcECd3BCH@Pj>=k z8$KuR57|A>OaJ~Z>g*&LfhI}X}Ih3tM(atOHUq#bEA6?=UtqKeeXJy z_B%3F4L%R-iD z@U-N!;7|ICFmSdR=j<-TJ^jfc{rCGuqIb0cKOQjPiKF>g`0h@047{z06Ffu4Wx*K9 zHwG4B(r(vK1#Np~8uR=ksjr~5L{e7PG|DmHq2TuFG@L%oI_#l8 z(OpFeb}SpxaK)QO?b0=LTIha`nY@QM2@S-v_J9>q^DmEwkv9Oyo3!uP6^u=0bMUqKt~#wWPo)GT$@saFAu_l&y>h4`Dv z_2gfN$1iumdin7fSdH!*Cg=#O+y0h|nwuT+wXslzuiq$zM(n>lN$$u7J1(APK45@y zCfj15T&`!Z8(KoGZ|Ojf+BT;)^u4DB->+uIyD`X9?}+NYTE$GuLNtc|_=g7#4d>pB zZ!G*8I?$+T@ z4()s=!wJw}l><76mCJ`iJ3JWOVdrNm+BkA+pTc`TIj9NG(kP+rM%ZIB5Yqa9@jfCh^v#8W0F z%P&~rSD-}Ou)8M~yZ@LW>(k+i{C6GQ^|qUJa$+plB0JkfJ3AYOMtJ9rbx}?Ze7*|% zRy(``RXmo3xbPf5VOerTHvBA<=(05%Nw7FX`2SgRmp%(#g0GGfGdE?T JW0U#Y;NKK4cnSaj delta 4411 zcmeH~e^6A{7035H79xmd5f)ZKL|6nBaDfFEL|9-YVo3`A04MbaD`HTIpsuK>@FY&s z(bR!=tS`xAM!RM@9j%V=YA`>nXt3k7YNm~yXyP>3q(8Jcqt-gnPS}3${zyooQzxeX z_3nJ$IrrTA?vHcsz3;C4nO?r4@3c&mbpB+}=@Os@fgz8_BdkJC?5>KHMt7dOxzDrz zBF%!uUd^pitF(~#uGAq-khnFdbyO<~mc-+M6wIWLY>$Q(3ec7}S)iLi5eNfOAO=hX zU!X=F2O7X65CO&m(b!1H(SF_AI3D_8`KdZxzPDn?Fr%ngg1pIZ6SEAY zV%W0%_OaX*963vr3}r)^>oOm{Q7LjaxjOJgQrp*R|QX>xEcB7 zU;(=MU)UmubvATh?a;_4j2nEfDI&r)nFPJ9On1Vf^|UD38WRsi(rWW1#Ch#BL6&C1De2EAnZ#ae+pb+18D22 zO5)E$=eI`@Rtbm(F~9`If^i@g#DVc30Zag5rV}9(K@u>7WH1S&fK)IUOaW6t8psA2 zU>XqbXF_KA^mdOGN)E^c(}4}-fqXCn6o8qa5EKDBCvPzIg?vw`Sj4&)C& z1$f$T6ZQaeVVg(TqcwYc zAN7PcQn=V9fjIJ}QI>^bsSjWJfkLtT#nurAjX1T$(r*Fc6ccsCxz`HBDfW)v)&?m~ zvK>HKR6u*Xs$xS`f8kKKb04|QQctx+A^k8qNvX-@!kSfvIa12ZTd~wpcyeL&hZVPb z_;`&uX%HWSl~`1J@$sm^*DMPkvbm6MY+eQS7fd?#Oz00K491g)_IUSOsoI@{a8myq zLHTU?hh8h3a+;Jxm-Gz0lz5)slQM&4)C}j%NiQjr?$8qc*Q9n@%o|d6vAijlZypZQ z?3Mx264D09ZPgEVvCZeswg}Np@6lcDp5qB( zbDe`Wv!!JQ%d`70FZ=j+$Fmab`PY07Bc%tMuv zDNAvD?r%h?>Y#Y+PUO60dUT?0mLY9Wv(PJ|bRe#65)^`4;@PSlwlkyHk%{ zVr6SZe8kVynp*Y1z@-m*&$jM*b@zw5uBu3_rD7+!=S%Ko#5>CK_)Xcwe=MJ>7k#i? zC#wT`GHIq2sl+doZ{zmz5PHemaqY=CesN<;@btX2EDQHlHq#cbz5U!sWuwg7H>Me_ z*=c#xeS;OdD0#v9`gNPQctVl;S&2{G$kXyR!7;^~8;v>EG>fmI4XMtWo}Ig*lZ%s! z+>Mf$#B1JpNF8M?HyO0*)`R5Uqj$GS?pF-%gF*B=ro)Gbc=zG>Q6BV!V0fwEw%&>$ zLJ#}j$k@pz7Sr<@yG z3>*0GZ%oYmjy{;@Q=&Q@1v-BvPf-u(Q`Yzxzw$=bbs%xd5+zM+d7j<~pl194FnNh}tJ zt<;t?Jv%)+tIU$l#a0u?Uuj_brME}dH!f;$iF=A}wX<M)%9y;E)%Wsj>`oj0-f{O%eIlz zp7dS0^}5sSe0LY0KV{$_ol51T6It%d#HRBHx%S;^<>R4IdGyG2r94vXeZ2jQT`89N z)R}a&5lMIXRB_S6>~U`RK^)b?Q-ZEqXTeX4YgkTv_3h36^_0S%6Mq;Ps&jng->Y1E zB883T&pm!Cn(=&v#PX-I=6t_FihlabK;Y}**B!$>mBRL0ecILYGS>a4a?kBwkWt(` zeJQg~ykiCqc%YQ~?5W&w$i#B7o#mCXB7ef$&ExfTxKQ?7*)6GejMTt8KPjW#s#A~S z?tXj}?#1Fte1vHzkMAFMAum?l8{T}_Uz_PQXlpt3?-|s=+izq(>aaEYtp>>`{;~K{ zW}o=yBGQ*bRfl5eYhzt|Y2x=XOflg7e~b1_cj2`%+Jia#gfCQwV0Pm0X~9f#)Mt|@ z75}1Wu=G)D^WsvI7InQ=;){d7WY=J==pv3g2jkSvNXl1h@o{EnX%y-VB3?roEZ>k- zcLeI(9IWV#j-@$7ig2jf988l$Nvryip3GE@5zXqoGM;`DL@%PkVr9_v@J`}*_fW!j zX+*_}ZvA*KQW z-(u$6?fe_@iIY%x?!4Jez4z35u23v#)LSm(?dX+*wL2V+-(7<+?ph-^e+H zr%4i4r7KbDFBW<`xTLxfjt5Z>6-~cIn@lUs9@R@>F1mOuk$Xkb(bBeteS4 z@e7T@c1xlcuVuNanxS}^5BvBL5@G^z5e{3HNC?gb&onh#4UUjm?(3gW47YV5#yAuK z4t(vGS%$}!`);MED*_ep{b#0_g1c;o1lzKzZ4dR~Q{<{fsc1I2bQC&I7WTVacL|w; z*Wol7gCF6EnR!$}eb$4Bb0}`h#ciXK!76%PWw}zZYSKXCDrXU+NqwX37FpRpXj>-H ztEh90qvquJv~&E~WpkiJ;?c#E!sl(abP|7g%9dd*lI^nx3GlSyq;7mRcAYPpI=zp~ zdvD;5DC+=8?hE7GmSCv*x>~hurf^B_A{n?N$N7K$SnimXQKS=*K%@~_#3W)Cv4B`X Ppm;+^`%jC&p*;E*G!D>V delta 1034 zcmZ{iO-vI(6vsRLqT6nvfGyNup%o2H_>ymd1S<;hf`CP#G;G#(3M<>)&FmJHiAFhFJ9#0&BJ)mg9i^LhO3E*^R~gXCSHE?=FQvx``E|2w!4M?V9va!xOqmXec-dLFC)T&df28#U8a*Ozc3*CD3>pjbu|U3RylmV+(#F;qcnMthiG z>6WZYTB*of>-awNY{+T}Grxg(PR){6ah1U5n#-t<#(Y0ej0S{Wv7U=FkE&?cWOQ0+ z3shz!LN6Mld&2wHejc{zDOrWCkt8AmP?4Ksh6rHB=o5eY&=5D7$5_)fOhA?lgG0dg zEF$MDoF^B-ISQyu^3~siCTJ+|GtqJKo~;Bk3c4$P)mF zfJwj=x?4Mg>gjRqxvFIGx&buHCmp4F-M2stLR$eTx)M4)InGVyzN0Hz(wp(Z7EPr{ zC%oHJ^V(xH?U7~t@OvcFP!+3ca)+T#ls@<#+CgZJZZ-^}6n)(=HZ>0_pikm}1V_@% zTzqIuG>EQk$V5^V(_+EOo5YY&rGBAt39TGFZ@iA?7eMDGN6E?YVY__nkh#CEw`JEG z`#5@7KDNlMlYpaVWbwrQ_>?Ts&hT);t})2nNoU3l|C40+rK*U>;cGq#BBCTpKZO$w z&=T_+vX+-kd;X02BWKV!?TEBxSg4@$t diff --git a/acoustic_model/acoustic_model.pyproj b/acoustic_model/acoustic_model.pyproj index 27625c9..0ec4c9b 100644 --- a/acoustic_model/acoustic_model.pyproj +++ b/acoustic_model/acoustic_model.pyproj @@ -35,6 +35,9 @@ Code + + Code + Code diff --git a/acoustic_model/acoustic_model_functions.py b/acoustic_model/acoustic_model_functions.py index 662c82b..4fced38 100644 --- a/acoustic_model/acoustic_model_functions.py +++ b/acoustic_model/acoustic_model_functions.py @@ -199,4 +199,4 @@ def make_fame2ipa_variants(fame): ipa.append(fame.replace('ɔ̈', 'ɒ')) ipa.append(fame.replace('ɔ̈:', 'ɒ')) - return ipa \ No newline at end of file + return ipa diff --git a/acoustic_model/check_novoapi.py b/acoustic_model/check_novoapi.py index 4af8368..cf6e7c6 100644 --- a/acoustic_model/check_novoapi.py +++ b/acoustic_model/check_novoapi.py @@ -55,14 +55,15 @@ phoneset_novo70 = np.unique(phoneset_novo70) # [iː] iy # [œː] uh # [ɛː] eh -david_suggestion = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː'] +# [w] wv in IPA written as ʋ. +david_suggestion = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː', 'w'] -## ======================= convert phones ====================== +## ======================= extract words which is written only with novo70 ====================== mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_converter_dir) stimmen_transcription_ = pd.ExcelFile(default.stimmen_transcription_xlsx) -df = pd.read_excel(stimmen_transcription_, 'check') +df = pd.read_excel(stimmen_transcription_, 'frequency') #for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']): # ipa_converted = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa) # if not ipa_converted == ipa: @@ -70,11 +71,13 @@ df = pd.read_excel(stimmen_transcription_, 'check') transcription_ipa = list(df['IPA']) # transcription mistake? -transcription_ipa = [ipa.replace(';', ':') for ipa in transcription_ipa if not ipa=='pypɪl' and not pd.isnull(ipa)] +transcription_ipa = [ipa.replace(';', 'ː') for ipa in transcription_ipa if not ipa=='pypɪl' and not pd.isnull(ipa)] transcription_ipa = [ipa.replace('ˑ', '') for ipa in transcription_ipa] # only one case. not_in_novo70 = [] +all_in_novo70 = [] for ipa in transcription_ipa: + ipa = ipa.replace(':', 'ː') ipa = convert_phone_set.split_ipa(ipa) not_in_novo70_ = [phone for phone in ipa @@ -83,19 +86,76 @@ for ipa in transcription_ipa: not_in_novo70_ = [phone.replace(':', '') for phone in not_in_novo70_] not_in_novo70_ = [phone.replace('ː', '') for phone in not_in_novo70_] + if len(not_in_novo70_) == 0: + all_in_novo70.append(''.join(ipa)) + #translation_key.get(phone, phone) not_in_novo70.extend(not_in_novo70_) not_in_novo70_list = list(set(not_in_novo70)) +## check which phone is used in stimmen but not in novo70 +# 'ʀ', 'ʁ', +# 'ɒ', 'ɐ', +# 'o', 'a' (o:, a:?) +# [e] 'nyːver mɑntsjə' (1) +# [ɾ] 'ɪːɾ'(1) +# [ɹ] 'iːjəɹ' (1), 'ɪ:ɹ' (1) +# [ø] 'gʀøtəpi:r'(1), 'grøtəpi:r'(1) +# [æ] 'røːzəʀæt'(2), 'røːzəræt'(1) +# [ʊ] 'ʊ'(1) --> can be ʏ (uh)?? +# [χ] --> can be x?? def search_phone_ipa(x, phone_list): - return [phone for phone in phone_list if x in convert_phone_set.split_ipa(phone)] + x_in_item = [] + for ipa in phone_list: + ipa_original = ipa + ipa = ipa.replace(':', 'ː') + ipa = convert_phone_set.split_ipa(ipa) + if x in ipa and not x+':' in ipa: + x_in_item.append(ipa_original) + return x_in_item +#search_phone_ipa('ø', transcription_ipa) -# 'ɐ', 'ɒ', 'w', 'æ', 'ʀ', 'ʁ', -# 'œː', 'ɾ', -# 'o', 'a' -# [e] 'nyːver mɑntsjə' (1) -# [ɹ] 'iːjəɹ' (2) -search_phone_ipa('ˑ', transcription_ipa) \ No newline at end of file + +df = pd.read_excel(stimmen_transcription_, 'original') + +ipas = [] +famehtks = [] +for xsampa in df['Self Xsampa']: + if not isinstance(xsampa, float): # 'NaN' + # typo? + xsampa = xsampa.replace('r2:z@rA:\\t', 'r2:z@rA:t') + xsampa = xsampa.replace(';', ':') + + ipa = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa) + ipa = ipa.replace('ː', ':') + ipa = ipa.replace(' ', '') + ipas.append(ipa) + else: + ipas.append('') + +# extract interesting cols. +df = pd.DataFrame({'filename': df['Filename'], + 'word': df['Word'], + 'xsampa': df['Self Xsampa'], + 'ipa': pd.Series(ipas)}) + +# find options which all phones are in novo70. +#word_list = list(set(df['word'])) +#word_list = [word for word in word_list if not pd.isnull(word)] +#word = word_list[1] + +## pronunciation variants of 'word' +#df_ = df[df['word'] == word]['xsampa'] +##pronunciation_variant = list(set(df_)) + +cols = ['word', 'ipa', 'frequency'] +df_samples = pd.DataFrame(index=[], columns=cols) +for ipa in all_in_novo70: + ipa = ipa.replace('ː', ':') + samples = df[df['ipa'] == ipa] + word = list(set(samples['word']))[0] + samples_Series = pd.Series([word, ipa, len(samples)], index=df_samples.columns) + df_samples = df_samples.append(samples_Series, ignore_index=True) \ No newline at end of file diff --git a/acoustic_model/forced_alignment_novo.py b/acoustic_model/forced_alignment_novo.py new file mode 100644 index 0000000..243f275 --- /dev/null +++ b/acoustic_model/forced_alignment_novo.py @@ -0,0 +1,112 @@ +# +# forced alignment using novo-api. +# +# *** IMPORTANT *** +# This file should be treated as confidencial. +# This file should not be copied or uploaded to public sites. +# +# NOTES: +# The usage of novo api: https://bitbucket.org/novolanguage/python-novo-api +# I couldn't make it work as I described in the mail to Martijn Bartelds on +# 2018/12/03. +# As per the advice from him, I modified testgrammer.py and made it a function. +# +# In order to run on Python 3.6, the following points are changed in novo-api. +# (1) backend/__init__.py +# - #import session +# from . import session +# (2) backend/session.py +# - #except Exception, e: +# except Exception as e: +# - #print self.last_message +# print(self.last_message) +# (3) asr/segment/praat.py +# - def print_tier(output, title, begin, end, segs, (format, formatter)) +# def print_tier(output, title, begin, end, segs, format, formatter): +# (4) asr/spraaklab/__init.py +# - #import session +# from . import session +# (5) asr/spraaklab/schema.py +# - #print data, "validated not OK", e.message +# print("{0} validated not OK {1}".format(data, e.message)) +# - #print data, "validated OK" +# print("{} validated OK".format(data)) +# - #if isinstance(object, basestring): +# if isinstance(object, str) +# +# Aki Kunikoshi +# 428968@gmail.com +# +import argparse +import json + +from novoapi.backend import session + +# username / password cannot be passed as artuments... +p = argparse.ArgumentParser() +#p.add_argument("--user", default=None) +#p.add_argument("--password", default=None) +p.add_argument("--user", default='martijn.wieling') +p.add_argument("--password", default='fa0Thaic') +args = p.parse_args() + +wav_file = 'c:\\OneDrive\\WSL\\test\\onetwothree.wav' + +rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True) # , modeldir=modeldir) +grammar = { + "type": "confusion_network", + "version": "1.0", + "data": { + "kind": "sequence", + "elements": [{ + "kind": "word", + "pronunciation": [{ + "phones": ["wv", + "a1", + "n"], + "id": 0 + }, + { + "phones": ["wv", + "uh1", + "n"], + "id": 1 + }], + "label": "one" + }, + { + "kind": "word", + "pronunciation": [{ + "phones": ["t", + "uw1"], + "id": 0 + }], + "label": "two" + }, + { + "kind": "word", + "pronunciation": [{ + "phones": ["t", + "r", + "iy1"], + "id": 0 + }, + { + "phones": ["s", + "r", + "iy1"], + "id": 1 + }], + "label": "three" + }] + }, + "return_objects": ["grammar"], + "phoneset": "novo70" +} + +res = rec.setgrammar(grammar) +#print "Set grammar result", res + +#res = rec.recognize_wav("test/onetwothree.wav") +res = rec.recognize_wav(wav_file) +#print "Recognition result:", json.dumps(res.export(), indent=4)