From dd9e3d820bbf2fcb373df85ea54078a78ac3c467 Mon Sep 17 00:00:00 2001 From: yemaozi88 <428968@gmail.com> Date: Mon, 31 Dec 2018 13:04:33 +0100 Subject: [PATCH] started to check which words in stimmen transcription consists of only phones in novo70 phoneset. --- .vs/acoustic_model/v15/.suo | Bin 74752 -> 77312 bytes .../__pycache__/defaultfiles.cpython-36.pyc | Bin 995 -> 997 bytes acoustic_model/check_novoapi.py | 47 ++++++++++++++++-- acoustic_model/defaultfiles.py | 2 +- 4 files changed, 44 insertions(+), 5 deletions(-) diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo index e9ddb3028de24f584b466c6f213232c9135e52a2..4e9f3ed1ef89718ac33640b3435dcb74dd09b21d 100644 GIT binary patch delta 4612 zcmeI0du&tJ8NhvR9uO9S$pccJAq^!X1iy2yArxQV>qq>4#Li<(;P~x0vGLo6Hts{4 zG`0$Sx@Ov{b-i1)qV5lfw=s3BwFFeD+P#2|jxwq1V`3ZHpep}ND(t(7T}4}}wbK4z zT2J&l-*>+A`p$RHLr!tNzId(v_+UquKp?0C?%ch57sP_zecHZ}XgU7+&1dd@?=sSW zRprJ7qJT#6ZQ?Ps73b@YpaQ;Kw{>P6R5k(7dm)KI?*aA$tAKW3J+KY<2wEsXt_GTc z4L}>P2?zlX0BeEG06*9lK(W#dlPwI9&A3KwOUsjNrInk+S`)@s|)xR za0ECCJOs=De9I!p5)o~=+4EpxGSK>Vco~$`@s$}UZcWK{7TqjX= zKz|I)Sw1AnAG(_ncYuH)xB$V863}AqaUoF_24LK2f>RNkZy^s6T!P@fRu9g}@vqKq zDLdk)P?;#)L0qbldzj#k65OK%_mvjz+XVMyEB7KouVHlZ1%&VuC%3G=3;w|VSipqu zylkq265fB|`}oFL7vA=Z`?30km$;{ivg)NGQ7)b;A}pLA!hic^D}Mf!?f9=3-(5{Z zcR}%1$H7}ad0CC$x^$fTE>WJm+<|b*#r1go?=Iu+@5IaTA0Kc0Y%`Z(W&*1!#ZNJ9 z`K?RO)#iIw!k=xyp+%4QONGBI`1_lGHuxmob41Qk45c7t{W7Um+>dE_C*E=`gjH|8 zxW*qn5K8e^gJ9AV^@Z`5!rSqef7{wU`fL7ufYd2K1RPUdtu33^A#Q&>zeTm6@!4Xt z|Ktn93{Ya3QvA?Sk2jW1rouH}N`L=()1-fT-vmDve*E0v%9;as{v5SRv}W-wsu$oj zSI$eXB6g1${*K64MnlV$bU&qK$$q(1q3EZjQdvK&m1S6|LaETeT+UdOJBK9cR5&)> zHI(vCB|lw9+%>-uKYz7*tv5EE@h4ITsWSawc2&d)&8szZ_f41>=rMi zqhYt&>d1QBA zpf!_=WHZ#5-NhRXUQKA+o26+sX@(-n(q3QYu#WGb_d8tKek7jrj?*r+#-(*kYFzCaPZ^}K$e1A^7pMJweR6VA<}R?}kUgQ4TdjT{MJZ&WNjX`N zdKt~|q|`zOC#fkrYd42N+K5phGEXQonpkRxO{8O*q&Ak(6x^Bd37cocH0-r2(>W&Y zHpS&L(Y{G>UsPjLD@55L+Z5?=Gkm2vJ|%Vy`;2jiF)SVlGxoeJ>q=2tCX*%IPD^gW z;fd?ynHZZ*dv(ElWRxK%3VKTJx1}Wdd~!lB(uGE1mSn0xORO=cHZ-2j*$-=jR5Fm3 zYtt#(*EdWVotl}kDKbOq4C1kA6B7-NnLM;N;!;}V?wpeKc_^#8Kw2Z&eAa164vmh< zU{NM=I0x~}Nc~n>XhiQCmnd^yDixM0T)xa`!56oT=*4O_LK(s)Yr-}f$oj2m<&-1n z2w6v68e4)K3abNSevenJ)X#X-7DvXemj?#=qCvC9k=5l4HZqvd#11Qyp@`B|=yU7h zQ$t~SX4ISY6{23hEm$zo>2X`wsmoFWN<*+OoRyd}S%o-e(s|Rd5Gl!1A&EY#mKc(h zE1)$^XPsWlv^wM)bDKtzdC5@Ls2DXXbb8NBR+&zVO@1e1_W8zXnKW7;3lXb5XZEV| zMv2qrl37KL$h0Rc4NOKX1xlnak>qgRZVD?Kvj6+TSn8_ai&u8zgO@hrL%$K?{&&97L4F)FNrjIY z|5K8McaSMv?ZuBi*I8CweFou2dN$X41u+44t?}~{{rQQ$7f$qYn_0hPGY?hu+GcKu z!T%2Y*mJFIaZ4Y5a~t=;N?dSu+`=m6gG`|jCdD{92o z-|IZ{bR+WBVd?v9nZ16t;S(A(;EDI!&$PFo8!Ki%Sc@WOB0}_3(-~Das-W3;7gGHb D9_Z{A delta 2102 zcmZvdYfKbZ6oBVkmSvZREbrICJ_KA9frYJ$h%Spr(HNsu8$&3Fi1k5nbwDCP=OEha{yr94ckN;*l<>y1~;|_ zoq~68HptYmf}DDSvejV<=&MO;h*%-u@dL|J?qz3GfB$p8e zBAW;&I6jS}bwRJ<`Mj1USplbNpr1sV#{_IeOMLmHzeXex5yWzW7k7|k9#KfFCpHjk zaYpx1L^Wyqh#JDty)!21ZBmXBiSKZ%No&G8dZ$m+FUx(laufHkoQ&Qb zY7gCN4_#^xPvUII>hM+Mv=F%jX6cL_i-oyCB?Qk8kX+iJ(14vAv>4B_bjdu2bZ##9 zhkYp|ZNwUa`@^=lfD9qY4~_a0uR0bR#6djj@S(m*kObT_63=ffV{ z8SG4|C7Xlb=Y1&P=Sqk1V(|L(F0!-|ZNw>}op_JvAUMiP(igh7gs+qc^uUN@B86bv zBBmvCE9uLLbRvVuBvt@!FH4TgCN+oPG&{*W)(2~x;0tdCGq!|xKq}7CZvwVNbZC#$ z)vGkhT*Q#ja0(8D+RzbZh+j{6MZ^YTBjF;7LD^m=)Jp5rN(E}AEUI)`ROpqtn6@A& z#Fm6kd8?31g>2OLru?QyU)f5omyO?osieEN?7wZ3ByCYcOMN|ma5OOLjK%Qp7&LtR zd?>ydz{2HFK2e|)q~8W?d>W7O_xtgaYsJ1XudXC#*X}}lz8zfyD^WWTiT<%-U*aac z`W^B2i07`@l;*^_0_xQT==M0|T%DXO{3jB35t>T3v`$biov`O(4*Jr7M~clTZ(laT7=rzQA-g;$~NK4Y6hdylGO=;v@D_Q zC>x-Q#`?<5ih29EfNGr0?tm(6$>~IIycK(6jmk#bE#O=y=LWG^vF?2Ya-opl5a_KB zSHkOm2W~qx|A3I+6!IYtESzU$8@`J9ktfa?gCuFy`!?*#p;oJV&ihI^z&>6mmxwV>D2g?D@R zD6X3$LjF|9{X#ykf0fKH(m#?TmL_+h+t`E-Pa0Q1_howJaOAS>!>ZG`Qjypcl zjn~Fvuxqk%=$;?&VqGeJGZ=~W$7AV@LgR!LI44!{KOk@AL<1#94~=^P%Lc6I9LS?W zqtS-iLv^)Bs~e7;I9B61ob72X$Fa$BP=B%R@g@@=77N4f=fi;qZ|r;q(ZHyu%hByf z#Iz?id^nMz(MhH1Qk5-NlOck4H_1B_S3UFPx02!+xlDKe(2H^3&BgJFAnjXl@s7%) zl-GnKQ)|#WI;Y~4Jmu?@gB;%ey!xBeJ_vx_R|OAky`P=Xa*PdP1|~g z1SBfO(-#0QOk3n9TD37@k3Tc>NwPTjSmQfbawig2y=x53&(C+-3$oWbYhT?JeIRI` zvi6vhz^atK{5rVL0K6ZdG*YOk8<6Rs4DRW%=d?o6DJa8QGzn$z9Bm0AQOAi2wiq delta 44 vcmaFL{+ON9n3tDJ!b>wIb|Ysh6K85(6<>00shQ>G3MO7gHVAui4|5~{4tESP diff --git a/acoustic_model/check_novoapi.py b/acoustic_model/check_novoapi.py index 93ec540..4af8368 100644 --- a/acoustic_model/check_novoapi.py +++ b/acoustic_model/check_novoapi.py @@ -16,7 +16,7 @@ import acoustic_model_functions as am_func import convert_xsampa2ipa import defaultfiles as default -from forced_alignment import pyhtk +from forced_alignment import pyhtk, convert_phone_set import novoapi @@ -35,7 +35,7 @@ translation_key = dict() phoneset_ipa = [] phoneset_novo70 = [] -with open(default.cmu69_phoneset, "rt", encoding="utf-8") as fin: +with open(default.novo70_phoneset, "rt", encoding="utf-8") as fin: lines = fin.read() lines = lines.split('\n') for line in lines: @@ -49,6 +49,14 @@ with open(default.cmu69_phoneset, "rt", encoding="utf-8") as fin: phoneset_ipa = np.unique(phoneset_ipa) phoneset_novo70 = np.unique(phoneset_novo70) +# As per Nederlandse phoneset_aki.xlsx recieved from David +# [ɔː] oh / ohr +# [ɪː] ih / ihr +# [iː] iy +# [œː] uh +# [ɛː] eh +david_suggestion = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː'] + ## ======================= convert phones ====================== mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_converter_dir) @@ -56,7 +64,38 @@ mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_ stimmen_transcription_ = pd.ExcelFile(default.stimmen_transcription_xlsx) df = pd.read_excel(stimmen_transcription_, 'check') #for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']): -# #ipa_converted = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, xsampa_) # ipa_converted = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa) # if not ipa_converted == ipa: -# print('{0}: {1} - {2}'.format(xsampa, ipa_converted, ipa)) \ No newline at end of file +# print('{0}: {1} - {2}'.format(xsampa, ipa_converted, ipa)) +transcription_ipa = list(df['IPA']) + +# transcription mistake? +transcription_ipa = [ipa.replace(';', ':') for ipa in transcription_ipa if not ipa=='pypɪl' and not pd.isnull(ipa)] +transcription_ipa = [ipa.replace('ˑ', '') for ipa in transcription_ipa] # only one case. + +not_in_novo70 = [] +for ipa in transcription_ipa: + ipa = convert_phone_set.split_ipa(ipa) + + not_in_novo70_ = [phone for phone in ipa + if not phone in phoneset_ipa and not phone in david_suggestion] + not_in_novo70_ = [phone.replace('sp', '') for phone in not_in_novo70_] + not_in_novo70_ = [phone.replace(':', '') for phone in not_in_novo70_] + not_in_novo70_ = [phone.replace('ː', '') for phone in not_in_novo70_] + + #translation_key.get(phone, phone) + not_in_novo70.extend(not_in_novo70_) +not_in_novo70_list = list(set(not_in_novo70)) + + +def search_phone_ipa(x, phone_list): + return [phone for phone in phone_list if x in convert_phone_set.split_ipa(phone)] + + +# 'ɐ', 'ɒ', 'w', 'æ', 'ʀ', 'ʁ', +# 'œː', 'ɾ', +# 'o', 'a' +# [e] 'nyːver mɑntsjə' (1) +# [ɹ] 'iːjəɹ' (2) + +search_phone_ipa('ˑ', transcription_ipa) \ No newline at end of file diff --git a/acoustic_model/defaultfiles.py b/acoustic_model/defaultfiles.py index 726f23a..f464b9f 100644 --- a/acoustic_model/defaultfiles.py +++ b/acoustic_model/defaultfiles.py @@ -42,4 +42,4 @@ phonelist_friesian_txt = os.path.join(experiments_dir, 'friesian', 'acoustic novo_api_dir = os.path.join(WSL_dir, 'python-novo-api', 'novoapi') #novo_api_dir = r'c:\Python36-32\Lib\site-packages\novoapi' -cmu69_phoneset = os.path.join(novo_api_dir, 'asr', 'phoneset', 'en', 'cmu69.phoneset') \ No newline at end of file +novo70_phoneset = os.path.join(novo_api_dir, 'asr', 'phoneset', 'nl', 'novo70.phoneset') \ No newline at end of file