From a8dbb51d0c53878d4db797ac71a9531cf8e3489c Mon Sep 17 00:00:00 2001 From: yemaozi88 <428968@gmail.com> Date: Wed, 13 Jun 2018 07:22:53 +0900 Subject: [PATCH] to be sure. --- .vs/acoustic_model/v15/.suo | Bin 44544 -> 49664 bytes acoustic_model.sln | 1 + .../convert_xsampa2ipa.cpython-36.pyc | Bin 0 -> 2324 bytes acoustic_model/acoustic_model.py | 23 ---- acoustic_model/acoustic_model.pyproj | 8 +- acoustic_model/convert_xsampa2ipa.py | 128 ++++++++++++++++++ .../how_to_use_ipa-xsampa_converer.txt | 15 ++ acoustic_model/performance_check.py | 47 ++++++- 8 files changed, 197 insertions(+), 25 deletions(-) create mode 100644 acoustic_model/__pycache__/convert_xsampa2ipa.cpython-36.pyc create mode 100644 acoustic_model/convert_xsampa2ipa.py create mode 100644 acoustic_model/how_to_use_ipa-xsampa_converer.txt diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo index 67b05b0096282eae98c5f71ba2fc9630420f18dc..3e2e944e73e3fea5cf76e82c0511c132ec7b26b5 100644 GIT binary patch delta 6630 zcmeHLX>eQB6@FKe6?v0m;w`qk;RHL5tw`QQ1D0dQAzow~WigQ8$cmhZ*p?$p0V{yz zhG7`8Ky)aCr38f(iVZ1_SWAZpaGAC+gtq)>r%Zoz(jN@WuuRf{P;kHVBn3N(1D!w_ zIvmZXd+vRAJ@0Jy`QKIeZ&TjATx3pOk)jZJ^hZTOa#HU2Lm&KE)JULZOLR(M ziI+;y5C_n{(Gt-`UZqvFC|XqygHdbfv@&mPHrVMI^d|x0Fr_0 z0WH8oc^NKqP#S>^v^`jExrWmrKVo2lFe~O2@qC44x zlF||uiHFIQP(OMQ)klEGfkVKf01x>E%EzMoFHt@Li~|$ElfYBJ5#U$AB=9WoJP-n& z0eFt*qU8%HUySnH&WIA-l@K?%NhN+D+YD$u!cAW7sMt=Yv>h6jdLh6@rRHOS^`)$) z*A&L2tI)oe%>@)2XW=JbKUN(XOQ(xZ3xDWTNumho8sntJn2GKjEhCSvlcp4gxF8x{ zrq6V(G?iiyTgjU%FF*!`%BbUYLEVx~8j$*>Hk5*^>E)~YuL)Cm9i9_cf26I2a)GZ&W z6GyR2+ad>|pJx@};0a__s#FT};X#$Z2E(?>v<@Dm6Q{mfYDU?Blegl36j2YEBUxzY_rl(ZmlGB`mDQn{hblX?$PO0zb?vCI(_ms}$RtiI{KKop_uA;~ zd_y4rp45cheV>1D>zliFod_Mh?32H~A+ZnF*j?UXkJH{mkBpV3JE1N%sK`xF7gjdr z?NtBND*;`T>Qc=4O_erIR$BU^vN>ML#W)Tg^8Q=NTLsnD8bu48s_M{`;sHk&`Qd8v zRwKZ1bjpghC9>Up#7c)g)61G7;#k5pS^9__4psblVr*rUn~w#L*91N5bVV*^}$-)vIqvDnhvQ4|65VAnnp$)wtjx{ z*lXtW?pI2SQ*V0t9$B8sZB7rpJzmvyUQx;xxI>=bBmZx}d~B67+aj~Sb;sa=rKN|S zE3USj{9@$9+w$yGxU86Z^UQ&glfqA@N-WfY?9y&C!f^KLp2=(e87ELkN%q5?l zHGzTUo*My9EP8-VfDPyc?7(KA58(b=P|`y=1*FQJPjBWJl((TS^l)c4YxsQFJs$#% z&B4nxLS|99BUm^8?=P0DIMV#it6!=8tshZ1SRb0s6{7rXAKwyUAW5I_Tr8IT&Nz| zE5EtW?RuXeg+AjVPD0a_D^+YwY=vUBC5?(1#uGD-S;k}BxtRPd*-D%A->G@!uJ=ll z@7mvEyXMB_|1P`p;Qva^E5>wWp2&_r0QY{mv_V=6^M@P0kg>;~G4?q;&*)rUXLOdR z&hqz*Ao6qlePL7)s#$Nr%k+K6XpM?R!1_5?e9n0q9m4WvVS0`}o44Q7;~5yLtZAyH zl)9q`a@XTP(=pppG4rvojtqMo1Nuf+f4{@-ak>VF^{X6%4!6^;Z*s~FHuuP;itDb6 zcC>oj&cQ8Z1p~u&m%HEDTUM~n;U31gE2?OHa7klrt*zQ#U1Ky=RyhoHrj2E>fiqXQ z$~L^!W9#jA%sJ2K+4F4V;5;w7mZs4)Fl2K(+!~(A>hw7zrKAa5=rGYkt=#Ul^*cQy zu?aRA&wn!Qxko#9OLE#0&zTI9oYH)odzifiJ#`05^yi3$S5AZ#8Ko+OFC%_>1i@4a!H&O)(90ViQzmha%W;8;_KpsdIe1KTJ{rDNXu6McB8=wW1#4;2{CMcH z6}#vCpj=%j+*+WZLzV&|QG=yE=CaJP%c|+>&v)s&BEf8p%hf-uU*jAa)_*hq#dZiE z7Mif`{r~Vv>72^Nh_hrS**hht+W00MX_08V9|6!;O7HkHL^6%}9u!BYaNIK6n*}iO!kcF5z``#G2d)T%ChKIAL=0&ZDR{4rpWDs2m{i7>OXhUPZ z`is8y)caq$coFM%#O@LJ4DeliBBA*$Wt7+6w0JXeq&{RwUZgx8a88u`1ul~QL~h)V z(CEQ~bb8`x3g~rsPZxhbmEYB$#s|x1e^BT-^DXDw`t0$2-3M2U@4*Ke@#IM1*9!Q| z{oOoq7@E_NS5GG<%;H{ZxPN5Xg-26(@WaV#b}mn9$!>gK``)vW+b&)rvQkrZ>@&@U dr20k2(Abxsq?&?nyxx8Hnv}?AAxdgD{|DZ!wPFAO delta 4475 zcmeHLdu)@}75}d780SIaBu*et9EfOW9QSifN|GkTjvW(1Ab|jdI?IFD0k0v1*zO>Z z#&ig=t#rhv>Q+{!kanGbDbwOf)w-!M4ecJ92Gu%sYB#iLNYjuu6)E{+nMn3KeneTv zfUQ;kK)YXl-~HXkckeyto^$SX0w>C%*hGG18-3MmP#-|mr39K&7Ir0qE+EYVN#Myq4v+w7fNUTW z;6BS^d3r48x+d_nSb0S(&qrPu<8(wW7PWLbSZPROP5s>YvVaLg4Y2%qm3)G zwxG}qYz4LfEx`Qf_RNG`AgR$_-wZ06V`6{&`#&EuLF@{&KB(BsR*Q`2K$=>XkLBRC zaMH!fS;*G|EFagaVmWJ*%bd!igVZd~MWqQS0L=8BoJluTa`#@8HUW*mW}p?gt8^G@ zJKA;tZNP3IGCp2Ug6{!%c~2pC0`0&TfS59l=uV*U9KfC8AYD?ixIk4&bu^Qx&5Do5 z`RnNVUG6@*-Mjo9kavNfMl1gGnGP0A^@;uDPaP4xG@II@VZ$f_*c{cJAUgyNrEMl# zmMNtK#XQzN(tq)t+JXLgI+|N?iq4tW(pY~X-B_PZub7L$iz1K}s(aN5i54o}(XiF0 z0+Ge>e(Y1y+~Y%{mHZpNMgf(Id4%)MyF)k3!HS_-%WLA@<8;xz08+?db+ z;-RDZCYn~5gq>#fEt*GA$Ho%*+E)~4wV@Cb`d_v$zKv?MGiO(1SB<-93OKtXFPPnr z&63@c?D7vA;~#TgkIJQGRu`A!M#=X5pNx_$rsB#^T3ttMfG*_Sz&@Y{Z~;pvqE}3M z?Eoqd1A~AUpkJC~RRAPdY$tn}Og>$!O2G{XO9|frnV`SxwrDcIIFPZP#8vWVZA%oa zre)DO!!oucq|ky{2(qkxv9xAqrNcSz{BZt*zg|4`pr zVE#033(e%WYM_mUN6|gHDFu-&4XzCQujWMaISq+Ii2*qVMoi&(ke`K)<~E5$u?D;} z){y)X$W;L8{x%rQm=PyKvl+=E=(_qD8)EVjL!=9(ITSwA6xwI}wg_6L6T~WBA9hG2 z&}_+wP?3LKi-sd7noR`S=62n)^DjEFk& z+eT8@UO(n`q)%z`D`F-ux!k?o{ytx)@8CdXrR<=z#>rTuD;zZ}r+H^`uvv#cLfDJ4 zSXr^#5VAB|6!)`zn%r6(dTra8WGYkOdLhwmXF+5J!_yinuc)N(Q{D7(|ElP0T5f`K zQNXD)oE1@7Bk!Ew-V9!^9CBR#TVk*A*4W9VZGZI5Q`lLgMLuqO<9hl;t~B=Zw1sK< zjk7AdPZCmoH=@cR$&aw%lRV&^ATyCj2IBW+eN8IKjPNmY0|B99Gc_MMJ(^k&y73Xd8J?l}g1Y08?+=M-Ox z5~OLmee7kr_H-$IqsK!}kT>M*JtOv96!vNv&oOJcqhZ7P22)x6hV|w$lgTcZSsP5& zGFi6Dwkp}-aM;Yl$7{BFygd$2pWD0fxV^f)rQdBI^gQEk-{JPUy9av?v=93CxAz=! zxv7QLBn%vF+_{D4OoK*6T&o_-;#$Nq`rtsP!XB@v6-_ky#Y)olq$?iX9H%taXG_nC ze=Zrqh2Z+oOkawi>mC)g(&M4(@e9h(N5@`Q?zMkDB0ZvsNN2?RzKU4;;9a9S3!Q#u zj|k@49xytqqtTh>(&yrF_kn_8@{(Y9p8%1!G(db>$JT`Y@ajK=if_}Qk)1-pM3RhQ zKH@nx^PPd07EOz+U)qSPL=s%38du8}sb6w~5901pcdnFfp34?Vp(_)0A~LW8z`T-+Zx-FVL*ZB5=NFj!el)FEPEaysSz_U^$ z(Upm*SHl58;Smdkzi$anzA0WLQ^jFAdpVzewOUOR=hY-v^wH$5jiI-8dzJqWP5&O= z9IUJviv1@D%e%CEvEw9e=KSJq(_fmY z&{z!fO(D~>VR4#Hgpab(YGub>!Q+mo(Jbq z==QT`ACfRQ-lo(?qXP?f_Xt6o9;>1wQ>X9c@0oY;cY3e0{x^+2wd`duych~lG9>f{i*V*gu@9}vK^?Py5wApU_sHA?v@&4P3 WR$?8Nwfyf7PK%YJ{W4kBg#QT-XLXbS diff --git a/acoustic_model.sln b/acoustic_model.sln index 69850c8..264d7db 100644 --- a/acoustic_model.sln +++ b/acoustic_model.sln @@ -9,6 +9,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution ProjectSection(SolutionItems) = preProject ..\forced_alignment\forced_alignment\__init__.py = ..\forced_alignment\forced_alignment\__init__.py ..\forced_alignment\forced_alignment\convert_phone_set.py = ..\forced_alignment\forced_alignment\convert_phone_set.py + ..\ipa-xsama-converter\converter.py = ..\ipa-xsama-converter\converter.py ..\forced_alignment\forced_alignment\defaultfiles.py = ..\forced_alignment\forced_alignment\defaultfiles.py ..\forced_alignment\forced_alignment\forced_alignment.pyproj = ..\forced_alignment\forced_alignment\forced_alignment.pyproj ..\forced_alignment\forced_alignment\htk_dict.py = ..\forced_alignment\forced_alignment\htk_dict.py diff --git a/acoustic_model/__pycache__/convert_xsampa2ipa.cpython-36.pyc b/acoustic_model/__pycache__/convert_xsampa2ipa.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..40bdc3562eb97ad27fdcfa391f0b1f07f2ed4cf0 GIT binary patch literal 2324 zcmZ`)&2Jk;6rY{_@b1QO+$2p?zO)2{YG|EOkU*uXh}wbxDU^UnStw@XnZ)ZjyUy&m zb*#Pn;TJ*vFKKj91N3o7;fg0UyfuiEuwDDc!YH)DL9KDln#>%V=yKc zx&}#dv3^XohxgiHPs(9&d3}8^mA%nUyPF@Z57KOJvi@J9;jeInY<6wE;CBY&ag1vi z#T#%}JH+fgyR`3}Q+A&yGPO#_P!=dxIm+Fq;-a*eGqo(TO|oYlBI$l8KD3DbaO3J5 zX){l+ACZjSA}J|7jM^S4eT)EfD4o9B_sm?A{#4`7$-wMGMOkf1na_gz^s8oBmvjBP zs)NJdCM`{AjoJHj_=B?aTF$Ha2bNeoMwnxBQ#!RCkq#b8kDpV)b#nWuvN2v& z4d$IC*wxDg=BZFETp@>lZGJvr;y|Uq4HfHNk&|YR=jW-EfW#s__q~KR$x0EzrD>F}&Yx>nJy2 zgJqC^Qnj*~4WVha|5VP9dHwmR_IB$NTAnHw*?hUg8s+l%yjsNRv*l@3>o?SC zM0-W8JfOuBY@uAfNz{^Bd4-fq59lF<2g|s5SuJIhg{rAe85@4Xc@wGMoZKAb2}4qI ztGqX9#GqTn!gqmQ8}nkj-P z^xtm(YN0|VOStIFE;&qH>0b>{T_EiVU__zTnBg9QmEjZaUzd zPUYl7o>fjiPxTB-Do4k>3R1)x;8lt0$Z|{%<DlEDp9ZI9i+KuZc8MMbR z!RhuA7=qvq5y;Bgj~$@HBAj&ck*vJk1nx7TTXSXKNe15&@igc^&$K@|^5*Ctx&%RV z$f}{E(mD2Y0n9}Y6LCVDS|L8)s#^`r4acz@ieWo+nXX#Pv}t+PGQJS^O{#fUjl{}H z+IDc}=|LAQNE;3hD}~f570*Cd*U=jBo5S!hOvn8e1Uu@lQ&QUKjIOdvZ#+>RDp8a* zRH8K{;*xUvRMm7pJ$!7|JH!sNOCKGf2F{f-kapiu!PFAhm9GNSsegDK)#w9*;s&y1 zsKVlN=`!r3*RL`6zHRU{b~wXEf3*9)LI;N>E-KlZd>IC_;OSQkg}5R1ipyaT9Q{X^ zqi&)fwP#n-`ac=CYTD&av8y@Nap)^XHk)kE|pe* zZXuC)RV^(HP~{=Lc*ew84Ru;iKBmep?iDB&$+!^W6i!-mVnyHTXHq<=Ssd*{9K3(y z=8dh558mCXJS62%=cQ3vRIXX$`R;%x2nLD-wumPn8+$rpNL`(-vuRC3h7Y3~V_%%Z zL;{EUmV2.0 4d8c8573-32f0-4a62-9e62-3ce5cc680390 . - acoustic_model.py + performance_check.py . @@ -25,6 +25,12 @@ Code + + Code + + + Code + diff --git a/acoustic_model/convert_xsampa2ipa.py b/acoustic_model/convert_xsampa2ipa.py new file mode 100644 index 0000000..933b960 --- /dev/null +++ b/acoustic_model/convert_xsampa2ipa.py @@ -0,0 +1,128 @@ +""" Conversion between IPA and Xsampa. + +Note: this code is based on ipa-xsama-converter/converter.py. +https://github.com/lingz/ipa-xsama-converter/ +""" +import json +import sys +import os + + +#sys.path.append(ipa_xsampa_converter_dir) +#import converter + + +def load_converter(source, sink, ipa_xsampa_converter_dir): + """load the converter. + source and sink are either of "ipa", "xsampa" or "sassc". + """ + choices = ["ipa", "xsampa", "sassc"] + + # Validate params + try: + choice1 = choices.index(source) + choice2 = choices.index(sink) + if choice1 == choice2: + print("source and destination format are the same.") + except ValueError: + print("source and destination should be one of [ipa xsampa sassc].") + exit(1) + + # Mappings from disk + # some may not be used if source or sink is already IPA + source_to_ipa = {} + ipa_to_sink = {} + + ipa_xsampa = [] + sassc_ipa = [] + + # The IPAs that actually occur within SASSC + sassc_active_ipa = {} + + script_dir = os.path.dirname(os.path.realpath(__file__)) + + with open(os.path.join(ipa_xsampa_converter_dir, "ipa_xsampa_map.json"), encoding="utf-8") as f: + ipa_xsampa = json.load(f) + + sassc_active = source == "sassc" or sink == "sassc" + if sassc_active: + with open(os.path.join(script_dir, "./sassc_ipa.json")) as f: + sassc_ipa = json.load(f) + for pair in sassc_ipa: + for char in pair[1]: + sassc_active_ipa[char] = 1 + + if source == "xsampa": + for pair in ipa_xsampa: + source_to_ipa[pair[1]] = pair[0] + elif source == "sassc": + for pair in sassc_ipa: + source_to_ipa[pair[0]] = pair[1] + + if sink == "xsampa": + for pair in ipa_xsampa: + ipa_to_sink[pair[0]] = pair[1] + elif sink == "sassc": + for pair in sassc_ipa: + ipa_to_sink[pair[1]] = pair[0] + + # Combine them into a single mapping + mapping = {} + if source == "ipa": + mapping = ipa_to_sink + elif sink == "ipa": + mapping = source_to_ipa + else: + for k, ipas in source_to_ipa.iteritems(): + map_out = "" + failed = False + for ipa in ipas: + val = ipa_to_sink.get(ipa) + if not val: + failed = True + break + map_out += val + mapping[k] = map_out if not failed else None + + return mapping + + +def conversion(source, sink, mapping, line): + """ + conversion. + Args: + mapping: can be obtained with load_converter(). + line: must be seperated, by default the seperator is whitespace. + """ + + # Edit this to change the seperator + SEPERATOR = " " + + line = line.strip() + output = [] + #if sassc_active: + # tokens = line.split(SEPERATOR) + #else: + tokens = line + for token in tokens: + if token.isspace(): + output.append(token) + continue + # Remove extraneous chars that IPA does not accept + if sink == "sassc": + cleaned_token = u"" + for char in token: + if sassc_active_ipa.get(char): + cleaned_token += char + token = cleaned_token + mapped = mapping.get(token) + if not mapped: + print("WARNING: could not map token ", token, file=sys.stderr) + else: + output.append(mapped) + #if sassc_active: + # output = SEPERATOR.join(output) + #else: + output = "".join(output) + + return output \ No newline at end of file diff --git a/acoustic_model/how_to_use_ipa-xsampa_converer.txt b/acoustic_model/how_to_use_ipa-xsampa_converer.txt new file mode 100644 index 0000000..74d0b05 --- /dev/null +++ b/acoustic_model/how_to_use_ipa-xsampa_converer.txt @@ -0,0 +1,15 @@ +Check the indent: +114: output += "\n" + +Specify the encoding when the json file is loaded: +46: with open(os.path.join(script_dir, "ipa_xsampa_map.json")) as f: +--> with open(os.path.join(script_dir, "ipa_xsampa_map.json"), encoding="utf-8") as f: + +Because unicode is supported in Python 3.6, +86: line = unicode(line, 'utf-8').strip() +--> line = line.strip() +117:sys.stdout.write(output.encode("utf-8")) +--> sys.stdout.write(output) + +Change std input into arguments. +12: if len(sys.argv) != 4: \ No newline at end of file diff --git a/acoustic_model/performance_check.py b/acoustic_model/performance_check.py index a3e66d9..dcadb5f 100644 --- a/acoustic_model/performance_check.py +++ b/acoustic_model/performance_check.py @@ -1,4 +1,49 @@ -### ======================= forced alignment ======================= +import os +import sys +import csv +import subprocess + +import numpy as np + +import convert_xsampa2ipa + + +## ======================= user define ======================= +forced_alignment_module = r'C:\Users\Aki\source\repos\forced_alignment' +ipa_xsampa_converter_dir = r'C:\Users\Aki\source\repos\ipa-xsama-converter' +csvfile = r"C:\OneDrive\Research\rug\stimmen\Frisian Variants Picture Task Stimmen.csv" + + +sys.path.append(forced_alignment_module) +from forced_alignment import convert_phone_set + + +mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', ipa_xsampa_converter_dir) +#word_xsampa = 'e:j@X' +#word_ipa = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, word_xsampa) + +with open(csvfile, encoding="utf-8") as fin: + lines = csv.reader(fin, delimiter=';', lineterminator="\n", skipinitialspace=True) + next(lines, None) # skip the headers + + filenames = [] + words = [] + pronunciations = [] + for line in lines: + if line[1] is not '' and len(line) > 5: + filenames.append(line[0]) + words.append(line[1]) + word_xsampa = line[3] + word_ipa = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, word_xsampa) + word_ipa = word_ipa.replace('ː', ':') + word_famehtk = convert_phone_set.ipa2famehtk(word_ipa) + pronunciations.append(word_famehtk) +phonelist = ' '.join(pronunciations) +np.unique(phonelist.split(' ')) + + + +## ======================= forced alignment ======================= #if forced_alignment: # try: # scripts.run_command([