From 87abbbb95aeb0b72cd5b553cbec8e0901d472e4e Mon Sep 17 00:00:00 2001
From: yemaozi88 <428968@gmail.com>
Date: Sun, 27 Jan 2019 23:52:33 +0100
Subject: [PATCH] correspondence between lex.asr and lex.ipa is automatically
 obtained. header is added to the functions in fame_functions.py.

---
 .vs/acoustic_model/v15/.suo                   | Bin 89600 -> 95232 bytes
 _tmp/phone_to_be_searched.npy                 | Bin 500 -> 0 bytes
 _tmp/translation_key.npy                      | Bin 368 -> 0 bytes
 .../__pycache__/defaultfiles.cpython-36.pyc   | Bin 1260 -> 1260 bytes
 acoustic_model/convert_phone_set.py           |  10 +-
 acoustic_model/defaultfiles.py                |   1 -
 acoustic_model/fame_functions.py              | 234 +++++++++---------
 acoustic_model/fame_hmm.py                    |  83 +++++--
 acoustic_model/fame_phoneset.py               |  60 ++++-
 9 files changed, 244 insertions(+), 144 deletions(-)
 delete mode 100644 _tmp/phone_to_be_searched.npy
 delete mode 100644 _tmp/translation_key.npy
diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo
index ccccfcff5af26b0250002bbe105ee51ea032efa5..0b78f7c18ba1bfbd4d3cd5bf2b32eb15230e5e5c 100644
GIT binary patch
delta 4448
zcmcIne^k^}7XRMw%z(fEgA6|u7zq$Y9B~*Dm7oS3e-R>(A<C$Oh)F37Fp>zw*G<h;
zQ<!Uh)sy5-E^Aiqk*^zemR!)z9@qAmb!>mw)>^sN{u8j-9xY_{zTx{_$+Z1r>z(tN
z`{TWL-@WhN_r7n0UXAdDrae1C$8nsRFyi%kiEt`{{$z0sAAF?Xh4(w&xC~2}8k_iL
zcoo0qy}{3uXrIEA9H&K4hquvG?+lU{Wi6ffL|HZyeTEQ92$XefOb+og$H+`qEz2Q9
zQ)GSu(Ri8PNz^IxsYJ6!`8F@}h8dVnbRuCc!H+PW!1T8X3B*ScG6+^eA;Cgm!A~Wy
zaSzJ$38KY>hY4(B0*F3NU}K78Y9qRyFqy06xOv3PCoCY$A+UklWy*NgvShi0Xc{3_
zmLDQoN?1xLBP=5<CzKQHgcXF9gjECwVQoio)P&i@7zxa9o=h1ZhbIHVX9bg{ag3Jr
zJWX<wtlLbqnXp@yS^v+7$6=_`ezak$o6`O$$z#&q)<L3@b~c{1ain8=%NpA!)*c{T
zBta<xCF#$SE{MR!BoXx|up(fxlAjooX*ao91f>X~iKikc@l}C_gr5^!1eUZgf{9=t
zup-DMj4c9YoE3rM9FtiQFq+v>9NzYbEF=@Hk$IMNHh^WlfUr=OBZ#uBHxMQh?oK*O
z=neu)=pw>m0<-g&Oc~Ew#T~h$d|KOT5-SO72vvl2gwcUs?~^kB6wwVbucUr#+S&Lx
z>fcCU$yesLjNzHCmE<;AClK8N9I67hg|YoP3LN)KLKopz(&4~L=y0mtfjjQs>Cy7t
zS{Dz;abrjuyn<iT@5{I`^jUv5%O1x`-bKK>C%!$7#l{X5?s><kSwst*!dnyb13-z_
zr!OY`8`u<+kLN<t{g#k)mBVjB3)R)~Ov$C&3UCo`1vh_x&2jRYTz5hrz%i+RvOlQV
z0cEzO8_6cXc%AWXThgr@f3;@;?}_9mZ=u0^32dXb$@Dpy<`bPn*h57Vyong4q7HTx
zPr^0vR{iVpNH&|b6C~s5_#`+joy<!Fx6MUPZA{&zLSc2LXibAa$!hKcPd4w4p2H$W
zSM*{Hup0n{MJDM+v=%&>yvNR?ppBQ}#~VStjCE2t=nCKMS;M<mo>CXlHsR@&o2=!;
z6cQE_m~#vz#6Kp>j}u)+SV}09b<1VCjwm`;wZj3-egA3EkOv96qz~R(_2S20U3sZ-
z<zMEm7(6TXt3fZFDar#e!2}jDejbdMY-@W}-Sgskj5n6S8Ei7<2XL%b*D8+QFfPV(
zI*M(|5;ZGkU!phAavjpGr~G+O0q?2deFi1V)=tpb_PlmwdqAw}gWEZ$Z`QfK^{pCI
zMFuOVxYu(ou`UiClCnyEukyt5o-*Ej$TrSFhFR77GVOBi1{+HNYg<XnC`%usEQO3J
zZ5PS)1h!f2WBT00qb+Z}bSkgFcMO#5$-x(yK`nNcvm81ohbB2kddks!Hr1D-h{XZG
z7Gf8^FK)I0r=(lUeyjd*)8<*nyX$ZAdJTS4KLTdj*(hGpH4Z|zOGix2Ab2+2d!sxp
ze&3zB#h)K%Z>VZ$sIINSu;!`waA^QMi@$Rl(BV!{RgzXH?ZdXcvxCxSn#?WpD%Lkt
zZKwWRyxJUvA||<$3U!TiY44{bE88rsO!5F)->(-hrGgq8H$NI|Zf!9u^S9Cb?C6@>
z#@Y&J^{TbC^_5j?R#mL8exhbWRgH_>Frc+r(6XmQC?TGJ{R4WuI;7Rl2cD~LsI5tJ
zHcg>#ewI0_`9aTa!1U5cXkm6y-8@Yap6`QDv(j2y(jQDWVc%vOzIa=QI=3+>(=<a4
zW)n@y!4t!QsBfO4G0!l~n1Q*akGPvPD15HRh~^ZvWrisO`!`$MD4;MDDLQ!2QilOH
zVQz&@(yV?6^zvG*EweLB>7(1!N|sG1R0`eO=K!9YW>HfhA7j(2&)pfFVVP;Nj1I@*
zkp!GN5(w?sUr;Af$pwmA7eJ&A;`hD2v@f%0Vra_7{D#24`_4#*GZ5T&1|CVIeHNw*
zo)dnaMjkDPk4i^7?SPI>wWRKPMDv~H(S6t^**v|#MtAZwh{Q9!0iIsoeLe@5osY#F
zf;~7w+IhZFC63RAEYWEKKeS$aMn&H)u`U-TsGM9AuDh7%c^A;~a+lQAx7?pBV9BLe
zDfZGe)jdps=t_r3De|%d@U;)~t$NL9h6CuS$gb`=S@xnP*Sgx8jLc~n7JEVUYI{Sq
zt7@9FqVlPVC#o9ka?RLRlcK6-jpqXYzn$y8(gpapPnXM%E!X@gOS?Zki<>WHVf|;@
z(fL6U3hyT4&Cjx<W^sA+9Bri+pN(T*sBj8P!@Y%Jw|C*-w;5uL4rbz(p$D*R=_C~X
zA)xhoi#QMtVc7Y390W;yUp@|s$3NN{5ra$d+hMInMb7B3b?`$R9MR$6KpHzzeCk5{
zWFQBx-dY@%LWfvs(-|*Wzqw2&`TIVK$77%re?1(44Z}OaoXRB5(8r!jQK-HBXS_Nv
zn+{wkpE{own8jFZ7@5O7@%I~@APwE<1*S-#C!g=Xh*qYf{T#P%=go_+{&mtT)eC)P
zueH;sp_cCON_snCZb7tYFu*z#yn-m$Az4hG2ztNK;=unm>LQEv1uIyw-BQSVz38x*
zamzKUXa(r|ZuIQcr8M>hO8J54!&8)zMDN~(==<#grVppyFIsLsC5qqAlc)XHB50SX
zz6M=gDv-ZjoeLr4esOfE0T2e`SRJ5YC<?YKS4!lpBf~>0&i5P$&=<W_^^3fnbX4zL
z@R5NW_as%5a^c_Myxx0x;<i}Dl5Efys5G0?<}C=xub#GdXYkx(>^=WqBPsMZVl^2(
zM!WDF#neErd=F7$GW|iKyK~i4`X@)SvM3(0#N)^{t0x0~ia361pK<6*p-0ex0C>D8
z)ICt=6L_o}h!By74e0p19OJHs0J}W&?AyVKYBgl5%H9pc!nZ79Z$1QxeG|cGr2i6W
z^iR2o`(wG|H};s=3vZ{rH}$LGl^?%p%sVmf@>#ex;lkPUe-?<B)ZiSGV_npn$KUBL
z-*;?H!oiT3ef{meGp66RugdSBZft72-Yx_1UE`@*?(H7CS;YQC2*=38RDbf&=kEOR
zFniR)D*6jI>f+ojf2eh@II#cUT_1%Xa@_4<M}=7g9kgNl^%P$z$6ZhCN}#1_c|7YV
z=qWJ5sUY!4DwuoB*>DL&TOuqGk1T{p95j#XsZWMI{&+Us&|`QQJ`ENf21xEnE`%%4
zl|~jmnI7E}T?${&7b|gX&ssAaQjM{N|0*f!DJlc4Msz2EQQW$Wo)q`8rP#6g<DU-I
J;MGH!{{h1|ZeRca

delta 4250
zcmc(idr(x@9mntQvJ33;l$R(ESzkf2;w}=Sh_b*cz6BqRl5JqaBN}-H79)|wr7=Q9
z6!tfIO)_yZx0FWPI+bgSomzvMWIE|INoVXdH66z$HfcLa#0*wR6Sv>952cNH{L#mm
z`Ebtf_dECO`91Er2lrp}?$i3-{KRxgl62sf*Xu<mg@B9`Whru0#>yif{{E*I=v&a(
zxJ}+kKjQ1MH)=m*$1MCo3uV&9J4-N11d$*DtOYAT9Eb;cfaSfPgIyTAKp0pBrh;6M
z3|hcFK+G)+^kQ7rR~ElK3W`MmrTBFN=me9&5|9t30r5bvO@urF<bc+IEe(t9Y77XA
z(a!)g!2<!Ecz8}=EFvT1rRbM|dx0gOD@MN_h{y%ox`1sZ`VAlfga>pB(Qg9fVn5=6
zhXRJ+!xfM#ffGCoYCzETD0aPR8x-{cWdr&JATuBbCuTvmf*s@Zu^2xAc7k^BBzOw+
z05|Z+by1ghAC=xlw(XF|WqZ;A$iZv}e<KCD&-<M5yF{|*17QdzTa1Gtc?PyRkP7Y(
z%oBEz?5#i~doB>ko(@C-3HyEMCjn6~qClnqk^9*onESnwwCfAW&%*>Whz8@6-vn9Y
zPAr~{-Uh@b3(#)?^T7C8SPr=en1HB-@i`3Eg4a9E>lMYY8i<91Z4KnofNmr5X4?F3
z6CMo64)os!Refa>XH;WQ8!$YAKA2pQ(BR5eKB14FB+|KkoK{#Lhy0zsvT2^kuEg(=
z-?{Ij9{@i9FKN#jt4NKOJ#jAjA-BfuqGNm=fBU&L{%QG8G=DmAjZPF8??0QVxsrDi
zy`_yUSW7xlvqH(Ph5H7pmyu2sxnEJUme0po^^ZbvQX4bfBXdUXGy{nZ37`Fwa($Du
z(cDP#B)t+|34`!c81<{MwbytoX@x-&K8srM>C2#hg~t+Sv6`^Xuo2QFm?Z7<X(6)5
z_*E|IJ1{Q^0o8Y5(1!3&5Q*-EAhy+q{@K2=DO{apqvNphsjTUA>x6R}4W<qgsZ42w
zEO)JneIy3a!|XI1AfKVv9_mublm086crC>ywb67lamKPuh-BNc6$TPv5LxuOJ-W>2
zHoKN@(1}tP%E)>+B6c8}$Lyk`TC2G`Bvfq7>*Z?)Ox$%nCZeP`&#|$krn$v&1iR@_
zCa<M*?p>6s{KiP()UHqpr739%luoB$FJQOgRxL;&*_5~}DpHP=(0o32YdYQ>|H`Oy
z{O~6rD7$dDG)oR?AO<c+%j&!@sf&Ymsf)0;{0?YdE6M6qojthd)6nk*`@kOXT@bV-
zVcZRxLGW}CHcwy<+Z}cK9}D9N=SsRA8yZ}iA!`{kFO!-rtE*%+L8jesdWmv+8O15v
ztrTsk#ZgiRe=6`MJEaDspbnY_6jZj<B(+HuN4MQO+#CDqCmk;@|Mm36r<AisN;s56
zG5Uta?Tt?DjID^2<}}LF^47f;61TOgqQ0@Zrq0pa_(V;^Y*$SSo4Tj)FAqd<RF8qn
zdKa*J&v2rc;d{PmYej8qLzQc5V?zs9_GIwO2O=Wo%`;otmpbcOYTEdXz4;MaT-z$P
zw^VF%)>U&wk4emtEoOC-JXm{DrxO9;Q;T=lwjf1oz*<lWgkc@}^<YCl-i-bMuqhyy
zqkk|k=B9#PWoj1XD5cq?Q;uvVqh_ldra_O5j8^kJ|CTDTrS#;cHf+i<?r(NeTl-Fn
zh288{^JS%JHf_<i6devx=gWglv9cDsW`sQD_Hc1tMgiYD;40roo`#$9M>_tr;g-Bm
z(mriT!rbN&Qg_Iny=S@fKryfHnJrunvEFayg6@t;@hQaX<vWu3MCZMRSyHH!XUW@{
ztG0)#{j$#oEvNJd#YSA}nK<Vi!@)QIwj%u(J1#!y%U(ncju0n4>2;Dz^Rk`vR3did
zUTxOj!r)_WJJ{)2eDXgVS@p22?xQe&5FbMjXU8?yw6=&6aXEsc@x`JSZ^I1D-7`#F
zxTiqrpG0vyvwN~ArpWp8%sGJ?c|z2PYL_FcYigaXb*|d2b*MUjEkzpdGh6SJQaV4l
zXR6+pwP3MDS#vCA>s<ewPO%gZcj`TTZhp0MVW`FOWP9!o?pRl#KKC_wRFCWKq%ut8
z2#xz*SfbfqI!$V`Jg64y?-ps!IXa*XoGd5xaH#q-nR8y~*ZN;{5a$e?(axMI(zE9U
zyJjCeZSb5~#GB8g@*?1fN!0e9*&d>vmX)|1%2S%MD3opIo(i$xXX$)_dY-uYe5#iE
z$8tS)y|*IjBb~o&43bfL9FESt+j4^!2Z)<~S8B+{N$@ysXf9<!4lVrh)!(=h;(7NR
zUwUsc54=;(JKo6W&)(g`H{Z*jIClY}=d5aMZE<Zqh&zS<GJBrOuJ!We?npLV%~XCH
zO><;t6`Oje`woZ+d*a#sXSXtJA{$p+X%9}+MM(D7@MO0#C523U;QbVK-&ituVLRR3
zx1b-!+kQxQ*=D#AIVzN@BwlzUjSt>f&2w%RCyMJ&u!-A(xDwfYJLO&19_HZ>B9!_B
zif6|M{aiX~5DHHCxHw7NHG&OEz)Oke#=DzG%b7k{Bz(Fv;<v9Rc&_CBOSBICH=^~)
z|K73U34aI%M6wS51LI81JaU2fC)Z;*X7p{o{Ct9JKP&QzZ+8imAUyNMSNzV1EB{_>
z8Xwu|SS1HUf)L<qoON{;cN8Wo>8Z4ZZDVfnEw{1lmd)Vs9d`9n4SS1H%r~D&iHs;N
z?A|}P@xEuCUoMWj+h5t$xT`!S-;ufWqJ=xIVHxl6p-?R6yqYmS`-w{8B&r_>Blu$t
z_uVg)SR;)bN+BD)s1%0L)5`0MaQlesN-&DgwZHL)SLb|nTq3!0)DAkoVSHwNZG5Ch
z8tQ8rTn_HIk-&$GqW_PvQXUJX6lJoBYKGiN<d)fa{UI*BrVbd1z9w>C_9zdJ<rz2R
zS?5~Ce=rh%HZp@pMnd`3kqP0O-W2aRl$S>4EAGjZ$&0Tg$6oq*X3_Db7tYa@Nw1%?
z{&f}8`^PwTbg5X$(1@c?TvH`BtvB&yYid|v5v4Pg7Wfy58~frSN^Ba945X4Q;L1ZA
zlov}ViAOBqL(RFgKSYtUXq~uUCg0YD4VWo(Ad|wmsI_*eHiw=g<*jT=A4<=sccTU@
ggu{I%%^aG!f-chC=7jK{l;ol0I<ki;j+NB^Pp7-*Hvj+t

diff --git a/_tmp/phone_to_be_searched.npy b/_tmp/phone_to_be_searched.npy
deleted file mode 100644
index 1979d5a8d843757c833a505742490fb6372bee84..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 500
zcmZ9Gu?oU45J0cGh+mN{3KfUqQk=Vp;$Vm3Ai<_oQCkufRQwA6)j=H0X^$M<aQE)z
zlH6mKPUl$w3EX6^?b^s#$Z;CUP{^V(N23m_%H$e-qIOn0Z%b8c*Y~1P*bl{B_~>o`
zVAyua#pPGN89fDZ+%(%3Spa16+G{{2pRt)GyNgUSip`%K51M^m>`n9($aMEj(p6;g
s74yjCpUfkZU$EVfx$iXlphw2Dn!U4UJPCYLc!V=!`^WUxmHCqK2LvuDqW}N^

diff --git a/_tmp/translation_key.npy b/_tmp/translation_key.npy
deleted file mode 100644
index 96c1125fc128e683beffe38d4b916e6769a9989f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 368
zcmY+7Jx>Bb5Qg^<QP*$f+vd7E5<+5PWn)L-#lw)8U}4Czx4J~w<K5d!Btl}N#M;WA
z<ge0W(ZVTao_Xh;nUC=1>iX6pZSw3zBH`LQcfDE1Yr5Wjtc_->AL~dg<u)57BGVJe
zCL-(oM$?_U|AW5dxJt*9X^Y2Nw8rVk44Kw!O8Z*ySS3cM+)$8}i>hcfg%glh<anRQ
zk>CjxY6%o;J%<o7K)zSV>WG=4I>q`M3SDe0u-V0yA2^NQW09FovK`vXW!}yiwgVD4
z3+!~U>j$e#Gwg+7c=?@$O~Hl1{^0!%2Xh?ycJb>IM}Kf^-zyYl(b6MIvSi7iEX$S{
eDzakf2~}CO^njYI$-K<lE<#<_Q)X~7X#D_2WM?`6

diff --git a/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc b/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc
index ac76aeb2d4e380eb035c661125d52e768876a8e4..ad9ae7f6e423e1f58c68178156737e38140bb28d 100644
GIT binary patch
delta 33
pcmaFE`G%9-n3tF9?`z+U?3<YxqbBcWzQxALz`#(%Ie9iqAOONH3V#3q

delta 33
pcmaFE`G%9-n3tE!;jGU__RY+Uw<hmqzQxAHz`#(%HF-8mAON+!3MBvl

diff --git a/acoustic_model/convert_phone_set.py b/acoustic_model/convert_phone_set.py
index 49d8761..d2c1350 100644
--- a/acoustic_model/convert_phone_set.py
+++ b/acoustic_model/convert_phone_set.py
@@ -16,14 +16,14 @@ def multi_character_tokenize(line, multi_character_tokens):
 
 def split_word(word, multi_character_phones):
 	"""
-	Split a line by given phoneset.
+	split a line by given phoneset.
 	
 	Args:
-		word (str): one word written in given phoneset.
-		multi_character_phones:
+		word (str): a word written in given phoneset.
+		multi_character_phones (list): the list of multicharacter phones which is considered as one phone. this can be obtained with phoneset definition such as fame_phoneset.py. 
 
 	Returns:
-		word_seperated (str): the word splitted in given phoneset. 
-	"""
+		(word_seperated) (list): the word splitted in given phoneset. 
 
+	"""
 	return [phone for phone in multi_character_tokenize(word.strip(), multi_character_phones)]
\ No newline at end of file
diff --git a/acoustic_model/defaultfiles.py b/acoustic_model/defaultfiles.py
index 6a280af..7c4a8cf 100644
--- a/acoustic_model/defaultfiles.py
+++ b/acoustic_model/defaultfiles.py
@@ -33,7 +33,6 @@ repo_dir = r'C:\Users\Aki\source\repos'
 ipa_xsampa_converter_dir    = os.path.join(repo_dir, 'ipa-xsama-converter')
 forced_alignment_module_dir = os.path.join(repo_dir, 'forced_alignment')
 accent_classification_dir   = os.path.join(repo_dir, 'accent_classification', 'accent_classification')
-#pyhtk_dir                   = os.path.join(repo_dir, 'pyhtk', 'pyhtk')
 toolbox_dir					= os.path.join(repo_dir, 'toolbox')
 
 #htk_config_dir = r'c:\Users\A.Kunikoshi\source\repos\forced_alignment\forced_alignment\data\htk\preset_models\aki_dutch_2017'
diff --git a/acoustic_model/fame_functions.py b/acoustic_model/fame_functions.py
index 30ce6f8..eadb879 100644
--- a/acoustic_model/fame_functions.py
+++ b/acoustic_model/fame_functions.py
@@ -12,24 +12,6 @@ import defaultfiles as default
 import fame_phoneset
 import convert_phone_set
 
-#sys.path.append(default.forced_alignment_module_dir)
-#from forced_alignment import convert_phone_set
-
-#def find_phone(lexicon_file, phone):
-#	""" Search where the phone is used in the lexicon. """
-#	with open(lexicon_file, "rt", encoding="utf-8") as fin:
-#		lines = fin.read()
-#		lines = lines.split('\n')
-	
-#	extracted = []
-#	for line in lines:
-#		line = line.split('\t')
-#		if len(line) > 1:
-#			pronunciation = line[1]
-#			if phone in pronunciation:
-#				extracted.append(line)
-#	return extracted
-
 
 #def ipa2famehtk_lexicon(lexicon_file_in, lexicon_file_out):
 #	""" Convert a lexicon file from IPA to HTK format for FAME! corpus. """
@@ -128,25 +110,6 @@ import convert_phone_set
 
 #    return ipa
 
-def make_hcopy_scp_from_filelist_in_fame(fame_dir, dataset, feature_dir, hcopy_scp):
-	""" Make a script file for HCopy using the filelist in FAME! corpus. """
-	
-	filelist_txt = os.path.join(fame_dir, 'fame', 'filelists', dataset + 'list.txt')
-	with open(filelist_txt) as fin:
-		filelist = fin.read()
-		filelist = filelist.split('\n')
-	
-	with open(hcopy_scp, 'w') as fout:
-		for filename_ in filelist:
-			filename = filename_.replace('.TextGrid', '')
-
-			if len(filename) > 3: # remove '.', '..' and ''
-				wav_file = os.path.join(fame_dir, 'fame', 'wav', dataset, filename + '.wav')
-				mfc_file = os.path.join(feature_dir, filename + '.mfc')
-
-				fout.write(wav_file + '\t' + mfc_file + '\n')
-
-
 #def make_filelist(input_dir, output_txt):
 #	""" Make a list of files in the input_dir. """
 #	filenames = os.listdir(input_dir)
@@ -191,98 +154,147 @@ def make_hcopy_scp_from_filelist_in_fame(fame_dir, dataset, feature_dir, hcopy_s
 #                            f.write('{0}\t{1}\n'.format(WORD, key))
 
 
+def make_hcopy_scp_from_filelist_in_fame(fame_dir, dataset, feature_dir, hcopy_scp):
+	""" Make a script file for HCopy using the filelist in FAME! corpus. 
+	
+	Args:
+		fame_dir (path): the directory of FAME corpus.
+		dataset (str): 'devel', 'test' or 'train'.
+		feature_dir (path): the directory where feature will be stored.
+		hcopy_scp (path): a script file for HCopy to be made.
+
+	"""
+	filelist_txt = os.path.join(fame_dir, 'fame', 'filelists', dataset + 'list.txt')
+	with open(filelist_txt) as fin:
+		filelist = fin.read()
+		filelist = filelist.split('\n')
+	
+	with open(hcopy_scp, 'w') as fout:
+		for filename_ in filelist:
+			filename = filename_.replace('.TextGrid', '')
+
+			if len(filename) > 3: # remove '.', '..' and ''
+				wav_file = os.path.join(fame_dir, 'fame', 'wav', dataset, filename + '.wav')
+				mfc_file = os.path.join(feature_dir, filename + '.mfc')
+
+				fout.write(wav_file + '\t' + mfc_file + '\n')
 
 
 
 def load_lexicon(lexicon_file):
+	""" load lexicon file as Data Frame.
+
+	Args:
+		lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'.
+	
+	Returns:
+		lex (df): lexicon as Data Frame, which has columns 'word' and 'pronunciation'.
+
+	"""
 	lex = pd.read_csv(lexicon_file, delimiter='\t', header=None, encoding="utf-8")
 	lex.rename(columns={0: 'word', 1: 'pronunciation'}, inplace=True)
 	return lex
 
 
-def get_phonelist(lexicon_asr):
-	""" Make a list of phones which appears in the lexicon. """
-
-	#with open(lexicon_file, "rt", encoding="utf-8") as fin:
-	#	lines = fin.read()
-	#	lines = lines.split('\n')
-	#	phonelist = set([])
-	#	for line in lines:
-	#		line = line.split('\t')
-	#		if len(line) > 1:
-	#			pronunciation = set(line[1].split())
-	#			phonelist = phonelist | pronunciation
-	lex = load_lexicon(lexicon_asr)
-	return set(' '.join(lex['pronunciation']).split(' '))
-
-
-def extract_unknown_phones(word_list, known_phones):
-	return [i for i in word_list if not i in known_phones]
-
-
-if __name__ == '__main__':
-	import time
-	timer_start = time.time()
-
-	#def get_translation_key():
-	dir_tmp = r'c:\Users\Aki\source\repos\acoustic_model\_tmp'
-	lexicon_ipa = r'd:\_corpus\FAME\lexicon\lex.ipa'
-	lexicon_asr = r'd:\_corpus\FAME\lexicon\lex.asr'
-
-	lex_ipa = load_lexicon(lexicon_ipa)
-	lex_asr = load_lexicon(lexicon_asr)
-	if 1:
-		phone_to_be_searched = fame_phoneset.phoneset_ipa[:]
-		translation_key = dict()
-		for word in lex_ipa['word']:
-			if np.sum(lex_ipa['word'] == word) == 1 and np.sum(lex_asr['word'] == word) == 1:
-				ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
-				asr = lex_asr[lex_asr['word'] == word].iat[0, 1]
+def get_phoneset_from_lexicon(lexicon_file, phoneset='asr'):
+	""" Make a list of phones which appears in the lexicon. 
 	
-				ipa_list = convert_phone_set.split_word(ipa, fame_phoneset.multi_character_phones_ipa)
-				asr_list = asr.split(' ')
+	Args:
+		lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'.
+		phoneset (str): the phoneset with which lexicon_file is written. 'asr'(default) or 'ipa'.
 
-				# if there are phones which is not in phone_to_be_searched
-				#if len([True for i in asr_list if i in phone_to_be_searched]) > 0:
-				if(len(ipa_list) == len(asr_list)):
-					print("{0}: {1} --> {2}".format(word, ipa_list, asr_list))
-					for ipa_, asr_ in zip(ipa_list, asr_list):
-						if ipa_ in phone_to_be_searched:
-							translation_key[ipa_] = asr_
-							phone_to_be_searched.remove(ipa_)
+	Returns:
+		(list_of_phones) (set): the set of phones included in the lexicon_file.
 
-		print("elapsed time: {}".format(time.time() - timer_start))
+	"""
+	assert phoneset in ['asr', 'ipa'], 'phoneset should be \'asr\' or \'ipa\''
 
-		np.save(os.path.join(dir_tmp, 'translation_key.npy'), translation_key)
-		np.save(os.path.join(dir_tmp, 'phone_to_be_searched.npy'), phone_to_be_searched)
-	else:
-		translation_key		 = np.load(os.path.join(dir_tmp, 'translation_key.npy')).item()
-		phone_to_be_searched = np.load(os.path.join(dir_tmp, 'phone_to_be_searched.npy')).item()
+	lex = load_lexicon(lexicon_file)
+	if phoneset == 'asr':
+		return set(' '.join(lex['pronunciation']).split(' '))
+	elif phoneset == 'ipa':
+		join_pronunciations = ''.join(lex['pronunciation'])
+		return set(convert_phone_set.split_word(join_pronunciations, fame_phoneset.multi_character_phones_ipa))
 
 
-	#phone_unknown = list(phone_to_be_searched)
-	##phone_unknown.remove('')
-	#phone_known = list(translation_key.keys())
+def extract_unknown_phones(ipa, known_phones):
+	"""extract unknown phones in the pronunciation written in IPA.
 
-	#p = phone_unknown[0]
+	Args:
+		ipa (str): a pronunciation written in IPA. 
+		known_phones (list): list of phones already know.
+
+	Returns:
+		(list_of_phones) (list): unknown phones not included in 'known_phones'.
+
+	"""
+	ipa_split = convert_phone_set.split_word(ipa, fame_phoneset.multi_character_phones_ipa)
+	return [i for i in ipa_split if not i in known_phones]
+
+
+def get_translation_key(lexicon_file_ipa, lexicon_file_asr):
+	""" get correspondence between lexicon_file_ipa and lexicon_file_asr.
+
+	Args:
+		lexicon_file_ipa (path): lexicon in the format of 'word' /t 'pronunciation (IPA)'.
+		lexicon_file_asr (path): lexicon in the format of 'word' /t 'pronunciation (asr)'.
+			the each character of 'pronunciation' should be delimited by ' '.
+
+	Returns:
+		translation_key (dict): translation key from ipa to asr. 
+		(phone_unknown) (list): the list of IPA phones, which does not appear in lexicon_file_asr. 
+
+	"""
+	lex_ipa = load_lexicon(lexicon_file_ipa)
+	lex_asr = load_lexicon(lexicon_file_asr)
+	phone_unknown = fame_phoneset.phoneset_ipa[:]
+	translation_key = dict()
+	for word in lex_ipa['word']:
+		if np.sum(lex_ipa['word'] == word) == 1 and np.sum(lex_asr['word'] == word) == 1:
+			ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
+			asr = lex_asr[lex_asr['word'] == word].iat[0, 1]
 	
-	### extract lines which contains 'unknown' phone.
-	#lex_ipa_ = lex_ipa[lex_ipa['pronunciation'].str.count(p)>0]
-	##phone_unknown_ = phone_unknown[:]
-	##phone_unknown_.remove(p)
-	#phone_known_ = phone_known[:]
-	#phone_known_.append(p)
-	#for index, row in lex_ipa_.iterrows():
-	#	ipa = row['pronunciation']
-	#	phone_extract_unknown_phones(asr_list, phone_known_):
+			ipa_list = convert_phone_set.split_word(ipa, fame_phoneset.multi_character_phones_ipa)
+			asr_list = asr.split(' ')
 
-	#	# check the number of phones in phone_unknown_
-	#	if len([True for i in asr_list if i in phone_unknown_]) == 0:
-	#		word = row['word']
-	#		ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
-	#		print("{0}: {1} --> {2}".format(word, ipa, asr))
-	#		#print("{0}:{1}".format(index, row['pronunciation']))
+			# if there are phones which is not in phone_unknown
+			#if len([True for i in asr_list if i in phone_unknown]) > 0:
+			if(len(ipa_list) == len(asr_list)):
+				print("{0}: {1} --> {2}".format(word, ipa_list, asr_list))
+				for ipa_, asr_ in zip(ipa_list, asr_list):
+					if ipa_ in phone_unknown:
+						translation_key[ipa_] = asr_
+						phone_unknown.remove(ipa_)
+	return translation_key, list(phone_unknown)
 
 
+def find_phone(lexicon_file, phone, phoneset='ipa'):
+	""" extract rows where the phone is used in the lexicon_file. 
 
-	
\ No newline at end of file
+	Args:
+		lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'.
+		phone (str): the phone to be searched.
+		phoneset (str): the phoneset with which lexicon_file is written. 'asr' or 'ipa'(default).
+
+	Returns:
+		extracted (df): rows where the phone is used.
+
+	ToDo:
+		* develop when the phonset == 'asr'.
+
+	"""
+	assert phoneset in ['asr', 'ipa'], 'phoneset should be \'asr\' or \'ipa\''
+	
+	lex = load_lexicon(lexicon_file)
+	
+	# to reduce the calculation time, only target rows which include 'phone' at least once. 
+	lex_ = lex[lex['pronunciation'].str.count(phone)>0]
+
+	extracted = pd.DataFrame(index=[], columns=['word', 'pronunciation'])
+	for index, row in lex_.iterrows():
+		if phoneset == 'ipa':
+			pronunciation = convert_phone_set.split_word(row['pronunciation'], fame_phoneset.multi_character_phones_ipa)
+		if phone in pronunciation:
+			extracted_ = pd.Series([row['word'], pronunciation], index=extracted.columns)
+			extracted  = extracted.append(extracted_, ignore_index=True)
+	return extracted
\ No newline at end of file
diff --git a/acoustic_model/fame_hmm.py b/acoustic_model/fame_hmm.py
index d6327e7..fe319d0 100644
--- a/acoustic_model/fame_hmm.py
+++ b/acoustic_model/fame_hmm.py
@@ -6,6 +6,7 @@ import tempfile
 #import configparser
 #import subprocess
 #from collections import Counter
+import time
 
 #import numpy as np
 #import pandas as pd
@@ -27,8 +28,8 @@ from htk import pyhtk
 dataset_list = ['devel', 'test', 'train']
 
 # procedure
-extract_features  = 1
-#conv_lexicon	  = 0
+extract_features  = 0
+conv_lexicon	  = 1
 #check_lexicon	  = 0
 #make_mlf		  = 0
 #combine_files	  = 0
@@ -84,16 +85,14 @@ if not os.path.exists(tmp_dir):
 ## ======================= extract features =======================
 if extract_features:
 	for dataset in dataset_list:
-	#for dataset in ['test']:
 		print('==== {} ===='.format(dataset))
 
 		# a script file for HCopy 
 		print(">>> making a script file for HCopy... \n")
 		hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False)
 		hcopy_scp.close()
-		#hcopy_scp = os.path.join(default.htk_dir, 'tmp', 'HCopy.scp')
 
-		## get a list of features (hcopy.scp) from the filelist in FAME! corpus
+		# get a list of features (hcopy.scp) from the filelist in FAME! corpus
 		feature_dir_ = os.path.join(feature_dir, dataset)
 		if not os.path.exists(feature_dir_):
 			os.makedirs(feature_dir_)
@@ -101,32 +100,70 @@ if extract_features:
 		# extract features
 		print(">>> extracting features... \n")
 		fame_functions.make_hcopy_scp_from_filelist_in_fame(default.fame_dir, dataset, feature_dir_, hcopy_scp.name)
-
-		#subprocessStr = 'HCopy -C ' + config_hcopy + ' -S ' + hcopy_scp.name
-		#subprocess.call(subprocessStr, shell=True)
 		pyhtk.wav2mfc(default.config_hcopy, hcopy_scp.name)
 
 		# a script file for HCompV
 		print(">>> making a script file for HCompV... \n")
-
-
-## ======================= make a list of features =======================
-#if make_feature_list:
-#	print("==== make a list of features ====\n")
-
-#	for dataset in dataset_list:
-#		print(dataset)
-
-		#feature_dir = output_dir + '\\mfc\\' + dataset
 		hcompv_scp  = os.path.join(tmp_dir, dataset + '.scp')
-
-		#am_func.make_filelist(feature_dir, hcompv_scp)
 		fh.make_filelist(feature_dir_, hcompv_scp, '.mfc')
 
 
 ## ======================= convert lexicon from ipa to fame_htk =======================
 if conv_lexicon:
 	print('==== convert lexicon from ipa 2 fame ====\n')
+	
+	#dir_out = r'c:\Users\Aki\source\repos\acoustic_model\_tmp'
+	lexicon_dir = os.path.join(default.fame_dir, 'lexicon') 
+	lexicon_ipa = os.path.join(lexicon_dir, 'lex.ipa')
+	lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
+
+	# get the correspondence between lex_ipa and lex_asr.
+	lex_asr  = fame_functions.load_lexicon(lexicon_asr)
+	lex_ipa  = fame_functions.load_lexicon(lexicon_ipa)		
+	if 1:
+		timer_start = time.time()
+		translation_key, phone_unknown = fame_functions.get_translation_key(lexicon_ipa, lexicon_asr)
+		print("elapsed time: {}".format(time.time() - timer_start))
+
+		np.save('translation_key_ipa2asr.npy', translation_key)
+		np.save('phone_unknown.npy', phone_unknown)
+	else:
+		translation_key = np.load('translation_key_ipa2asr.npy').item()
+		phone_unknown   = np.load('phone_unknown.npy')
+		phone_unknown   = list(phone_unknown)
+
+
+	## manually check the correspondence for the phone in phone_unknown.
+	#p = phone_unknown[0]
+	#lex_ipa_ = find_phone(lexicon_ipa, p, phoneset='ipa')
+
+	#for word in lex_ipa_['word']:
+	#	ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
+	#	if np.sum(lex_asr['word'] == word) > 0:
+	#		asr = lex_asr[lex_asr['word'] == word].iat[0, 1]
+	
+	#		ipa_list = convert_phone_set.split_word(ipa, fame_phoneset.multi_character_phones_ipa)
+	#		asr_list = asr.split(' ')
+	#		if p in ipa_list and (len(ipa_list) == len(asr_list)):
+	#			print("{0}: {1} --> {2}".format(word, ipa_list, asr_list))
+	#			for ipa_, asr_ in zip(ipa_list, asr_list):
+	#				if ipa_ in phone_unknown:
+	#					translation_key[ipa_] = asr_
+	#					phone_unknown.remove(ipa_)
+
+
+	## check if all the phones in lexicon_ipa are in fame_phoneset.py.
+	#timer_start = time.time()
+	#phoneset_lex = get_phoneset_from_lexicon(lexicon_ipa, phoneset='ipa')
+	#print("elapsed time: {}".format(time.time() - timer_start))
+	
+	#phoneset_py = fame_phoneset.phoneset_ipa
+	#set(phoneset_lex) - set(phoneset_py)
+
+	##timer_start = time.time()
+	##extracted = find_phone(lexicon_ipa, 'ⁿ')
+	##print("elapsed time: {}".format(time.time() - timer_start))
+
 
 	# lex.asr is Kaldi compatible version of lex.ipa.
 	# to check... 
@@ -140,13 +177,13 @@ if conv_lexicon:
 	#		fout.write("{0}\t{1}\n".format(word, ' '.join(pronunciation_split)))
 
 	# convert each lexicon from ipa description to fame_htk phoneset.
-	am_func.ipa2famehtk_lexicon(lex_oov, lex_oov_htk)
-	am_func.ipa2famehtk_lexicon(lex_asr, lex_asr_htk)
+	#am_func.ipa2famehtk_lexicon(lex_oov, lex_oov_htk)
+	#am_func.ipa2famehtk_lexicon(lex_asr, lex_asr_htk)
 
 	# combine lexicon
 	# pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov.
 	# therefore there is no overlap between lex_asr and lex_oov.   
-	am_func.combine_lexicon(lex_asr_htk, lex_oov_htk, lex_htk)
+	#am_func.combine_lexicon(lex_asr_htk, lex_oov_htk, lex_htk)
 
 
 ## ======================= check if all the phones are successfully converted =======================
diff --git a/acoustic_model/fame_phoneset.py b/acoustic_model/fame_phoneset.py
index b1a07de..2c2387a 100644
--- a/acoustic_model/fame_phoneset.py
+++ b/acoustic_model/fame_phoneset.py
@@ -1,41 +1,79 @@
+""" definition of the phones to be used. """
+
+## phones in IPA.
 phoneset_ipa = [
 	# vowels
 	'i̯',
+	'i̯ⁿ',
 	'y',
 	'i',
+	'i.',
+	'iⁿ',
 	'i:',
+	'i:ⁿ',
 	'ɪ',
-	'ɪ:',
+	'ɪⁿ',
+	'ɪ.',
+	#'ɪ:', # not included in lex.ipa
+	'ɪ:ⁿ',
 	'e',
 	'e:',
+	'e:ⁿ',
 	'ə',
+	'əⁿ',
 	'ə:',
 	'ɛ',
+	'ɛ.',
+	'ɛⁿ',
 	'ɛ:',
+	'ɛ:ⁿ',
 	'a',
+	'aⁿ',
+	'a.',
 	'a:',
+	'a:ⁿ',
 	'ṷ',
-	'ú',
+	'ṷ.',
+	'ṷⁿ',
+	#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. 
 	'u',
+	'uⁿ',
+	'u.',
 	'u:',
+	'u:ⁿ',
 	'ü',
+	'ü.',
+	'üⁿ',
 	'ü:',
+	'ü:ⁿ',
 	'o',
+	'oⁿ',
+	'o.',
 	'o:',
+	'o:ⁿ',
 	'ö',
+	'ö.',
+	'öⁿ',
 	'ö:',
+	'ö:ⁿ',
 	'ɔ',
+	'ɔ.',
+	'ɔⁿ',
 	'ɔ:',
-	'ɔ̈',
+	'ɔ:ⁿ',
+	#'ɔ̈', # not included in lex.ipa 
+	'ɔ̈.',
 	'ɔ̈:',
 
 	# plosives
 	'p', 
 	'b', 
-	't', 
+	't',
+	'tⁿ',
 	'd', 
 	'k',
 	'g',
+	'ɡ', # = 'g'
 
 	# nasals
 	'm',
@@ -48,8 +86,22 @@ phoneset_ipa = [
 	's',
 	's:',
 	'z',
+	'zⁿ',
 	'x',
 	'h',
+
+	# tap and flip
+	'r',
+	'r.', # only appears in word 'mearpartijestelsel'(does not exist in lex_asr) and 'tenoarpartij'.   
+	'r:', # only appears in word 'mûsearflearmûs' and 'sjochdêr'.
+
+	# approximant
+	'j',
+	'j.',
+	'l'
 	]
 
+## the list of multi character phones. 
+# for example, the length of 'i̯ⁿ' is 3, but in the codes it is treated as one letter.
 multi_character_phones_ipa = [i for i in phoneset_ipa if len(i) > 1]
+multi_character_phones_ipa.sort(key=len, reverse=True)
\ No newline at end of file