lexicon is made.

2019-01-29 21:52:11 +01:00 · 2019-01-29 21:52:11 +01:00 · dc6b7b84b6
commit dc6b7b84b6
parent 8cda93de75
11 changed files with 241 additions and 424 deletions
--- a/.vs/acoustic_model/v15/.suo
+++ b/.vs/acoustic_model/v15/.suo
--- a/acoustic_model/pycache/defaultfiles.cpython-36.pyc
+++ b/acoustic_model/pycache/defaultfiles.cpython-36.pyc
--- a/acoustic_model/acoustic_model.pyproj
+++ b/acoustic_model/acoustic_model.pyproj
@ -23,7 +23,7 @@
  </PropertyGroup>
  <ItemGroup>
    <Compile Include="check_novoapi.py" />
-    <Compile Include="convert_phone_set.py">
+    <Compile Include="convert_phoneset.py">
      <SubType>Code</SubType>
    </Compile>
    <Compile Include="convert_xsampa2ipa.py">
@ -32,8 +32,6 @@
    <Compile Include="defaultfiles.py">
      <SubType>Code</SubType>
    </Compile>
    <Compile Include="fame_asr.py" />
    <Compile Include="fame_ipa.py" />
    <Compile Include="fame_test.py">
      <SubType>Code</SubType>
    </Compile>
@ -52,9 +50,20 @@
      <SubType>Code</SubType>
    </Compile>
    <Compile Include="fame_hmm.py" />
    <Compile Include="phoneset\fame_asr.py" />
    <Compile Include="phoneset\fame_ipa.py" />
  </ItemGroup>
  <ItemGroup>
    <Content Include="config.ini" />
    <Content Include="phoneset\fame_ipa2asr.npy" />
    <Content Include="phoneset\output_get_translation_key_phone_unknown.npy" />
    <Content Include="phoneset\output_get_translation_key_translation_key.npy" />
    <Content Include="phoneset\__pycache__\fame_asr.cpython-36.pyc" />
    <Content Include="phoneset\__pycache__\fame_ipa.cpython-36.pyc" />
  </ItemGroup>
  <ItemGroup>
    <Folder Include="phoneset\" />
    <Folder Include="phoneset\__pycache__\" />
  </ItemGroup>
  <Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
  <!-- Uncomment the CoreCompile target to enable the Build command in
--- a/acoustic_model/convert_phone_set.py
+++ b/acoustic_model/convert_phone_set.py
@ -26,4 +26,15 @@ def split_word(word, multi_character_phones):
 		(word_seperated) (list): the word splitted in given phoneset. 
 	"""
-	return [phone for phone in multi_character_tokenize(word.strip(), multi_character_phones)]
+	return [phone 
 		 for phone in multi_character_tokenize(word.strip(), multi_character_phones)
 		 ]
 def convert_phoneset(word_list, translation_key):
 	"""
 	Args:
 		word_list (str): a list of phones written in given phoneset.
 		translation_key (dict): 
 	"""
 	return [translation_key.get(phone, phone) for phone in word_list]
--- a/acoustic_model/fame_asr.py
+++ b/acoustic_model/fame_asr.py
@ -1,127 +0,0 @@
 """ definition of the phones to be used. """
 # phonese in {FAME}/lexicon/lex.asr
 phoneset = [
 	# vowels
 	'a',
 	'a:',
 	'e',
 	'e:',
 	'i',
 	'i:',
 	'i̯',
 	'o',
 	'o:',
 	'ö',
 	'ö:',
 	'u',
 	'u:',
 	'ü',
 	'ü:',
 	#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone. 
 	'ṷ',
 	'y',
 	'ɔ',
 	'ɔ:',
 	'ɔ̈', 
 	'ɔ̈:',
 	'ə',
 	'ɛ',
 	'ɛ:',
 	'ɪ',
 	'ɪ:',
 	# plosives
 	'p', 
 	'b', 
 	't',
 	'd', 
 	'k',
 	'g',
 	'ɡ', # = 'g'
 	# nasals
 	'm',
 	'n',
 	'ŋ',
 	# fricatives
 	'f',
 	'v',
 	's',
 	's:',
 	'z',
 	'x',
 	'h',
 	# tap and flip
 	'r',
 	'r:',
 	# approximant
 	'j',
 	'l'
 	]
 ## reduce the number of phones.
 # the phones which seldom occur are replaced with another more popular phones.
 # replacements are based on the advice from Martijn Wieling.
 reduction_key = {
 	'y':'i:', 'e':'e:', 'ə:':'ɛ:', 'r:':'r', 'ɡ':'g'
 	}
 # already removed beforehand in phoneset. Just to be sure.
 phones_to_be_removed = ['ú', 's:', 'ɔ̈:']
 phoneset_short = [reduction_key.get(i, i) for i in phoneset
 				  if not i in phones_to_be_removed]
 phoneset_short = list(set(phoneset_short))
 phoneset_short.sort()
 ## translation_key to htk format (ascii).
 # phones which gives UnicodeEncodeError when phone.encode("ascii")
 # are replaced with other characters.
 translation_key_asr2htk = {
 	'i̯': 'i_',
 	'ṷ': 'u_',
 	# on the analogy of German umlaut, 'e' is used.
 	'ö': 'oe', 'ö:': 'oe:',
 	'ü': 'ue', 'ü:': 'ue:',
 	# on the analogy of Chinese...
 	'ŋ': 'ng',
 	# refer to Xsampa. 
 	'ɔ': 'O', 'ɔ:': 'O:', 'ɔ̈': 'Oe',
 	'ɛ': 'E', 'ɛ:': 'E:',
 	'ɪ': 'I', 'ɪ:': 'I:', 
 	# it is @ in Xsampa, but that is not handy on HTK.
 	'ə': 'A'
 	}
 phoneset_htk = [translation_key_asr2htk.get(i, i) for i in phoneset_short]
 ## check
 #for i in phoneset_short:
 #	try:
 #		print("{0} --> {1}".format(i, i.encode("ascii")))
 #	except UnicodeEncodeError:
 #		print(">>> {}".format(i))
 ## the list of multi character phones. 
 # for example, the length of 'a:' is 3, but in the codes it is treated as one letter.
 # original.
 multi_character_phones = [i for i in phoneset if len(i) > 1]
 multi_character_phones.sort(key=len, reverse=True)
 # phonset reduced.
 multi_character_phones_short = [i for i in phoneset_short if len(i) > 1]
 multi_character_phones_short.sort(key=len, reverse=True)
 # htk compatible.
 multi_character_phones_htk = [i for i in phoneset_htk if len(i) > 1]
 multi_character_phones_htk.sort(key=len, reverse=True)
--- a/acoustic_model/fame_functions.py
+++ b/acoustic_model/fame_functions.py
@ -1,4 +1,5 @@
 import os
 os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
 import sys
 from collections import Counter
@ -8,38 +9,8 @@ import numpy as np
 import pandas as pd
 import defaultfiles as default
-from phoneset import fame_ipa
+import convert_phoneset
-import convert_phone_set
+from phoneset import fame_ipa, fame_asr
 #def ipa2famehtk_lexicon(lexicon_file_in, lexicon_file_out):
 #	""" Convert a lexicon file from IPA to HTK format for FAME! corpus. """
 #	lexicon_in = pd.read_table(lexicon_file_in, names=['word', 'pronunciation'])
 #	with open(lexicon_file_out, "w", encoding="utf-8") as fout:
 #		for word, pronunciation in zip(lexicon_in['word'], lexicon_in['pronunciation']):
 #			pronunciation_no_space = pronunciation.replace(' ', '')
 #			pronunciation_famehtk  = convert_phone_set.ipa2famehtk(pronunciation_no_space)
 #			if 'ceh' not in pronunciation_famehtk and 'sh' not in pronunciation_famehtk:
 #				fout.write("{0}\t{1}\n".format(word.upper(), pronunciation_famehtk))
 #def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out):
 #	""" Combine two lexicon files and sort by words. """
 #	with open(lexicon_file1, "rt", encoding="utf-8") as fin:
 #		lines1 = fin.read()
 #		lines1 = lines1.split('\n')
 #	with open(lexicon_file2, "rt", encoding="utf-8") as fin:
 #		lines2 = fin.read()
 #		lines2 = lines2.split('\n')
 #	lex1 = pd.read_table(lexicon_file1, names=['word', 'pronunciation'])
 #	lex2 = pd.read_table(lexicon_file2, names=['word', 'pronunciation'])
 #	lex  = pd.concat([lex1, lex2])
 #	lex  = lex.sort_values(by='word', ascending=True)
 #	lex.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
 #def read_fileFA(fileFA):
 #    """
@ -291,4 +262,74 @@ def find_phone(lexicon_file, phone, phoneset_name='ipa'):
 		if phone in pronunciation:
 			extracted_ = pd.Series([row['word'], pronunciation], index=extracted.columns)
 			extracted  = extracted.append(extracted_, ignore_index=True)
-	return extracted
+	return extracted
 def asr2htk_space_delimited(pronunciation):
 	"""convert phoneset from asr to htk.
 	Args:
 		pronunciation (str): space delimited asr phones. 
 	Returns:
 		(pronunciation) (str): space delimited asr phones in htk format (ascii).
 	"""
 	pronunciation_short = [fame_asr.reduction_key.get(i, i) for i in pronunciation.split(' ') 
 			   if not i in fame_asr.phones_to_be_removed]
 	return ' '.join(convert_phoneset.convert_phoneset(
 		pronunciation_short, fame_asr.translation_key_asr2htk))
 def lexicon_asr2htk(lexicon_file_asr, lexicon_file_htk):
 	""" Convert a lexicon file from asr to htk format (ascii). 
 	Args:
 		lexicon_file_asr (path): a lexicon file written in asr format e.g. fame/lex.asr.
 		lexicon_file_htk (path): a lexicon file written in htk format (ascii).
 	"""
 	lex_asr = load_lexicon(lexicon_file_asr)
 	def asr2htk_space_delimited_(row):
 		return asr2htk_space_delimited(row['pronunciation'])
 	lex_htk = pd.DataFrame({
 		'word': lex_asr['word'], 
        'pronunciation': lex_asr.apply(asr2htk_space_delimited_, axis=1)
 		})
 	lex_htk = lex_htk.ix[:, ['word', 'pronunciation']]
 	lex_htk.to_csv(lexicon_file_htk, header=None, index=None, sep='\t')
 	return
 def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out):
 	""" Combine two lexicon files and sort by words. 
 	Args:
 		lexicon_file1, lexicon_file2 (path): input lexicon files.
 	Returns:
 		lexicon_file_out (path): lexicon_file which lexcion_file1 and 2 are combined and sorted.
 	"""
 	lex1 = load_lexicon(lexicon_file1)
 	lex2 = load_lexicon(lexicon_file2)
 	lex  = pd.concat([lex1, lex2])
 	lex  = lex.sort_values(by='word', ascending=True)
 	lex.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
 def fix_single_quote(lexicon_file):
 	""" add '\' before all single quote at the beginning of words.
 	Args:
 		lexicon_file (path): lexicon file, which will be overwitten.
 	"""
 	lex = load_lexicon(lexicon_file)
 	for i in lex[lex['word'].str.startswith('\'')].index.values:
 		lex.iat[i, 0] = lex.iat[i, 0].replace('\'', '\\\'')
 		# to_csv does not work with space seperator. therefore all tabs should manually be replaced.
 		#lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
 		lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep='\t')
 	return
--- a/acoustic_model/fame_hmm.py
+++ b/acoustic_model/fame_hmm.py
@ -5,7 +5,6 @@ os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
 import tempfile
 #import configparser
 #import subprocess
 #from collections import Counter
 import time
 import numpy as np
@ -29,44 +28,21 @@ dataset_list = ['devel', 'test', 'train']
 # procedure
 extract_features  = 0
-conv_lexicon	  = 1
+make_lexicon	  = 0
-#check_lexicon	  = 0
+make_mlf		  = 0
-#make_mlf		  = 0
+combine_files	  = 0
-#combine_files	  = 0
+flat_start		  = 0
-#flat_start		  = 0
+train_model		  = 0
 #train_model		  = 1
 #sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir))
 #sys.path.append(forced_alignment_module)
 #from forced_alignment import convert_phone_set
 ## ======================= load variables =======================
-#config = configparser.ConfigParser()
+lexicon_dir = os.path.join(default.fame_dir, 'lexicon') 
-#config.sections()
+lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
-#config.read(config_ini)
+lexicon_oov = os.path.join(lexicon_dir, 'lex.oov')
-
+lexicon_htk_asr = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_asr')
-#config_hcopy = config['Settings']['config_hcopy']
+lexicon_htk_oov = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_oov')
-#config_train = config['Settings']['config_train']
+lexicon_htk     = os.path.join(default.htk_dir, 'lexicon', 'lex.htk')
 #mkhmmdefs_pl = config['Settings']['mkhmmdefs_pl']
 #FAME_dir	 = config['Settings']['FAME_dir']
 #lexicon_dir = os.path.join(default.fame_dir, 'lexicon') 
 #lexicon_ipa = os.path.join(lexicon_dir, 'lex.ipa')
 #lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
 #lex_asr		= FAME_dir + '\\lexicon\\lex.asr'
 #lex_asr_htk = FAME_dir + '\\lexicon\\lex.asr_htk'
 #lex_oov		= FAME_dir + '\\lexicon\\lex.oov'
 #lex_oov_htk = FAME_dir + '\\lexicon\\lex.oov_htk'
 ##lex_ipa		= FAME_dir + '\\lexicon\\lex.ipa'
 ##lex_ipa_	= FAME_dir + '\\lexicon\\lex.ipa_'
 ##lex_ipa_htk = FAME_dir + '\\lexicon\\lex.ipa_htk'
 #lex_htk		= FAME_dir + '\\lexicon\\lex_original.htk'
 #lex_htk_	= FAME_dir + '\\lexicon\\lex.htk'
 #hcompv_scp = output_dir + '\\scp\\combined.scp'
 #combined_mlf = output_dir + '\\label\\combined.mlf'
@ -88,8 +64,10 @@ if not os.path.exists(tmp_dir):
 ## ======================= extract features =======================
 if extract_features:
 	print('==== extract features ====\n')
 	for dataset in dataset_list:
-		print('==== {} ===='.format(dataset))
+		print('==== dataset: {} ===='.format(dataset))
 		# a script file for HCopy 
 		print(">>> making a script file for HCopy... \n")
@ -112,48 +90,28 @@ if extract_features:
 		fh.make_filelist(feature_dir_, hcompv_scp, '.mfc')
-## ======================= convert lexicon from ipa to fame_htk =======================
+## ======================= make lexicon for HTK =======================
-if conv_lexicon:
+if make_lexicon:
-	print('==== convert lexicon from ipa 2 fame ====\n')
+	print('==== make lexicon for HTK ====\n')
-	# convert each lexicon from ipa description to fame_htk phoneset.
+
-	#am_func.ipa2famehtk_lexicon(lex_oov, lex_oov_htk)
+	# convert each lexicon from fame_asr phoneset to fame_htk phoneset.
-	#am_func.ipa2famehtk_lexicon(lex_asr, lex_asr_htk)
+	print('>>> converting each lexicon from fame_asr phoneset to fame_htk phoneset... \n')
 	fame_functions.lexicon_asr2htk(lexicon_asr, lexicon_htk_asr)
 	fame_functions.lexicon_asr2htk(lexicon_oov, lexicon_htk_oov)
 	# combine lexicon
 	print('>>> combining lexicon files into one lexicon... \n')
 	# pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov.
 	# therefore there is no overlap between lex_asr and lex_oov.   
-	#am_func.combine_lexicon(lex_asr_htk, lex_oov_htk, lex_htk)
+	fame_functions.combine_lexicon(lexicon_htk_asr, lexicon_htk_oov, lexicon_htk)
-
+	## ======================= 
-## ======================= check if all the phones are successfully converted =======================
+	## manually make changes to the pronunciation dictionary and save it as lex.htk 
-if check_lexicon:
+	## =======================
-	print("==== check if all the phones are successfully converted. ====\n")
+	# (1) Replace all tabs with single space;
-
+	# (2) Put a '\' before any dictionary entry beginning with single quote 
-	# the phones used in the lexicon.
+	#http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html
-	phonelist_asr = am_func.get_phonelist(lex_asr)
+	fame_functions.fix_single_quote(lexicon_htk)
 	phonelist_oov = am_func.get_phonelist(lex_oov)
 	phonelist_htk = am_func.get_phonelist(lex_htk)
 	phonelist = phonelist_asr.union(phonelist_oov)
 	# the lines which include a specific phone.
 	lines = am_func.find_phone(lex_asr, 'g')
 	# statistics over the lexicon
 	lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])
 	pronunciation = lexicon_htk['pronunciation']
 	phones_all = []
 	for word in pronunciation:
 		phones_all = phones_all + word.split()
 	c = Counter(phones_all)
 ## ======================= 
 ## manually make changes to the pronunciation dictionary and save it as lex.htk 
 ## =======================
 # (1) Replace all tabs with single space;
 # (2) Put a '\' before any dictionary entry beginning with single quote 
 #http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html
 ## ======================= make label file =======================
--- a/acoustic_model/fame_ipa.py
+++ b/acoustic_model/fame_ipa.py
@ -1,107 +0,0 @@
 """ definition of the phones to be used. """
 phoneset = [
 	# vowels
 	'i̯',
 	'i̯ⁿ',
 	'y',
 	'i',
 	'i.',
 	'iⁿ',
 	'i:',
 	'i:ⁿ',
 	'ɪ',
 	'ɪⁿ',
 	'ɪ.',
 	#'ɪ:', # not included in lex.ipa
 	'ɪ:ⁿ',
 	'e',
 	'e:',
 	'e:ⁿ',
 	'ə',
 	'əⁿ',
 	'ə:',
 	'ɛ',
 	'ɛ.',
 	'ɛⁿ',
 	'ɛ:',
 	'ɛ:ⁿ',
 	'a',
 	'aⁿ',
 	'a.',
 	'a:',
 	'a:ⁿ',
 	'ṷ',
 	'ṷ.',
 	'ṷⁿ',
 	#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone. 
 	'u',
 	'uⁿ',
 	'u.',
 	'u:',
 	'u:ⁿ',
 	'ü',
 	'ü.',
 	'üⁿ',
 	'ü:',
 	'ü:ⁿ',
 	'o',
 	'oⁿ',
 	'o.',
 	'o:',
 	'o:ⁿ',
 	'ö',
 	'ö.',
 	'öⁿ',
 	'ö:',
 	'ö:ⁿ',
 	'ɔ',
 	'ɔ.',
 	'ɔⁿ',
 	'ɔ:',
 	'ɔ:ⁿ',
 	#'ɔ̈', # not included in lex.ipa 
 	'ɔ̈.',
 	'ɔ̈:',
 	# plosives
 	'p', 
 	'b', 
 	't',
 	'tⁿ',
 	'd', 
 	'k',
 	'g',
 	'ɡ', # = 'g'
 	# nasals
 	'm',
 	'n',
 	'ŋ',
 	# fricatives
 	'f',
 	'v',
 	's',
 	's:',
 	'z',
 	'zⁿ',
 	'x',
 	'h',
 	# tap and flip
 	'r',
 	'r.', # only appears in word 'mearpartijestelsel'(does not exist in lex_asr) and 'tenoarpartij'.   
 	'r:', # only appears in word 'mûsearflearmûs' and 'sjochdêr'.
 	# approximant
 	'j',
 	'j.',
 	'l'
 	]
 ## the list of multi character phones. 
 # for example, the length of 'i̯ⁿ' is 3, but in the codes it is treated as one letter.
 multi_character_phones = [i for i in phoneset if len(i) > 1]
 multi_character_phones.sort(key=len, reverse=True)
--- a/acoustic_model/fame_test.py
+++ b/acoustic_model/fame_test.py
@ -1,7 +1,7 @@
 import sys
 import os
 os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
-
+from collections import Counter
 import time
 import numpy as np
@ -11,12 +11,12 @@ import fame_functions
 import defaultfiles as default
 sys.path.append(default.toolbox_dir)
 from phoneset import fame_ipa, fame_asr
-
+import convert_phoneset
 lexicon_dir = os.path.join(default.fame_dir, 'lexicon') 
 lexicon_ipa = os.path.join(lexicon_dir, 'lex.ipa')
 lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
-
+lexicon_htk = os.path.join(default.htk_dir, 'lexicon', 'lex.htk')
 ## check if all the phones in lexicon.ipa are in fame_ipa.py.
 #timer_start = time.time()
@ -64,6 +64,7 @@ else:
 #				if ipa_ in phone_unknown:
 #					translation_key_ipa2asr[ipa_] = asr_
 #					phone_unknown.remove(ipa_)
 translation_key_ipa2asr['ə:'] = 'ə'
 translation_key_ipa2asr['r.'] = 'r'
 translation_key_ipa2asr['r:'] = 'r'
@ -71,23 +72,32 @@ np.save(os.path.join('phoneset', 'fame_ipa2asr.npy'), translation_key_ipa2asr)
 ## check if all the phones in lexicon.asr are in translation_key_ipa2asr.
 #timer_start = time.time()
 #phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_asr, phoneset='asr')
 #phoneset_lex.remove("")
 #phoneset_asr = list(set(translation_key_ipa2asr.values()))
 #print("phones which is in lexicon.asr but not in the translation_key_ipa2asr:\n{}".format(
 #	set(phoneset_lex) - set(phoneset_asr)))
 #print("elapsed time: {}".format(time.time() - timer_start))
 ## check if all the phones in lexicon.htk are in fame_asr.py.
 timer_start = time.time()
-phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_asr, phoneset='asr')
+phoneset_htk = fame_asr.phoneset_htk
-phoneset_lex.remove("")
+phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_htk)
-phoneset_asr = list(set(translation_key_ipa2asr.values()))
+phoneset_lex.remove('')
-print("phones which is in lexicon.asr but not in the translation_key_ipa2asr:\n{}".format(
+print("phones which is in lexicon.htk but not in the fame_asr.py are:\n{}".format(
-	set(phoneset_lex) - set(phoneset_asr)))
+	set(phoneset_htk) - set(phoneset_lex)))
 print("elapsed time: {}".format(time.time() - timer_start))
-## make the translation key between asr to htk.
+# statistics over the lexicon
-#multi_character_phones = [i for i in phoneset_asr if len(i) > 1]
+lex_htk = fame_functions.load_lexicon(lexicon_htk)
-#multi_character_phones.sort(key=len, reverse=True)
+phones_all = (' '.join(lex_htk['pronunciation'])).split(' ')
 c = Counter(phones_all)
-#lexicon_ipa = pd.read_table(lex_ipa, names=['word', 'pronunciation'])
+lexicon_out = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\lexicon\lex.htk2'
-#with open(lex_ipa_, "w", encoding="utf-8") as fout:
+for i in lex_htk[lex_htk['word'].str.startswith('\'')].index.values:
-#	for word, pronunciation in zip(lexicon_ipa['word'], lexicon_ipa['pronunciation']):
+	lex_htk.iat[i, 0] = lex_htk.iat[i, 0].replace('\'', '\\\'')
-#		# ignore nasalization and '.'
+# to_csv does not work with space seperator. therefore all tabs should manually be replaced.
-#		pronunciation_ = pronunciation.replace(u'ⁿ', '')
+#lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
-#		pronunciation_ = pronunciation_.replace('.', '')
+lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
 #		pronunciation_split = convert_phone_set.split_ipa_fame(pronunciation_)
 #		fout.write("{0}\t{1}\n".format(word, ' '.join(pronunciation_split)))
--- a/acoustic_model/phoneset/fame_asr.py
+++ b/acoustic_model/phoneset/fame_asr.py
@ -1,74 +1,40 @@
 """ definition of the phones to be used. """
 # phonese in {FAME}/lexicon/lex.asr
 phoneset = [
 	# vowels
-	'i̯',
+	'a',
-	'i̯ⁿ',
+	'a:',
 	'y',
 	'i',
 	'i.',
 	'iⁿ',
 	'i:',
 	'i:ⁿ',
 	'ɪ',
 	'ɪⁿ',
 	'ɪ.',
 	#'ɪ:', # not included in lex.ipa
 	'ɪ:ⁿ',
 	'e',
 	'e:',
-	'e:ⁿ',
+	'i',
-	'ə',
+	'i:',
-	'əⁿ',
+	'i̯',
 	'ə:',
 	'ɛ',
 	'ɛ.',
 	'ɛⁿ',
 	'ɛ:',
 	'ɛ:ⁿ',
 	'a',
 	'aⁿ',
 	'a.',
 	'a:',
 	'a:ⁿ',
 	'ṷ',
 	'ṷ.',
 	'ṷⁿ',
 	#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. 
 	'u',
 	'uⁿ',
 	'u.',
 	'u:',
 	'u:ⁿ',
 	'ü',
 	'ü.',
 	'üⁿ',
 	'ü:',
 	'ü:ⁿ',
 	'o',
 	'oⁿ',
 	'o.',
 	'o:',
 	'o:ⁿ',
 	'ö',
 	'ö.',
 	'öⁿ',
 	'ö:',
-	'ö:ⁿ',
+	'u',
 	'u:',
 	'ü',
 	'ü:',
 	#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone. 
 	'ṷ',
 	'y',
 	'ɔ',
 	'ɔ.',
 	'ɔⁿ',
 	'ɔ:',
-	'ɔ:ⁿ',
+	'ɔ̈', 
 	#'ɔ̈', # not included in lex.ipa 
 	'ɔ̈.',
 	'ɔ̈:',
 	'ə',
 	'ɛ',
 	'ɛ:',
 	'ɪ',
 	'ɪ:',
 	# plosives
 	'p', 
 	'b', 
 	't',
 	'tⁿ',
 	'd', 
 	'k',
 	'g',
@ -85,22 +51,77 @@ phoneset = [
 	's',
 	's:',
 	'z',
 	'zⁿ',
 	'x',
 	'h',
-
+	
 	# tap and flip
 	'r',
-	'r.', # only appears in word 'mearpartijestelsel'(does not exist in lex_asr) and 'tenoarpartij'.   
+	'r:',
 	'r:', # only appears in word 'mûsearflearmûs' and 'sjochdêr'.
 	# approximant
 	'j',
 	'j.',
 	'l'
 	]
 ## reduce the number of phones.
 # the phones which seldom occur are replaced with another more popular phones.
 # replacements are based on the advice from Martijn Wieling.
 reduction_key = {
 	'y':'i:', 'e':'e:', 'ə:':'ɛ:', 'r:':'r', 'ɡ':'g'
 	}
 # already removed beforehand in phoneset. Just to be sure.
 phones_to_be_removed = ['ú', 's:', 'ɔ̈:']
 phoneset_short = [reduction_key.get(i, i) for i in phoneset
 				  if not i in phones_to_be_removed]
 phoneset_short = list(set(phoneset_short))
 phoneset_short.sort()
 ## translation_key to htk format (ascii).
 # phones which gives UnicodeEncodeError when phone.encode("ascii")
 # are replaced with other characters.
 translation_key_asr2htk = {
 	'i̯': 'i_',
 	'ṷ': 'u_',
 	# on the analogy of German umlaut, 'e' is used.
 	'ö': 'oe', 'ö:': 'oe:',
 	'ü': 'ue', 'ü:': 'ue:',
 	# on the analogy of Chinese...
 	'ŋ': 'ng',
 	# refer to Xsampa. 
 	'ɔ': 'O', 'ɔ:': 'O:', 'ɔ̈': 'Oe',
 	'ɛ': 'E', 'ɛ:': 'E:',
 	'ɪ': 'I', 'ɪ:': 'I:', 
 	# it is @ in Xsampa, but that is not handy on HTK.
 	'ə': 'A'
 	}
 phoneset_htk = [translation_key_asr2htk.get(i, i) for i in phoneset_short]
 ## check
 #for i in phoneset_short:
 #	try:
 #		print("{0} --> {1}".format(i, i.encode("ascii")))
 #	except UnicodeEncodeError:
 #		print(">>> {}".format(i))
 ## the list of multi character phones. 
-# for example, the length of 'i̯ⁿ' is 3, but in the codes it is treated as one letter.
+# for example, the length of 'a:' is 3, but in the codes it is treated as one letter.
 # original.
 multi_character_phones = [i for i in phoneset if len(i) > 1]
-multi_character_phones.sort(key=len, reverse=True)
+multi_character_phones.sort(key=len, reverse=True)
 # phonset reduced.
 multi_character_phones_short = [i for i in phoneset_short if len(i) > 1]
 multi_character_phones_short.sort(key=len, reverse=True)
 # htk compatible.
 multi_character_phones_htk = [i for i in phoneset_htk if len(i) > 1]
 multi_character_phones_htk.sort(key=len, reverse=True)
--- a/acoustic_model/phoneset/fame_ipa.py
+++ b/acoustic_model/phoneset/fame_ipa.py
@ -34,7 +34,7 @@ phoneset = [
 	'ṷ',
 	'ṷ.',
 	'ṷⁿ',
-	#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. 
+	#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone. 
 	'u',
 	'uⁿ',
 	'u.',
@ -100,6 +100,7 @@ phoneset = [
 	'l'
 	]
 ## the list of multi character phones. 
 # for example, the length of 'i̯ⁿ' is 3, but in the codes it is treated as one letter.
 multi_character_phones = [i for i in phoneset if len(i) > 1]