lexicon is made.

2019-01-29 21:52:11 +01:00
parent 8cda93de75
commit dc6b7b84b6
11 changed files with 241 additions and 424 deletions
--- a/.vs/acoustic_model/v15/.suo
+++ b/.vs/acoustic_model/v15/.suo
--- a/acoustic_model/pycache/defaultfiles.cpython-36.pyc
+++ b/acoustic_model/pycache/defaultfiles.cpython-36.pyc
--- a/acoustic_model/acoustic_model.pyproj
+++ b/acoustic_model/acoustic_model.pyproj
@@ -23,7 +23,7 @@
  </PropertyGroup>
  <ItemGroup>
    <Compile Include="check_novoapi.py" />
-    <Compile Include="convert_phone_set.py">
+    <Compile Include="convert_phoneset.py">
      <SubType>Code</SubType>
    </Compile>
    <Compile Include="convert_xsampa2ipa.py">
@@ -32,8 +32,6 @@
    <Compile Include="defaultfiles.py">
      <SubType>Code</SubType>
    </Compile>
-    <Compile Include="fame_asr.py" />
-    <Compile Include="fame_ipa.py" />
    <Compile Include="fame_test.py">
      <SubType>Code</SubType>
    </Compile>
@@ -52,9 +50,20 @@
      <SubType>Code</SubType>
    </Compile>
    <Compile Include="fame_hmm.py" />
+    <Compile Include="phoneset\fame_asr.py" />
+    <Compile Include="phoneset\fame_ipa.py" />
  </ItemGroup>
  <ItemGroup>
    <Content Include="config.ini" />
+    <Content Include="phoneset\fame_ipa2asr.npy" />
+    <Content Include="phoneset\output_get_translation_key_phone_unknown.npy" />
+    <Content Include="phoneset\output_get_translation_key_translation_key.npy" />
+    <Content Include="phoneset\__pycache__\fame_asr.cpython-36.pyc" />
+    <Content Include="phoneset\__pycache__\fame_ipa.cpython-36.pyc" />
+  </ItemGroup>
+  <ItemGroup>
+    <Folder Include="phoneset\" />
+    <Folder Include="phoneset\__pycache__\" />
  </ItemGroup>
  <Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
  <!-- Uncomment the CoreCompile target to enable the Build command in
--- a/acoustic_model/convert_phone_set.py
+++ b/acoustic_model/convert_phone_set.py
@@ -26,4 +26,15 @@ def split_word(word, multi_character_phones):
 		(word_seperated) (list): the word splitted in given phoneset. 

 	"""
-	return [phone for phone in multi_character_tokenize(word.strip(), multi_character_phones)]
+	return [phone 
+		 for phone in multi_character_tokenize(word.strip(), multi_character_phones)
+		 ]
+
+
+def convert_phoneset(word_list, translation_key):
+	"""
+	Args:
+		word_list (str): a list of phones written in given phoneset.
+		translation_key (dict): 
+	"""
+	return [translation_key.get(phone, phone) for phone in word_list]
--- a/acoustic_model/fame_asr.py
+++ b/acoustic_model/fame_asr.py
@@ -1,127 +0,0 @@
-""" definition of the phones to be used. """
-
-# phonese in {FAME}/lexicon/lex.asr
-phoneset = [
-	# vowels
-	'a',
-	'a:',
-	'e',
-	'e:',
-	'i',
-	'i:',
-	'i̯',
-	'o',
-	'o:',
-	'ö',
-	'ö:',
-	'u',
-	'u:',
-	'ü',
-	'ü:',
-	#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone. 
-	'ṷ',
-	'y',
-	'ɔ',
-	'ɔ:',
-	'ɔ̈', 
-	'ɔ̈:',
-	'ə',
-	'ɛ',
-	'ɛ:',
-	'ɪ',
-	'ɪ:',
-
-	# plosives
-	'p', 
-	'b', 
-	't',
-	'd', 
-	'k',
-	'g',
-	'ɡ', # = 'g'
-
-	# nasals
-	'm',
-	'n',
-	'ŋ',
-	
-	# fricatives
-	'f',
-	'v',
-	's',
-	's:',
-	'z',
-	'x',
-	'h',
-	
-	# tap and flip
-	'r',
-	'r:',
-
-	# approximant
-	'j',
-	'l'
-	]
-
-
-## reduce the number of phones.
-# the phones which seldom occur are replaced with another more popular phones.
-# replacements are based on the advice from Martijn Wieling.
-reduction_key = {
-	'y':'i:', 'e':'e:', 'ə:':'ɛ:', 'r:':'r', 'ɡ':'g'
-	}
-# already removed beforehand in phoneset. Just to be sure.
-phones_to_be_removed = ['ú', 's:', 'ɔ̈:']
-
-phoneset_short = [reduction_key.get(i, i) for i in phoneset
-				  if not i in phones_to_be_removed]
-phoneset_short = list(set(phoneset_short))
-phoneset_short.sort()
-
-
-## translation_key to htk format (ascii).
-# phones which gives UnicodeEncodeError when phone.encode("ascii")
-# are replaced with other characters.
-translation_key_asr2htk = {
-	'i̯': 'i_',
-	'ṷ': 'u_',
-
-	# on the analogy of German umlaut, 'e' is used.
-	'ö': 'oe', 'ö:': 'oe:',
-	'ü': 'ue', 'ü:': 'ue:',
-
-	# on the analogy of Chinese...
-	'ŋ': 'ng',
-				
-	# refer to Xsampa. 
-	'ɔ': 'O', 'ɔ:': 'O:', 'ɔ̈': 'Oe',
-	'ɛ': 'E', 'ɛ:': 'E:',
-	'ɪ': 'I', 'ɪ:': 'I:', 
-
-	# it is @ in Xsampa, but that is not handy on HTK.
-	'ə': 'A'
-	}
-phoneset_htk = [translation_key_asr2htk.get(i, i) for i in phoneset_short]
-
-## check
-#for i in phoneset_short:
-#	try:
-#		print("{0} --> {1}".format(i, i.encode("ascii")))
-#	except UnicodeEncodeError:
-#		print(">>> {}".format(i))
-
-
-## the list of multi character phones. 
-# for example, the length of 'a:' is 3, but in the codes it is treated as one letter.
-
-# original.
-multi_character_phones = [i for i in phoneset if len(i) > 1]
-multi_character_phones.sort(key=len, reverse=True)
-
-# phonset reduced.
-multi_character_phones_short = [i for i in phoneset_short if len(i) > 1]
-multi_character_phones_short.sort(key=len, reverse=True)
-
-# htk compatible.
-multi_character_phones_htk = [i for i in phoneset_htk if len(i) > 1]
-multi_character_phones_htk.sort(key=len, reverse=True)
--- a/acoustic_model/fame_functions.py
+++ b/acoustic_model/fame_functions.py
@@ -1,4 +1,5 @@
 import os
+os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')

 import sys
 from collections import Counter
@@ -8,38 +9,8 @@ import numpy as np
 import pandas as pd

 import defaultfiles as default
-from phoneset import fame_ipa
-import convert_phone_set
-
-
-#def ipa2famehtk_lexicon(lexicon_file_in, lexicon_file_out):
-#	""" Convert a lexicon file from IPA to HTK format for FAME! corpus. """
-
-#	lexicon_in = pd.read_table(lexicon_file_in, names=['word', 'pronunciation'])
-#	with open(lexicon_file_out, "w", encoding="utf-8") as fout:
-#		for word, pronunciation in zip(lexicon_in['word'], lexicon_in['pronunciation']):
-#			pronunciation_no_space = pronunciation.replace(' ', '')
-#			pronunciation_famehtk  = convert_phone_set.ipa2famehtk(pronunciation_no_space)
-#			if 'ceh' not in pronunciation_famehtk and 'sh' not in pronunciation_famehtk:
-#				fout.write("{0}\t{1}\n".format(word.upper(), pronunciation_famehtk))
-
-
-#def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out):
-#	""" Combine two lexicon files and sort by words. """
-
-#	with open(lexicon_file1, "rt", encoding="utf-8") as fin:
-#		lines1 = fin.read()
-#		lines1 = lines1.split('\n')
-#	with open(lexicon_file2, "rt", encoding="utf-8") as fin:
-#		lines2 = fin.read()
-#		lines2 = lines2.split('\n')
-	
-#	lex1 = pd.read_table(lexicon_file1, names=['word', 'pronunciation'])
-#	lex2 = pd.read_table(lexicon_file2, names=['word', 'pronunciation'])
-#	lex  = pd.concat([lex1, lex2])
-#	lex  = lex.sort_values(by='word', ascending=True)
-#	lex.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
-
+import convert_phoneset
+from phoneset import fame_ipa, fame_asr

 #def read_fileFA(fileFA):
 #    """
@@ -291,4 +262,74 @@ def find_phone(lexicon_file, phone, phoneset_name='ipa'):
 		if phone in pronunciation:
 			extracted_ = pd.Series([row['word'], pronunciation], index=extracted.columns)
 			extracted  = extracted.append(extracted_, ignore_index=True)
-	return extracted
+	return extracted
+
+
+def asr2htk_space_delimited(pronunciation):
+	"""convert phoneset from asr to htk.
+	
+	Args:
+		pronunciation (str): space delimited asr phones. 
+
+	Returns:
+		(pronunciation) (str): space delimited asr phones in htk format (ascii).
+
+	"""
+	pronunciation_short = [fame_asr.reduction_key.get(i, i) for i in pronunciation.split(' ') 
+			   if not i in fame_asr.phones_to_be_removed]
+	return ' '.join(convert_phoneset.convert_phoneset(
+		pronunciation_short, fame_asr.translation_key_asr2htk))
+
+
+def lexicon_asr2htk(lexicon_file_asr, lexicon_file_htk):
+	""" Convert a lexicon file from asr to htk format (ascii). 
+
+	Args:
+		lexicon_file_asr (path): a lexicon file written in asr format e.g. fame/lex.asr.
+		lexicon_file_htk (path): a lexicon file written in htk format (ascii).
+
+	"""
+	lex_asr = load_lexicon(lexicon_file_asr)
+	def asr2htk_space_delimited_(row):
+		return asr2htk_space_delimited(row['pronunciation'])
+
+	lex_htk = pd.DataFrame({
+		'word': lex_asr['word'], 
+        'pronunciation': lex_asr.apply(asr2htk_space_delimited_, axis=1)
+		})
+	lex_htk = lex_htk.ix[:, ['word', 'pronunciation']]
+	lex_htk.to_csv(lexicon_file_htk, header=None, index=None, sep='\t')
+	return
+
+
+def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out):
+	""" Combine two lexicon files and sort by words. 
+
+	Args:
+		lexicon_file1, lexicon_file2 (path): input lexicon files.
+		
+	Returns:
+		lexicon_file_out (path): lexicon_file which lexcion_file1 and 2 are combined and sorted.
+
+	"""
+	lex1 = load_lexicon(lexicon_file1)
+	lex2 = load_lexicon(lexicon_file2)
+	lex  = pd.concat([lex1, lex2])
+	lex  = lex.sort_values(by='word', ascending=True)
+	lex.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
+
+
+def fix_single_quote(lexicon_file):
+	""" add '\' before all single quote at the beginning of words.
+
+	Args:
+		lexicon_file (path): lexicon file, which will be overwitten.
+
+	"""
+	lex = load_lexicon(lexicon_file)
+	for i in lex[lex['word'].str.startswith('\'')].index.values:
+		lex.iat[i, 0] = lex.iat[i, 0].replace('\'', '\\\'')
+		# to_csv does not work with space seperator. therefore all tabs should manually be replaced.
+		#lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
+		lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep='\t')
+	return
--- a/acoustic_model/fame_hmm.py
+++ b/acoustic_model/fame_hmm.py
@@ -5,7 +5,6 @@ os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
 import tempfile
 #import configparser
 #import subprocess
-#from collections import Counter
 import time

 import numpy as np
@@ -29,44 +28,21 @@ dataset_list = ['devel', 'test', 'train']

 # procedure
 extract_features  = 0
-conv_lexicon	  = 1
-#check_lexicon	  = 0
-#make_mlf		  = 0
-#combine_files	  = 0
-#flat_start		  = 0
-#train_model		  = 1
-
-
-#sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir))
-#sys.path.append(forced_alignment_module)
-#from forced_alignment import convert_phone_set
-
+make_lexicon	  = 0
+make_mlf		  = 0
+combine_files	  = 0
+flat_start		  = 0
+train_model		  = 0


 ## ======================= load variables =======================

-#config = configparser.ConfigParser()
-#config.sections()
-#config.read(config_ini)
-
-#config_hcopy = config['Settings']['config_hcopy']
-#config_train = config['Settings']['config_train']
-#mkhmmdefs_pl = config['Settings']['mkhmmdefs_pl']
-#FAME_dir	 = config['Settings']['FAME_dir']
-
-#lexicon_dir = os.path.join(default.fame_dir, 'lexicon') 
-#lexicon_ipa = os.path.join(lexicon_dir, 'lex.ipa')
-#lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
-
-#lex_asr		= FAME_dir + '\\lexicon\\lex.asr'
-#lex_asr_htk = FAME_dir + '\\lexicon\\lex.asr_htk'
-#lex_oov		= FAME_dir + '\\lexicon\\lex.oov'
-#lex_oov_htk = FAME_dir + '\\lexicon\\lex.oov_htk'
-##lex_ipa		= FAME_dir + '\\lexicon\\lex.ipa'
-##lex_ipa_	= FAME_dir + '\\lexicon\\lex.ipa_'
-##lex_ipa_htk = FAME_dir + '\\lexicon\\lex.ipa_htk'
-#lex_htk		= FAME_dir + '\\lexicon\\lex_original.htk'
-#lex_htk_	= FAME_dir + '\\lexicon\\lex.htk'
+lexicon_dir = os.path.join(default.fame_dir, 'lexicon') 
+lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
+lexicon_oov = os.path.join(lexicon_dir, 'lex.oov')
+lexicon_htk_asr = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_asr')
+lexicon_htk_oov = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_oov')
+lexicon_htk     = os.path.join(default.htk_dir, 'lexicon', 'lex.htk')

 #hcompv_scp = output_dir + '\\scp\\combined.scp'
 #combined_mlf = output_dir + '\\label\\combined.mlf'
@@ -88,8 +64,10 @@ if not os.path.exists(tmp_dir):

 ## ======================= extract features =======================
 if extract_features:
+	print('==== extract features ====\n')
+
 	for dataset in dataset_list:
-		print('==== {} ===='.format(dataset))
+		print('==== dataset: {} ===='.format(dataset))

 		# a script file for HCopy 
 		print(">>> making a script file for HCopy... \n")
@@ -112,48 +90,28 @@ if extract_features:
 		fh.make_filelist(feature_dir_, hcompv_scp, '.mfc')


-## ======================= convert lexicon from ipa to fame_htk =======================
-if conv_lexicon:
-	print('==== convert lexicon from ipa 2 fame ====\n')
-	# convert each lexicon from ipa description to fame_htk phoneset.
-	#am_func.ipa2famehtk_lexicon(lex_oov, lex_oov_htk)
-	#am_func.ipa2famehtk_lexicon(lex_asr, lex_asr_htk)
+## ======================= make lexicon for HTK =======================
+if make_lexicon:
+	print('==== make lexicon for HTK ====\n')
+
+	# convert each lexicon from fame_asr phoneset to fame_htk phoneset.
+	print('>>> converting each lexicon from fame_asr phoneset to fame_htk phoneset... \n')
+	fame_functions.lexicon_asr2htk(lexicon_asr, lexicon_htk_asr)
+	fame_functions.lexicon_asr2htk(lexicon_oov, lexicon_htk_oov)

 	# combine lexicon
+	print('>>> combining lexicon files into one lexicon... \n')
 	# pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov.
 	# therefore there is no overlap between lex_asr and lex_oov.   
-	#am_func.combine_lexicon(lex_asr_htk, lex_oov_htk, lex_htk)
+	fame_functions.combine_lexicon(lexicon_htk_asr, lexicon_htk_oov, lexicon_htk)

-
-## ======================= check if all the phones are successfully converted =======================
-if check_lexicon:
-	print("==== check if all the phones are successfully converted. ====\n")
-
-	# the phones used in the lexicon.
-	phonelist_asr = am_func.get_phonelist(lex_asr)
-	phonelist_oov = am_func.get_phonelist(lex_oov)
-	phonelist_htk = am_func.get_phonelist(lex_htk)
-
-	phonelist = phonelist_asr.union(phonelist_oov)
-
-	# the lines which include a specific phone.
-	lines = am_func.find_phone(lex_asr, 'g')
-
-	# statistics over the lexicon
-	lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])
-	pronunciation = lexicon_htk['pronunciation']
-	phones_all = []
-	for word in pronunciation:
-		phones_all = phones_all + word.split()
-	c = Counter(phones_all)
-
-
-## ======================= 
-## manually make changes to the pronunciation dictionary and save it as lex.htk 
-## =======================
-# (1) Replace all tabs with single space;
-# (2) Put a '\' before any dictionary entry beginning with single quote 
-#http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html
+	## ======================= 
+	## manually make changes to the pronunciation dictionary and save it as lex.htk 
+	## =======================
+	# (1) Replace all tabs with single space;
+	# (2) Put a '\' before any dictionary entry beginning with single quote 
+	#http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html
+	fame_functions.fix_single_quote(lexicon_htk)


 ## ======================= make label file =======================
--- a/acoustic_model/fame_ipa.py
+++ b/acoustic_model/fame_ipa.py
@@ -1,107 +0,0 @@
-""" definition of the phones to be used. """
-
-phoneset = [
-	# vowels
-	'i̯',
-	'i̯ⁿ',
-	'y',
-	'i',
-	'i.',
-	'iⁿ',
-	'i:',
-	'i:ⁿ',
-	'ɪ',
-	'ɪⁿ',
-	'ɪ.',
-	#'ɪ:', # not included in lex.ipa
-	'ɪ:ⁿ',
-	'e',
-	'e:',
-	'e:ⁿ',
-	'ə',
-	'əⁿ',
-	'ə:',
-	'ɛ',
-	'ɛ.',
-	'ɛⁿ',
-	'ɛ:',
-	'ɛ:ⁿ',
-	'a',
-	'aⁿ',
-	'a.',
-	'a:',
-	'a:ⁿ',
-	'ṷ',
-	'ṷ.',
-	'ṷⁿ',
-	#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone. 
-	'u',
-	'uⁿ',
-	'u.',
-	'u:',
-	'u:ⁿ',
-	'ü',
-	'ü.',
-	'üⁿ',
-	'ü:',
-	'ü:ⁿ',
-	'o',
-	'oⁿ',
-	'o.',
-	'o:',
-	'o:ⁿ',
-	'ö',
-	'ö.',
-	'öⁿ',
-	'ö:',
-	'ö:ⁿ',
-	'ɔ',
-	'ɔ.',
-	'ɔⁿ',
-	'ɔ:',
-	'ɔ:ⁿ',
-	#'ɔ̈', # not included in lex.ipa 
-	'ɔ̈.',
-	'ɔ̈:',
-
-	# plosives
-	'p', 
-	'b', 
-	't',
-	'tⁿ',
-	'd', 
-	'k',
-	'g',
-	'ɡ', # = 'g'
-
-	# nasals
-	'm',
-	'n',
-	'ŋ',
-	
-	# fricatives
-	'f',
-	'v',
-	's',
-	's:',
-	'z',
-	'zⁿ',
-	'x',
-	'h',
-
-	# tap and flip
-	'r',
-	'r.', # only appears in word 'mearpartijestelsel'(does not exist in lex_asr) and 'tenoarpartij'.   
-	'r:', # only appears in word 'mûsearflearmûs' and 'sjochdêr'.
-
-	# approximant
-	'j',
-	'j.',
-	'l'
-	]
-
-
-## the list of multi character phones. 
-# for example, the length of 'i̯ⁿ' is 3, but in the codes it is treated as one letter.
-multi_character_phones = [i for i in phoneset if len(i) > 1]
-multi_character_phones.sort(key=len, reverse=True)
--- a/acoustic_model/fame_test.py
+++ b/acoustic_model/fame_test.py
@@ -1,7 +1,7 @@
 import sys
 import os
 os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
-
+from collections import Counter
 import time

 import numpy as np
@@ -11,12 +11,12 @@ import fame_functions
 import defaultfiles as default
 sys.path.append(default.toolbox_dir)
 from phoneset import fame_ipa, fame_asr
-
+import convert_phoneset

 lexicon_dir = os.path.join(default.fame_dir, 'lexicon') 
 lexicon_ipa = os.path.join(lexicon_dir, 'lex.ipa')
 lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
-
+lexicon_htk = os.path.join(default.htk_dir, 'lexicon', 'lex.htk')

 ## check if all the phones in lexicon.ipa are in fame_ipa.py.
 #timer_start = time.time()
@@ -64,6 +64,7 @@ else:
 #				if ipa_ in phone_unknown:
 #					translation_key_ipa2asr[ipa_] = asr_
 #					phone_unknown.remove(ipa_)
+
 translation_key_ipa2asr['ə:'] = 'ə'
 translation_key_ipa2asr['r.'] = 'r'
 translation_key_ipa2asr['r:'] = 'r'
@@ -71,23 +72,32 @@ np.save(os.path.join('phoneset', 'fame_ipa2asr.npy'), translation_key_ipa2asr)


 ## check if all the phones in lexicon.asr are in translation_key_ipa2asr.
+#timer_start = time.time()
+#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_asr, phoneset='asr')
+#phoneset_lex.remove("")
+#phoneset_asr = list(set(translation_key_ipa2asr.values()))
+#print("phones which is in lexicon.asr but not in the translation_key_ipa2asr:\n{}".format(
+#	set(phoneset_lex) - set(phoneset_asr)))
+#print("elapsed time: {}".format(time.time() - timer_start))
+
+
+## check if all the phones in lexicon.htk are in fame_asr.py.
 timer_start = time.time()
-phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_asr, phoneset='asr')
-phoneset_lex.remove("")
-phoneset_asr = list(set(translation_key_ipa2asr.values()))
-print("phones which is in lexicon.asr but not in the translation_key_ipa2asr:\n{}".format(
-	set(phoneset_lex) - set(phoneset_asr)))
+phoneset_htk = fame_asr.phoneset_htk
+phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_htk)
+phoneset_lex.remove('')
+print("phones which is in lexicon.htk but not in the fame_asr.py are:\n{}".format(
+	set(phoneset_htk) - set(phoneset_lex)))
 print("elapsed time: {}".format(time.time() - timer_start))

-## make the translation key between asr to htk.
-#multi_character_phones = [i for i in phoneset_asr if len(i) > 1]
-#multi_character_phones.sort(key=len, reverse=True)
+# statistics over the lexicon
+lex_htk = fame_functions.load_lexicon(lexicon_htk)
+phones_all = (' '.join(lex_htk['pronunciation'])).split(' ')
+c = Counter(phones_all)

-#lexicon_ipa = pd.read_table(lex_ipa, names=['word', 'pronunciation'])
-#with open(lex_ipa_, "w", encoding="utf-8") as fout:
-#	for word, pronunciation in zip(lexicon_ipa['word'], lexicon_ipa['pronunciation']):
-#		# ignore nasalization and '.'
-#		pronunciation_ = pronunciation.replace(u'ⁿ', '')
-#		pronunciation_ = pronunciation_.replace('.', '')
-#		pronunciation_split = convert_phone_set.split_ipa_fame(pronunciation_)
-#		fout.write("{0}\t{1}\n".format(word, ' '.join(pronunciation_split)))
+lexicon_out = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\lexicon\lex.htk2'
+for i in lex_htk[lex_htk['word'].str.startswith('\'')].index.values:
+	lex_htk.iat[i, 0] = lex_htk.iat[i, 0].replace('\'', '\\\'')
+# to_csv does not work with space seperator. therefore all tabs should manually be replaced.
+#lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
+lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
--- a/acoustic_model/phoneset/fame_asr.py
+++ b/acoustic_model/phoneset/fame_asr.py
@@ -1,74 +1,40 @@
 """ definition of the phones to be used. """

+# phonese in {FAME}/lexicon/lex.asr
 phoneset = [
 	# vowels
-	'i̯',
-	'i̯ⁿ',
-	'y',
-	'i',
-	'i.',
-	'iⁿ',
-	'i:',
-	'i:ⁿ',
-	'ɪ',
-	'ɪⁿ',
-	'ɪ.',
-	#'ɪ:', # not included in lex.ipa
-	'ɪ:ⁿ',
+	'a',
+	'a:',
 	'e',
 	'e:',
-	'e:ⁿ',
-	'ə',
-	'əⁿ',
-	'ə:',
-	'ɛ',
-	'ɛ.',
-	'ɛⁿ',
-	'ɛ:',
-	'ɛ:ⁿ',
-	'a',
-	'aⁿ',
-	'a.',
-	'a:',
-	'a:ⁿ',
-	'ṷ',
-	'ṷ.',
-	'ṷⁿ',
-	#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. 
-	'u',
-	'uⁿ',
-	'u.',
-	'u:',
-	'u:ⁿ',
-	'ü',
-	'ü.',
-	'üⁿ',
-	'ü:',
-	'ü:ⁿ',
+	'i',
+	'i:',
+	'i̯',
 	'o',
-	'oⁿ',
-	'o.',
 	'o:',
-	'o:ⁿ',
 	'ö',
-	'ö.',
-	'öⁿ',
 	'ö:',
-	'ö:ⁿ',
+	'u',
+	'u:',
+	'ü',
+	'ü:',
+	#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone. 
+	'ṷ',
+	'y',
 	'ɔ',
-	'ɔ.',
-	'ɔⁿ',
 	'ɔ:',
-	'ɔ:ⁿ',
-	#'ɔ̈', # not included in lex.ipa 
-	'ɔ̈.',
+	'ɔ̈', 
 	'ɔ̈:',
+	'ə',
+	'ɛ',
+	'ɛ:',
+	'ɪ',
+	'ɪ:',

 	# plosives
 	'p', 
 	'b', 
 	't',
-	'tⁿ',
 	'd', 
 	'k',
 	'g',
@@ -85,22 +51,77 @@ phoneset = [
 	's',
 	's:',
 	'z',
-	'zⁿ',
 	'x',
 	'h',
-
+	
 	# tap and flip
 	'r',
-	'r.', # only appears in word 'mearpartijestelsel'(does not exist in lex_asr) and 'tenoarpartij'.   
-	'r:', # only appears in word 'mûsearflearmûs' and 'sjochdêr'.
+	'r:',

 	# approximant
 	'j',
-	'j.',
 	'l'
 	]

+
+## reduce the number of phones.
+# the phones which seldom occur are replaced with another more popular phones.
+# replacements are based on the advice from Martijn Wieling.
+reduction_key = {
+	'y':'i:', 'e':'e:', 'ə:':'ɛ:', 'r:':'r', 'ɡ':'g'
+	}
+# already removed beforehand in phoneset. Just to be sure.
+phones_to_be_removed = ['ú', 's:', 'ɔ̈:']
+
+phoneset_short = [reduction_key.get(i, i) for i in phoneset
+				  if not i in phones_to_be_removed]
+phoneset_short = list(set(phoneset_short))
+phoneset_short.sort()
+
+
+## translation_key to htk format (ascii).
+# phones which gives UnicodeEncodeError when phone.encode("ascii")
+# are replaced with other characters.
+translation_key_asr2htk = {
+	'i̯': 'i_',
+	'ṷ': 'u_',
+
+	# on the analogy of German umlaut, 'e' is used.
+	'ö': 'oe', 'ö:': 'oe:',
+	'ü': 'ue', 'ü:': 'ue:',
+
+	# on the analogy of Chinese...
+	'ŋ': 'ng',
+				
+	# refer to Xsampa. 
+	'ɔ': 'O', 'ɔ:': 'O:', 'ɔ̈': 'Oe',
+	'ɛ': 'E', 'ɛ:': 'E:',
+	'ɪ': 'I', 'ɪ:': 'I:', 
+
+	# it is @ in Xsampa, but that is not handy on HTK.
+	'ə': 'A'
+	}
+phoneset_htk = [translation_key_asr2htk.get(i, i) for i in phoneset_short]
+
+## check
+#for i in phoneset_short:
+#	try:
+#		print("{0} --> {1}".format(i, i.encode("ascii")))
+#	except UnicodeEncodeError:
+#		print(">>> {}".format(i))
+
+
 ## the list of multi character phones. 
-# for example, the length of 'i̯ⁿ' is 3, but in the codes it is treated as one letter.
+# for example, the length of 'a:' is 3, but in the codes it is treated as one letter.
+
+# original.
 multi_character_phones = [i for i in phoneset if len(i) > 1]
-multi_character_phones.sort(key=len, reverse=True)
+multi_character_phones.sort(key=len, reverse=True)
+
+# phonset reduced.
+multi_character_phones_short = [i for i in phoneset_short if len(i) > 1]
+multi_character_phones_short.sort(key=len, reverse=True)
+
+# htk compatible.
+multi_character_phones_htk = [i for i in phoneset_htk if len(i) > 1]
+multi_character_phones_htk.sort(key=len, reverse=True)
--- a/acoustic_model/phoneset/fame_ipa.py
+++ b/acoustic_model/phoneset/fame_ipa.py
@@ -34,7 +34,7 @@ phoneset = [
 	'ṷ',
 	'ṷ.',
 	'ṷⁿ',
-	#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. 
+	#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone. 
 	'u',
 	'uⁿ',
 	'u.',
@@ -100,6 +100,7 @@ phoneset = [
 	'l'
 	]

+
 ## the list of multi character phones. 
 # for example, the length of 'i̯ⁿ' is 3, but in the codes it is treated as one letter.
 multi_character_phones = [i for i in phoneset if len(i) > 1]