bug related encoding on label file is fixed.

label files are extracted. hcompv_scp is made.
fix the bug there are characters in the lexicon which cannot be described in ascii.
2019-02-04 13:46:27 +01:00 · 2019-02-03 13:54:37 +01:00 · 2019-02-03 00:34:35 +01:00 · 2019-01-29 21:52:11 +01:00 · 2019-01-28 12:34:20 +01:00
13 changed files with 614 additions and 398 deletions
--- a/.vs/acoustic_model/v15/.suo
+++ b/.vs/acoustic_model/v15/.suo
--- a/acoustic_model/pycache/defaultfiles.cpython-36.pyc
+++ b/acoustic_model/pycache/defaultfiles.cpython-36.pyc
--- a/acoustic_model/acoustic_model.pyproj
+++ b/acoustic_model/acoustic_model.pyproj
@@ -4,8 +4,7 @@
    <SchemaVersion>2.0</SchemaVersion>
    <ProjectGuid>4d8c8573-32f0-4a62-9e62-3ce5cc680390</ProjectGuid>
    <ProjectHome>.</ProjectHome>
-    <StartupFile>
-    </StartupFile>
+    <StartupFile>fame_hmm.py</StartupFile>
    <SearchPath>
    </SearchPath>
    <WorkingDirectory>.</WorkingDirectory>
@@ -23,7 +22,7 @@
  </PropertyGroup>
  <ItemGroup>
    <Compile Include="check_novoapi.py" />
-    <Compile Include="convert_phone_set.py">
+    <Compile Include="convert_phoneset.py">
      <SubType>Code</SubType>
    </Compile>
    <Compile Include="convert_xsampa2ipa.py">
@@ -32,7 +31,7 @@
    <Compile Include="defaultfiles.py">
      <SubType>Code</SubType>
    </Compile>
-    <Compile Include="fame_phoneset.py">
+    <Compile Include="fame_test.py">
      <SubType>Code</SubType>
    </Compile>
    <Compile Include="fa_test.py">
@@ -50,9 +49,20 @@
      <SubType>Code</SubType>
    </Compile>
    <Compile Include="fame_hmm.py" />
+    <Compile Include="phoneset\fame_asr.py" />
+    <Compile Include="phoneset\fame_ipa.py" />
  </ItemGroup>
  <ItemGroup>
    <Content Include="config.ini" />
+    <Content Include="phoneset\fame_ipa2asr.npy" />
+    <Content Include="phoneset\output_get_translation_key_phone_unknown.npy" />
+    <Content Include="phoneset\output_get_translation_key_translation_key.npy" />
+    <Content Include="phoneset\__pycache__\fame_asr.cpython-36.pyc" />
+    <Content Include="phoneset\__pycache__\fame_ipa.cpython-36.pyc" />
+  </ItemGroup>
+  <ItemGroup>
+    <Folder Include="phoneset\" />
+    <Folder Include="phoneset\__pycache__\" />
  </ItemGroup>
  <Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
  <!-- Uncomment the CoreCompile target to enable the Build command in
--- a/acoustic_model/convert_phone_set.py
+++ b/acoustic_model/convert_phone_set.py
@@ -20,10 +20,21 @@ def split_word(word, multi_character_phones):
 	
 	Args:
 		word (str): a word written in given phoneset.
-		multi_character_phones (list): the list of multicharacter phones which is considered as one phone. this can be obtained with phoneset definition such as fame_phoneset.py. 
+		multi_character_phones (list): the list of multicharacter phones which is considered as one phone. this can be obtained with phoneset definition such as fame_ipa.py. 

 	Returns:
 		(word_seperated) (list): the word splitted in given phoneset. 

 	"""
-	return [phone for phone in multi_character_tokenize(word.strip(), multi_character_phones)]
+	return [phone 
+		 for phone in multi_character_tokenize(word.strip(), multi_character_phones)
+		 ]
+
+
+def convert_phoneset(word_list, translation_key):
+	"""
+	Args:
+		word_list (str): a list of phones written in given phoneset.
+		translation_key (dict): 
+	"""
+	return [translation_key.get(phone, phone) for phone in word_list]
--- a/acoustic_model/defaultfiles.py
+++ b/acoustic_model/defaultfiles.py
@@ -1,14 +1,13 @@
 import os
-
-#default_hvite_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', 'htk', 'config.HVite')
+# add path of the parent directory
+#os.path.dirname(os.path.realpath(__file__))

 #cygwin_dir = r'C:\cygwin64\home\Aki\acoustic_model'

 #htk_dir = r'C:\Aki\htk_fame'
 htk_dir = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk'

-config_hcopy = os.path.join(htk_dir, 'config', 'config.HCopy')
-#config_train = os.path.join(cygwin_dir, 'config', 'config.train')
+
 #config_hvite = os.path.join(cygwin_dir, 'config', 'config.HVite')
 #mkhmmdefs_pl = os.path.join(cygwin_dir, 'src', 'acoustic_model', 'mkhmmdefs.pl')

@@ -39,11 +38,11 @@ toolbox_dir					= os.path.join(repo_dir, 'toolbox')
 #config_hvite = os.path.join(htk_config_dir, 'config.HVite')
 #acoustic_model = os.path.join(htk_config_dir, 'hmmdefs.compo')
 #acoustic_model = r'c:\cygwin64\home\A.Kunikoshi\acoustic_model\model\barbara\hmm128-2\hmmdefs.compo'
-#phonelist_txt = os.path.join(htk_config_dir, 'phonelist.txt')
+phonelist_txt = os.path.join(htk_dir, 'config', 'phonelist.txt')

 WSL_dir   = r'C:\OneDrive\WSL'
 #fame_dir        = os.path.join(WSL_dir, 'kaldi-trunk', 'egs', 'fame')
-fame_dir = r'd:\_corpus\fame'
+fame_dir = r'c:\OneDrive\Research\rug\_data\FAME'

 fame_s5_dir     = os.path.join(fame_dir, 's5')
 fame_corpus_dir = os.path.join(fame_dir, 'corpus')
--- a/acoustic_model/fame_functions.py
+++ b/acoustic_model/fame_functions.py
@@ -9,38 +9,8 @@ import numpy as np
 import pandas as pd

 import defaultfiles as default
-import fame_phoneset
-import convert_phone_set
-
-
-#def ipa2famehtk_lexicon(lexicon_file_in, lexicon_file_out):
-#	""" Convert a lexicon file from IPA to HTK format for FAME! corpus. """
-
-#	lexicon_in = pd.read_table(lexicon_file_in, names=['word', 'pronunciation'])
-#	with open(lexicon_file_out, "w", encoding="utf-8") as fout:
-#		for word, pronunciation in zip(lexicon_in['word'], lexicon_in['pronunciation']):
-#			pronunciation_no_space = pronunciation.replace(' ', '')
-#			pronunciation_famehtk  = convert_phone_set.ipa2famehtk(pronunciation_no_space)
-#			if 'ceh' not in pronunciation_famehtk and 'sh' not in pronunciation_famehtk:
-#				fout.write("{0}\t{1}\n".format(word.upper(), pronunciation_famehtk))
-
-
-#def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out):
-#	""" Combine two lexicon files and sort by words. """
-
-#	with open(lexicon_file1, "rt", encoding="utf-8") as fin:
-#		lines1 = fin.read()
-#		lines1 = lines1.split('\n')
-#	with open(lexicon_file2, "rt", encoding="utf-8") as fin:
-#		lines2 = fin.read()
-#		lines2 = lines2.split('\n')
-	
-#	lex1 = pd.read_table(lexicon_file1, names=['word', 'pronunciation'])
-#	lex2 = pd.read_table(lexicon_file2, names=['word', 'pronunciation'])
-#	lex  = pd.concat([lex1, lex2])
-#	lex  = lex.sort_values(by='word', ascending=True)
-#	lex.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
-
+import convert_phoneset
+from phoneset import fame_ipa, fame_asr

 #def read_fileFA(fileFA):
 #    """
@@ -110,14 +80,6 @@ import convert_phone_set

 #    return ipa

-#def make_filelist(input_dir, output_txt):
-#	""" Make a list of files in the input_dir. """
-#	filenames = os.listdir(input_dir)
-
-#	with open(output_txt, 'w') as fout:
-#		for filename in filenames:
-#			fout.write(input_dir + '\\' + filename + '\n')
-

 #def make_htk_dict(word, pronvar_, fileDic, output_type):
 #    """
@@ -179,10 +141,11 @@ def make_hcopy_scp_from_filelist_in_fame(fame_dir, dataset, feature_dir, hcopy_s

 				fout.write(wav_file + '\t' + mfc_file + '\n')

+	return


 def load_lexicon(lexicon_file):
-	""" load lexicon file as Data Frame.
+	""" load lexicon file as data frame.

 	Args:
 		lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'.
@@ -196,25 +159,27 @@ def load_lexicon(lexicon_file):
 	return lex


-def get_phoneset_from_lexicon(lexicon_file, phoneset='asr'):
+def get_phoneset_from_lexicon(lexicon_file, phoneset_name='asr'):
 	""" Make a list of phones which appears in the lexicon. 
 	
 	Args:
 		lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'.
-		phoneset (str): the phoneset with which lexicon_file is written. 'asr'(default) or 'ipa'.
+		phoneset_name (str): the name of phoneset with which lexicon_file is written. 'asr'(default) or 'ipa'.

 	Returns:
 		(list_of_phones) (set): the set of phones included in the lexicon_file.

 	"""
-	assert phoneset in ['asr', 'ipa'], 'phoneset should be \'asr\' or \'ipa\''
+	assert phoneset_name in ['asr', 'ipa'], 'phoneset_name should be \'asr\' or \'ipa\''

 	lex = load_lexicon(lexicon_file)
-	if phoneset == 'asr':
+	if phoneset_name == 'asr':
 		return set(' '.join(lex['pronunciation']).split(' '))
-	elif phoneset == 'ipa':
+	elif phoneset_name == 'ipa':
 		join_pronunciations = ''.join(lex['pronunciation'])
-		return set(convert_phone_set.split_word(join_pronunciations, fame_phoneset.multi_character_phones_ipa))
+		return set(convert_phone_set.split_word(join_pronunciations, fame_ipa.multi_character_phones))
+
+	return


 def extract_unknown_phones(ipa, known_phones):
@@ -228,7 +193,7 @@ def extract_unknown_phones(ipa, known_phones):
 		(list_of_phones) (list): unknown phones not included in 'known_phones'.

 	"""
-	ipa_split = convert_phone_set.split_word(ipa, fame_phoneset.multi_character_phones_ipa)
+	ipa_split = convert_phone_set.split_word(ipa, fame_ipa.multi_character_phones)
 	return [i for i in ipa_split if not i in known_phones]


@@ -247,14 +212,14 @@ def get_translation_key(lexicon_file_ipa, lexicon_file_asr):
 	"""
 	lex_ipa = load_lexicon(lexicon_file_ipa)
 	lex_asr = load_lexicon(lexicon_file_asr)
-	phone_unknown = fame_phoneset.phoneset_ipa[:]
+	phone_unknown = fame_ipa.phoneset[:]
 	translation_key = dict()
 	for word in lex_ipa['word']:
 		if np.sum(lex_ipa['word'] == word) == 1 and np.sum(lex_asr['word'] == word) == 1:
 			ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
 			asr = lex_asr[lex_asr['word'] == word].iat[0, 1]
 	
-			ipa_list = convert_phone_set.split_word(ipa, fame_phoneset.multi_character_phones_ipa)
+			ipa_list = convert_phone_set.split_word(ipa, fame_ipa.multi_character_phones)
 			asr_list = asr.split(' ')

 			# if there are phones which is not in phone_unknown
@@ -268,13 +233,13 @@ def get_translation_key(lexicon_file_ipa, lexicon_file_asr):
 	return translation_key, list(phone_unknown)


-def find_phone(lexicon_file, phone, phoneset='ipa'):
+def find_phone(lexicon_file, phone, phoneset_name='ipa'):
 	""" extract rows where the phone is used in the lexicon_file. 

 	Args:
 		lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'.
 		phone (str): the phone to be searched.
-		phoneset (str): the phoneset with which lexicon_file is written. 'asr' or 'ipa'(default).
+		phoneset_name (str): the name of phoneset_name with which lexicon_file is written. 'asr' or 'ipa'(default).

 	Returns:
 		extracted (df): rows where the phone is used.
@@ -283,7 +248,7 @@ def find_phone(lexicon_file, phone, phoneset='ipa'):
 		* develop when the phonset == 'asr'.

 	"""
-	assert phoneset in ['asr', 'ipa'], 'phoneset should be \'asr\' or \'ipa\''
+	assert phoneset_name in ['asr', 'ipa'], 'phoneset_name should be \'asr\' or \'ipa\''
 	
 	lex = load_lexicon(lexicon_file)
 	
@@ -292,9 +257,87 @@ def find_phone(lexicon_file, phone, phoneset='ipa'):

 	extracted = pd.DataFrame(index=[], columns=['word', 'pronunciation'])
 	for index, row in lex_.iterrows():
-		if phoneset == 'ipa':
-			pronunciation = convert_phone_set.split_word(row['pronunciation'], fame_phoneset.multi_character_phones_ipa)
+		if phoneset_name == 'ipa':
+			pronunciation = convert_phone_set.split_word(row['pronunciation'], fame_ipa.multi_character_phones)
 		if phone in pronunciation:
 			extracted_ = pd.Series([row['word'], pronunciation], index=extracted.columns)
 			extracted  = extracted.append(extracted_, ignore_index=True)
 	return extracted
+
+
+def asr2htk_space_delimited(pronunciation):
+	"""convert phoneset from asr to htk.
+	
+	Args:
+		pronunciation (str): space delimited asr phones. 
+
+	Returns:
+		(pronunciation) (str): space delimited asr phones in htk format (ascii).
+
+	"""
+	pronunciation_short = [fame_asr.reduction_key.get(i, i) for i in pronunciation.split(' ') 
+			   if not i in fame_asr.phones_to_be_removed]
+	return ' '.join(convert_phoneset.convert_phoneset(
+		pronunciation_short, fame_asr.translation_key_asr2htk))
+
+
+def lexicon_asr2htk(lexicon_file_asr, lexicon_file_htk):
+	""" Convert a lexicon file from asr to htk format (ascii). 
+
+	Args:
+		lexicon_file_asr (path): a lexicon file written in asr format e.g. fame/lex.asr.
+		lexicon_file_htk (path): a lexicon file written in htk format (ascii).
+
+	"""
+	lex_asr = load_lexicon(lexicon_file_asr)
+	def word2htk_(row):
+		return word2htk(row['word'])
+	def asr2htk_space_delimited_(row):
+		return asr2htk_space_delimited(row['pronunciation'])
+
+	lex_htk = pd.DataFrame({
+		'word': lex_asr.apply(word2htk_, axis=1).str.upper(),
+        'pronunciation': lex_asr.apply(asr2htk_space_delimited_, axis=1)
+		})
+	lex_htk = lex_htk.ix[:, ['word', 'pronunciation']]
+	lex_htk.to_csv(lexicon_file_htk, header=None, index=None, sep='\t', encoding='utf-8')
+	return
+
+
+def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out):
+	""" Combine two lexicon files and sort by words. 
+
+	Args:
+		lexicon_file1, lexicon_file2 (path): input lexicon files.
+		
+	Returns:
+		lexicon_file_out (path): lexicon_file which lexcion_file1 and 2 are combined and sorted.
+
+	"""
+	lex1 = load_lexicon(lexicon_file1)
+	lex2 = load_lexicon(lexicon_file2)
+	lex  = pd.concat([lex1, lex2])
+	lex  = lex.sort_values(by='word', ascending=True)
+	lex.to_csv(lexicon_out, index=False, header=False, sep='\t', encoding='utf-8')
+
+
+def fix_single_quote(lexicon_file):
+	""" add '\' before all single quote at the beginning of words.
+	convert special characters to ascii compatible characters.
+
+	Args:
+		lexicon_file (path): lexicon file, which will be overwitten.
+
+	"""
+	lex = load_lexicon(lexicon_file)
+	lex = lex.dropna() # remove N/A.
+	for i in lex[lex['word'].str.startswith('\'')].index.values:
+		lex.iat[i, 0] = lex.iat[i, 0].replace('\'', '\\\'')
+	# to_csv does not work with space seperator. therefore all tabs should manually be replaced.
+	#lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
+	lex.to_csv(lexicon_file, index=False, header=False, sep='\t', encoding='utf-8')
+	return
+
+
+def word2htk(word):
+	return ''.join([fame_asr.translation_key_word2htk.get(i, i) for i in word])
--- a/acoustic_model/fame_hmm.py
+++ b/acoustic_model/fame_hmm.py
@@ -3,15 +3,15 @@ import os
 os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')

 import tempfile
-#import configparser
-#import subprocess
-#from collections import Counter
+import shutil
+import glob
 import time

-#import numpy as np
-#import pandas as pd
+import numpy as np
+import pandas as pd

 import fame_functions
+from phoneset import fame_ipa, fame_asr
 import defaultfiles as default
 sys.path.append(default.toolbox_dir)
 import file_handling as fh
@@ -19,60 +19,42 @@ from htk import pyhtk


 ## ======================= user define =======================
-#repo_dir = 'C:\\Users\\Aki\\source\\repos\\acoustic_model'
-#curr_dir = repo_dir + '\\acoustic_model'
-#config_ini = curr_dir + '\\config.ini'
-#output_dir = 'C:\\OneDrive\\Research\\rug\\experiments\\friesian\\acoustic_model'
-#forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced_alignment'
+# procedure
+make_lexicon	  = 0
+make_label		  = 0 # it takes roughly 4800 sec on Surface pro 2.
+make_htk_files    = 0
+extract_features  = 0
+flat_start		  = 0
+train_model_without_sp = 1
+
+
+# pre-defined values.

 dataset_list = ['devel', 'test', 'train']
+hmmdefs_name = 'hmmdefs'

-# procedure
-extract_features  = 0
-conv_lexicon	  = 1
-#check_lexicon	  = 0
-#make_mlf		  = 0
-#combine_files	  = 0
-#flat_start		  = 0
-#train_model		  = 1
+lexicon_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr')
+lexicon_oov = os.path.join(default.fame_dir, 'lexicon', 'lex.oov')
+
+config_dir = os.path.join(default.htk_dir, 'config')
+config_hcopy = os.path.join(config_dir, 'config.HCopy')
+config_train = os.path.join(config_dir, 'config.train')
+global_ded   = os.path.join(config_dir, 'global.ded')
+mkphones_led = os.path.join(config_dir, 'mkphones.led')
+prototype    = os.path.join(config_dir, 'proto39')
+
+model_dir    = os.path.join(default.htk_dir, 'model')


-#sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir))
-#sys.path.append(forced_alignment_module)
-#from forced_alignment import convert_phone_set
+# directories / files to be made.

+lexicon_dir = os.path.join(default.htk_dir, 'lexicon') 
+lexicon_htk_asr = os.path.join(lexicon_dir, 'lex.htk_asr')
+lexicon_htk_oov = os.path.join(lexicon_dir, 'lex.htk_oov')
+lexicon_htk     = os.path.join(lexicon_dir, 'lex.htk')

-
-## ======================= load variables =======================
-
-#config = configparser.ConfigParser()
-#config.sections()
-#config.read(config_ini)
-
-#config_hcopy = config['Settings']['config_hcopy']
-#config_train = config['Settings']['config_train']
-#mkhmmdefs_pl = config['Settings']['mkhmmdefs_pl']
-#FAME_dir	 = config['Settings']['FAME_dir']
-
-#lex_asr		= FAME_dir + '\\lexicon\\lex.asr'
-#lex_asr_htk = FAME_dir + '\\lexicon\\lex.asr_htk'
-#lex_oov		= FAME_dir + '\\lexicon\\lex.oov'
-#lex_oov_htk = FAME_dir + '\\lexicon\\lex.oov_htk'
-##lex_ipa		= FAME_dir + '\\lexicon\\lex.ipa'
-##lex_ipa_	= FAME_dir + '\\lexicon\\lex.ipa_'
-##lex_ipa_htk = FAME_dir + '\\lexicon\\lex.ipa_htk'
-#lex_htk		= FAME_dir + '\\lexicon\\lex_original.htk'
-#lex_htk_	= FAME_dir + '\\lexicon\\lex.htk'
-
-#hcompv_scp = output_dir + '\\scp\\combined.scp'
-#combined_mlf = output_dir + '\\label\\combined.mlf'
-
-#model_dir  = output_dir + '\\model'
-#model0_dir = model_dir + '\\hmm0'
-#proto_init = model_dir + '\\proto38'
-#proto_name = 'proto'
-#phonelist  = output_dir + '\\config\\phonelist_friesian.txt'
-#hmmdefs_name = 'hmmdefs'
+phonelist_txt = os.path.join(config_dir, 'phonelist.txt')
+model0_dir	  = os.path.join(model_dir, 'hmm0')

 feature_dir = os.path.join(default.htk_dir, 'mfc')
 if not os.path.exists(feature_dir):
@@ -80,134 +62,26 @@ if not os.path.exists(feature_dir):
 tmp_dir = os.path.join(default.htk_dir, 'tmp')
 if not os.path.exists(tmp_dir):
 	os.makedirs(tmp_dir)
+label_dir = os.path.join(default.htk_dir, 'label')
+if not os.path.exists(label_dir):
+	os.makedirs(label_dir)


-## ======================= extract features =======================
-if extract_features:
-	for dataset in dataset_list:
-		print('==== {} ===='.format(dataset))
-
-		# a script file for HCopy 
-		print(">>> making a script file for HCopy... \n")
-		hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False)
-		hcopy_scp.close()
-
-		# get a list of features (hcopy.scp) from the filelist in FAME! corpus
-		feature_dir_ = os.path.join(feature_dir, dataset)
-		if not os.path.exists(feature_dir_):
-			os.makedirs(feature_dir_)
-
-		# extract features
-		print(">>> extracting features... \n")
-		fame_functions.make_hcopy_scp_from_filelist_in_fame(default.fame_dir, dataset, feature_dir_, hcopy_scp.name)
-		pyhtk.wav2mfc(default.config_hcopy, hcopy_scp.name)
-
-		# a script file for HCompV
-		print(">>> making a script file for HCompV... \n")
-		hcompv_scp  = os.path.join(tmp_dir, dataset + '.scp')
-		fh.make_filelist(feature_dir_, hcompv_scp, '.mfc')
-
-
-## ======================= convert lexicon from ipa to fame_htk =======================
-if conv_lexicon:
-	print('==== convert lexicon from ipa 2 fame ====\n')
-	
-	#dir_out = r'c:\Users\Aki\source\repos\acoustic_model\_tmp'
-	lexicon_dir = os.path.join(default.fame_dir, 'lexicon') 
-	lexicon_ipa = os.path.join(lexicon_dir, 'lex.ipa')
-	lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
-
-	# get the correspondence between lex_ipa and lex_asr.
-	lex_asr  = fame_functions.load_lexicon(lexicon_asr)
-	lex_ipa  = fame_functions.load_lexicon(lexicon_ipa)		
-	if 1:
+## ======================= make lexicon for HTK =======================
+if make_lexicon:
 	timer_start = time.time()
-		translation_key, phone_unknown = fame_functions.get_translation_key(lexicon_ipa, lexicon_asr)
-		print("elapsed time: {}".format(time.time() - timer_start))
+	print('==== making lexicon for HTK ====')

-		np.save('translation_key_ipa2asr.npy', translation_key)
-		np.save('phone_unknown.npy', phone_unknown)
-	else:
-		translation_key = np.load('translation_key_ipa2asr.npy').item()
-		phone_unknown   = np.load('phone_unknown.npy')
-		phone_unknown   = list(phone_unknown)
-
-
-	## manually check the correspondence for the phone in phone_unknown.
-	#p = phone_unknown[0]
-	#lex_ipa_ = find_phone(lexicon_ipa, p, phoneset='ipa')
-
-	#for word in lex_ipa_['word']:
-	#	ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
-	#	if np.sum(lex_asr['word'] == word) > 0:
-	#		asr = lex_asr[lex_asr['word'] == word].iat[0, 1]
-	
-	#		ipa_list = convert_phone_set.split_word(ipa, fame_phoneset.multi_character_phones_ipa)
-	#		asr_list = asr.split(' ')
-	#		if p in ipa_list and (len(ipa_list) == len(asr_list)):
-	#			print("{0}: {1} --> {2}".format(word, ipa_list, asr_list))
-	#			for ipa_, asr_ in zip(ipa_list, asr_list):
-	#				if ipa_ in phone_unknown:
-	#					translation_key[ipa_] = asr_
-	#					phone_unknown.remove(ipa_)
-
-
-	## check if all the phones in lexicon_ipa are in fame_phoneset.py.
-	#timer_start = time.time()
-	#phoneset_lex = get_phoneset_from_lexicon(lexicon_ipa, phoneset='ipa')
-	#print("elapsed time: {}".format(time.time() - timer_start))
-	
-	#phoneset_py = fame_phoneset.phoneset_ipa
-	#set(phoneset_lex) - set(phoneset_py)
-
-	##timer_start = time.time()
-	##extracted = find_phone(lexicon_ipa, 'ⁿ')
-	##print("elapsed time: {}".format(time.time() - timer_start))
-
-
-	# lex.asr is Kaldi compatible version of lex.ipa.
-	# to check... 
-	#lexicon_ipa = pd.read_table(lex_ipa, names=['word', 'pronunciation'])
-	#with open(lex_ipa_, "w", encoding="utf-8") as fout:
-	#	for word, pronunciation in zip(lexicon_ipa['word'], lexicon_ipa['pronunciation']):
-	#		# ignore nasalization and '.'
-	#		pronunciation_ = pronunciation.replace(u'ⁿ', '')
-	#		pronunciation_ = pronunciation_.replace('.', '')
-	#		pronunciation_split = convert_phone_set.split_ipa_fame(pronunciation_)
-	#		fout.write("{0}\t{1}\n".format(word, ' '.join(pronunciation_split)))
-
-	# convert each lexicon from ipa description to fame_htk phoneset.
-	#am_func.ipa2famehtk_lexicon(lex_oov, lex_oov_htk)
-	#am_func.ipa2famehtk_lexicon(lex_asr, lex_asr_htk)
+	# convert each lexicon from fame_asr phoneset to fame_htk phoneset.
+	print('>>> converting each lexicon from fame_asr phoneset to fame_htk phoneset...')
+	fame_functions.lexicon_asr2htk(lexicon_asr, lexicon_htk_asr)
+	fame_functions.lexicon_asr2htk(lexicon_oov, lexicon_htk_oov)

 	# combine lexicon
+	print('>>> combining lexicon files into one lexicon...')
 	# pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov.
 	# therefore there is no overlap between lex_asr and lex_oov.   
-	#am_func.combine_lexicon(lex_asr_htk, lex_oov_htk, lex_htk)
-
-
-## ======================= check if all the phones are successfully converted =======================
-if check_lexicon:
-	print("==== check if all the phones are successfully converted. ====\n")
-
-	# the phones used in the lexicon.
-	phonelist_asr = am_func.get_phonelist(lex_asr)
-	phonelist_oov = am_func.get_phonelist(lex_oov)
-	phonelist_htk = am_func.get_phonelist(lex_htk)
-
-	phonelist = phonelist_asr.union(phonelist_oov)
-
-	# the lines which include a specific phone.
-	lines = am_func.find_phone(lex_asr, 'g')
-
-	# statistics over the lexicon
-	lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])
-	pronunciation = lexicon_htk['pronunciation']
-	phones_all = []
-	for word in pronunciation:
-		phones_all = phones_all + word.split()
-	c = Counter(phones_all)
-
+	fame_functions.combine_lexicon(lexicon_htk_asr, lexicon_htk_oov, lexicon_htk)

 	## ======================= 
 	## manually make changes to the pronunciation dictionary and save it as lex.htk 
@@ -215,164 +89,172 @@ if check_lexicon:
 	# (1) Replace all tabs with single space;
 	# (2) Put a '\' before any dictionary entry beginning with single quote 
 	#http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html
+	fame_functions.fix_single_quote(lexicon_htk)
+	print("elapsed time: {}".format(time.time() - timer_start))


-## ======================= make label file =======================
-if make_mlf:
-	print("==== make mlf ====\n")
-
-	print("generating word level transcription...\n")
+## ======================= make label files =======================
+if make_label:
 	for dataset in dataset_list:
-		hcompv_scp  = output_dir + '\\scp\\' + dataset + '.scp'
-		hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'
-		script_list = FAME_dir + '\\data\\' + dataset + '\\text'
-		mlf_word	= output_dir + '\\label\\' + dataset + '_word.mlf'
-		mlf_phone   = output_dir + '\\label\\' + dataset + '_phone.mlf'
+		timer_start = time.time()
+		print("==== making label files on dataset {}".format(dataset))

-		# lexicon
-		lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])
-
-		# list of features
-		with open(hcompv_scp) as fin:
-			features = fin.read()
-			features = features.split('\n')
+		script_list = os.path.join(default.fame_dir, 'data', dataset, 'text')
+		wav_dir_	= os.path.join(default.fame_dir, 'fame', 'wav', dataset)
+		label_dir_		= os.path.join(label_dir, dataset)
+		dictionary_file = os.path.join(label_dir_, 'temp.dic')
+		fh.make_new_directory(label_dir_)

 		# list of scripts 
 		with open(script_list, "rt", encoding="utf-8") as fin:
-			scripts = fin.read()
-			scripts = pd.Series(scripts.split('\n'))
+			scripts = fin.read().split('\n')

-		i = 0
-		missing_words = []
-		fscp = open(hcompv_scp2, 'wt')
-		fmlf = open(mlf_word, "wt", encoding="utf-8")
-		fmlf.write("#!MLF!#\n")
-		feature_nr = 1
-		for feature in features:
-			sys.stdout.write("\r%d/%d" % (feature_nr, len(features)))
-			sys.stdout.flush()
-			feature_nr += 1
-			file_basename = os.path.basename(feature).replace('.mfc', '')
+		for line in scripts:
+			# sample line:
+			# sp0457m_test_1968_plakkenfryslanterhorne_2168 en dan begjinne je natuerlik
+			filename_ = line.split(' ')[0]
+			filename  = '_'.join(filename_.split('_')[1:])
+			sentence  = ' '.join(line.split(' ')[1:])
+			sentence_htk = fame_functions.word2htk(sentence)

-			# get words from scripts.
-			try:
-				script = scripts[scripts.str.contains(file_basename)]
-			except IndexError:
-				script = []
+			wav_file = os.path.join(wav_dir_, filename + '.wav')
+			if os.path.exists(wav_file) and pyhtk.can_be_ascii(sentence_htk) == 0:
+				if pyhtk.create_dictionary_without_log(
+					sentence_htk, global_ded, dictionary_file, lexicon_htk) == 0:
+					# when the file name is too long, HDMan command does not work.
+					# therefore first temporary dictionary_file is made, then renamed. 
+					shutil.move(dictionary_file, os.path.join(label_dir_, filename + '.dic'))

-			if len(script) != 0:
-				script_id  = script.index[0]
-				script_txt = script.get(script_id)
-				script_words = script_txt.split(' ')
-				del script_words[0]
-
-				# check if all words can be found in the lexicon.
-				SCRIPT_WORDS = []
-				script_prons = []
-				is_in_lexicon = 1
-				for word in script_words:
-					WORD = word.upper()
-					SCRIPT_WORDS.append(WORD)
-					extracted = lexicon_htk[lexicon_htk['word']==WORD]
-					if len(extracted) == 0:
-						missing_words.append(word)
-					script_prons.append(extracted)
-					is_in_lexicon *= len(extracted)
-
-				# if all pronunciations are found in the lexicon, update scp and mlf files.
-				if is_in_lexicon:
-					# add the feature filename into the .scp file.
-					fscp.write("{}\n".format(feature))
-					i += 1
-
-					# add the words to the mlf file.
-					fmlf.write('\"*/{}.lab\"\n'.format(file_basename))
-					#fmlf.write('{}'.format('\n'.join(SCRIPT_WORDS)))
-					for word_ in SCRIPT_WORDS:
-						if word_[0] == '\'':
-							word_ = '\\' + word_
-						fmlf.write('{}\n'.format(word_))
-					fmlf.write('.\n')
-		print("\n{0} has {1} samples.\n".format(dataset, i))
-		np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words)
-
-		fscp.close()
-		fmlf.close()
+					label_file = os.path.join(label_dir_, filename + '.lab')
+					pyhtk.create_label_file(sentence_htk, label_file)
+				else:
+					os.remove(dictionary_file)
+		print("elapsed time: {}".format(time.time() - timer_start))


-		## generate phone level transcription 
-		print("generating phone level transcription...\n")
-		mkphones = output_dir + '\\label\\mkphones0.txt'
-		subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word
-		subprocess.call(subprocessStr, shell=True)
+## ======================= make other required files =======================
+if make_htk_files:
+	timer_start = time.time()
+	print("==== making files required for HTK ====")
 	
-
-## ======================= combined scps and mlfs =======================
-if combine_files:
-	print("==== combine scps and mlfs ====\n")
-
-	fscp = open(hcompv_scp, 'wt')
-	fmlf = open(combined_mlf, 'wt')
+	print(">>> making a phonelist...")
+	pyhtk.create_phonelist_file(fame_asr.phoneset_htk, phonelist_txt)

 	for dataset in dataset_list:
-		fmlf.write("#!MLF!#\n")
+		wav_dir_	 = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
+		feature_dir_ = os.path.join(feature_dir, dataset)
+		label_dir_   = os.path.join(label_dir, dataset)
+		mlf_word  = os.path.join(label_dir, dataset + '_word.mlf')
+		mlf_phone = os.path.join(label_dir, dataset + '_phone.mlf')
+
+		#print(">>> making a script file for {}...".format(dataset))
+		#listdir    = glob.glob(os.path.join(wav_dir_, '*.dic'))
+		#mfc_list   = [filename.replace(wav_dir_, feature_dir_).replace('.dic', '.mfc') for filename in listdir]
+		#hcompv_scp = os.path.join(tmp_dir, dataset + '.scp')
+		#with open(hcompv_scp, 'wb') as f:
+		#	f.write(bytes('\n'.join(mfc_list) + '\n', 'ascii'))
+
+		print(">>> making a mlf file for {}...".format(dataset))
+		lab_list = glob.glob(os.path.join(label_dir_, '*.lab'))
+		with open(mlf_word, 'wb') as fmlf:
+			fmlf.write(bytes('#!MLF!#\n', 'ascii'))
+			for label_file in lab_list:
+				filename = os.path.basename(label_file)
+				fmlf.write(bytes('\"*/{}\"\n'.format(filename), 'ascii'))
+				with open(label_file) as flab:
+					lines = flab.read()
+				fmlf.write(bytes(lines + '.\n', 'ascii'))
+
+		print(">>> generating phone level transcription for {}...".format(dataset))
+		pyhtk.mlf_word2phone(lexicon_htk, mlf_phone, mlf_word, mkphones_led)
+		print("elapsed time: {}".format(time.time() - timer_start))
+
+
+## ======================= extract features =======================
+if extract_features:
 	for dataset in dataset_list:
-			each_mlf = output_dir + '\\label\\' + dataset + '_phone.mlf'
-			each_scp = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'
+		timer_start = time.time()
+		print('==== extract features on dataset {} ===='.format(dataset))

-		with open(each_mlf, 'r') as fin:
-			lines = fin.read()
-			lines = lines.split('\n')
-		fmlf.write('\n'.join(lines[1:]))
+		wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
+		label_dir_   = os.path.join(label_dir, dataset)
+		feature_dir_ = os.path.join(feature_dir, dataset)
+		fh.make_new_directory(feature_dir_)

-		with open(each_scp, 'r') as fin:
-			lines = fin.read()
-		fscp.write(lines)
+		# a script file for HCopy 
+		print(">>> making a script file for HCopy...")
+		hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False)
+		hcopy_scp.close()

-	fscp.close()
-	fmlf.close()
+		# get a list of features (hcopy.scp) 
+		# from the filelist in FAME! corpus.
+		#fame_functions.make_hcopy_scp_from_filelist_in_fame(default.fame_dir, dataset, feature_dir_, hcopy_scp.name)
+		# from the list of label files.
+		lab_list = glob.glob(os.path.join(label_dir_, '*.lab'))
+		feature_list = [
+			os.path.join(wav_dir_, os.path.basename(lab_file).replace('.lab', '.wav')) + '\t'
+			+ os.path.join(feature_dir_, os.path.basename(lab_file).replace('.lab', '.mfc'))
+				  for lab_file in lab_list]
+		with open(hcopy_scp.name, 'wb') as f:
+			f.write(bytes('\n'.join(feature_list), 'ascii'))
+		
+		# extract features.
+		print(">>> extracting features on {}...".format(dataset))
+		pyhtk.wav2mfc(config_hcopy, hcopy_scp.name)
+		os.remove(hcopy_scp.name)
+
+		# make hcompv.scp.
+		print(">>> making a script file for {}...".format(dataset))
+		listdir    = glob.glob(os.path.join(label_dir_, '*.dic'))
+		mfc_list   = [filename.replace(label_dir_, feature_dir_).replace('.dic', '.mfc') for filename in listdir]
+		hcompv_scp = os.path.join(tmp_dir, dataset + '.scp')
+		with open(hcompv_scp, 'wb') as f:
+			f.write(bytes('\n'.join(mfc_list) + '\n', 'ascii'))
+
+		print("elapsed time: {}".format(time.time() - timer_start))


 ## ======================= flat start monophones =======================
 if flat_start:
-	subprocessStr = 'HCompV -T 1 -C ' + config_train + ' -m -v 0.01 -S ' + hcompv_scp + ' -M ' + model0_dir + ' ' + proto_init
-	subprocess.call(subprocessStr, shell=True)
+	hcompv_scp = os.path.join(tmp_dir, 'test.scp')
+
+	timer_start = time.time()
+	print('==== flat start ====')
+	pyhtk.flat_start(config_train, hcompv_scp, model0_dir, prototype)

 	# allocate mean & variance to all phones in the phone list
-	subprocessStr = 'perl ' + mkhmmdefs_pl + ' ' + model0_dir + '\\proto38' + ' ' + phonelist + ' > ' + model0_dir + '\\' + hmmdefs_name 
-	subprocess.call(subprocessStr, shell=True)
+	pyhtk.create_hmmdefs(
+		os.path.join(model0_dir, 'proto39'),
+	    os.path.join(model0_dir, 'hmmdefs'), 
+		phonelist_txt)
+	print("elapsed time: {}".format(time.time() - timer_start))


 ## ======================= estimate monophones =======================
-if train_model:
-	iter_num_max = 3
-	for mix_num in [128, 256, 512, 1024]:
-		for iter_num in range(1, iter_num_max+1):
-			print("===== mix{}, iter{} =====".format(mix_num, iter_num))
-			iter_num_pre = iter_num - 1
-			modelN_dir = model_dir + '\\hmm' + str(mix_num) + '-' + str(iter_num)
-			if not os.path.exists(modelN_dir):
-				os.makedirs(modelN_dir)
+if train_model_without_sp:
+	hcompv_scp = os.path.join(tmp_dir, 'test.scp')
+	mlf_file = os.path.join(label_dir, 'test_phone.mlf')
+	output_dir = os.path.join(model_dir, 'hmm1')
+	fh.make_new_directory(output_dir)

-			if iter_num == 1 and mix_num == 1:
-				modelN_dir_pre = model0_dir
-			else:
-				modelN_dir_pre = model_dir + '\\hmm' + str(mix_num) + '-' + str(iter_num_pre)
-		
-			## re-estimation
-			subprocessStr = 'HERest -T 1 -C ' + config_train + ' -v 0.01 -I ' + combined_mlf + ' -H ' + modelN_dir_pre + '\\' + hmmdefs_name + ' -M ' + modelN_dir + ' ' + phonelist + ' -S ' + hcompv_scp
-			subprocess.call(subprocessStr, shell=True)
-
-		mix_num_next = mix_num * 2
-		modelN_dir_next = model_dir + '\\hmm' + str(mix_num_next) + '-0'
-		if not os.path.exists(modelN_dir_next):
-			os.makedirs(modelN_dir_next)
-	
-		header_file = modelN_dir + '\\mix' + str(mix_num_next) + '.hed'
-		with open(header_file, 'w') as fout:
-			fout.write("MU %d {*.state[2-4].mix}" % (mix_num_next))
-
-		subprocessStr =	'HHEd -T 1 -H ' + modelN_dir + '\\' + hmmdefs_name + ' -M ' + modelN_dir_next + ' ' + header_file + ' ' + phonelist
-		
-		subprocess.call(subprocessStr, shell=True)
+	print('==== train model without sp ====')
+	if not os.path.exists(os.path.join(output_dir, 'iter0')):
+		shutil.copytree(model0_dir, os.path.join(output_dir, 'iter0'))
+	niter = 1
+	for niter in range(1, 5):
+		timer_start = time.time()
+		hmm_n = 'iter' + str(niter)
+		hmm_n_pre = 'iter' + str(niter-1)
+		modeln_dir	   = os.path.join(output_dir, hmm_n)
+		modeln_dir_pre = os.path.join(output_dir, hmm_n_pre) 
 		
+		# re-estimation
+		fh.make_new_directory(modeln_dir)
+		pyhtk.re_estimation(
+			config_train,
+			os.path.join(modeln_dir_pre, 'proto39'),
+			os.path.join(modeln_dir_pre, hmmdefs_name), 
+			modeln_dir,
+			hcompv_scp, phonelist_txt,
+			mlf_file=mlf_file)
+		print("elapsed time: {}".format(time.time() - timer_start))
--- a/acoustic_model/fame_test.py
+++ b/acoustic_model/fame_test.py
@@ -0,0 +1,134 @@
+import sys
+import os
+os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
+from collections import Counter
+import time
+import re
+
+import numpy as np
+import pandas as pd
+
+import fame_functions
+import defaultfiles as default
+sys.path.append(default.toolbox_dir)
+from phoneset import fame_ipa, fame_asr
+import convert_phoneset
+
+lexicon_dir = os.path.join(default.fame_dir, 'lexicon') 
+lexicon_ipa = os.path.join(lexicon_dir, 'lex.ipa')
+lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
+lexicon_htk = os.path.join(default.htk_dir, 'lexicon', 'lex.htk')
+
+## check if all the phones in lexicon.ipa are in fame_ipa.py.
+#timer_start = time.time()
+#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_ipa, phoneset='ipa')	
+#phoneset_py = fame_ipa.phoneset
+#print("phones which is in lexicon.ipa but not in fame_ipa.py:\n{}".format(
+#	set(phoneset_lex) - set(phoneset_py)))
+#print("elapsed time: {}".format(time.time() - timer_start))
+
+# check which word has the phone. 
+#timer_start = time.time()
+#extracted = find_phone(lexicon_ipa, 'ⁿ')
+#print("elapsed time: {}".format(time.time() - timer_start))
+
+
+## get the correspondence between lex_ipa and lex_asr.	
+lex_asr  = fame_functions.load_lexicon(lexicon_asr)
+lex_ipa  = fame_functions.load_lexicon(lexicon_ipa)		
+if 0:
+	timer_start = time.time()
+	translation_key_ipa2asr, phone_unknown = fame_functions.get_translation_key(lexicon_ipa, lexicon_asr)
+	print("elapsed time: {}".format(time.time() - timer_start))
+
+	np.save(os.path.join('phoneset', 'output_get_translation_key_translation_key.npy'), translation_key_ipa2asr)
+	np.save(os.path.join('phoneset', 'output_get_translation_key_phone_unknown.npy'), phone_unknown)
+else:
+	translation_key_ipa2asr = np.load(os.path.join('phoneset', 'output_get_translation_key_translation_key.npy')).item()
+	phone_unknown   = np.load(os.path.join('phoneset', 'output_get_translation_key_phone_unknown.npy'))
+	phone_unknown   = list(phone_unknown)
+
+# manually check the correspondence for the phone in phone_unknown.
+#p = phone_unknown[0]
+#lex_ipa_ = find_phone(lexicon_ipa, p, phoneset='ipa')
+
+#for word in lex_ipa_['word']:
+#	ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
+#	if np.sum(lex_asr['word'] == word) > 0:
+#		asr = lex_asr[lex_asr['word'] == word].iat[0, 1]
+	
+#		ipa_list = convert_phone_set.split_word(ipa, fame_ipa.multi_character_phones)
+#		asr_list = asr.split(' ')
+#		if p in ipa_list and (len(ipa_list) == len(asr_list)):
+#			print("{0}: {1} --> {2}".format(word, ipa_list, asr_list))
+#			for ipa_, asr_ in zip(ipa_list, asr_list):
+#				if ipa_ in phone_unknown:
+#					translation_key_ipa2asr[ipa_] = asr_
+#					phone_unknown.remove(ipa_)
+
+translation_key_ipa2asr['ə:'] = 'ə'
+translation_key_ipa2asr['r.'] = 'r'
+translation_key_ipa2asr['r:'] = 'r'
+np.save(os.path.join('phoneset', 'fame_ipa2asr.npy'), translation_key_ipa2asr)
+
+
+## check if all the phones in lexicon.asr are in translation_key_ipa2asr.
+#timer_start = time.time()
+#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_asr, phoneset='asr')
+#phoneset_lex.remove("")
+#phoneset_asr = list(set(translation_key_ipa2asr.values()))
+#print("phones which is in lexicon.asr but not in the translation_key_ipa2asr:\n{}".format(
+#	set(phoneset_lex) - set(phoneset_asr)))
+#print("elapsed time: {}".format(time.time() - timer_start))
+
+
+## check if all the phones in lexicon.htk are in fame_asr.py.
+#timer_start = time.time()
+#phoneset_htk = fame_asr.phoneset_htk
+#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_htk)
+#phoneset_lex.remove('')
+#print("phones which is in lexicon.htk but not in the fame_asr.py are:\n{}".format(
+#	set(phoneset_htk) - set(phoneset_lex)))
+#print("elapsed time: {}".format(time.time() - timer_start))
+
+## statistics over the lexicon
+#lex_htk = fame_functions.load_lexicon(lexicon_htk)
+#phones_all = (' '.join(lex_htk['pronunciation'])).split(' ')
+#c = Counter(phones_all)
+
+#lexicon_out = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\lexicon\lex.htk2'
+#for i in lex_htk[lex_htk['word'].str.startswith('\'')].index.values:
+#	lex_htk.iat[i, 0] = lex_htk.iat[i, 0].replace('\'', '\\\'')
+## to_csv does not work with space seperator. therefore all tabs should manually be replaced.
+##lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
+#lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
+
+
+## check which letters are not coded in ascii. 
+print('asr phones which cannot be coded in ascii:\n')
+for i in fame_asr.phoneset_short:
+	try:
+		i_encoded = i.encode("ascii")
+		#print("{0} --> {1}".format(i, i.encode("ascii")))
+	except UnicodeEncodeError:
+		print(">>> {}".format(i))
+
+print("letters in the scripts which is not coded in ascii:\n")
+for dataset in ['train', 'devel', 'test']:
+	timer_start = time.time()
+
+	script_list = os.path.join(default.fame_dir, 'data', dataset, 'text')
+	with open(script_list, "rt", encoding="utf-8") as fin:
+		scripts = fin.read().split('\n')
+
+	for line in scripts:
+		sentence  = ' '.join(line.split(' ')[1:])
+		sentence_htk = fame_functions.word2htk(sentence)
+
+		#if len(re.findall(r'[âêôûč\'àéèúćäëïöü]', sentence))==0:
+		try:
+			sentence_htk = bytes(sentence_htk, 'ascii')
+		except UnicodeEncodeError:
+			print(sentence)
+			print(sentence_htk)
+
--- a/acoustic_model/phoneset/fame_asr.py
+++ b/acoustic_model/phoneset/fame_asr.py
@@ -0,0 +1,137 @@
+""" definition of the phones to be used. """
+
+# phonese in {FAME}/lexicon/lex.asr
+phoneset = [
+	# vowels
+	'a',
+	'a:',
+	'e',
+	'e:',
+	'i',
+	'i:',
+	'i̯',
+	'o',
+	'o:',
+	'ö',
+	'ö:',
+	'u',
+	'u:',
+	'ü',
+	'ü:',
+	#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone. 
+	'ṷ',
+	'y',
+	'ɔ',
+	'ɔ:',
+	'ɔ̈', 
+	'ɔ̈:',
+	'ə',
+	'ɛ',
+	'ɛ:',
+	'ɪ',
+	'ɪ:',
+
+	# plosives
+	'p', 
+	'b', 
+	't',
+	'd', 
+	'k',
+	'g',
+	'ɡ', # = 'g'
+
+	# nasals
+	'm',
+	'n',
+	'ŋ',
+	
+	# fricatives
+	'f',
+	'v',
+	's',
+	's:',
+	'z',
+	'x',
+	'h',
+	
+	# tap and flip
+	'r',
+	'r:',
+
+	# approximant
+	'j',
+	'l'
+	]
+
+
+## reduce the number of phones.
+# the phones which seldom occur are replaced with another more popular phones.
+# replacements are based on the advice from Martijn Wieling.
+reduction_key = {
+	'y':'i:', 'e':'e:', 'ə:':'ɛ:', 'r:':'r', 'ɡ':'g'
+	}
+# already removed beforehand in phoneset. Just to be sure.
+phones_to_be_removed = ['ú', 's:', 'ɔ̈:']
+
+phoneset_short = [reduction_key.get(i, i) for i in phoneset
+				  if not i in phones_to_be_removed]
+phoneset_short = list(set(phoneset_short))
+phoneset_short.sort()
+
+
+## translation_key to htk format (ascii).
+# phones which gives UnicodeEncodeError when phone.encode("ascii")
+# are replaced with other characters.
+translation_key_asr2htk = {
+	'i̯': 'i_',
+	'ṷ': 'u_',
+
+	# on the analogy of German umlaut, 'e' is used.
+	'ö': 'oe', 'ö:': 'oe:',
+	'ü': 'ue', 'ü:': 'ue:',
+
+	# on the analogy of Chinese...
+	'ŋ': 'ng',
+				
+	# refer to Xsampa. 
+	'ɔ': 'O', 'ɔ:': 'O:', 'ɔ̈': 'Oe',
+	'ɛ': 'E', 'ɛ:': 'E:',
+	'ɪ': 'I', 'ɪ:': 'I:', 
+
+	# it is @ in Xsampa, but that is not handy on HTK.
+	'ə': 'A'
+	}
+phoneset_htk = [translation_key_asr2htk.get(i, i) for i in phoneset_short]
+
+#not_in_ascii = [
+#	'\'', 
+#	'â', 'ê', 'ô', 'û', 'č', 
+#	'à', 'í', 'é', 'è', 'ú', 'ć', 
+#	'ä', 'ë', 'ï', 'ö', 'ü'
+#]
+translation_key_word2htk = {
+	#'\'': '\\\'',
+	'í':'i1', 'é':'e1', 'ú':'u1', 'ć':'c1',
+	'à':'a2', 'è':'e2', 	
+	'â':'a3', 'ê':'e3', 'ô':'o3', 'û':'u3', 
+	'č':'c4',
+	'ä': 'ao', 'ë': 'ee', 'ï': 'ie', 'ö': 'oe', 'ü': 'ue',
+}
+#[translation_key_word2htk.get(i, i) for i in not_in_ascii]
+
+
+
+## the list of multi character phones. 
+# for example, the length of 'a:' is 3, but in the codes it is treated as one letter.
+
+# original.
+multi_character_phones = [i for i in phoneset if len(i) > 1]
+multi_character_phones.sort(key=len, reverse=True)
+
+# phonset reduced.
+multi_character_phones_short = [i for i in phoneset_short if len(i) > 1]
+multi_character_phones_short.sort(key=len, reverse=True)
+
+# htk compatible.
+multi_character_phones_htk = [i for i in phoneset_htk if len(i) > 1]
+multi_character_phones_htk.sort(key=len, reverse=True)
--- a/acoustic_model/phoneset/fame_ipa.py
+++ b/acoustic_model/phoneset/fame_ipa.py
@@ -1,7 +1,6 @@
 """ definition of the phones to be used. """

-## phones in IPA.
-phoneset_ipa = [
+phoneset = [
 	# vowels
 	'i̯',
 	'i̯ⁿ',
@@ -35,7 +34,7 @@ phoneset_ipa = [
 	'ṷ',
 	'ṷ.',
 	'ṷⁿ',
-	#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. 
+	#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone. 
 	'u',
 	'uⁿ',
 	'u.',
@@ -101,7 +100,8 @@ phoneset_ipa = [
 	'l'
 	]

+
 ## the list of multi character phones. 
 # for example, the length of 'i̯ⁿ' is 3, but in the codes it is treated as one letter.
-multi_character_phones_ipa = [i for i in phoneset_ipa if len(i) > 1]
-multi_character_phones_ipa.sort(key=len, reverse=True)
+multi_character_phones = [i for i in phoneset if len(i) > 1]
+multi_character_phones.sort(key=len, reverse=True)
--- a/acoustic_model/phoneset/fame_ipa2asr.npy
+++ b/acoustic_model/phoneset/fame_ipa2asr.npy
--- a/acoustic_model/phoneset/output_get_translation_key_phone_unknown.npy
+++ b/acoustic_model/phoneset/output_get_translation_key_phone_unknown.npy
--- a/acoustic_model/phoneset/output_get_translation_key_translation_key.npy
+++ b/acoustic_model/phoneset/output_get_translation_key_translation_key.npy
Author	SHA1	Message	Date
yemaozi88	f6e7c8eefa	bug related encoding on label file is fixed.	2019-02-04 13:46:27 +01:00
yemaozi88	322a8a0079	label files are extracted. hcompv_scp is made.	2019-02-03 13:54:37 +01:00
yemaozi88	22cccfb61d	fix the bug there are characters in the lexicon which cannot be described in ascii.	2019-02-03 00:34:35 +01:00
yemaozi88	dc6b7b84b6	lexicon is made.	2019-01-29 21:52:11 +01:00
yemaozi88	8cda93de75	fame_asr phoneset is added including reduced version and htk compatible version.	2019-01-28 12:34:20 +01:00