correspondence between lex.asr and lex.ipa is automatically obtained. header is added to the functions in fame_functions.py.

phonset is given as fame_phoneset.py. translation key is obtained based on the information.
2019-01-27 23:52:33 +01:00 · 2019-01-27 01:34:04 +01:00
11 changed files with 351 additions and 125 deletions
--- a/.vs/acoustic_model/v15/.suo
+++ b/.vs/acoustic_model/v15/.suo
--- a/_tmp/phone_to_be_searched.npy
+++ b/_tmp/phone_to_be_searched.npy
--- a/_tmp/translation_key.npy
+++ b/_tmp/translation_key.npy
--- a/acoustic_model.sln
+++ b/acoustic_model.sln
@@ -10,7 +10,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
 		..\forced_alignment\forced_alignment\__init__.py = ..\forced_alignment\forced_alignment\__init__.py
 		..\forced_alignment\forced_alignment\convert_phone_set.py = ..\forced_alignment\forced_alignment\convert_phone_set.py
 		..\toolbox\evaluation.py = ..\toolbox\evaluation.py
-		..\toolbox\toolbox\file_handling.py = ..\toolbox\toolbox\file_handling.py
 		..\forced_alignment\forced_alignment\htk_dict.py = ..\forced_alignment\forced_alignment\htk_dict.py
 		..\forced_alignment\forced_alignment\lexicon.py = ..\forced_alignment\forced_alignment\lexicon.py
 		..\forced_alignment\forced_alignment\mlf.py = ..\forced_alignment\forced_alignment\mlf.py
@@ -23,7 +22,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
 		..\forced_alignment\forced_alignment\test_environment.py = ..\forced_alignment\forced_alignment\test_environment.py
 	EndProjectSection
 EndProject
-Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "pyhtk", "..\pyhtk\pyhtk\pyhtk.pyproj", "{75FCEFAF-9397-43FC-8189-DE97ADB77AA5}"
+Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "toolbox", "..\toolbox\toolbox.pyproj", "{F0D46C9C-51C6-4989-8A2F-35F2A0C048BE}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
@@ -33,8 +32,8 @@ Global
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{4D8C8573-32F0-4A62-9E62-3CE5CC680390}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
 		{4D8C8573-32F0-4A62-9E62-3CE5CC680390}.Release|Any CPU.ActiveCfg = Release|Any CPU
-		{75FCEFAF-9397-43FC-8189-DE97ADB77AA5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
-		{75FCEFAF-9397-43FC-8189-DE97ADB77AA5}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{F0D46C9C-51C6-4989-8A2F-35F2A0C048BE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{F0D46C9C-51C6-4989-8A2F-35F2A0C048BE}.Release|Any CPU.ActiveCfg = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/acoustic_model/pycache/defaultfiles.cpython-36.pyc
+++ b/acoustic_model/pycache/defaultfiles.cpython-36.pyc
--- a/acoustic_model/acoustic_model.pyproj
+++ b/acoustic_model/acoustic_model.pyproj
@@ -23,12 +23,18 @@
  </PropertyGroup>
  <ItemGroup>
    <Compile Include="check_novoapi.py" />
+    <Compile Include="convert_phone_set.py">
+      <SubType>Code</SubType>
+    </Compile>
    <Compile Include="convert_xsampa2ipa.py">
      <SubType>Code</SubType>
    </Compile>
    <Compile Include="defaultfiles.py">
      <SubType>Code</SubType>
    </Compile>
+    <Compile Include="fame_phoneset.py">
+      <SubType>Code</SubType>
+    </Compile>
    <Compile Include="fa_test.py">
      <SubType>Code</SubType>
    </Compile>
--- a/acoustic_model/convert_phone_set.py
+++ b/acoustic_model/convert_phone_set.py
@@ -0,0 +1,29 @@
+"""Module to convert phonemes."""
+
+def multi_character_tokenize(line, multi_character_tokens):
+	"""Tries to match one of the tokens in multi_character_tokens at each position of line, starting at position 0,
+	if so tokenizes and eats that token. Otherwise tokenizes a single character"""
+	while line != '':
+		for token in multi_character_tokens:
+			if line.startswith(token) and len(token) > 0:
+				yield token
+				line = line[len(token):]
+				break
+		else:
+			yield line[:1]
+			line = line[1:]
+
+
+def split_word(word, multi_character_phones):
+	"""
+	split a line by given phoneset.
+	
+	Args:
+		word (str): a word written in given phoneset.
+		multi_character_phones (list): the list of multicharacter phones which is considered as one phone. this can be obtained with phoneset definition such as fame_phoneset.py. 
+
+	Returns:
+		(word_seperated) (list): the word splitted in given phoneset. 
+
+	"""
+	return [phone for phone in multi_character_tokenize(word.strip(), multi_character_phones)]
--- a/acoustic_model/defaultfiles.py
+++ b/acoustic_model/defaultfiles.py
@@ -4,7 +4,8 @@ import os

 #cygwin_dir = r'C:\cygwin64\home\Aki\acoustic_model'

-htk_dir = r'C:\Aki\htk_fame'
+#htk_dir = r'C:\Aki\htk_fame'
+htk_dir = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk'

 config_hcopy = os.path.join(htk_dir, 'config', 'config.HCopy')
 #config_train = os.path.join(cygwin_dir, 'config', 'config.train')
@@ -28,22 +29,21 @@ config_hcopy = os.path.join(htk_dir, 'config', 'config.HCopy')
 #filePhoneList = config['pyHTK']['filePhoneList']
 #AcousticModel = config['pyHTK']['AcousticModel']

-repo_dir = r'C:\Users\A.Kunikoshi\source\repos'
+repo_dir = r'C:\Users\Aki\source\repos'
 ipa_xsampa_converter_dir    = os.path.join(repo_dir, 'ipa-xsama-converter')
 forced_alignment_module_dir = os.path.join(repo_dir, 'forced_alignment')
 accent_classification_dir   = os.path.join(repo_dir, 'accent_classification', 'accent_classification')
-pyhtk_dir                   = os.path.join(repo_dir, 'pyhtk', 'pyhtk')
-toolbox_dir					= os.path.join(repo_dir, 'toolbox', 'toolbox')
+toolbox_dir					= os.path.join(repo_dir, 'toolbox')

-htk_config_dir = r'c:\Users\A.Kunikoshi\source\repos\forced_alignment\forced_alignment\data\htk\preset_models\aki_dutch_2017'
-config_hvite = os.path.join(htk_config_dir, 'config.HVite')
+#htk_config_dir = r'c:\Users\A.Kunikoshi\source\repos\forced_alignment\forced_alignment\data\htk\preset_models\aki_dutch_2017'
+#config_hvite = os.path.join(htk_config_dir, 'config.HVite')
 #acoustic_model = os.path.join(htk_config_dir, 'hmmdefs.compo')
-acoustic_model = r'c:\cygwin64\home\A.Kunikoshi\acoustic_model\model\barbara\hmm128-2\hmmdefs.compo'
-phonelist_txt = os.path.join(htk_config_dir, 'phonelist.txt')
+#acoustic_model = r'c:\cygwin64\home\A.Kunikoshi\acoustic_model\model\barbara\hmm128-2\hmmdefs.compo'
+#phonelist_txt = os.path.join(htk_config_dir, 'phonelist.txt')

 WSL_dir   = r'C:\OneDrive\WSL'
 #fame_dir        = os.path.join(WSL_dir, 'kaldi-trunk', 'egs', 'fame')
-fame_dir = r'f:\_corpus\fame'
+fame_dir = r'd:\_corpus\fame'

 fame_s5_dir     = os.path.join(fame_dir, 's5')
 fame_corpus_dir = os.path.join(fame_dir, 'corpus')
--- a/acoustic_model/fame_functions.py
+++ b/acoustic_model/fame_functions.py
@@ -1,5 +1,5 @@
 import os
-os.chdir(r'C:\Users\A.Kunikoshi\source\repos\acoustic_model\acoustic_model')
+os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')

 import sys
 from collections import Counter
@@ -9,24 +9,8 @@ import numpy as np
 import pandas as pd

 import defaultfiles as default
-
-#sys.path.append(default.forced_alignment_module_dir)
-#from forced_alignment import convert_phone_set
-
-#def find_phone(lexicon_file, phone):
-#	""" Search where the phone is used in the lexicon. """
-#	with open(lexicon_file, "rt", encoding="utf-8") as fin:
-#		lines = fin.read()
-#		lines = lines.split('\n')
-	
-#	extracted = []
-#	for line in lines:
-#		line = line.split('\t')
-#		if len(line) > 1:
-#			pronunciation = line[1]
-#			if phone in pronunciation:
-#				extracted.append(line)
-#	return extracted
+import fame_phoneset
+import convert_phone_set


 #def ipa2famehtk_lexicon(lexicon_file_in, lexicon_file_out):
@@ -126,25 +110,6 @@ import defaultfiles as default

 #    return ipa

-def make_hcopy_scp_from_filelist_in_fame(fame_dir, dataset, feature_dir, hcopy_scp):
-	""" Make a script file for HCopy using the filelist in FAME! corpus. """
-	
-	filelist_txt = os.path.join(fame_dir, 'fame', 'filelists', dataset + 'list.txt')
-	with open(filelist_txt) as fin:
-		filelist = fin.read()
-		filelist = filelist.split('\n')
-	
-	with open(hcopy_scp, 'w') as fout:
-		for filename_ in filelist:
-			filename = filename_.replace('.TextGrid', '')
-
-			if len(filename) > 3: # remove '.', '..' and ''
-				wav_file = os.path.join(fame_dir, 'fame', 'wav', dataset, filename + '.wav')
-				mfc_file = os.path.join(feature_dir, filename + '.mfc')
-
-				fout.write(wav_file + '\t' + mfc_file + '\n')
-
-
 #def make_filelist(input_dir, output_txt):
 #	""" Make a list of files in the input_dir. """
 #	filenames = os.listdir(input_dir)
@@ -189,64 +154,147 @@ def make_hcopy_scp_from_filelist_in_fame(fame_dir, dataset, feature_dir, hcopy_s
 #                            f.write('{0}\t{1}\n'.format(WORD, key))


+def make_hcopy_scp_from_filelist_in_fame(fame_dir, dataset, feature_dir, hcopy_scp):
+	""" Make a script file for HCopy using the filelist in FAME! corpus. 
+	
+	Args:
+		fame_dir (path): the directory of FAME corpus.
+		dataset (str): 'devel', 'test' or 'train'.
+		feature_dir (path): the directory where feature will be stored.
+		hcopy_scp (path): a script file for HCopy to be made.
+
+	"""
+	filelist_txt = os.path.join(fame_dir, 'fame', 'filelists', dataset + 'list.txt')
+	with open(filelist_txt) as fin:
+		filelist = fin.read()
+		filelist = filelist.split('\n')
+	
+	with open(hcopy_scp, 'w') as fout:
+		for filename_ in filelist:
+			filename = filename_.replace('.TextGrid', '')
+
+			if len(filename) > 3: # remove '.', '..' and ''
+				wav_file = os.path.join(fame_dir, 'fame', 'wav', dataset, filename + '.wav')
+				mfc_file = os.path.join(feature_dir, filename + '.mfc')
+
+				fout.write(wav_file + '\t' + mfc_file + '\n')



 def load_lexicon(lexicon_file):
+	""" load lexicon file as Data Frame.
+
+	Args:
+		lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'.
+	
+	Returns:
+		lex (df): lexicon as Data Frame, which has columns 'word' and 'pronunciation'.
+
+	"""
 	lex = pd.read_csv(lexicon_file, delimiter='\t', header=None, encoding="utf-8")
 	lex.rename(columns={0: 'word', 1: 'pronunciation'}, inplace=True)
 	return lex


-def get_phonelist(lexicon_asr):
-	""" Make a list of phones which appears in the lexicon. """
-
-	#with open(lexicon_file, "rt", encoding="utf-8") as fin:
-	#	lines = fin.read()
-	#	lines = lines.split('\n')
-	#	phonelist = set([])
-	#	for line in lines:
-	#		line = line.split('\t')
-	#		if len(line) > 1:
-	#			pronunciation = set(line[1].split())
-	#			phonelist = phonelist | pronunciation
-	lex = load_lexicon(lexicon_asr)
-	return set(' '.join(lex['pronunciation']).split(' '))
-
-import time
-
-timer_start = time.time()
-
-#def get_translation_key():
-dir_tmp = r'c:\Users\A.Kunikoshi\source\repos\acoustic_model\_tmp'
-lexicon_ipa = r'f:\_corpus\FAME\lexicon\lex.ipa'
-lexicon_asr = r'f:\_corpus\FAME\lexicon\lex.asr'
-
-lex_ipa = load_lexicon(lexicon_ipa)
-lex_asr = load_lexicon(lexicon_asr)
-if 0:
-	phone_to_be_searched = get_phonelist(lexicon_asr)
-	translation_key = dict()
-	for word in lex_asr['word']:
-		if np.sum(lex_asr['word'] == word) == 1 and np.sum(lex_ipa['word'] == word) == 1:
-			asr = lex_asr[lex_asr['word'] == word].iat[0, 1]
-			ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
+def get_phoneset_from_lexicon(lexicon_file, phoneset='asr'):
+	""" Make a list of phones which appears in the lexicon. 
 	
+	Args:
+		lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'.
+		phoneset (str): the phoneset with which lexicon_file is written. 'asr'(default) or 'ipa'.
+
+	Returns:
+		(list_of_phones) (set): the set of phones included in the lexicon_file.
+
+	"""
+	assert phoneset in ['asr', 'ipa'], 'phoneset should be \'asr\' or \'ipa\''
+
+	lex = load_lexicon(lexicon_file)
+	if phoneset == 'asr':
+		return set(' '.join(lex['pronunciation']).split(' '))
+	elif phoneset == 'ipa':
+		join_pronunciations = ''.join(lex['pronunciation'])
+		return set(convert_phone_set.split_word(join_pronunciations, fame_phoneset.multi_character_phones_ipa))
+
+
+def extract_unknown_phones(ipa, known_phones):
+	"""extract unknown phones in the pronunciation written in IPA.
+
+	Args:
+		ipa (str): a pronunciation written in IPA. 
+		known_phones (list): list of phones already know.
+
+	Returns:
+		(list_of_phones) (list): unknown phones not included in 'known_phones'.
+
+	"""
+	ipa_split = convert_phone_set.split_word(ipa, fame_phoneset.multi_character_phones_ipa)
+	return [i for i in ipa_split if not i in known_phones]
+
+
+def get_translation_key(lexicon_file_ipa, lexicon_file_asr):
+	""" get correspondence between lexicon_file_ipa and lexicon_file_asr.
+
+	Args:
+		lexicon_file_ipa (path): lexicon in the format of 'word' /t 'pronunciation (IPA)'.
+		lexicon_file_asr (path): lexicon in the format of 'word' /t 'pronunciation (asr)'.
+			the each character of 'pronunciation' should be delimited by ' '.
+
+	Returns:
+		translation_key (dict): translation key from ipa to asr. 
+		(phone_unknown) (list): the list of IPA phones, which does not appear in lexicon_file_asr. 
+
+	"""
+	lex_ipa = load_lexicon(lexicon_file_ipa)
+	lex_asr = load_lexicon(lexicon_file_asr)
+	phone_unknown = fame_phoneset.phoneset_ipa[:]
+	translation_key = dict()
+	for word in lex_ipa['word']:
+		if np.sum(lex_ipa['word'] == word) == 1 and np.sum(lex_asr['word'] == word) == 1:
+			ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
+			asr = lex_asr[lex_asr['word'] == word].iat[0, 1]
+	
+			ipa_list = convert_phone_set.split_word(ipa, fame_phoneset.multi_character_phones_ipa)
 			asr_list = asr.split(' ')
-			# if there are phones which is not in phone_to_be_searched
-			if len([True for i in asr_list if i in phone_to_be_searched]) > 0:
-				if(len(ipa) == len(asr_list)):
-					print("{0}: {1} --> {2}".format(word, ipa, asr))
-					for ipa_, asr_ in zip(ipa, asr_list):
-						if asr_ in phone_to_be_searched:
-							#if not translation_key[ipa_] == asr_:
-							translation_key[ipa_] = asr_
-							phone_to_be_searched.remove(asr_)

-	print("elapsed time: {}".format(time.time() - timer_start))
+			# if there are phones which is not in phone_unknown
+			#if len([True for i in asr_list if i in phone_unknown]) > 0:
+			if(len(ipa_list) == len(asr_list)):
+				print("{0}: {1} --> {2}".format(word, ipa_list, asr_list))
+				for ipa_, asr_ in zip(ipa_list, asr_list):
+					if ipa_ in phone_unknown:
+						translation_key[ipa_] = asr_
+						phone_unknown.remove(ipa_)
+	return translation_key, list(phone_unknown)

-	np.save(os.path.join(dir_tmp, 'translation_key.npy'), translation_key)
-	np.save(os.path.join(dir_tmp, 'phone_to_be_searched.npy'), phone_to_be_searched)
-else:
-	translation_key		 = np.load(os.path.join(dir_tmp, 'translation_key.npy')).item()
-	phone_to_be_searched = np.load(os.path.join(dir_tmp, 'phone_to_be_searched.npy')).item()
+
+def find_phone(lexicon_file, phone, phoneset='ipa'):
+	""" extract rows where the phone is used in the lexicon_file. 
+
+	Args:
+		lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'.
+		phone (str): the phone to be searched.
+		phoneset (str): the phoneset with which lexicon_file is written. 'asr' or 'ipa'(default).
+
+	Returns:
+		extracted (df): rows where the phone is used.
+
+	ToDo:
+		* develop when the phonset == 'asr'.
+
+	"""
+	assert phoneset in ['asr', 'ipa'], 'phoneset should be \'asr\' or \'ipa\''
+	
+	lex = load_lexicon(lexicon_file)
+	
+	# to reduce the calculation time, only target rows which include 'phone' at least once. 
+	lex_ = lex[lex['pronunciation'].str.count(phone)>0]
+
+	extracted = pd.DataFrame(index=[], columns=['word', 'pronunciation'])
+	for index, row in lex_.iterrows():
+		if phoneset == 'ipa':
+			pronunciation = convert_phone_set.split_word(row['pronunciation'], fame_phoneset.multi_character_phones_ipa)
+		if phone in pronunciation:
+			extracted_ = pd.Series([row['word'], pronunciation], index=extracted.columns)
+			extracted  = extracted.append(extracted_, ignore_index=True)
+	return extracted
--- a/acoustic_model/fame_hmm.py
+++ b/acoustic_model/fame_hmm.py
@@ -1,21 +1,21 @@
 import sys
 import os
-os.chdir(r'C:\Users\A.Kunikoshi\source\repos\acoustic_model\acoustic_model')
+os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')

 import tempfile
 #import configparser
 #import subprocess
 #from collections import Counter
+import time

 #import numpy as np
 #import pandas as pd

 import fame_functions
 import defaultfiles as default
-sys.path.append(default.pyhtk_dir)
-import pyhtk
 sys.path.append(default.toolbox_dir)
-import file_handling
+import file_handling as fh
+from htk import pyhtk


 ## ======================= user define =======================
@@ -28,8 +28,8 @@ import file_handling
 dataset_list = ['devel', 'test', 'train']

 # procedure
-extract_features  = 1
-#conv_lexicon	  = 0
+extract_features  = 0
+conv_lexicon	  = 1
 #check_lexicon	  = 0
 #make_mlf		  = 0
 #combine_files	  = 0
@@ -85,14 +85,12 @@ if not os.path.exists(tmp_dir):
 ## ======================= extract features =======================
 if extract_features:
 	for dataset in dataset_list:
-	#for dataset in ['test']:
 		print('==== {} ===='.format(dataset))

 		# a script file for HCopy 
 		print(">>> making a script file for HCopy... \n")
 		hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False)
 		hcopy_scp.close()
-		#hcopy_scp = os.path.join(default.htk_dir, 'tmp', 'HCopy.scp')

 		# get a list of features (hcopy.scp) from the filelist in FAME! corpus
 		feature_dir_ = os.path.join(feature_dir, dataset)
@@ -102,31 +100,70 @@ if extract_features:
 		# extract features
 		print(">>> extracting features... \n")
 		fame_functions.make_hcopy_scp_from_filelist_in_fame(default.fame_dir, dataset, feature_dir_, hcopy_scp.name)
-
-		#subprocessStr = 'HCopy -C ' + config_hcopy + ' -S ' + hcopy_scp.name
-		#subprocess.call(subprocessStr, shell=True)
 		pyhtk.wav2mfc(default.config_hcopy, hcopy_scp.name)

 		# a script file for HCompV
 		print(">>> making a script file for HCompV... \n")
-
-## ======================= make a list of features =======================
-#if make_feature_list:
-#	print("==== make a list of features ====\n")
-
-#	for dataset in dataset_list:
-#		print(dataset)
-
-		#feature_dir = output_dir + '\\mfc\\' + dataset
 		hcompv_scp  = os.path.join(tmp_dir, dataset + '.scp')
-
-		#am_func.make_filelist(feature_dir, hcompv_scp)
-		file_handling.make_filelist(feature_dir_, hcompv_scp, '.mfc')
+		fh.make_filelist(feature_dir_, hcompv_scp, '.mfc')


 ## ======================= convert lexicon from ipa to fame_htk =======================
 if conv_lexicon:
 	print('==== convert lexicon from ipa 2 fame ====\n')
+	
+	#dir_out = r'c:\Users\Aki\source\repos\acoustic_model\_tmp'
+	lexicon_dir = os.path.join(default.fame_dir, 'lexicon') 
+	lexicon_ipa = os.path.join(lexicon_dir, 'lex.ipa')
+	lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
+
+	# get the correspondence between lex_ipa and lex_asr.
+	lex_asr  = fame_functions.load_lexicon(lexicon_asr)
+	lex_ipa  = fame_functions.load_lexicon(lexicon_ipa)		
+	if 1:
+		timer_start = time.time()
+		translation_key, phone_unknown = fame_functions.get_translation_key(lexicon_ipa, lexicon_asr)
+		print("elapsed time: {}".format(time.time() - timer_start))
+
+		np.save('translation_key_ipa2asr.npy', translation_key)
+		np.save('phone_unknown.npy', phone_unknown)
+	else:
+		translation_key = np.load('translation_key_ipa2asr.npy').item()
+		phone_unknown   = np.load('phone_unknown.npy')
+		phone_unknown   = list(phone_unknown)
+
+
+	## manually check the correspondence for the phone in phone_unknown.
+	#p = phone_unknown[0]
+	#lex_ipa_ = find_phone(lexicon_ipa, p, phoneset='ipa')
+
+	#for word in lex_ipa_['word']:
+	#	ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
+	#	if np.sum(lex_asr['word'] == word) > 0:
+	#		asr = lex_asr[lex_asr['word'] == word].iat[0, 1]
+	
+	#		ipa_list = convert_phone_set.split_word(ipa, fame_phoneset.multi_character_phones_ipa)
+	#		asr_list = asr.split(' ')
+	#		if p in ipa_list and (len(ipa_list) == len(asr_list)):
+	#			print("{0}: {1} --> {2}".format(word, ipa_list, asr_list))
+	#			for ipa_, asr_ in zip(ipa_list, asr_list):
+	#				if ipa_ in phone_unknown:
+	#					translation_key[ipa_] = asr_
+	#					phone_unknown.remove(ipa_)
+
+
+	## check if all the phones in lexicon_ipa are in fame_phoneset.py.
+	#timer_start = time.time()
+	#phoneset_lex = get_phoneset_from_lexicon(lexicon_ipa, phoneset='ipa')
+	#print("elapsed time: {}".format(time.time() - timer_start))
+	
+	#phoneset_py = fame_phoneset.phoneset_ipa
+	#set(phoneset_lex) - set(phoneset_py)
+
+	##timer_start = time.time()
+	##extracted = find_phone(lexicon_ipa, 'ⁿ')
+	##print("elapsed time: {}".format(time.time() - timer_start))
+

 	# lex.asr is Kaldi compatible version of lex.ipa.
 	# to check... 
@@ -140,13 +177,13 @@ if conv_lexicon:
 	#		fout.write("{0}\t{1}\n".format(word, ' '.join(pronunciation_split)))

 	# convert each lexicon from ipa description to fame_htk phoneset.
-	am_func.ipa2famehtk_lexicon(lex_oov, lex_oov_htk)
-	am_func.ipa2famehtk_lexicon(lex_asr, lex_asr_htk)
+	#am_func.ipa2famehtk_lexicon(lex_oov, lex_oov_htk)
+	#am_func.ipa2famehtk_lexicon(lex_asr, lex_asr_htk)

 	# combine lexicon
 	# pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov.
 	# therefore there is no overlap between lex_asr and lex_oov.   
-	am_func.combine_lexicon(lex_asr_htk, lex_oov_htk, lex_htk)
+	#am_func.combine_lexicon(lex_asr_htk, lex_oov_htk, lex_htk)


 ## ======================= check if all the phones are successfully converted =======================
--- a/acoustic_model/fame_phoneset.py
+++ b/acoustic_model/fame_phoneset.py
@@ -0,0 +1,107 @@
+""" definition of the phones to be used. """
+
+## phones in IPA.
+phoneset_ipa = [
+	# vowels
+	'i̯',
+	'i̯ⁿ',
+	'y',
+	'i',
+	'i.',
+	'iⁿ',
+	'i:',
+	'i:ⁿ',
+	'ɪ',
+	'ɪⁿ',
+	'ɪ.',
+	#'ɪ:', # not included in lex.ipa
+	'ɪ:ⁿ',
+	'e',
+	'e:',
+	'e:ⁿ',
+	'ə',
+	'əⁿ',
+	'ə:',
+	'ɛ',
+	'ɛ.',
+	'ɛⁿ',
+	'ɛ:',
+	'ɛ:ⁿ',
+	'a',
+	'aⁿ',
+	'a.',
+	'a:',
+	'a:ⁿ',
+	'ṷ',
+	'ṷ.',
+	'ṷⁿ',
+	#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. 
+	'u',
+	'uⁿ',
+	'u.',
+	'u:',
+	'u:ⁿ',
+	'ü',
+	'ü.',
+	'üⁿ',
+	'ü:',
+	'ü:ⁿ',
+	'o',
+	'oⁿ',
+	'o.',
+	'o:',
+	'o:ⁿ',
+	'ö',
+	'ö.',
+	'öⁿ',
+	'ö:',
+	'ö:ⁿ',
+	'ɔ',
+	'ɔ.',
+	'ɔⁿ',
+	'ɔ:',
+	'ɔ:ⁿ',
+	#'ɔ̈', # not included in lex.ipa 
+	'ɔ̈.',
+	'ɔ̈:',
+
+	# plosives
+	'p', 
+	'b', 
+	't',
+	'tⁿ',
+	'd', 
+	'k',
+	'g',
+	'ɡ', # = 'g'
+
+	# nasals
+	'm',
+	'n',
+	'ŋ',
+	
+	# fricatives
+	'f',
+	'v',
+	's',
+	's:',
+	'z',
+	'zⁿ',
+	'x',
+	'h',
+
+	# tap and flip
+	'r',
+	'r.', # only appears in word 'mearpartijestelsel'(does not exist in lex_asr) and 'tenoarpartij'.   
+	'r:', # only appears in word 'mûsearflearmûs' and 'sjochdêr'.
+
+	# approximant
+	'j',
+	'j.',
+	'l'
+	]
+
+## the list of multi character phones. 
+# for example, the length of 'i̯ⁿ' is 3, but in the codes it is treated as one letter.
+multi_character_phones_ipa = [i for i in phoneset_ipa if len(i) > 1]
+multi_character_phones_ipa.sort(key=len, reverse=True)
Author	SHA1	Message	Date
yemaozi88	87abbbb95a	correspondence between lex.asr and lex.ipa is automatically obtained. header is added to the functions in fame_functions.py.	2019-01-27 23:52:33 +01:00
yemaozi88	813f013d7a	phonset is given as fame_phoneset.py. translation key is obtained based on the information.	2019-01-27 01:34:04 +01:00