diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo
index 92f0791..3ce8f85 100644
Binary files a/.vs/acoustic_model/v15/.suo and b/.vs/acoustic_model/v15/.suo differ
diff --git a/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc b/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc
index 8bb2ce1..869323d 100644
Binary files a/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc and b/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc differ
diff --git a/acoustic_model/acoustic_model.pyproj b/acoustic_model/acoustic_model.pyproj
index f2c3827..8faedc8 100644
--- a/acoustic_model/acoustic_model.pyproj
+++ b/acoustic_model/acoustic_model.pyproj
@@ -23,7 +23,7 @@
-
+
Code
@@ -32,8 +32,6 @@
Code
-
-
Code
@@ -52,9 +50,20 @@
Code
+
+
+
+
+
+
+
+
+
+
+
{1}".format(i, i.encode("ascii")))
-# except UnicodeEncodeError:
-# print(">>> {}".format(i))
-
-
-## the list of multi character phones.
-# for example, the length of 'a:' is 3, but in the codes it is treated as one letter.
-
-# original.
-multi_character_phones = [i for i in phoneset if len(i) > 1]
-multi_character_phones.sort(key=len, reverse=True)
-
-# phonset reduced.
-multi_character_phones_short = [i for i in phoneset_short if len(i) > 1]
-multi_character_phones_short.sort(key=len, reverse=True)
-
-# htk compatible.
-multi_character_phones_htk = [i for i in phoneset_htk if len(i) > 1]
-multi_character_phones_htk.sort(key=len, reverse=True)
diff --git a/acoustic_model/fame_functions.py b/acoustic_model/fame_functions.py
index 380f602..5fe60e5 100644
--- a/acoustic_model/fame_functions.py
+++ b/acoustic_model/fame_functions.py
@@ -1,4 +1,5 @@
import os
+os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import sys
from collections import Counter
@@ -8,38 +9,8 @@ import numpy as np
import pandas as pd
import defaultfiles as default
-from phoneset import fame_ipa
-import convert_phone_set
-
-
-#def ipa2famehtk_lexicon(lexicon_file_in, lexicon_file_out):
-# """ Convert a lexicon file from IPA to HTK format for FAME! corpus. """
-
-# lexicon_in = pd.read_table(lexicon_file_in, names=['word', 'pronunciation'])
-# with open(lexicon_file_out, "w", encoding="utf-8") as fout:
-# for word, pronunciation in zip(lexicon_in['word'], lexicon_in['pronunciation']):
-# pronunciation_no_space = pronunciation.replace(' ', '')
-# pronunciation_famehtk = convert_phone_set.ipa2famehtk(pronunciation_no_space)
-# if 'ceh' not in pronunciation_famehtk and 'sh' not in pronunciation_famehtk:
-# fout.write("{0}\t{1}\n".format(word.upper(), pronunciation_famehtk))
-
-
-#def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out):
-# """ Combine two lexicon files and sort by words. """
-
-# with open(lexicon_file1, "rt", encoding="utf-8") as fin:
-# lines1 = fin.read()
-# lines1 = lines1.split('\n')
-# with open(lexicon_file2, "rt", encoding="utf-8") as fin:
-# lines2 = fin.read()
-# lines2 = lines2.split('\n')
-
-# lex1 = pd.read_table(lexicon_file1, names=['word', 'pronunciation'])
-# lex2 = pd.read_table(lexicon_file2, names=['word', 'pronunciation'])
-# lex = pd.concat([lex1, lex2])
-# lex = lex.sort_values(by='word', ascending=True)
-# lex.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
-
+import convert_phoneset
+from phoneset import fame_ipa, fame_asr
#def read_fileFA(fileFA):
# """
@@ -291,4 +262,74 @@ def find_phone(lexicon_file, phone, phoneset_name='ipa'):
if phone in pronunciation:
extracted_ = pd.Series([row['word'], pronunciation], index=extracted.columns)
extracted = extracted.append(extracted_, ignore_index=True)
- return extracted
\ No newline at end of file
+ return extracted
+
+
+def asr2htk_space_delimited(pronunciation):
+ """convert phoneset from asr to htk.
+
+ Args:
+ pronunciation (str): space delimited asr phones.
+
+ Returns:
+ (pronunciation) (str): space delimited asr phones in htk format (ascii).
+
+ """
+ pronunciation_short = [fame_asr.reduction_key.get(i, i) for i in pronunciation.split(' ')
+ if not i in fame_asr.phones_to_be_removed]
+ return ' '.join(convert_phoneset.convert_phoneset(
+ pronunciation_short, fame_asr.translation_key_asr2htk))
+
+
+def lexicon_asr2htk(lexicon_file_asr, lexicon_file_htk):
+ """ Convert a lexicon file from asr to htk format (ascii).
+
+ Args:
+ lexicon_file_asr (path): a lexicon file written in asr format e.g. fame/lex.asr.
+ lexicon_file_htk (path): a lexicon file written in htk format (ascii).
+
+ """
+ lex_asr = load_lexicon(lexicon_file_asr)
+ def asr2htk_space_delimited_(row):
+ return asr2htk_space_delimited(row['pronunciation'])
+
+ lex_htk = pd.DataFrame({
+ 'word': lex_asr['word'],
+ 'pronunciation': lex_asr.apply(asr2htk_space_delimited_, axis=1)
+ })
+ lex_htk = lex_htk.ix[:, ['word', 'pronunciation']]
+ lex_htk.to_csv(lexicon_file_htk, header=None, index=None, sep='\t')
+ return
+
+
+def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out):
+ """ Combine two lexicon files and sort by words.
+
+ Args:
+ lexicon_file1, lexicon_file2 (path): input lexicon files.
+
+ Returns:
+ lexicon_file_out (path): lexicon_file which lexcion_file1 and 2 are combined and sorted.
+
+ """
+ lex1 = load_lexicon(lexicon_file1)
+ lex2 = load_lexicon(lexicon_file2)
+ lex = pd.concat([lex1, lex2])
+ lex = lex.sort_values(by='word', ascending=True)
+ lex.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
+
+
+def fix_single_quote(lexicon_file):
+ """ add '\' before all single quote at the beginning of words.
+
+ Args:
+ lexicon_file (path): lexicon file, which will be overwitten.
+
+ """
+ lex = load_lexicon(lexicon_file)
+ for i in lex[lex['word'].str.startswith('\'')].index.values:
+ lex.iat[i, 0] = lex.iat[i, 0].replace('\'', '\\\'')
+ # to_csv does not work with space seperator. therefore all tabs should manually be replaced.
+ #lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
+ lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep='\t')
+ return
diff --git a/acoustic_model/fame_hmm.py b/acoustic_model/fame_hmm.py
index 058deaa..ba2732c 100644
--- a/acoustic_model/fame_hmm.py
+++ b/acoustic_model/fame_hmm.py
@@ -5,7 +5,6 @@ os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import tempfile
#import configparser
#import subprocess
-#from collections import Counter
import time
import numpy as np
@@ -29,44 +28,21 @@ dataset_list = ['devel', 'test', 'train']
# procedure
extract_features = 0
-conv_lexicon = 1
-#check_lexicon = 0
-#make_mlf = 0
-#combine_files = 0
-#flat_start = 0
-#train_model = 1
-
-
-#sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir))
-#sys.path.append(forced_alignment_module)
-#from forced_alignment import convert_phone_set
-
+make_lexicon = 0
+make_mlf = 0
+combine_files = 0
+flat_start = 0
+train_model = 0
## ======================= load variables =======================
-#config = configparser.ConfigParser()
-#config.sections()
-#config.read(config_ini)
-
-#config_hcopy = config['Settings']['config_hcopy']
-#config_train = config['Settings']['config_train']
-#mkhmmdefs_pl = config['Settings']['mkhmmdefs_pl']
-#FAME_dir = config['Settings']['FAME_dir']
-
-#lexicon_dir = os.path.join(default.fame_dir, 'lexicon')
-#lexicon_ipa = os.path.join(lexicon_dir, 'lex.ipa')
-#lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
-
-#lex_asr = FAME_dir + '\\lexicon\\lex.asr'
-#lex_asr_htk = FAME_dir + '\\lexicon\\lex.asr_htk'
-#lex_oov = FAME_dir + '\\lexicon\\lex.oov'
-#lex_oov_htk = FAME_dir + '\\lexicon\\lex.oov_htk'
-##lex_ipa = FAME_dir + '\\lexicon\\lex.ipa'
-##lex_ipa_ = FAME_dir + '\\lexicon\\lex.ipa_'
-##lex_ipa_htk = FAME_dir + '\\lexicon\\lex.ipa_htk'
-#lex_htk = FAME_dir + '\\lexicon\\lex_original.htk'
-#lex_htk_ = FAME_dir + '\\lexicon\\lex.htk'
+lexicon_dir = os.path.join(default.fame_dir, 'lexicon')
+lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
+lexicon_oov = os.path.join(lexicon_dir, 'lex.oov')
+lexicon_htk_asr = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_asr')
+lexicon_htk_oov = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_oov')
+lexicon_htk = os.path.join(default.htk_dir, 'lexicon', 'lex.htk')
#hcompv_scp = output_dir + '\\scp\\combined.scp'
#combined_mlf = output_dir + '\\label\\combined.mlf'
@@ -88,8 +64,10 @@ if not os.path.exists(tmp_dir):
## ======================= extract features =======================
if extract_features:
+ print('==== extract features ====\n')
+
for dataset in dataset_list:
- print('==== {} ===='.format(dataset))
+ print('==== dataset: {} ===='.format(dataset))
# a script file for HCopy
print(">>> making a script file for HCopy... \n")
@@ -112,48 +90,28 @@ if extract_features:
fh.make_filelist(feature_dir_, hcompv_scp, '.mfc')
-## ======================= convert lexicon from ipa to fame_htk =======================
-if conv_lexicon:
- print('==== convert lexicon from ipa 2 fame ====\n')
- # convert each lexicon from ipa description to fame_htk phoneset.
- #am_func.ipa2famehtk_lexicon(lex_oov, lex_oov_htk)
- #am_func.ipa2famehtk_lexicon(lex_asr, lex_asr_htk)
+## ======================= make lexicon for HTK =======================
+if make_lexicon:
+ print('==== make lexicon for HTK ====\n')
+
+ # convert each lexicon from fame_asr phoneset to fame_htk phoneset.
+ print('>>> converting each lexicon from fame_asr phoneset to fame_htk phoneset... \n')
+ fame_functions.lexicon_asr2htk(lexicon_asr, lexicon_htk_asr)
+ fame_functions.lexicon_asr2htk(lexicon_oov, lexicon_htk_oov)
# combine lexicon
+ print('>>> combining lexicon files into one lexicon... \n')
# pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov.
# therefore there is no overlap between lex_asr and lex_oov.
- #am_func.combine_lexicon(lex_asr_htk, lex_oov_htk, lex_htk)
+ fame_functions.combine_lexicon(lexicon_htk_asr, lexicon_htk_oov, lexicon_htk)
-
-## ======================= check if all the phones are successfully converted =======================
-if check_lexicon:
- print("==== check if all the phones are successfully converted. ====\n")
-
- # the phones used in the lexicon.
- phonelist_asr = am_func.get_phonelist(lex_asr)
- phonelist_oov = am_func.get_phonelist(lex_oov)
- phonelist_htk = am_func.get_phonelist(lex_htk)
-
- phonelist = phonelist_asr.union(phonelist_oov)
-
- # the lines which include a specific phone.
- lines = am_func.find_phone(lex_asr, 'g')
-
- # statistics over the lexicon
- lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])
- pronunciation = lexicon_htk['pronunciation']
- phones_all = []
- for word in pronunciation:
- phones_all = phones_all + word.split()
- c = Counter(phones_all)
-
-
-## =======================
-## manually make changes to the pronunciation dictionary and save it as lex.htk
-## =======================
-# (1) Replace all tabs with single space;
-# (2) Put a '\' before any dictionary entry beginning with single quote
-#http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html
+ ## =======================
+ ## manually make changes to the pronunciation dictionary and save it as lex.htk
+ ## =======================
+ # (1) Replace all tabs with single space;
+ # (2) Put a '\' before any dictionary entry beginning with single quote
+ #http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html
+ fame_functions.fix_single_quote(lexicon_htk)
## ======================= make label file =======================
diff --git a/acoustic_model/fame_ipa.py b/acoustic_model/fame_ipa.py
deleted file mode 100644
index 4d44f0a..0000000
--- a/acoustic_model/fame_ipa.py
+++ /dev/null
@@ -1,107 +0,0 @@
-""" definition of the phones to be used. """
-
-phoneset = [
- # vowels
- 'i̯',
- 'i̯ⁿ',
- 'y',
- 'i',
- 'i.',
- 'iⁿ',
- 'i:',
- 'i:ⁿ',
- 'ɪ',
- 'ɪⁿ',
- 'ɪ.',
- #'ɪ:', # not included in lex.ipa
- 'ɪ:ⁿ',
- 'e',
- 'e:',
- 'e:ⁿ',
- 'ə',
- 'əⁿ',
- 'ə:',
- 'ɛ',
- 'ɛ.',
- 'ɛⁿ',
- 'ɛ:',
- 'ɛ:ⁿ',
- 'a',
- 'aⁿ',
- 'a.',
- 'a:',
- 'a:ⁿ',
- 'ṷ',
- 'ṷ.',
- 'ṷⁿ',
- #'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone.
- 'u',
- 'uⁿ',
- 'u.',
- 'u:',
- 'u:ⁿ',
- 'ü',
- 'ü.',
- 'üⁿ',
- 'ü:',
- 'ü:ⁿ',
- 'o',
- 'oⁿ',
- 'o.',
- 'o:',
- 'o:ⁿ',
- 'ö',
- 'ö.',
- 'öⁿ',
- 'ö:',
- 'ö:ⁿ',
- 'ɔ',
- 'ɔ.',
- 'ɔⁿ',
- 'ɔ:',
- 'ɔ:ⁿ',
- #'ɔ̈', # not included in lex.ipa
- 'ɔ̈.',
- 'ɔ̈:',
-
- # plosives
- 'p',
- 'b',
- 't',
- 'tⁿ',
- 'd',
- 'k',
- 'g',
- 'ɡ', # = 'g'
-
- # nasals
- 'm',
- 'n',
- 'ŋ',
-
- # fricatives
- 'f',
- 'v',
- 's',
- 's:',
- 'z',
- 'zⁿ',
- 'x',
- 'h',
-
- # tap and flip
- 'r',
- 'r.', # only appears in word 'mearpartijestelsel'(does not exist in lex_asr) and 'tenoarpartij'.
- 'r:', # only appears in word 'mûsearflearmûs' and 'sjochdêr'.
-
- # approximant
- 'j',
- 'j.',
- 'l'
- ]
-
-
-## the list of multi character phones.
-# for example, the length of 'i̯ⁿ' is 3, but in the codes it is treated as one letter.
-multi_character_phones = [i for i in phoneset if len(i) > 1]
-multi_character_phones.sort(key=len, reverse=True)
\ No newline at end of file
diff --git a/acoustic_model/fame_test.py b/acoustic_model/fame_test.py
index 121f4e5..d330e7f 100644
--- a/acoustic_model/fame_test.py
+++ b/acoustic_model/fame_test.py
@@ -1,7 +1,7 @@
import sys
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
-
+from collections import Counter
import time
import numpy as np
@@ -11,12 +11,12 @@ import fame_functions
import defaultfiles as default
sys.path.append(default.toolbox_dir)
from phoneset import fame_ipa, fame_asr
-
+import convert_phoneset
lexicon_dir = os.path.join(default.fame_dir, 'lexicon')
lexicon_ipa = os.path.join(lexicon_dir, 'lex.ipa')
lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
-
+lexicon_htk = os.path.join(default.htk_dir, 'lexicon', 'lex.htk')
## check if all the phones in lexicon.ipa are in fame_ipa.py.
#timer_start = time.time()
@@ -64,6 +64,7 @@ else:
# if ipa_ in phone_unknown:
# translation_key_ipa2asr[ipa_] = asr_
# phone_unknown.remove(ipa_)
+
translation_key_ipa2asr['ə:'] = 'ə'
translation_key_ipa2asr['r.'] = 'r'
translation_key_ipa2asr['r:'] = 'r'
@@ -71,23 +72,32 @@ np.save(os.path.join('phoneset', 'fame_ipa2asr.npy'), translation_key_ipa2asr)
## check if all the phones in lexicon.asr are in translation_key_ipa2asr.
+#timer_start = time.time()
+#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_asr, phoneset='asr')
+#phoneset_lex.remove("")
+#phoneset_asr = list(set(translation_key_ipa2asr.values()))
+#print("phones which is in lexicon.asr but not in the translation_key_ipa2asr:\n{}".format(
+# set(phoneset_lex) - set(phoneset_asr)))
+#print("elapsed time: {}".format(time.time() - timer_start))
+
+
+## check if all the phones in lexicon.htk are in fame_asr.py.
timer_start = time.time()
-phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_asr, phoneset='asr')
-phoneset_lex.remove("")
-phoneset_asr = list(set(translation_key_ipa2asr.values()))
-print("phones which is in lexicon.asr but not in the translation_key_ipa2asr:\n{}".format(
- set(phoneset_lex) - set(phoneset_asr)))
+phoneset_htk = fame_asr.phoneset_htk
+phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_htk)
+phoneset_lex.remove('')
+print("phones which is in lexicon.htk but not in the fame_asr.py are:\n{}".format(
+ set(phoneset_htk) - set(phoneset_lex)))
print("elapsed time: {}".format(time.time() - timer_start))
-## make the translation key between asr to htk.
-#multi_character_phones = [i for i in phoneset_asr if len(i) > 1]
-#multi_character_phones.sort(key=len, reverse=True)
+# statistics over the lexicon
+lex_htk = fame_functions.load_lexicon(lexicon_htk)
+phones_all = (' '.join(lex_htk['pronunciation'])).split(' ')
+c = Counter(phones_all)
-#lexicon_ipa = pd.read_table(lex_ipa, names=['word', 'pronunciation'])
-#with open(lex_ipa_, "w", encoding="utf-8") as fout:
-# for word, pronunciation in zip(lexicon_ipa['word'], lexicon_ipa['pronunciation']):
-# # ignore nasalization and '.'
-# pronunciation_ = pronunciation.replace(u'ⁿ', '')
-# pronunciation_ = pronunciation_.replace('.', '')
-# pronunciation_split = convert_phone_set.split_ipa_fame(pronunciation_)
-# fout.write("{0}\t{1}\n".format(word, ' '.join(pronunciation_split)))
\ No newline at end of file
+lexicon_out = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\lexicon\lex.htk2'
+for i in lex_htk[lex_htk['word'].str.startswith('\'')].index.values:
+ lex_htk.iat[i, 0] = lex_htk.iat[i, 0].replace('\'', '\\\'')
+# to_csv does not work with space seperator. therefore all tabs should manually be replaced.
+#lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
+lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
diff --git a/acoustic_model/phoneset/fame_asr.py b/acoustic_model/phoneset/fame_asr.py
index 01b3ab4..8408646 100644
--- a/acoustic_model/phoneset/fame_asr.py
+++ b/acoustic_model/phoneset/fame_asr.py
@@ -1,74 +1,40 @@
""" definition of the phones to be used. """
+# phonese in {FAME}/lexicon/lex.asr
phoneset = [
# vowels
- 'i̯',
- 'i̯ⁿ',
- 'y',
- 'i',
- 'i.',
- 'iⁿ',
- 'i:',
- 'i:ⁿ',
- 'ɪ',
- 'ɪⁿ',
- 'ɪ.',
- #'ɪ:', # not included in lex.ipa
- 'ɪ:ⁿ',
+ 'a',
+ 'a:',
'e',
'e:',
- 'e:ⁿ',
- 'ə',
- 'əⁿ',
- 'ə:',
- 'ɛ',
- 'ɛ.',
- 'ɛⁿ',
- 'ɛ:',
- 'ɛ:ⁿ',
- 'a',
- 'aⁿ',
- 'a.',
- 'a:',
- 'a:ⁿ',
- 'ṷ',
- 'ṷ.',
- 'ṷⁿ',
- #'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr.
- 'u',
- 'uⁿ',
- 'u.',
- 'u:',
- 'u:ⁿ',
- 'ü',
- 'ü.',
- 'üⁿ',
- 'ü:',
- 'ü:ⁿ',
+ 'i',
+ 'i:',
+ 'i̯',
'o',
- 'oⁿ',
- 'o.',
'o:',
- 'o:ⁿ',
'ö',
- 'ö.',
- 'öⁿ',
'ö:',
- 'ö:ⁿ',
+ 'u',
+ 'u:',
+ 'ü',
+ 'ü:',
+ #'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone.
+ 'ṷ',
+ 'y',
'ɔ',
- 'ɔ.',
- 'ɔⁿ',
'ɔ:',
- 'ɔ:ⁿ',
- #'ɔ̈', # not included in lex.ipa
- 'ɔ̈.',
+ 'ɔ̈',
'ɔ̈:',
+ 'ə',
+ 'ɛ',
+ 'ɛ:',
+ 'ɪ',
+ 'ɪ:',
# plosives
'p',
'b',
't',
- 'tⁿ',
'd',
'k',
'g',
@@ -85,22 +51,77 @@ phoneset = [
's',
's:',
'z',
- 'zⁿ',
'x',
'h',
-
+
# tap and flip
'r',
- 'r.', # only appears in word 'mearpartijestelsel'(does not exist in lex_asr) and 'tenoarpartij'.
- 'r:', # only appears in word 'mûsearflearmûs' and 'sjochdêr'.
+ 'r:',
# approximant
'j',
- 'j.',
'l'
]
+
+## reduce the number of phones.
+# the phones which seldom occur are replaced with another more popular phones.
+# replacements are based on the advice from Martijn Wieling.
+reduction_key = {
+ 'y':'i:', 'e':'e:', 'ə:':'ɛ:', 'r:':'r', 'ɡ':'g'
+ }
+# already removed beforehand in phoneset. Just to be sure.
+phones_to_be_removed = ['ú', 's:', 'ɔ̈:']
+
+phoneset_short = [reduction_key.get(i, i) for i in phoneset
+ if not i in phones_to_be_removed]
+phoneset_short = list(set(phoneset_short))
+phoneset_short.sort()
+
+
+## translation_key to htk format (ascii).
+# phones which gives UnicodeEncodeError when phone.encode("ascii")
+# are replaced with other characters.
+translation_key_asr2htk = {
+ 'i̯': 'i_',
+ 'ṷ': 'u_',
+
+ # on the analogy of German umlaut, 'e' is used.
+ 'ö': 'oe', 'ö:': 'oe:',
+ 'ü': 'ue', 'ü:': 'ue:',
+
+ # on the analogy of Chinese...
+ 'ŋ': 'ng',
+
+ # refer to Xsampa.
+ 'ɔ': 'O', 'ɔ:': 'O:', 'ɔ̈': 'Oe',
+ 'ɛ': 'E', 'ɛ:': 'E:',
+ 'ɪ': 'I', 'ɪ:': 'I:',
+
+ # it is @ in Xsampa, but that is not handy on HTK.
+ 'ə': 'A'
+ }
+phoneset_htk = [translation_key_asr2htk.get(i, i) for i in phoneset_short]
+
+## check
+#for i in phoneset_short:
+# try:
+# print("{0} --> {1}".format(i, i.encode("ascii")))
+# except UnicodeEncodeError:
+# print(">>> {}".format(i))
+
+
## the list of multi character phones.
-# for example, the length of 'i̯ⁿ' is 3, but in the codes it is treated as one letter.
+# for example, the length of 'a:' is 3, but in the codes it is treated as one letter.
+
+# original.
multi_character_phones = [i for i in phoneset if len(i) > 1]
-multi_character_phones.sort(key=len, reverse=True)
\ No newline at end of file
+multi_character_phones.sort(key=len, reverse=True)
+
+# phonset reduced.
+multi_character_phones_short = [i for i in phoneset_short if len(i) > 1]
+multi_character_phones_short.sort(key=len, reverse=True)
+
+# htk compatible.
+multi_character_phones_htk = [i for i in phoneset_htk if len(i) > 1]
+multi_character_phones_htk.sort(key=len, reverse=True)
diff --git a/acoustic_model/phoneset/fame_ipa.py b/acoustic_model/phoneset/fame_ipa.py
index 01b3ab4..4d44f0a 100644
--- a/acoustic_model/phoneset/fame_ipa.py
+++ b/acoustic_model/phoneset/fame_ipa.py
@@ -34,7 +34,7 @@ phoneset = [
'ṷ',
'ṷ.',
'ṷⁿ',
- #'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr.
+ #'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone.
'u',
'uⁿ',
'u.',
@@ -100,6 +100,7 @@ phoneset = [
'l'
]
+
## the list of multi character phones.
# for example, the length of 'i̯ⁿ' is 3, but in the codes it is treated as one letter.
multi_character_phones = [i for i in phoneset if len(i) > 1]