lexicon is made.

This commit is contained in:
yemaozi88 2019-01-29 21:52:11 +01:00
parent 8cda93de75
commit dc6b7b84b6
11 changed files with 241 additions and 424 deletions

Binary file not shown.

View File

@ -23,7 +23,7 @@
</PropertyGroup> </PropertyGroup>
<ItemGroup> <ItemGroup>
<Compile Include="check_novoapi.py" /> <Compile Include="check_novoapi.py" />
<Compile Include="convert_phone_set.py"> <Compile Include="convert_phoneset.py">
<SubType>Code</SubType> <SubType>Code</SubType>
</Compile> </Compile>
<Compile Include="convert_xsampa2ipa.py"> <Compile Include="convert_xsampa2ipa.py">
@ -32,8 +32,6 @@
<Compile Include="defaultfiles.py"> <Compile Include="defaultfiles.py">
<SubType>Code</SubType> <SubType>Code</SubType>
</Compile> </Compile>
<Compile Include="fame_asr.py" />
<Compile Include="fame_ipa.py" />
<Compile Include="fame_test.py"> <Compile Include="fame_test.py">
<SubType>Code</SubType> <SubType>Code</SubType>
</Compile> </Compile>
@ -52,9 +50,20 @@
<SubType>Code</SubType> <SubType>Code</SubType>
</Compile> </Compile>
<Compile Include="fame_hmm.py" /> <Compile Include="fame_hmm.py" />
<Compile Include="phoneset\fame_asr.py" />
<Compile Include="phoneset\fame_ipa.py" />
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<Content Include="config.ini" /> <Content Include="config.ini" />
<Content Include="phoneset\fame_ipa2asr.npy" />
<Content Include="phoneset\output_get_translation_key_phone_unknown.npy" />
<Content Include="phoneset\output_get_translation_key_translation_key.npy" />
<Content Include="phoneset\__pycache__\fame_asr.cpython-36.pyc" />
<Content Include="phoneset\__pycache__\fame_ipa.cpython-36.pyc" />
</ItemGroup>
<ItemGroup>
<Folder Include="phoneset\" />
<Folder Include="phoneset\__pycache__\" />
</ItemGroup> </ItemGroup>
<Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" /> <Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
<!-- Uncomment the CoreCompile target to enable the Build command in <!-- Uncomment the CoreCompile target to enable the Build command in

View File

@ -26,4 +26,15 @@ def split_word(word, multi_character_phones):
(word_seperated) (list): the word splitted in given phoneset. (word_seperated) (list): the word splitted in given phoneset.
""" """
return [phone for phone in multi_character_tokenize(word.strip(), multi_character_phones)] return [phone
for phone in multi_character_tokenize(word.strip(), multi_character_phones)
]
def convert_phoneset(word_list, translation_key):
"""
Args:
word_list (str): a list of phones written in given phoneset.
translation_key (dict):
"""
return [translation_key.get(phone, phone) for phone in word_list]

View File

@ -1,127 +0,0 @@
""" definition of the phones to be used. """
# phonese in {FAME}/lexicon/lex.asr
phoneset = [
# vowels
'a',
'a:',
'e',
'e:',
'i',
'i:',
'',
'o',
'o:',
'ö',
'ö:',
'u',
'u:',
'ü',
'ü:',
#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone.
'',
'y',
'ɔ',
'ɔ:',
'ɔ̈',
'ɔ̈:',
'ə',
'ɛ',
'ɛ:',
'ɪ',
'ɪ:',
# plosives
'p',
'b',
't',
'd',
'k',
'g',
'ɡ', # = 'g'
# nasals
'm',
'n',
'ŋ',
# fricatives
'f',
'v',
's',
's:',
'z',
'x',
'h',
# tap and flip
'r',
'r:',
# approximant
'j',
'l'
]
## reduce the number of phones.
# the phones which seldom occur are replaced with another more popular phones.
# replacements are based on the advice from Martijn Wieling.
reduction_key = {
'y':'i:', 'e':'e:', 'ə:':'ɛ:', 'r:':'r', 'ɡ':'g'
}
# already removed beforehand in phoneset. Just to be sure.
phones_to_be_removed = ['ú', 's:', 'ɔ̈:']
phoneset_short = [reduction_key.get(i, i) for i in phoneset
if not i in phones_to_be_removed]
phoneset_short = list(set(phoneset_short))
phoneset_short.sort()
## translation_key to htk format (ascii).
# phones which gives UnicodeEncodeError when phone.encode("ascii")
# are replaced with other characters.
translation_key_asr2htk = {
'': 'i_',
'': 'u_',
# on the analogy of German umlaut, 'e' is used.
'ö': 'oe', 'ö:': 'oe:',
'ü': 'ue', 'ü:': 'ue:',
# on the analogy of Chinese...
'ŋ': 'ng',
# refer to Xsampa.
'ɔ': 'O', 'ɔ:': 'O:', 'ɔ̈': 'Oe',
'ɛ': 'E', 'ɛ:': 'E:',
'ɪ': 'I', 'ɪ:': 'I:',
# it is @ in Xsampa, but that is not handy on HTK.
'ə': 'A'
}
phoneset_htk = [translation_key_asr2htk.get(i, i) for i in phoneset_short]
## check
#for i in phoneset_short:
# try:
# print("{0} --> {1}".format(i, i.encode("ascii")))
# except UnicodeEncodeError:
# print(">>> {}".format(i))
## the list of multi character phones.
# for example, the length of 'a:' is 3, but in the codes it is treated as one letter.
# original.
multi_character_phones = [i for i in phoneset if len(i) > 1]
multi_character_phones.sort(key=len, reverse=True)
# phonset reduced.
multi_character_phones_short = [i for i in phoneset_short if len(i) > 1]
multi_character_phones_short.sort(key=len, reverse=True)
# htk compatible.
multi_character_phones_htk = [i for i in phoneset_htk if len(i) > 1]
multi_character_phones_htk.sort(key=len, reverse=True)

View File

@ -1,4 +1,5 @@
import os import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import sys import sys
from collections import Counter from collections import Counter
@ -8,38 +9,8 @@ import numpy as np
import pandas as pd import pandas as pd
import defaultfiles as default import defaultfiles as default
from phoneset import fame_ipa import convert_phoneset
import convert_phone_set from phoneset import fame_ipa, fame_asr
#def ipa2famehtk_lexicon(lexicon_file_in, lexicon_file_out):
# """ Convert a lexicon file from IPA to HTK format for FAME! corpus. """
# lexicon_in = pd.read_table(lexicon_file_in, names=['word', 'pronunciation'])
# with open(lexicon_file_out, "w", encoding="utf-8") as fout:
# for word, pronunciation in zip(lexicon_in['word'], lexicon_in['pronunciation']):
# pronunciation_no_space = pronunciation.replace(' ', '')
# pronunciation_famehtk = convert_phone_set.ipa2famehtk(pronunciation_no_space)
# if 'ceh' not in pronunciation_famehtk and 'sh' not in pronunciation_famehtk:
# fout.write("{0}\t{1}\n".format(word.upper(), pronunciation_famehtk))
#def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out):
# """ Combine two lexicon files and sort by words. """
# with open(lexicon_file1, "rt", encoding="utf-8") as fin:
# lines1 = fin.read()
# lines1 = lines1.split('\n')
# with open(lexicon_file2, "rt", encoding="utf-8") as fin:
# lines2 = fin.read()
# lines2 = lines2.split('\n')
# lex1 = pd.read_table(lexicon_file1, names=['word', 'pronunciation'])
# lex2 = pd.read_table(lexicon_file2, names=['word', 'pronunciation'])
# lex = pd.concat([lex1, lex2])
# lex = lex.sort_values(by='word', ascending=True)
# lex.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
#def read_fileFA(fileFA): #def read_fileFA(fileFA):
# """ # """
@ -291,4 +262,74 @@ def find_phone(lexicon_file, phone, phoneset_name='ipa'):
if phone in pronunciation: if phone in pronunciation:
extracted_ = pd.Series([row['word'], pronunciation], index=extracted.columns) extracted_ = pd.Series([row['word'], pronunciation], index=extracted.columns)
extracted = extracted.append(extracted_, ignore_index=True) extracted = extracted.append(extracted_, ignore_index=True)
return extracted return extracted
def asr2htk_space_delimited(pronunciation):
"""convert phoneset from asr to htk.
Args:
pronunciation (str): space delimited asr phones.
Returns:
(pronunciation) (str): space delimited asr phones in htk format (ascii).
"""
pronunciation_short = [fame_asr.reduction_key.get(i, i) for i in pronunciation.split(' ')
if not i in fame_asr.phones_to_be_removed]
return ' '.join(convert_phoneset.convert_phoneset(
pronunciation_short, fame_asr.translation_key_asr2htk))
def lexicon_asr2htk(lexicon_file_asr, lexicon_file_htk):
""" Convert a lexicon file from asr to htk format (ascii).
Args:
lexicon_file_asr (path): a lexicon file written in asr format e.g. fame/lex.asr.
lexicon_file_htk (path): a lexicon file written in htk format (ascii).
"""
lex_asr = load_lexicon(lexicon_file_asr)
def asr2htk_space_delimited_(row):
return asr2htk_space_delimited(row['pronunciation'])
lex_htk = pd.DataFrame({
'word': lex_asr['word'],
'pronunciation': lex_asr.apply(asr2htk_space_delimited_, axis=1)
})
lex_htk = lex_htk.ix[:, ['word', 'pronunciation']]
lex_htk.to_csv(lexicon_file_htk, header=None, index=None, sep='\t')
return
def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out):
""" Combine two lexicon files and sort by words.
Args:
lexicon_file1, lexicon_file2 (path): input lexicon files.
Returns:
lexicon_file_out (path): lexicon_file which lexcion_file1 and 2 are combined and sorted.
"""
lex1 = load_lexicon(lexicon_file1)
lex2 = load_lexicon(lexicon_file2)
lex = pd.concat([lex1, lex2])
lex = lex.sort_values(by='word', ascending=True)
lex.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
def fix_single_quote(lexicon_file):
""" add '\' before all single quote at the beginning of words.
Args:
lexicon_file (path): lexicon file, which will be overwitten.
"""
lex = load_lexicon(lexicon_file)
for i in lex[lex['word'].str.startswith('\'')].index.values:
lex.iat[i, 0] = lex.iat[i, 0].replace('\'', '\\\'')
# to_csv does not work with space seperator. therefore all tabs should manually be replaced.
#lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep='\t')
return

View File

@ -5,7 +5,6 @@ os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import tempfile import tempfile
#import configparser #import configparser
#import subprocess #import subprocess
#from collections import Counter
import time import time
import numpy as np import numpy as np
@ -29,44 +28,21 @@ dataset_list = ['devel', 'test', 'train']
# procedure # procedure
extract_features = 0 extract_features = 0
conv_lexicon = 1 make_lexicon = 0
#check_lexicon = 0 make_mlf = 0
#make_mlf = 0 combine_files = 0
#combine_files = 0 flat_start = 0
#flat_start = 0 train_model = 0
#train_model = 1
#sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir))
#sys.path.append(forced_alignment_module)
#from forced_alignment import convert_phone_set
## ======================= load variables ======================= ## ======================= load variables =======================
#config = configparser.ConfigParser() lexicon_dir = os.path.join(default.fame_dir, 'lexicon')
#config.sections() lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
#config.read(config_ini) lexicon_oov = os.path.join(lexicon_dir, 'lex.oov')
lexicon_htk_asr = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_asr')
#config_hcopy = config['Settings']['config_hcopy'] lexicon_htk_oov = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_oov')
#config_train = config['Settings']['config_train'] lexicon_htk = os.path.join(default.htk_dir, 'lexicon', 'lex.htk')
#mkhmmdefs_pl = config['Settings']['mkhmmdefs_pl']
#FAME_dir = config['Settings']['FAME_dir']
#lexicon_dir = os.path.join(default.fame_dir, 'lexicon')
#lexicon_ipa = os.path.join(lexicon_dir, 'lex.ipa')
#lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
#lex_asr = FAME_dir + '\\lexicon\\lex.asr'
#lex_asr_htk = FAME_dir + '\\lexicon\\lex.asr_htk'
#lex_oov = FAME_dir + '\\lexicon\\lex.oov'
#lex_oov_htk = FAME_dir + '\\lexicon\\lex.oov_htk'
##lex_ipa = FAME_dir + '\\lexicon\\lex.ipa'
##lex_ipa_ = FAME_dir + '\\lexicon\\lex.ipa_'
##lex_ipa_htk = FAME_dir + '\\lexicon\\lex.ipa_htk'
#lex_htk = FAME_dir + '\\lexicon\\lex_original.htk'
#lex_htk_ = FAME_dir + '\\lexicon\\lex.htk'
#hcompv_scp = output_dir + '\\scp\\combined.scp' #hcompv_scp = output_dir + '\\scp\\combined.scp'
#combined_mlf = output_dir + '\\label\\combined.mlf' #combined_mlf = output_dir + '\\label\\combined.mlf'
@ -88,8 +64,10 @@ if not os.path.exists(tmp_dir):
## ======================= extract features ======================= ## ======================= extract features =======================
if extract_features: if extract_features:
print('==== extract features ====\n')
for dataset in dataset_list: for dataset in dataset_list:
print('==== {} ===='.format(dataset)) print('==== dataset: {} ===='.format(dataset))
# a script file for HCopy # a script file for HCopy
print(">>> making a script file for HCopy... \n") print(">>> making a script file for HCopy... \n")
@ -112,48 +90,28 @@ if extract_features:
fh.make_filelist(feature_dir_, hcompv_scp, '.mfc') fh.make_filelist(feature_dir_, hcompv_scp, '.mfc')
## ======================= convert lexicon from ipa to fame_htk ======================= ## ======================= make lexicon for HTK =======================
if conv_lexicon: if make_lexicon:
print('==== convert lexicon from ipa 2 fame ====\n') print('==== make lexicon for HTK ====\n')
# convert each lexicon from ipa description to fame_htk phoneset.
#am_func.ipa2famehtk_lexicon(lex_oov, lex_oov_htk) # convert each lexicon from fame_asr phoneset to fame_htk phoneset.
#am_func.ipa2famehtk_lexicon(lex_asr, lex_asr_htk) print('>>> converting each lexicon from fame_asr phoneset to fame_htk phoneset... \n')
fame_functions.lexicon_asr2htk(lexicon_asr, lexicon_htk_asr)
fame_functions.lexicon_asr2htk(lexicon_oov, lexicon_htk_oov)
# combine lexicon # combine lexicon
print('>>> combining lexicon files into one lexicon... \n')
# pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov. # pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov.
# therefore there is no overlap between lex_asr and lex_oov. # therefore there is no overlap between lex_asr and lex_oov.
#am_func.combine_lexicon(lex_asr_htk, lex_oov_htk, lex_htk) fame_functions.combine_lexicon(lexicon_htk_asr, lexicon_htk_oov, lexicon_htk)
## =======================
## ======================= check if all the phones are successfully converted ======================= ## manually make changes to the pronunciation dictionary and save it as lex.htk
if check_lexicon: ## =======================
print("==== check if all the phones are successfully converted. ====\n") # (1) Replace all tabs with single space;
# (2) Put a '\' before any dictionary entry beginning with single quote
# the phones used in the lexicon. #http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html
phonelist_asr = am_func.get_phonelist(lex_asr) fame_functions.fix_single_quote(lexicon_htk)
phonelist_oov = am_func.get_phonelist(lex_oov)
phonelist_htk = am_func.get_phonelist(lex_htk)
phonelist = phonelist_asr.union(phonelist_oov)
# the lines which include a specific phone.
lines = am_func.find_phone(lex_asr, 'g')
# statistics over the lexicon
lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])
pronunciation = lexicon_htk['pronunciation']
phones_all = []
for word in pronunciation:
phones_all = phones_all + word.split()
c = Counter(phones_all)
## =======================
## manually make changes to the pronunciation dictionary and save it as lex.htk
## =======================
# (1) Replace all tabs with single space;
# (2) Put a '\' before any dictionary entry beginning with single quote
#http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html
## ======================= make label file ======================= ## ======================= make label file =======================

View File

@ -1,107 +0,0 @@
""" definition of the phones to be used. """
phoneset = [
# vowels
'',
'i̯ⁿ',
'y',
'i',
'i.',
'iⁿ',
'i:',
'i:ⁿ',
'ɪ',
'ɪⁿ',
'ɪ.',
#'ɪ:', # not included in lex.ipa
'ɪ:ⁿ',
'e',
'e:',
'e:ⁿ',
'ə',
'əⁿ',
'ə:',
'ɛ',
'ɛ.',
'ɛⁿ',
'ɛ:',
'ɛ:ⁿ',
'a',
'aⁿ',
'a.',
'a:',
'a:ⁿ',
'',
'ṷ.',
'ṷⁿ',
#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone.
'u',
'uⁿ',
'u.',
'u:',
'u:ⁿ',
'ü',
'ü.',
'üⁿ',
'ü:',
'ü:ⁿ',
'o',
'oⁿ',
'o.',
'o:',
'o:ⁿ',
'ö',
'ö.',
'öⁿ',
'ö:',
'ö:ⁿ',
'ɔ',
'ɔ.',
'ɔⁿ',
'ɔ:',
'ɔ:ⁿ',
#'ɔ̈', # not included in lex.ipa
'ɔ̈.',
'ɔ̈:',
# plosives
'p',
'b',
't',
'tⁿ',
'd',
'k',
'g',
'ɡ', # = 'g'
# nasals
'm',
'n',
'ŋ',
# fricatives
'f',
'v',
's',
's:',
'z',
'zⁿ',
'x',
'h',
# tap and flip
'r',
'r.', # only appears in word 'mearpartijestelsel'(does not exist in lex_asr) and 'tenoarpartij'.
'r:', # only appears in word 'mûsearflearmûs' and 'sjochdêr'.
# approximant
'j',
'j.',
'l'
]
## the list of multi character phones.
# for example, the length of 'i̯ⁿ' is 3, but in the codes it is treated as one letter.
multi_character_phones = [i for i in phoneset if len(i) > 1]
multi_character_phones.sort(key=len, reverse=True)

View File

@ -1,7 +1,7 @@
import sys import sys
import os import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
from collections import Counter
import time import time
import numpy as np import numpy as np
@ -11,12 +11,12 @@ import fame_functions
import defaultfiles as default import defaultfiles as default
sys.path.append(default.toolbox_dir) sys.path.append(default.toolbox_dir)
from phoneset import fame_ipa, fame_asr from phoneset import fame_ipa, fame_asr
import convert_phoneset
lexicon_dir = os.path.join(default.fame_dir, 'lexicon') lexicon_dir = os.path.join(default.fame_dir, 'lexicon')
lexicon_ipa = os.path.join(lexicon_dir, 'lex.ipa') lexicon_ipa = os.path.join(lexicon_dir, 'lex.ipa')
lexicon_asr = os.path.join(lexicon_dir, 'lex.asr') lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
lexicon_htk = os.path.join(default.htk_dir, 'lexicon', 'lex.htk')
## check if all the phones in lexicon.ipa are in fame_ipa.py. ## check if all the phones in lexicon.ipa are in fame_ipa.py.
#timer_start = time.time() #timer_start = time.time()
@ -64,6 +64,7 @@ else:
# if ipa_ in phone_unknown: # if ipa_ in phone_unknown:
# translation_key_ipa2asr[ipa_] = asr_ # translation_key_ipa2asr[ipa_] = asr_
# phone_unknown.remove(ipa_) # phone_unknown.remove(ipa_)
translation_key_ipa2asr['ə:'] = 'ə' translation_key_ipa2asr['ə:'] = 'ə'
translation_key_ipa2asr['r.'] = 'r' translation_key_ipa2asr['r.'] = 'r'
translation_key_ipa2asr['r:'] = 'r' translation_key_ipa2asr['r:'] = 'r'
@ -71,23 +72,32 @@ np.save(os.path.join('phoneset', 'fame_ipa2asr.npy'), translation_key_ipa2asr)
## check if all the phones in lexicon.asr are in translation_key_ipa2asr. ## check if all the phones in lexicon.asr are in translation_key_ipa2asr.
#timer_start = time.time()
#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_asr, phoneset='asr')
#phoneset_lex.remove("")
#phoneset_asr = list(set(translation_key_ipa2asr.values()))
#print("phones which is in lexicon.asr but not in the translation_key_ipa2asr:\n{}".format(
# set(phoneset_lex) - set(phoneset_asr)))
#print("elapsed time: {}".format(time.time() - timer_start))
## check if all the phones in lexicon.htk are in fame_asr.py.
timer_start = time.time() timer_start = time.time()
phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_asr, phoneset='asr') phoneset_htk = fame_asr.phoneset_htk
phoneset_lex.remove("") phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_htk)
phoneset_asr = list(set(translation_key_ipa2asr.values())) phoneset_lex.remove('')
print("phones which is in lexicon.asr but not in the translation_key_ipa2asr:\n{}".format( print("phones which is in lexicon.htk but not in the fame_asr.py are:\n{}".format(
set(phoneset_lex) - set(phoneset_asr))) set(phoneset_htk) - set(phoneset_lex)))
print("elapsed time: {}".format(time.time() - timer_start)) print("elapsed time: {}".format(time.time() - timer_start))
## make the translation key between asr to htk. # statistics over the lexicon
#multi_character_phones = [i for i in phoneset_asr if len(i) > 1] lex_htk = fame_functions.load_lexicon(lexicon_htk)
#multi_character_phones.sort(key=len, reverse=True) phones_all = (' '.join(lex_htk['pronunciation'])).split(' ')
c = Counter(phones_all)
#lexicon_ipa = pd.read_table(lex_ipa, names=['word', 'pronunciation']) lexicon_out = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\lexicon\lex.htk2'
#with open(lex_ipa_, "w", encoding="utf-8") as fout: for i in lex_htk[lex_htk['word'].str.startswith('\'')].index.values:
# for word, pronunciation in zip(lexicon_ipa['word'], lexicon_ipa['pronunciation']): lex_htk.iat[i, 0] = lex_htk.iat[i, 0].replace('\'', '\\\'')
# # ignore nasalization and '.' # to_csv does not work with space seperator. therefore all tabs should manually be replaced.
# pronunciation_ = pronunciation.replace(u'ⁿ', '') #lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
# pronunciation_ = pronunciation_.replace('.', '') lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
# pronunciation_split = convert_phone_set.split_ipa_fame(pronunciation_)
# fout.write("{0}\t{1}\n".format(word, ' '.join(pronunciation_split)))

View File

@ -1,74 +1,40 @@
""" definition of the phones to be used. """ """ definition of the phones to be used. """
# phonese in {FAME}/lexicon/lex.asr
phoneset = [ phoneset = [
# vowels # vowels
'', 'a',
'i̯ⁿ', 'a:',
'y',
'i',
'i.',
'iⁿ',
'i:',
'i:ⁿ',
'ɪ',
'ɪⁿ',
'ɪ.',
#'ɪ:', # not included in lex.ipa
'ɪ:ⁿ',
'e', 'e',
'e:', 'e:',
'e:ⁿ', 'i',
'ə', 'i:',
'əⁿ', '',
'ə:',
'ɛ',
'ɛ.',
'ɛⁿ',
'ɛ:',
'ɛ:ⁿ',
'a',
'aⁿ',
'a.',
'a:',
'a:ⁿ',
'',
'ṷ.',
'ṷⁿ',
#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr.
'u',
'uⁿ',
'u.',
'u:',
'u:ⁿ',
'ü',
'ü.',
'üⁿ',
'ü:',
'ü:ⁿ',
'o', 'o',
'oⁿ',
'o.',
'o:', 'o:',
'o:ⁿ',
'ö', 'ö',
'ö.',
'öⁿ',
'ö:', 'ö:',
'ö:ⁿ', 'u',
'u:',
'ü',
'ü:',
#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone.
'',
'y',
'ɔ', 'ɔ',
'ɔ.',
'ɔⁿ',
'ɔ:', 'ɔ:',
'ɔ:ⁿ', 'ɔ̈',
#'ɔ̈', # not included in lex.ipa
'ɔ̈.',
'ɔ̈:', 'ɔ̈:',
'ə',
'ɛ',
'ɛ:',
'ɪ',
'ɪ:',
# plosives # plosives
'p', 'p',
'b', 'b',
't', 't',
'tⁿ',
'd', 'd',
'k', 'k',
'g', 'g',
@ -85,22 +51,77 @@ phoneset = [
's', 's',
's:', 's:',
'z', 'z',
'zⁿ',
'x', 'x',
'h', 'h',
# tap and flip # tap and flip
'r', 'r',
'r.', # only appears in word 'mearpartijestelsel'(does not exist in lex_asr) and 'tenoarpartij'. 'r:',
'r:', # only appears in word 'mûsearflearmûs' and 'sjochdêr'.
# approximant # approximant
'j', 'j',
'j.',
'l' 'l'
] ]
## reduce the number of phones.
# the phones which seldom occur are replaced with another more popular phones.
# replacements are based on the advice from Martijn Wieling.
reduction_key = {
'y':'i:', 'e':'e:', 'ə:':'ɛ:', 'r:':'r', 'ɡ':'g'
}
# already removed beforehand in phoneset. Just to be sure.
phones_to_be_removed = ['ú', 's:', 'ɔ̈:']
phoneset_short = [reduction_key.get(i, i) for i in phoneset
if not i in phones_to_be_removed]
phoneset_short = list(set(phoneset_short))
phoneset_short.sort()
## translation_key to htk format (ascii).
# phones which gives UnicodeEncodeError when phone.encode("ascii")
# are replaced with other characters.
translation_key_asr2htk = {
'': 'i_',
'': 'u_',
# on the analogy of German umlaut, 'e' is used.
'ö': 'oe', 'ö:': 'oe:',
'ü': 'ue', 'ü:': 'ue:',
# on the analogy of Chinese...
'ŋ': 'ng',
# refer to Xsampa.
'ɔ': 'O', 'ɔ:': 'O:', 'ɔ̈': 'Oe',
'ɛ': 'E', 'ɛ:': 'E:',
'ɪ': 'I', 'ɪ:': 'I:',
# it is @ in Xsampa, but that is not handy on HTK.
'ə': 'A'
}
phoneset_htk = [translation_key_asr2htk.get(i, i) for i in phoneset_short]
## check
#for i in phoneset_short:
# try:
# print("{0} --> {1}".format(i, i.encode("ascii")))
# except UnicodeEncodeError:
# print(">>> {}".format(i))
## the list of multi character phones. ## the list of multi character phones.
# for example, the length of 'i̯ⁿ' is 3, but in the codes it is treated as one letter. # for example, the length of 'a:' is 3, but in the codes it is treated as one letter.
# original.
multi_character_phones = [i for i in phoneset if len(i) > 1] multi_character_phones = [i for i in phoneset if len(i) > 1]
multi_character_phones.sort(key=len, reverse=True) multi_character_phones.sort(key=len, reverse=True)
# phonset reduced.
multi_character_phones_short = [i for i in phoneset_short if len(i) > 1]
multi_character_phones_short.sort(key=len, reverse=True)
# htk compatible.
multi_character_phones_htk = [i for i in phoneset_htk if len(i) > 1]
multi_character_phones_htk.sort(key=len, reverse=True)

View File

@ -34,7 +34,7 @@ phoneset = [
'', '',
'ṷ.', 'ṷ.',
'ṷⁿ', 'ṷⁿ',
#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. #'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone.
'u', 'u',
'uⁿ', 'uⁿ',
'u.', 'u.',
@ -100,6 +100,7 @@ phoneset = [
'l' 'l'
] ]
## the list of multi character phones. ## the list of multi character phones.
# for example, the length of 'i̯ⁿ' is 3, but in the codes it is treated as one letter. # for example, the length of 'i̯ⁿ' is 3, but in the codes it is treated as one letter.
multi_character_phones = [i for i in phoneset if len(i) > 1] multi_character_phones = [i for i in phoneset if len(i) > 1]