128 lines
2.5 KiB
Python
128 lines
2.5 KiB
Python
""" definition of the phones to be used. """
|
||
|
||
# phonese in {FAME}/lexicon/lex.asr
|
||
phoneset = [
|
||
# vowels
|
||
'a',
|
||
'a:',
|
||
'e',
|
||
'e:',
|
||
'i',
|
||
'i:',
|
||
'i̯',
|
||
'o',
|
||
'o:',
|
||
'ö',
|
||
'ö:',
|
||
'u',
|
||
'u:',
|
||
'ü',
|
||
'ü:',
|
||
#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone.
|
||
'ṷ',
|
||
'y',
|
||
'ɔ',
|
||
'ɔ:',
|
||
'ɔ̈',
|
||
'ɔ̈:',
|
||
'ə',
|
||
'ɛ',
|
||
'ɛ:',
|
||
'ɪ',
|
||
'ɪ:',
|
||
|
||
# plosives
|
||
'p',
|
||
'b',
|
||
't',
|
||
'd',
|
||
'k',
|
||
'g',
|
||
'ɡ', # = 'g'
|
||
|
||
# nasals
|
||
'm',
|
||
'n',
|
||
'ŋ',
|
||
|
||
# fricatives
|
||
'f',
|
||
'v',
|
||
's',
|
||
's:',
|
||
'z',
|
||
'x',
|
||
'h',
|
||
|
||
# tap and flip
|
||
'r',
|
||
'r:',
|
||
|
||
# approximant
|
||
'j',
|
||
'l'
|
||
]
|
||
|
||
|
||
## reduce the number of phones.
|
||
# the phones which seldom occur are replaced with another more popular phones.
|
||
# replacements are based on the advice from Martijn Wieling.
|
||
reduction_key = {
|
||
'y':'i:', 'e':'e:', 'ə:':'ɛ:', 'r:':'r', 'ɡ':'g'
|
||
}
|
||
# already removed beforehand in phoneset. Just to be sure.
|
||
phones_to_be_removed = ['ú', 's:', 'ɔ̈:']
|
||
|
||
phoneset_short = [reduction_key.get(i, i) for i in phoneset
|
||
if not i in phones_to_be_removed]
|
||
phoneset_short = list(set(phoneset_short))
|
||
phoneset_short.sort()
|
||
|
||
|
||
## translation_key to htk format (ascii).
|
||
# phones which gives UnicodeEncodeError when phone.encode("ascii")
|
||
# are replaced with other characters.
|
||
translation_key_asr2htk = {
|
||
'i̯': 'i_',
|
||
'ṷ': 'u_',
|
||
|
||
# on the analogy of German umlaut, 'e' is used.
|
||
'ö': 'oe', 'ö:': 'oe:',
|
||
'ü': 'ue', 'ü:': 'ue:',
|
||
|
||
# on the analogy of Chinese...
|
||
'ŋ': 'ng',
|
||
|
||
# refer to Xsampa.
|
||
'ɔ': 'O', 'ɔ:': 'O:', 'ɔ̈': 'Oe',
|
||
'ɛ': 'E', 'ɛ:': 'E:',
|
||
'ɪ': 'I', 'ɪ:': 'I:',
|
||
|
||
# it is @ in Xsampa, but that is not handy on HTK.
|
||
'ə': 'A'
|
||
}
|
||
phoneset_htk = [translation_key_asr2htk.get(i, i) for i in phoneset_short]
|
||
|
||
## check
|
||
#for i in phoneset_short:
|
||
# try:
|
||
# print("{0} --> {1}".format(i, i.encode("ascii")))
|
||
# except UnicodeEncodeError:
|
||
# print(">>> {}".format(i))
|
||
|
||
|
||
## the list of multi character phones.
|
||
# for example, the length of 'a:' is 3, but in the codes it is treated as one letter.
|
||
|
||
# original.
|
||
multi_character_phones = [i for i in phoneset if len(i) > 1]
|
||
multi_character_phones.sort(key=len, reverse=True)
|
||
|
||
# phonset reduced.
|
||
multi_character_phones_short = [i for i in phoneset_short if len(i) > 1]
|
||
multi_character_phones_short.sort(key=len, reverse=True)
|
||
|
||
# htk compatible.
|
||
multi_character_phones_htk = [i for i in phoneset_htk if len(i) > 1]
|
||
multi_character_phones_htk.sort(key=len, reverse=True)
|