128 lines
2.5 KiB
Python
128 lines
2.5 KiB
Python
|
""" definition of the phones to be used. """
|
|||
|
|
|||
|
# phonese in {FAME}/lexicon/lex.asr
|
|||
|
phoneset = [
|
|||
|
# vowels
|
|||
|
'a',
|
|||
|
'a:',
|
|||
|
'e',
|
|||
|
'e:',
|
|||
|
'i',
|
|||
|
'i:',
|
|||
|
'i̯',
|
|||
|
'o',
|
|||
|
'o:',
|
|||
|
'ö',
|
|||
|
'ö:',
|
|||
|
'u',
|
|||
|
'u:',
|
|||
|
'ü',
|
|||
|
'ü:',
|
|||
|
#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone.
|
|||
|
'ṷ',
|
|||
|
'y',
|
|||
|
'ɔ',
|
|||
|
'ɔ:',
|
|||
|
'ɔ̈',
|
|||
|
'ɔ̈:',
|
|||
|
'ə',
|
|||
|
'ɛ',
|
|||
|
'ɛ:',
|
|||
|
'ɪ',
|
|||
|
'ɪ:',
|
|||
|
|
|||
|
# plosives
|
|||
|
'p',
|
|||
|
'b',
|
|||
|
't',
|
|||
|
'd',
|
|||
|
'k',
|
|||
|
'g',
|
|||
|
'ɡ', # = 'g'
|
|||
|
|
|||
|
# nasals
|
|||
|
'm',
|
|||
|
'n',
|
|||
|
'ŋ',
|
|||
|
|
|||
|
# fricatives
|
|||
|
'f',
|
|||
|
'v',
|
|||
|
's',
|
|||
|
's:',
|
|||
|
'z',
|
|||
|
'x',
|
|||
|
'h',
|
|||
|
|
|||
|
# tap and flip
|
|||
|
'r',
|
|||
|
'r:',
|
|||
|
|
|||
|
# approximant
|
|||
|
'j',
|
|||
|
'l'
|
|||
|
]
|
|||
|
|
|||
|
|
|||
|
## reduce the number of phones.
|
|||
|
# the phones which seldom occur are replaced with another more popular phones.
|
|||
|
# replacements are based on the advice from Martijn Wieling.
|
|||
|
reduction_key = {
|
|||
|
'y':'i:', 'e':'e:', 'ə:':'ɛ:', 'r:':'r', 'ɡ':'g'
|
|||
|
}
|
|||
|
# already removed beforehand in phoneset. Just to be sure.
|
|||
|
phones_to_be_removed = ['ú', 's:', 'ɔ̈:']
|
|||
|
|
|||
|
phoneset_short = [reduction_key.get(i, i) for i in phoneset
|
|||
|
if not i in phones_to_be_removed]
|
|||
|
phoneset_short = list(set(phoneset_short))
|
|||
|
phoneset_short.sort()
|
|||
|
|
|||
|
|
|||
|
## translation_key to htk format (ascii).
|
|||
|
# phones which gives UnicodeEncodeError when phone.encode("ascii")
|
|||
|
# are replaced with other characters.
|
|||
|
translation_key_asr2htk = {
|
|||
|
'i̯': 'i_',
|
|||
|
'ṷ': 'u_',
|
|||
|
|
|||
|
# on the analogy of German umlaut, 'e' is used.
|
|||
|
'ö': 'oe', 'ö:': 'oe:',
|
|||
|
'ü': 'ue', 'ü:': 'ue:',
|
|||
|
|
|||
|
# on the analogy of Chinese...
|
|||
|
'ŋ': 'ng',
|
|||
|
|
|||
|
# refer to Xsampa.
|
|||
|
'ɔ': 'O', 'ɔ:': 'O:', 'ɔ̈': 'Oe',
|
|||
|
'ɛ': 'E', 'ɛ:': 'E:',
|
|||
|
'ɪ': 'I', 'ɪ:': 'I:',
|
|||
|
|
|||
|
# it is @ in Xsampa, but that is not handy on HTK.
|
|||
|
'ə': 'A'
|
|||
|
}
|
|||
|
phoneset_htk = [translation_key_asr2htk.get(i, i) for i in phoneset_short]
|
|||
|
|
|||
|
## check
|
|||
|
#for i in phoneset_short:
|
|||
|
# try:
|
|||
|
# print("{0} --> {1}".format(i, i.encode("ascii")))
|
|||
|
# except UnicodeEncodeError:
|
|||
|
# print(">>> {}".format(i))
|
|||
|
|
|||
|
|
|||
|
## the list of multi character phones.
|
|||
|
# for example, the length of 'a:' is 3, but in the codes it is treated as one letter.
|
|||
|
|
|||
|
# original.
|
|||
|
multi_character_phones = [i for i in phoneset if len(i) > 1]
|
|||
|
multi_character_phones.sort(key=len, reverse=True)
|
|||
|
|
|||
|
# phonset reduced.
|
|||
|
multi_character_phones_short = [i for i in phoneset_short if len(i) > 1]
|
|||
|
multi_character_phones_short.sort(key=len, reverse=True)
|
|||
|
|
|||
|
# htk compatible.
|
|||
|
multi_character_phones_htk = [i for i in phoneset_htk if len(i) > 1]
|
|||
|
multi_character_phones_htk.sort(key=len, reverse=True)
|