""" definition of the phones to be used. """ # phonese in {FAME}/lexicon/lex.asr phoneset = [ # vowels 'a', 'a:', 'e', 'e:', 'i', 'i:', 'i̯', 'o', 'o:', 'ö', 'ö:', 'u', 'u:', 'ü', 'ü:', #'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone. 'ṷ', 'y', 'ɔ', 'ɔ:', 'ɔ̈', 'ɔ̈:', 'ə', 'ɛ', 'ɛ:', 'ɪ', 'ɪ:', # plosives 'p', 'b', 't', 'd', 'k', 'g', 'ɡ', # = 'g' # nasals 'm', 'n', 'ŋ', # fricatives 'f', 'v', 's', 's:', 'z', 'x', 'h', # tap and flip 'r', 'r:', # approximant 'j', 'l' ] ## reduce the number of phones. # the phones which seldom occur are replaced with another more popular phones. # replacements are based on the advice from Martijn Wieling. reduction_key = { 'y':'i:', 'e':'e:', 'ə:':'ɛ:', 'r:':'r', 'ɡ':'g', # aki added because this is used in stimmen_project. 'ɔ̈:':'ɔ:' } # already removed beforehand in phoneset. Just to be sure. phones_to_be_removed = ['ú', 's:'] def phone_reduction(phones): """ Args: phones (list): list of phones. """ if sum([phone in phones for phone in phones_to_be_removed]) != 0: print('input includes phone(s) which is not defined in fame_asr.') print('those phone(s) are removed.') return [reduction_key.get(i, i) for i in phones if i not in phones_to_be_removed] phoneset_short = list(set(phone_reduction(phoneset))) phoneset_short.sort() ## translation_key to htk format (ascii). # phones which gives UnicodeEncodeError when phone.encode("ascii") # are replaced with other characters. translation_key_asr2htk = { 'i̯': 'i_', 'ṷ': 'u_', # on the analogy of German umlaut, 'e' is used. 'ö': 'oe', 'ö:': 'oe:', '' 'ü': 'ue', 'ü:': 'ue:', # on the analogy of Chinese... 'ŋ': 'ng', # refer to Xsampa. 'ɔ': 'O', 'ɔ:': 'O:', 'ɔ̈': 'Oe', #'ɔ̈:': 'O:', # does not appear in FAME, but used in stimmen. 'ɛ': 'E', 'ɛ:': 'E:', 'ɪ': 'I', 'ɪ:': 'I:', # it is @ in Xsampa, but that is not handy on HTK. 'ə': 'A' } phoneset_htk = [translation_key_asr2htk.get(i, i) for i in phoneset_short] #not_in_ascii = [ # '\'', # 'â', 'ê', 'ô', 'û', 'č', # 'à', 'í', 'é', 'è', 'ú', 'ć', # 'ä', 'ë', 'ï', 'ö', 'ü' #] translation_key_word2htk = { #'\'': '\\\'', 'í':'i1', 'é':'e1', 'ú':'u1', 'ć':'c1', 'à':'a2', 'è':'e2', 'â':'a3', 'ê':'e3', 'ô':'o3', 'û':'u3', 'č':'c4', 'ä': 'ao', 'ë': 'ee', 'ï': 'ie', 'ö': 'oe', 'ü': 'ue', } #[translation_key_word2htk.get(i, i) for i in not_in_ascii] #Stop: p, b, t, d, k, g #Nasal: m, n, ng(ŋ) #Fricative: s, z, f, v, h, x #Liquid: l, r #Vowel: a, a:, e:, i, i:, i_(i̯), o, o:, u, u:, u_(ṷ), oe(ö), oe:(ö:), ue(ü), ue:(ü:), O(ɔ), O:(ɔ:), Oe(ɔ̈), A(ə), E(ɛ), E:(ɛ:), I(ɪ), I:(ɪ:) ## the list of multi character phones. # for example, the length of 'a:' is 3, but in the codes it is treated as one letter. # original. multi_character_phones = [i for i in phoneset if len(i) > 1] multi_character_phones.sort(key=len, reverse=True) # phonset reduced. multi_character_phones_short = [i for i in phoneset_short if len(i) > 1] multi_character_phones_short.sort(key=len, reverse=True) # htk compatible. multi_character_phones_htk = [i for i in phoneset_htk if len(i) > 1] multi_character_phones_htk.sort(key=len, reverse=True)