2019-01-27 23:52:33 +01:00
""" definition of the phones to be used. """
2019-01-29 21:52:11 +01:00
# phonese in {FAME}/lexicon/lex.asr
2019-01-28 12:34:20 +01:00
phoneset = [
2019-01-27 01:34:04 +01:00
# vowels
' a ' ,
' a: ' ,
2019-01-29 21:52:11 +01:00
' e ' ,
' e: ' ,
' i ' ,
' i: ' ,
' i̯ ' ,
2019-01-27 01:34:04 +01:00
' o ' ,
' o: ' ,
' ö ' ,
' ö: ' ,
2019-01-29 21:52:11 +01:00
' u ' ,
' u: ' ,
' ü ' ,
' ü: ' ,
#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone.
' ṷ ' ,
' y ' ,
2019-01-27 01:34:04 +01:00
' ɔ ' ,
' ɔ: ' ,
2019-01-29 21:52:11 +01:00
' ɔ̈ ' ,
2019-01-27 01:34:04 +01:00
' ɔ̈: ' ,
2019-01-29 21:52:11 +01:00
' ə ' ,
' ɛ ' ,
' ɛ: ' ,
' ɪ ' ,
' ɪ :' ,
2019-01-27 01:34:04 +01:00
# plosives
' p ' ,
' b ' ,
2019-01-27 23:52:33 +01:00
' t ' ,
2019-01-27 01:34:04 +01:00
' d ' ,
' k ' ,
' g ' ,
2019-01-27 23:52:33 +01:00
' ɡ ' , # = 'g'
2019-01-27 01:34:04 +01:00
# nasals
' m ' ,
' n ' ,
' ŋ ' ,
# fricatives
' f ' ,
' v ' ,
' s ' ,
' s: ' ,
' z ' ,
' x ' ,
' h ' ,
2019-01-29 21:52:11 +01:00
2019-01-27 23:52:33 +01:00
# tap and flip
' r ' ,
2019-01-29 21:52:11 +01:00
' r: ' ,
2019-01-27 23:52:33 +01:00
# approximant
' j ' ,
' l '
2019-01-27 01:34:04 +01:00
]
2019-01-29 21:52:11 +01:00
## reduce the number of phones.
# the phones which seldom occur are replaced with another more popular phones.
# replacements are based on the advice from Martijn Wieling.
reduction_key = {
2019-02-14 00:21:28 +01:00
' y ' : ' i: ' , ' e ' : ' e: ' , ' ə: ' : ' ɛ: ' , ' r: ' : ' r ' , ' ɡ ' : ' g ' ,
# aki added because this is used in stimmen_project.
' ɔ̈: ' : ' ɔ: '
2019-01-29 21:52:11 +01:00
}
# already removed beforehand in phoneset. Just to be sure.
2019-02-14 00:21:28 +01:00
phones_to_be_removed = [ ' ú ' , ' s: ' ]
2019-01-29 21:52:11 +01:00
2019-02-06 00:00:14 +01:00
def phone_reduction ( phones ) :
2019-02-14 00:21:28 +01:00
"""
Args :
phones ( list ) : list of phones .
"""
2019-02-06 00:00:14 +01:00
return [ reduction_key . get ( i , i ) for i in phones
2019-01-29 21:52:11 +01:00
if not i in phones_to_be_removed ]
2019-02-14 00:21:28 +01:00
2019-02-06 00:00:14 +01:00
phoneset_short = list ( set ( phone_reduction ( phoneset ) ) )
2019-01-29 21:52:11 +01:00
phoneset_short . sort ( )
2019-02-06 00:00:14 +01:00
2019-01-29 21:52:11 +01:00
## translation_key to htk format (ascii).
# phones which gives UnicodeEncodeError when phone.encode("ascii")
# are replaced with other characters.
translation_key_asr2htk = {
' i̯ ' : ' i_ ' ,
' ṷ ' : ' u_ ' ,
# on the analogy of German umlaut, 'e' is used.
' ö ' : ' oe ' , ' ö: ' : ' oe: ' ,
' ü ' : ' ue ' , ' ü: ' : ' ue: ' ,
# on the analogy of Chinese...
' ŋ ' : ' ng ' ,
# refer to Xsampa.
2019-02-14 00:21:28 +01:00
' ɔ ' : ' O ' , ' ɔ: ' : ' O: ' , ' ɔ̈ ' : ' Oe ' ,
#'ɔ̈:': 'O:', # does not appear in FAME, but used in stimmen.
2019-01-29 21:52:11 +01:00
' ɛ ' : ' E ' , ' ɛ: ' : ' E: ' ,
' ɪ ' : ' I ' , ' ɪ :' : ' I: ' ,
# it is @ in Xsampa, but that is not handy on HTK.
' ə ' : ' A '
}
phoneset_htk = [ translation_key_asr2htk . get ( i , i ) for i in phoneset_short ]
2019-02-03 00:34:35 +01:00
#not_in_ascii = [
# '\'',
# 'â', 'ê', 'ô', 'û', 'č',
# 'à', 'í', 'é', 'è', 'ú', 'ć',
# 'ä', 'ë', 'ï', 'ö', 'ü'
#]
translation_key_word2htk = {
2019-02-03 13:54:37 +01:00
#'\'': '\\\'',
2019-02-03 00:34:35 +01:00
' í ' : ' i1 ' , ' é ' : ' e1 ' , ' ú ' : ' u1 ' , ' ć ' : ' c1 ' ,
' à ' : ' a2 ' , ' è ' : ' e2 ' ,
' â ' : ' a3 ' , ' ê ' : ' e3 ' , ' ô ' : ' o3 ' , ' û ' : ' u3 ' ,
' č ' : ' c4 ' ,
' ä ' : ' ao ' , ' ë ' : ' ee ' , ' ï ' : ' ie ' , ' ö ' : ' oe ' , ' ü ' : ' ue ' ,
}
#[translation_key_word2htk.get(i, i) for i in not_in_ascii]
2019-01-29 21:52:11 +01:00
2019-01-27 23:52:33 +01:00
## the list of multi character phones.
2019-01-29 21:52:11 +01:00
# for example, the length of 'a:' is 3, but in the codes it is treated as one letter.
# original.
2019-01-28 12:34:20 +01:00
multi_character_phones = [ i for i in phoneset if len ( i ) > 1 ]
2019-01-29 21:52:11 +01:00
multi_character_phones . sort ( key = len , reverse = True )
# phonset reduced.
multi_character_phones_short = [ i for i in phoneset_short if len ( i ) > 1 ]
multi_character_phones_short . sort ( key = len , reverse = True )
# htk compatible.
multi_character_phones_htk = [ i for i in phoneset_htk if len ( i ) > 1 ]
multi_character_phones_htk . sort ( key = len , reverse = True )