41 lines
1.2 KiB
Python
41 lines
1.2 KiB
Python
"""Module to convert phonemes."""
|
|
|
|
def multi_character_tokenize(line, multi_character_tokens):
|
|
"""Tries to match one of the tokens in multi_character_tokens at each position of line, starting at position 0,
|
|
if so tokenizes and eats that token. Otherwise tokenizes a single character"""
|
|
while line != '':
|
|
for token in multi_character_tokens:
|
|
if line.startswith(token) and len(token) > 0:
|
|
yield token
|
|
line = line[len(token):]
|
|
break
|
|
else:
|
|
yield line[:1]
|
|
line = line[1:]
|
|
|
|
|
|
def split_word(word, multi_character_phones):
|
|
"""
|
|
split a line by given phoneset.
|
|
|
|
Args:
|
|
word (str): a word written in given phoneset.
|
|
multi_character_phones (list): the list of multicharacter phones which is considered as one phone. this can be obtained with phoneset definition such as fame_ipa.py.
|
|
|
|
Returns:
|
|
(word_seperated) (list): the word splitted in given phoneset.
|
|
|
|
"""
|
|
return [phone
|
|
for phone in multi_character_tokenize(word.strip(), multi_character_phones)
|
|
]
|
|
|
|
|
|
def convert_phoneset(word_list, translation_key):
|
|
"""
|
|
Args:
|
|
word_list (str): a list of phones written in given phoneset.
|
|
translation_key (dict):
|
|
"""
|
|
return [translation_key.get(phone, phone) for phone in word_list]
|