acoustic_model/acoustic_model/convert_phoneset.py

41 lines
1.2 KiB
Python

"""Module to convert phonemes."""
def multi_character_tokenize(line, multi_character_tokens):
"""Tries to match one of the tokens in multi_character_tokens at each position of line, starting at position 0,
if so tokenizes and eats that token. Otherwise tokenizes a single character"""
while line != '':
for token in multi_character_tokens:
if line.startswith(token) and len(token) > 0:
yield token
line = line[len(token):]
break
else:
yield line[:1]
line = line[1:]
def split_word(word, multi_character_phones):
"""
split a line by given phoneset.
Args:
word (str): a word written in given phoneset.
multi_character_phones (list): the list of multicharacter phones which is considered as one phone. this can be obtained with phoneset definition such as fame_ipa.py.
Returns:
(word_seperated) (list): the word splitted in given phoneset.
"""
return [phone
for phone in multi_character_tokenize(word.strip(), multi_character_phones)
]
def convert_phoneset(word_list, translation_key):
"""
Args:
word_list (str): a list of phones written in given phoneset.
translation_key (dict):
"""
return [translation_key.get(phone, phone) for phone in word_list]