acoustic_model/acoustic_model/convert_phone_set.py

29 lines
985 B
Python

"""Module to convert phonemes."""
def multi_character_tokenize(line, multi_character_tokens):
"""Tries to match one of the tokens in multi_character_tokens at each position of line, starting at position 0,
if so tokenizes and eats that token. Otherwise tokenizes a single character"""
while line != '':
for token in multi_character_tokens:
if line.startswith(token) and len(token) > 0:
yield token
line = line[len(token):]
break
else:
yield line[:1]
line = line[1:]
def split_word(word, multi_character_phones):
"""
split a line by given phoneset.
Args:
word (str): a word written in given phoneset.
multi_character_phones (list): the list of multicharacter phones which is considered as one phone. this can be obtained with phoneset definition such as fame_phoneset.py.
Returns:
(word_seperated) (list): the word splitted in given phoneset.
"""
return [phone for phone in multi_character_tokenize(word.strip(), multi_character_phones)]