29 lines
985 B
Python
29 lines
985 B
Python
"""Module to convert phonemes."""
|
|
|
|
def multi_character_tokenize(line, multi_character_tokens):
|
|
"""Tries to match one of the tokens in multi_character_tokens at each position of line, starting at position 0,
|
|
if so tokenizes and eats that token. Otherwise tokenizes a single character"""
|
|
while line != '':
|
|
for token in multi_character_tokens:
|
|
if line.startswith(token) and len(token) > 0:
|
|
yield token
|
|
line = line[len(token):]
|
|
break
|
|
else:
|
|
yield line[:1]
|
|
line = line[1:]
|
|
|
|
|
|
def split_word(word, multi_character_phones):
|
|
"""
|
|
split a line by given phoneset.
|
|
|
|
Args:
|
|
word (str): a word written in given phoneset.
|
|
multi_character_phones (list): the list of multicharacter phones which is considered as one phone. this can be obtained with phoneset definition such as fame_phoneset.py.
|
|
|
|
Returns:
|
|
(word_seperated) (list): the word splitted in given phoneset.
|
|
|
|
"""
|
|
return [phone for phone in multi_character_tokenize(word.strip(), multi_character_phones)] |