29 lines
836 B
Python
29 lines
836 B
Python
"""Module to convert phonemes."""
|
|
|
|
def multi_character_tokenize(line, multi_character_tokens):
|
|
"""Tries to match one of the tokens in multi_character_tokens at each position of line, starting at position 0,
|
|
if so tokenizes and eats that token. Otherwise tokenizes a single character"""
|
|
while line != '':
|
|
for token in multi_character_tokens:
|
|
if line.startswith(token) and len(token) > 0:
|
|
yield token
|
|
line = line[len(token):]
|
|
break
|
|
else:
|
|
yield line[:1]
|
|
line = line[1:]
|
|
|
|
|
|
def split_word(word, multi_character_phones):
|
|
"""
|
|
Split a line by given phoneset.
|
|
|
|
Args:
|
|
word (str): one word written in given phoneset.
|
|
multi_character_phones:
|
|
|
|
Returns:
|
|
word_seperated (str): the word splitted in given phoneset.
|
|
"""
|
|
|
|
return [phone for phone in multi_character_tokenize(word.strip(), multi_character_phones)] |