acoustic_model/acoustic_model/convert_phone_set.py

"""Module to convert phonemes."""

def multi_character_tokenize(line, multi_character_tokens):
	"""Tries to match one of the tokens in multi_character_tokens at each position of line, starting at position 0,
	if so tokenizes and eats that token. Otherwise tokenizes a single character"""
	while line != '':
		for token in multi_character_tokens:
			if line.startswith(token) and len(token) > 0:
				yield token
				line = line[len(token):]
				break
		else:
			yield line[:1]
			line = line[1:]


def split_word(word, multi_character_phones):
	"""
	split a line by given phoneset.

	Args:
		word (str): a word written in given phoneset.
		multi_character_phones (list): the list of multicharacter phones which is considered as one phone. this can be obtained with phoneset definition such as fame_phoneset.py.

	Returns:
		(word_seperated) (list): the word splitted in given phoneset.

	"""
	return [phone for phone in multi_character_tokenize(word.strip(), multi_character_phones)]