acoustic_model/acoustic_model/convert_phoneset.py

"""Module to convert phonemes."""

def multi_character_tokenize(line, multi_character_tokens):
	"""Tries to match one of the tokens in multi_character_tokens at each position of line, starting at position 0,
	if so tokenizes and eats that token. Otherwise tokenizes a single character"""
	while line != '':
		for token in multi_character_tokens:
			if line.startswith(token) and len(token) > 0:
				yield token
				line = line[len(token):]
				break
		else:
			yield line[:1]
			line = line[1:]


def split_word(word, multi_character_phones):
	"""
	split a line by given phoneset.

	Args:
		word (str): a word written in given phoneset.
		multi_character_phones (list): the list of multicharacter phones which is considered as one phone. this can be obtained with phoneset definition such as fame_ipa.py.

	Returns:
		(word_seperated) (list): the word splitted in given phoneset.

	"""
	return [phone
		 for phone in multi_character_tokenize(word.strip(), multi_character_phones)
		 ]


def convert_phoneset(word_list, translation_key):
	"""
	Args:
		word_list (str): a list of phones written in given phoneset.
		translation_key (dict):
	"""
	return [translation_key.get(phone, phone) for phone in word_list]