acoustic_model/acoustic_model/convert_xsampa2ipa.py

161 lines
4.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

""" Conversion between IPA and Xsampa.
Note: this code is based on ipa-xsama-converter/converter.py.
https://github.com/lingz/ipa-xsama-converter/
"""
import json
import sys
import os
import defaultfiles as default
sys.path.append(os.path.join(default.repo_dir, 'forced_alignment'))
from forced_alignment import convert_phone_set
def load_converter(source, sink, ipa_xsampa_converter_dir):
"""load the converter.
source and sink are either of "ipa", "xsampa" or "sassc".
"""
choices = ["ipa", "xsampa", "sassc"]
# Validate params
try:
choice1 = choices.index(source)
choice2 = choices.index(sink)
if choice1 == choice2:
print("source and destination format are the same.")
except ValueError:
print("source and destination should be one of [ipa xsampa sassc].")
exit(1)
# Mappings from disk
# some may not be used if source or sink is already IPA
source_to_ipa = {}
ipa_to_sink = {}
ipa_xsampa = []
sassc_ipa = []
# The IPAs that actually occur within SASSC
sassc_active_ipa = {}
script_dir = os.path.dirname(os.path.realpath(__file__))
with open(os.path.join(ipa_xsampa_converter_dir, "ipa_xsampa_map.json"), encoding="utf-8") as f:
ipa_xsampa = json.load(f)
sassc_active = source == "sassc" or sink == "sassc"
if sassc_active:
with open(os.path.join(script_dir, "./sassc_ipa.json")) as f:
sassc_ipa = json.load(f)
for pair in sassc_ipa:
for char in pair[1]:
sassc_active_ipa[char] = 1
if source == "xsampa":
for pair in ipa_xsampa:
source_to_ipa[pair[1]] = pair[0]
elif source == "sassc":
for pair in sassc_ipa:
source_to_ipa[pair[0]] = pair[1]
if sink == "xsampa":
for pair in ipa_xsampa:
ipa_to_sink[pair[0]] = pair[1]
elif sink == "sassc":
for pair in sassc_ipa:
ipa_to_sink[pair[1]] = pair[0]
# Combine them into a single mapping
mapping = {}
if source == "ipa":
mapping = ipa_to_sink
elif sink == "ipa":
mapping = source_to_ipa
else:
for k, ipas in source_to_ipa.iteritems():
map_out = ""
failed = False
for ipa in ipas:
val = ipa_to_sink.get(ipa)
if not val:
failed = True
break
map_out += val
mapping[k] = map_out if not failed else None
return mapping
def conversion(source, sink, mapping, line):
"""
conversion.
Args:
mapping: can be obtained with load_converter().
line: must be seperated, by default the seperator is whitespace.
"""
# Edit this to change the seperator
SEPERATOR = " "
line = line.strip()
output = []
#if sassc_active:
# tokens = line.split(SEPERATOR)
#else:
tokens = line
for token in tokens:
if token.isspace():
output.append(token)
continue
# Remove extraneous chars that IPA does not accept
if sink == "sassc":
cleaned_token = u""
for char in token:
if sassc_active_ipa.get(char):
cleaned_token += char
token = cleaned_token
mapped = mapping.get(token)
if not mapped:
print("WARNING: could not map token ", token, file=sys.stderr)
else:
output.append(mapped)
#if sassc_active:
# output = SEPERATOR.join(output)
#else:
output = "".join(output)
return output
def xsampa2ipa(mapping, xsampa):
"""
conversion from xsampa to ipa.
Args:
mapping: can be obtained with load_converter().
xsampa: a line written in xsampa.
Notes:
function conversion does not work when:
- the input is a word.
- when the line includes '\'.
- 'ɡ' and 'g' are considered to be different.
"""
# make a multi_character_list to split 'xsampa'.
multi_character_list = []
for i in list(mapping):
if len(i) > 1:
multi_character_list.append(i)
# conversion
ipa = []
for phone in convert_phone_set.multi_character_tokenize(xsampa, multi_character_list):
ipa.append(mapping.get(phone, phone))
ipa = ''.join(ipa)
# strange conversion.
ipa = ipa.replace('ɡ', 'g')
return ipa