161 lines
4.4 KiB
Python
161 lines
4.4 KiB
Python
""" Conversion between IPA and Xsampa.
|
||
|
||
Note: this code is based on ipa-xsama-converter/converter.py.
|
||
https://github.com/lingz/ipa-xsama-converter/
|
||
"""
|
||
import json
|
||
import sys
|
||
import os
|
||
|
||
import defaultfiles as default
|
||
sys.path.append(os.path.join(default.repo_dir, 'forced_alignment'))
|
||
from forced_alignment import convert_phone_set
|
||
|
||
|
||
def load_converter(source, sink, ipa_xsampa_converter_dir):
|
||
"""load the converter.
|
||
source and sink are either of "ipa", "xsampa" or "sassc".
|
||
"""
|
||
choices = ["ipa", "xsampa", "sassc"]
|
||
|
||
# Validate params
|
||
try:
|
||
choice1 = choices.index(source)
|
||
choice2 = choices.index(sink)
|
||
if choice1 == choice2:
|
||
print("source and destination format are the same.")
|
||
except ValueError:
|
||
print("source and destination should be one of [ipa xsampa sassc].")
|
||
exit(1)
|
||
|
||
# Mappings from disk
|
||
# some may not be used if source or sink is already IPA
|
||
source_to_ipa = {}
|
||
ipa_to_sink = {}
|
||
|
||
ipa_xsampa = []
|
||
sassc_ipa = []
|
||
|
||
# The IPAs that actually occur within SASSC
|
||
sassc_active_ipa = {}
|
||
|
||
script_dir = os.path.dirname(os.path.realpath(__file__))
|
||
|
||
with open(os.path.join(ipa_xsampa_converter_dir, "ipa_xsampa_map.json"), encoding="utf-8") as f:
|
||
ipa_xsampa = json.load(f)
|
||
|
||
sassc_active = source == "sassc" or sink == "sassc"
|
||
if sassc_active:
|
||
with open(os.path.join(script_dir, "./sassc_ipa.json")) as f:
|
||
sassc_ipa = json.load(f)
|
||
for pair in sassc_ipa:
|
||
for char in pair[1]:
|
||
sassc_active_ipa[char] = 1
|
||
|
||
if source == "xsampa":
|
||
for pair in ipa_xsampa:
|
||
source_to_ipa[pair[1]] = pair[0]
|
||
elif source == "sassc":
|
||
for pair in sassc_ipa:
|
||
source_to_ipa[pair[0]] = pair[1]
|
||
|
||
if sink == "xsampa":
|
||
for pair in ipa_xsampa:
|
||
ipa_to_sink[pair[0]] = pair[1]
|
||
elif sink == "sassc":
|
||
for pair in sassc_ipa:
|
||
ipa_to_sink[pair[1]] = pair[0]
|
||
|
||
# Combine them into a single mapping
|
||
mapping = {}
|
||
if source == "ipa":
|
||
mapping = ipa_to_sink
|
||
elif sink == "ipa":
|
||
mapping = source_to_ipa
|
||
else:
|
||
for k, ipas in source_to_ipa.iteritems():
|
||
map_out = ""
|
||
failed = False
|
||
for ipa in ipas:
|
||
val = ipa_to_sink.get(ipa)
|
||
if not val:
|
||
failed = True
|
||
break
|
||
map_out += val
|
||
mapping[k] = map_out if not failed else None
|
||
|
||
return mapping
|
||
|
||
|
||
def conversion(source, sink, mapping, line):
|
||
"""
|
||
conversion.
|
||
Args:
|
||
mapping: can be obtained with load_converter().
|
||
line: must be seperated, by default the seperator is whitespace.
|
||
"""
|
||
|
||
# Edit this to change the seperator
|
||
SEPERATOR = " "
|
||
|
||
line = line.strip()
|
||
output = []
|
||
#if sassc_active:
|
||
# tokens = line.split(SEPERATOR)
|
||
#else:
|
||
tokens = line
|
||
for token in tokens:
|
||
if token.isspace():
|
||
output.append(token)
|
||
continue
|
||
# Remove extraneous chars that IPA does not accept
|
||
if sink == "sassc":
|
||
cleaned_token = u""
|
||
for char in token:
|
||
if sassc_active_ipa.get(char):
|
||
cleaned_token += char
|
||
token = cleaned_token
|
||
mapped = mapping.get(token)
|
||
if not mapped:
|
||
print("WARNING: could not map token ", token, file=sys.stderr)
|
||
else:
|
||
output.append(mapped)
|
||
#if sassc_active:
|
||
# output = SEPERATOR.join(output)
|
||
#else:
|
||
output = "".join(output)
|
||
|
||
return output
|
||
|
||
|
||
def xsampa2ipa(mapping, xsampa):
|
||
"""
|
||
conversion from xsampa to ipa.
|
||
|
||
Args:
|
||
mapping: can be obtained with load_converter().
|
||
xsampa: a line written in xsampa.
|
||
|
||
Notes:
|
||
function conversion does not work when:
|
||
- the input is a word.
|
||
- when the line includes '\'.
|
||
- 'ɡ' and 'g' are considered to be different.
|
||
|
||
"""
|
||
# make a multi_character_list to split 'xsampa'.
|
||
multi_character_list = []
|
||
for i in list(mapping):
|
||
if len(i) > 1:
|
||
multi_character_list.append(i)
|
||
|
||
# conversion
|
||
ipa = []
|
||
for phone in convert_phone_set.multi_character_tokenize(xsampa, multi_character_list):
|
||
ipa.append(mapping.get(phone, phone))
|
||
ipa = ''.join(ipa)
|
||
|
||
# strange conversion.
|
||
ipa = ipa.replace('ɡ', 'g')
|
||
|
||
return ipa |