""" Conversion between IPA and Xsampa. Note: this code is based on ipa-xsama-converter/converter.py. https://github.com/lingz/ipa-xsama-converter/ """ import json import sys import os import defaultfiles as default sys.path.append(os.path.join(default.repo_dir, 'forced_alignment')) from forced_alignment import convert_phone_set def load_converter(source, sink, ipa_xsampa_converter_dir): """load the converter. source and sink are either of "ipa", "xsampa" or "sassc". """ choices = ["ipa", "xsampa", "sassc"] # Validate params try: choice1 = choices.index(source) choice2 = choices.index(sink) if choice1 == choice2: print("source and destination format are the same.") except ValueError: print("source and destination should be one of [ipa xsampa sassc].") exit(1) # Mappings from disk # some may not be used if source or sink is already IPA source_to_ipa = {} ipa_to_sink = {} ipa_xsampa = [] sassc_ipa = [] # The IPAs that actually occur within SASSC sassc_active_ipa = {} script_dir = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(ipa_xsampa_converter_dir, "ipa_xsampa_map.json"), encoding="utf-8") as f: ipa_xsampa = json.load(f) sassc_active = source == "sassc" or sink == "sassc" if sassc_active: with open(os.path.join(script_dir, "./sassc_ipa.json")) as f: sassc_ipa = json.load(f) for pair in sassc_ipa: for char in pair[1]: sassc_active_ipa[char] = 1 if source == "xsampa": for pair in ipa_xsampa: source_to_ipa[pair[1]] = pair[0] elif source == "sassc": for pair in sassc_ipa: source_to_ipa[pair[0]] = pair[1] if sink == "xsampa": for pair in ipa_xsampa: ipa_to_sink[pair[0]] = pair[1] elif sink == "sassc": for pair in sassc_ipa: ipa_to_sink[pair[1]] = pair[0] # Combine them into a single mapping mapping = {} if source == "ipa": mapping = ipa_to_sink elif sink == "ipa": mapping = source_to_ipa else: for k, ipas in source_to_ipa.iteritems(): map_out = "" failed = False for ipa in ipas: val = ipa_to_sink.get(ipa) if not val: failed = True break map_out += val mapping[k] = map_out if not failed else None return mapping def conversion(source, sink, mapping, line): """ conversion. Args: mapping: can be obtained with load_converter(). line: must be seperated, by default the seperator is whitespace. """ # Edit this to change the seperator SEPERATOR = " " line = line.strip() output = [] #if sassc_active: # tokens = line.split(SEPERATOR) #else: tokens = line for token in tokens: if token.isspace(): output.append(token) continue # Remove extraneous chars that IPA does not accept if sink == "sassc": cleaned_token = u"" for char in token: if sassc_active_ipa.get(char): cleaned_token += char token = cleaned_token mapped = mapping.get(token) if not mapped: print("WARNING: could not map token ", token, file=sys.stderr) else: output.append(mapped) #if sassc_active: # output = SEPERATOR.join(output) #else: output = "".join(output) return output def xsampa2ipa(mapping, xsampa): """ conversion from xsampa to ipa. Args: mapping: can be obtained with load_converter(). xsampa: a line written in xsampa. Notes: function conversion does not work when: - the input is a word. - when the line includes '\'. - 'ɡ' and 'g' are considered to be different. """ # make a multi_character_list to split 'xsampa'. multi_character_list = [] for i in list(mapping): if len(i) > 1: multi_character_list.append(i) # conversion ipa = [] for phone in convert_phone_set.multi_character_tokenize(xsampa, multi_character_list): ipa.append(mapping.get(phone, phone)) ipa = ''.join(ipa) # strange conversion. ipa = ipa.replace('ɡ', 'g') return ipa