acoustic_model/acoustic_model/convert_xsampa2ipa.py

""" Conversion between IPA and Xsampa.

Note: this code is based on ipa-xsama-converter/converter.py.
https://github.com/lingz/ipa-xsama-converter/
"""
import json
import sys
import os

import defaultfiles as default
sys.path.append(os.path.join(default.repo_dir, 'forced_alignment'))
from forced_alignment import convert_phone_set


def load_converter(source, sink, ipa_xsampa_converter_dir):
    """load the converter.
    source and sink are either of "ipa", "xsampa" or "sassc".
    """
    choices = ["ipa", "xsampa", "sassc"]

    # Validate params
    try:
        choice1 = choices.index(source)
        choice2 = choices.index(sink)
        if choice1 == choice2:
            print("source and destination format are the same.")
    except ValueError:
        print("source and destination should be one of [ipa xsampa sassc].")
        exit(1)
        
    # Mappings from disk
    # some may not be used if source or sink is already IPA
    source_to_ipa = {}
    ipa_to_sink = {}

    ipa_xsampa = []
    sassc_ipa = []

    # The IPAs that actually occur within SASSC
    sassc_active_ipa = {}

    script_dir = os.path.dirname(os.path.realpath(__file__))

    with open(os.path.join(ipa_xsampa_converter_dir, "ipa_xsampa_map.json"), encoding="utf-8") as f:
        ipa_xsampa = json.load(f)

    sassc_active = source == "sassc" or sink == "sassc"
    if sassc_active:
        with open(os.path.join(script_dir, "./sassc_ipa.json")) as f:
            sassc_ipa = json.load(f)
        for pair in sassc_ipa:
            for char in pair[1]:
                sassc_active_ipa[char] = 1

    if source == "xsampa":
        for pair in ipa_xsampa:
            source_to_ipa[pair[1]] = pair[0]
    elif source == "sassc":
        for pair in sassc_ipa:
            source_to_ipa[pair[0]] = pair[1]

    if sink == "xsampa":
        for pair in ipa_xsampa:
            ipa_to_sink[pair[0]] = pair[1]
    elif sink == "sassc":
        for pair in sassc_ipa:
            ipa_to_sink[pair[1]] = pair[0]

    # Combine them into a single mapping
    mapping = {}
    if source == "ipa":
        mapping = ipa_to_sink
    elif sink == "ipa":
        mapping = source_to_ipa
    else:
        for k, ipas in source_to_ipa.iteritems():
            map_out = ""
            failed = False
            for ipa in ipas:
                val = ipa_to_sink.get(ipa)
                if not val:
                    failed = True
                    break
                map_out += val
            mapping[k] = map_out if not failed else None

    return mapping


def conversion(source, sink, mapping, line):
    """
    conversion.
    Args:
        mapping: can be obtained with load_converter().
        line: must be seperated, by default the seperator is whitespace.
    """

    # Edit this to change the seperator
    SEPERATOR = " "

    line = line.strip()
    output = []
    #if sassc_active:
    #	tokens = line.split(SEPERATOR)
    #else:
    tokens = line
    for token in tokens:
        if token.isspace():
            output.append(token)
            continue
        # Remove extraneous chars that IPA does not accept
        if sink == "sassc":
            cleaned_token = u""
            for char in token:
                if sassc_active_ipa.get(char):
                    cleaned_token += char
            token = cleaned_token
        mapped = mapping.get(token)
        if not mapped:
            print("WARNING: could not map token ", token, file=sys.stderr)
        else:
            output.append(mapped)
    #if sassc_active:
    #	output = SEPERATOR.join(output)
    #else:
    output = "".join(output)
        
    return output


def xsampa2ipa(mapping, xsampa):
    """
    conversion from xsampa to ipa.
    
    Args:
        mapping: can be obtained with load_converter().
        xsampa: a line written in xsampa.

    Notes:
        function conversion does not work when:
        - the input is a word.
        - when the line includes '\'.
        - 'ɡ' and 'g' are considered to be different.

    """
    # make a multi_character_list to split 'xsampa'.
    multi_character_list = []
    for i in list(mapping):
        if len(i) > 1:
            multi_character_list.append(i)
    
    # conversion
    ipa = []
    for phone in convert_phone_set.multi_character_tokenize(xsampa, multi_character_list):
        ipa.append(mapping.get(phone, phone))
    ipa = ''.join(ipa)

    # strange conversion.
    ipa = ipa.replace('ɡ', 'g')
    
    return ipa