acoustic_model/novoapi/asr/spraaklab/schema.py

#!/usr/bin/env python
## (c) 2017 NovoLanguage, author: David A. van Leeuwen

## The purpose of this to define the grammar structure in a json schema, so that it can be validated,
## (de)serialized, and perhaps even automatically converted to a Python class structure.

import json
import jsonschema

grammar_schema_v10 = {
    "$schema": "http://json-schema.org/schema#",
    "title": "NovoLanguage grammar",
    "description": "A grammar specification for the NovoLanguage Automatic Speech Recognition",
    "$ref": "#/definitions/group",
    "definitions": {
        "phones": {
            "type": "array",
            "items": {
                "type": "string"
            },
            "minItems": 1
        },
        "pronunciation": {
            "type": "object",
            "properties": {
                "phones": {
                    "$ref": "#/definitions/phones"
                },
                "syllables": {
                    "type": "array",
                    "items": {
                        "$ref": "#/definitions/syllable"
                    },
                    "minItems": 1
                },
                "id": {
                    "type": "integer",
                    "description": "ID to distinguish this pronunciation from other variants"
                },
                "meta": {
                    "type": "object"
                }
            },
            "required": ["phones"]
        },
        "syllable": {
            "type": "object",
            "properties": {
                "begin": {
                    "type": "integer",
                    "minimum": 0
                },
                "end": {
                    "type": "integer",
                    "minimum": 0
                },
                "stress": {
                    "type": "integer",
                    "minimum": 0
                },
                "tone": {
                    "type": "integer",
                    "minimum": 0
                }
            },
            "required": ["begin", "end"]
        },
        "word": {
            "type": "object",
            "properties": {
                "kind": {
                    "type": "string",
                    "enum": ["word"]
                },
                "label": {
                    "type": "string"
                },
                "pronunciation": {
                    "anyOf": [
                        {
                            "$ref": "#/definitions/pronunciation"
                        },
                        {
                            "type": "array",
                            "items": {
                                "anyOf": [
                                    {
                                        "$ref": "#/definitions/pronunciation"
                                    },
                                    {
                                        "$ref": "#/definitions/phones"
                                    }
                                ]
                            },
                            "minItems": 1
                        },
                        {
                            "$ref": "#/definitions/phones"
                        }

                    ]
                },
                "syllables": {
                    "type": "array",
                    "items": {
                        "$ref": "#/definitions/syllable"
                    }
                },
                "graphemes": {
                    "type": "array",
                    "items": {
                        "type": "string"
                    }
                },
                "id": {
                    "type": "integer",
                    "description": "ID to distinguish this word from other words (with possibly the same label)"
                },
                "meta": {
                    "type": "object"
                }
            },
            "required": ["label"]
        },
        "element": {
            "title": "element",
            "oneOf": [
                {
                    "$ref": "#/definitions/word"
                },
                {
                    "$ref": "#/definitions/group"
                },
                {
                    "type": ["string", "null"]
                }
            ]
        },
        "group": {
            "title": "element group",
            "type": "object",
            "properties": {
                "kind": {
                    "type": "string",
                    "enum": ["sequence", "alternatives", "order"]
                },
                "elements": {
                    "type": "array",
                    "items": {
                        "$ref": "#/definitions/element"
                    },
                    "minItems": 1,
                },
                "meta": {
                    "type": "object"
                }
            },
            "required": ["kind", "elements"]
        }
    }
}

grammar_schema_v01 = {
    "$schema": "http://json-schema.org/schema#",
    "title": "NovoLanguage grammar v0.1",
    "description": "A grammar specification for the NovoLanguage Automatic Speech Recognition",
    "type": "object",
    "properties": {
        "type": {
            "type": "string",
            "enum": ["multiple_choice", "word_order"]
        },
        "parts": {
            "type": "array",
            "minItems": 1,
            "maxItems": 5,
            "items": {
                "type": ["string", "array"],
                "items": {
                    "type": ["string"]
                }
            }
        }
    }
}

grammar_rpc_schema = {
    "$schema": "http://json-schema.org/schema#",
    "title": "NovoLanguage RPC grammar",
    "type": "object",
    "properties": {
        "type": {
            "type": "string",
            "enum": ["confusion_network"]
        },
        "version": {
            "type": "string",
            "default": "v0.1"
        },
        "data": {
            "type": "object"
        },
        "return_dict": {
            "type": "boolean"
        },
        "return_objects": {
            "type": "array",
            "items": {
                "type": "string",
                "enum": ["dict", "grammar"]
            }
        },
        "phoneset": {
            "type": "string",
            "enum": ["cmu69", "novo70", "mdbg115"]
        },
        "parallel_silence": {
            "type": "boolean"
        }
    },
    "required": ["type", "data"]
}

def validate(object, schema=grammar_schema_v10):
    #if isinstance(object, basestring):
    if isinstance(object, str):
        object = json.loads(object)
    if not isinstance(object, dict):
        raise TypeError("Expected dict or json string")
    try:
        jsonschema.validate(object, schema)
    except jsonschema.ValidationError:
        return False
    except Exception:
        raise
    else:
        return True

def validate_rpc_grammar(message):
    """validate an rpc grammar message"""
    if not validate(message, grammar_rpc_schema):
        raise ValueError("Not a valid RPC grammar")
    version = message.get("version", "0.1")
    data = message["data"]
    if version == "0.1":
        if not validate(data, grammar_schema_v01):
            raise ValueError("Not a valid grammar v0.1")
    elif version == "1.0":
        if not validate(data, grammar_schema_v10):
            raise ValueError("Not a valid grammar v1.0")
    else:
        raise ValueError("Unsupported schema version")


## test
def test(data=None):
    if not data:
        data = {"kind": "sequence", "elements": [
            {"kind": "alternatives", "elements": ["a plain string", "an alternative string"]},
            {"kind": "word", "label": "a word", "pronunciation": {"phones": ["ah", "w", "er", "d"]}},
            {"kind": "order", "elements": [{"kind": "word", "label": "another word", "visible": False}, "last word"]}]}
    try:
        jsonschema.validate(data, schema)
    except jsonschema.ValidationError as e:
        #print data, "validated not OK", e.message
        print("{0} validated not OK {1}".format(data, e.message))
    else:
        #print data, "validated OK"
        print("{} validated OK".format(data))


if __name__ == "__main__":
    test()