diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo
index 88f58eb..7f32711 100644
Binary files a/.vs/acoustic_model/v15/.suo and b/.vs/acoustic_model/v15/.suo differ
diff --git a/acoustic_model/acoustic_model.pyproj b/acoustic_model/acoustic_model.pyproj
index 0ec4c9b..f4800ce 100644
--- a/acoustic_model/acoustic_model.pyproj
+++ b/acoustic_model/acoustic_model.pyproj
@@ -35,12 +35,15 @@
Code
-
+
Code
Code
+
+ Code
+
diff --git a/acoustic_model/check_novoapi.py b/acoustic_model/check_novoapi.py
index cf6e7c6..3fd4601 100644
--- a/acoustic_model/check_novoapi.py
+++ b/acoustic_model/check_novoapi.py
@@ -19,35 +19,10 @@ import defaultfiles as default
from forced_alignment import pyhtk, convert_phone_set
import novoapi
-
+import novoapi_functions
## ======================= novo phoneset ======================
-translation_key = dict()
-
-#phonelist_novo70_ = pd.ExcelFile(default.phonelist_novo70_xlsx)
-#df = pd.read_excel(phonelist_novo70_, 'list')
-## *_simple includes columns which has only one phone in.
-#for ipa, novo70 in zip(df['IPA_simple'], df['novo70_simple']):
-# if not pd.isnull(ipa):
-# print('{0}:{1}'.format(ipa, novo70))
-# translation_key[ipa] = novo70
-#phonelist_novo70 = np.unique(list(df['novo70_simple']))
-
-phoneset_ipa = []
-phoneset_novo70 = []
-with open(default.novo70_phoneset, "rt", encoding="utf-8") as fin:
- lines = fin.read()
- lines = lines.split('\n')
- for line in lines:
- words = line.split('\t')
- if len(words) > 1:
- novo70 = words[0]
- ipa = words[1]
- phoneset_ipa.append(ipa)
- phoneset_novo70.append(novo70)
- translation_key[ipa] = novo70
-phoneset_ipa = np.unique(phoneset_ipa)
-phoneset_novo70 = np.unique(phoneset_novo70)
+phoneset_ipa, phoneset_novo70, translation_key = novoapi_functions.load_phonset()
# As per Nederlandse phoneset_aki.xlsx recieved from David
# [ɔː] oh / ohr
diff --git a/acoustic_model/forced_alignment_novo.py b/acoustic_model/novoapi_forced_alignment.py
similarity index 93%
rename from acoustic_model/forced_alignment_novo.py
rename to acoustic_model/novoapi_forced_alignment.py
index 243f275..93b6a73 100644
--- a/acoustic_model/forced_alignment_novo.py
+++ b/acoustic_model/novoapi_forced_alignment.py
@@ -41,6 +41,7 @@ import argparse
import json
from novoapi.backend import session
+import novoapi_functions
# username / password cannot be passed as artuments...
p = argparse.ArgumentParser()
@@ -110,3 +111,8 @@ res = rec.setgrammar(grammar)
#res = rec.recognize_wav("test/onetwothree.wav")
res = rec.recognize_wav(wav_file)
#print "Recognition result:", json.dumps(res.export(), indent=4)
+
+# list of the pronunciation for each words
+word = 'pauw'
+pronunciation_ipa = ['pau', 'pɑu']
+grammar = novoapi_functions.make_grammar(word, pronunciation_ipa)
\ No newline at end of file
diff --git a/acoustic_model/novoapi_functions.py b/acoustic_model/novoapi_functions.py
new file mode 100644
index 0000000..0bdb324
--- /dev/null
+++ b/acoustic_model/novoapi_functions.py
@@ -0,0 +1,138 @@
+import numpy as np
+
+import defaultfiles as default
+
+def load_phonset():
+ translation_key_ipa2novo70 = dict()
+ translation_key_novo702ipa = dict()
+
+ #phonelist_novo70_ = pd.ExcelFile(default.phonelist_novo70_xlsx)
+ #df = pd.read_excel(phonelist_novo70_, 'list')
+ ## *_simple includes columns which has only one phone in.
+ #for ipa, novo70 in zip(df['IPA_simple'], df['novo70_simple']):
+ # if not pd.isnull(ipa):
+ # print('{0}:{1}'.format(ipa, novo70))
+ # translation_key[ipa] = novo70
+ #phonelist_novo70 = np.unique(list(df['novo70_simple']))
+
+ phoneset_ipa = []
+ phoneset_novo70 = []
+ with open(default.novo70_phoneset, "rt", encoding="utf-8") as fin:
+ lines = fin.read()
+ lines = lines.split('\n')
+ for line in lines:
+ words = line.split('\t')
+ if len(words) > 1:
+ novo70 = words[0]
+ ipa = words[1]
+ phoneset_ipa.append(ipa)
+ phoneset_novo70.append(novo70)
+ translation_key_ipa2novo70[ipa] = novo70
+ translation_key_novo702ipa[novo70] = ipa
+ phoneset_ipa = np.unique(phoneset_ipa)
+ phoneset_novo70 = np.unique(phoneset_novo70)
+
+ return phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa
+
+
+def multi_character_tokenize(line, multi_character_tokens):
+ """
+ Tries to match one of the tokens in multi_character_tokens at each position of line,
+ starting at position 0,
+ if so tokenizes and eats that token. Otherwise tokenizes a single character.
+
+ Copied from forced_alignment.convert_phone_set.py
+ """
+ while line != '':
+ for token in multi_character_tokens:
+ if line.startswith(token) and len(token) > 0:
+ yield token
+ line = line[len(token):]
+ break
+ else:
+ yield line[:1]
+ line = line[1:]
+
+
+def split_ipa(line):
+ """
+ Split a line by IPA phones.
+ If nasalized sound (such as ɛ̃ː) is included, it will give error.
+ :param string line: one line written in IPA.
+ :return string lineSeperated: the line splitted in IPA phone.
+ """
+
+ multi_character_phones = [
+ # IPAs in CGN.
+ u'ʌu', u'ɛi', u'œy', u'aː', u'eː', u'iː', u'oː', u'øː', u'ɛː', u'œː', u'ɔː', u'ɛ̃ː', u'ɑ̃ː', u'ɔ̃ː', u'œ̃', u'ɪː'
+ ]
+
+ return [phone for phone in multi_character_tokenize(line.strip(), multi_character_phones)]
+
+
+def split_novo70(line):
+ """
+ Split a line by novo70 phones.
+ :param string line: one line written in novo70.
+ :return string lineSeperated: the line splitted by novo70 phones.
+ """
+ _, phoneset_novo70, _, _ = load_phonset()
+ multi_character_phones = [p for p in phoneset_novo70 if len(p) > 1]
+ multi_character_phones = sorted(multi_character_phones, key=len, reverse=True)
+
+ return ['sp' if phone == ' ' else phone
+ for phone in multi_character_tokenize(line.strip(), multi_character_phones)]
+
+
+def novo702ipa(tokens):
+ pronunciation = []
+ _, _, _, translation_key = load_phonset()
+ for phone in split_novo70(tokens):
+ pronunciation.append(translation_key.get(phone, phone))
+ return ' '.join(pronunciation)
+
+
+# numbering of novo70 should be checked.
+def ipa2novo70(tokens):
+ pronunciation = []
+ _, _, translation_key, _ = load_phonset()
+ for phone in split_ipa(tokens):
+ pronunciation.append(translation_key.get(phone, phone))
+ return ' '.join(pronunciation)
+
+
+def make_grammar(word, pronunciation_ipa):
+ """
+ Args:
+ words
+ pronunciation_ipa: list of pronunciation variants.
+ """
+ #word = 'pauw'
+ #pronunciation_ipa = ['pau', 'pɑu']
+
+ grammer_data_elements0_pronunciation = []
+ for id, ipa in enumerate(pronunciation_ipa):
+ novo70 = novoapi_functions.ipa2novo70(ipa)
+ grammer_data_elements0_pronunciation.append({
+ "phones": novo70.split(),
+ "id": id
+ })
+
+ grammar_data = {
+ "kind": 'sequence',
+ "elements": [{
+ "kind": "word",
+ "pronunciation": grammer_data_elements0_pronunciation,
+ "label": word
+ }]
+ }
+
+ grammar = {
+ "type": "confusion_network",
+ "version": "1.0",
+ "data": grammar_data,
+ "return_objects": ["grammar"],
+ "phoneset": "novo70"
+ }
+
+ return grammar
\ No newline at end of file