diff --git a/.gitignore b/.gitignore index b1d3894..520e3c4 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ ## important ## .acoustic_model/forced_alignment_novo.py +.acoustic_model/novoapi_functions.py # User-specific files *.suo diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo index 7f32711..9fe1f21 100644 Binary files a/.vs/acoustic_model/v15/.suo and b/.vs/acoustic_model/v15/.suo differ diff --git a/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc b/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc index b61bd54..635aaae 100644 Binary files a/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc and b/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc differ diff --git a/acoustic_model/defaultfiles.py b/acoustic_model/defaultfiles.py index f464b9f..e9c4c96 100644 --- a/acoustic_model/defaultfiles.py +++ b/acoustic_model/defaultfiles.py @@ -36,8 +36,14 @@ fame_s5_dir = os.path.join(fame_dir, 's5') fame_corpus_dir = os.path.join(fame_dir, 'corpus') experiments_dir = r'c:\OneDrive\Research\rug\experiments' -stimmen_transcription_xlsx = os.path.join(experiments_dir, 'stimmen', 'data', 'Frisian Variants Picture Task Stimmen.xlsx') -stimmen_data_dir = os.path.join(experiments_dir, 'stimmen', 'data') +stimmen_dir = os.path.join(experiments_dir, 'stimmen') +stimmen_data_dir = os.path.join(stimmen_dir, 'data') +# 44.1 kHz +#stimmen_wav_dir = os.path.join(stimmen_dir, 'wav') +# 16 kHz +stimmen_wav_dir = r'c:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\corpus\stimmen' + +stimmen_transcription_xlsx = os.path.join(stimmen_data_dir, 'Frisian Variants Picture Task Stimmen.xlsx') phonelist_friesian_txt = os.path.join(experiments_dir, 'friesian', 'acoustic_model', 'config', 'phonelist_friesian.txt') novo_api_dir = os.path.join(WSL_dir, 'python-novo-api', 'novoapi') diff --git a/acoustic_model/htk_vs_kaldi.py b/acoustic_model/htk_vs_kaldi.py index f4dd82a..ca7f6af 100644 --- a/acoustic_model/htk_vs_kaldi.py +++ b/acoustic_model/htk_vs_kaldi.py @@ -256,7 +256,7 @@ if make_kaldi_lexicon_txt: # f.write("{0},{1}\n".format(key,c[key])) for key, value in c.most_common(option_num): - # make possible pronounciation variant list. + # make possible pronunciation variant list. pronvar_list = am_func.fame_pronunciation_variant(key) for pronvar_ in pronvar_list: diff --git a/acoustic_model/novoapi_forced_alignment.py b/acoustic_model/novoapi_forced_alignment.py index 93b6a73..932d7c1 100644 --- a/acoustic_model/novoapi_forced_alignment.py +++ b/acoustic_model/novoapi_forced_alignment.py @@ -37,11 +37,15 @@ # Aki Kunikoshi # 428968@gmail.com # +import os +os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') + import argparse import json from novoapi.backend import session import novoapi_functions +import defaultfiles as default # username / password cannot be passed as artuments... p = argparse.ArgumentParser() @@ -51,68 +55,11 @@ p.add_argument("--user", default='martijn.wieling') p.add_argument("--password", default='fa0Thaic') args = p.parse_args() -wav_file = 'c:\\OneDrive\\WSL\\test\\onetwothree.wav' - -rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True) # , modeldir=modeldir) -grammar = { - "type": "confusion_network", - "version": "1.0", - "data": { - "kind": "sequence", - "elements": [{ - "kind": "word", - "pronunciation": [{ - "phones": ["wv", - "a1", - "n"], - "id": 0 - }, - { - "phones": ["wv", - "uh1", - "n"], - "id": 1 - }], - "label": "one" - }, - { - "kind": "word", - "pronunciation": [{ - "phones": ["t", - "uw1"], - "id": 0 - }], - "label": "two" - }, - { - "kind": "word", - "pronunciation": [{ - "phones": ["t", - "r", - "iy1"], - "id": 0 - }, - { - "phones": ["s", - "r", - "iy1"], - "id": 1 - }], - "label": "three" - }] - }, - "return_objects": ["grammar"], - "phoneset": "novo70" -} - -res = rec.setgrammar(grammar) -#print "Set grammar result", res - -#res = rec.recognize_wav("test/onetwothree.wav") -res = rec.recognize_wav(wav_file) -#print "Recognition result:", json.dumps(res.export(), indent=4) - +#wav_file = 'c:\\OneDrive\\WSL\\test\\onetwothree.wav' +wav_file = os.path.join(default.stimmen_wav_dir, 'pg_pauw_2206_0fjd8.wav') # list of the pronunciation for each words word = 'pauw' pronunciation_ipa = ['pau', 'pɑu'] -grammar = novoapi_functions.make_grammar(word, pronunciation_ipa) \ No newline at end of file + +result = novoapi_functions.forced_alignment(wav_file, word, pronunciation_ipa) +pronunciation_ipa, pronunciation_novo70, llh = novoapi_functions.result2pronunciation(result, word) \ No newline at end of file diff --git a/acoustic_model/novoapi_functions.py b/acoustic_model/novoapi_functions.py index 0bdb324..0ab6aa8 100644 --- a/acoustic_model/novoapi_functions.py +++ b/acoustic_model/novoapi_functions.py @@ -1,7 +1,14 @@ +## this script should be used only by Aki Kunikoshi. + import numpy as np +import argparse +import json + +from novoapi.backend import session import defaultfiles as default + def load_phonset(): translation_key_ipa2novo70 = dict() translation_key_novo702ipa = dict() @@ -112,7 +119,7 @@ def make_grammar(word, pronunciation_ipa): grammer_data_elements0_pronunciation = [] for id, ipa in enumerate(pronunciation_ipa): - novo70 = novoapi_functions.ipa2novo70(ipa) + novo70 = ipa2novo70(ipa) grammer_data_elements0_pronunciation.append({ "phones": novo70.split(), "id": id @@ -135,4 +142,32 @@ def make_grammar(word, pronunciation_ipa): "phoneset": "novo70" } - return grammar \ No newline at end of file + return grammar + + +def forced_alignment(wav_file, word, pronunciation_ipa): + ### IMPORTANT ### + # because of this function, this script should not be uploaded / shared. + + # username / password cannot be passed as artuments... + p = argparse.ArgumentParser() + p.add_argument("--user", default='martijn.wieling') + p.add_argument("--password", default='fa0Thaic') + args = p.parse_args() + + rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True) # , modeldir=modeldir) + + grammar = make_grammar(word, pronunciation_ipa) + result = rec.setgrammar(grammar) + #print "Set grammar result", res + result = rec.recognize_wav(wav_file) + return result.export() + + +def result2pronunciation(result, word): + result_ = [result[i] for i in range(len(result)) if result[i]['label'] == word] + llh = result_[0]['llh'] + phones = result_[0]['phones'] + pronunciation_novo70 = [phone['label'] for phone in phones] + pronunciation_ipa = [novo702ipa(phone) for phone in pronunciation_novo70] + return pronunciation_ipa, pronunciation_novo70, llh \ No newline at end of file