functions are added to perform forced_alignment using novoapi. results can be written in novo70 or IPA.

This commit is contained in:
yemaozi88 2019-01-10 23:39:02 +01:00
parent d6d5543d03
commit 1622655542
7 changed files with 56 additions and 67 deletions

1
.gitignore vendored
View File

@ -3,6 +3,7 @@
## important ## ## important ##
.acoustic_model/forced_alignment_novo.py .acoustic_model/forced_alignment_novo.py
.acoustic_model/novoapi_functions.py
# User-specific files # User-specific files
*.suo *.suo

Binary file not shown.

View File

@ -36,8 +36,14 @@ fame_s5_dir = os.path.join(fame_dir, 's5')
fame_corpus_dir = os.path.join(fame_dir, 'corpus') fame_corpus_dir = os.path.join(fame_dir, 'corpus')
experiments_dir = r'c:\OneDrive\Research\rug\experiments' experiments_dir = r'c:\OneDrive\Research\rug\experiments'
stimmen_transcription_xlsx = os.path.join(experiments_dir, 'stimmen', 'data', 'Frisian Variants Picture Task Stimmen.xlsx') stimmen_dir = os.path.join(experiments_dir, 'stimmen')
stimmen_data_dir = os.path.join(experiments_dir, 'stimmen', 'data') stimmen_data_dir = os.path.join(stimmen_dir, 'data')
# 44.1 kHz
#stimmen_wav_dir = os.path.join(stimmen_dir, 'wav')
# 16 kHz
stimmen_wav_dir = r'c:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\corpus\stimmen'
stimmen_transcription_xlsx = os.path.join(stimmen_data_dir, 'Frisian Variants Picture Task Stimmen.xlsx')
phonelist_friesian_txt = os.path.join(experiments_dir, 'friesian', 'acoustic_model', 'config', 'phonelist_friesian.txt') phonelist_friesian_txt = os.path.join(experiments_dir, 'friesian', 'acoustic_model', 'config', 'phonelist_friesian.txt')
novo_api_dir = os.path.join(WSL_dir, 'python-novo-api', 'novoapi') novo_api_dir = os.path.join(WSL_dir, 'python-novo-api', 'novoapi')

View File

@ -256,7 +256,7 @@ if make_kaldi_lexicon_txt:
# f.write("{0},{1}\n".format(key,c[key])) # f.write("{0},{1}\n".format(key,c[key]))
for key, value in c.most_common(option_num): for key, value in c.most_common(option_num):
# make possible pronounciation variant list. # make possible pronunciation variant list.
pronvar_list = am_func.fame_pronunciation_variant(key) pronvar_list = am_func.fame_pronunciation_variant(key)
for pronvar_ in pronvar_list: for pronvar_ in pronvar_list:

View File

@ -37,11 +37,15 @@
# Aki Kunikoshi # Aki Kunikoshi
# 428968@gmail.com # 428968@gmail.com
# #
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import argparse import argparse
import json import json
from novoapi.backend import session from novoapi.backend import session
import novoapi_functions import novoapi_functions
import defaultfiles as default
# username / password cannot be passed as artuments... # username / password cannot be passed as artuments...
p = argparse.ArgumentParser() p = argparse.ArgumentParser()
@ -51,68 +55,11 @@ p.add_argument("--user", default='martijn.wieling')
p.add_argument("--password", default='fa0Thaic') p.add_argument("--password", default='fa0Thaic')
args = p.parse_args() args = p.parse_args()
wav_file = 'c:\\OneDrive\\WSL\\test\\onetwothree.wav' #wav_file = 'c:\\OneDrive\\WSL\\test\\onetwothree.wav'
wav_file = os.path.join(default.stimmen_wav_dir, 'pg_pauw_2206_0fjd8.wav')
rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True) # , modeldir=modeldir)
grammar = {
"type": "confusion_network",
"version": "1.0",
"data": {
"kind": "sequence",
"elements": [{
"kind": "word",
"pronunciation": [{
"phones": ["wv",
"a1",
"n"],
"id": 0
},
{
"phones": ["wv",
"uh1",
"n"],
"id": 1
}],
"label": "one"
},
{
"kind": "word",
"pronunciation": [{
"phones": ["t",
"uw1"],
"id": 0
}],
"label": "two"
},
{
"kind": "word",
"pronunciation": [{
"phones": ["t",
"r",
"iy1"],
"id": 0
},
{
"phones": ["s",
"r",
"iy1"],
"id": 1
}],
"label": "three"
}]
},
"return_objects": ["grammar"],
"phoneset": "novo70"
}
res = rec.setgrammar(grammar)
#print "Set grammar result", res
#res = rec.recognize_wav("test/onetwothree.wav")
res = rec.recognize_wav(wav_file)
#print "Recognition result:", json.dumps(res.export(), indent=4)
# list of the pronunciation for each words # list of the pronunciation for each words
word = 'pauw' word = 'pauw'
pronunciation_ipa = ['pau', 'pɑu'] pronunciation_ipa = ['pau', 'pɑu']
grammar = novoapi_functions.make_grammar(word, pronunciation_ipa)
result = novoapi_functions.forced_alignment(wav_file, word, pronunciation_ipa)
pronunciation_ipa, pronunciation_novo70, llh = novoapi_functions.result2pronunciation(result, word)

View File

@ -1,7 +1,14 @@
## this script should be used only by Aki Kunikoshi.
import numpy as np import numpy as np
import argparse
import json
from novoapi.backend import session
import defaultfiles as default import defaultfiles as default
def load_phonset(): def load_phonset():
translation_key_ipa2novo70 = dict() translation_key_ipa2novo70 = dict()
translation_key_novo702ipa = dict() translation_key_novo702ipa = dict()
@ -112,7 +119,7 @@ def make_grammar(word, pronunciation_ipa):
grammer_data_elements0_pronunciation = [] grammer_data_elements0_pronunciation = []
for id, ipa in enumerate(pronunciation_ipa): for id, ipa in enumerate(pronunciation_ipa):
novo70 = novoapi_functions.ipa2novo70(ipa) novo70 = ipa2novo70(ipa)
grammer_data_elements0_pronunciation.append({ grammer_data_elements0_pronunciation.append({
"phones": novo70.split(), "phones": novo70.split(),
"id": id "id": id
@ -136,3 +143,31 @@ def make_grammar(word, pronunciation_ipa):
} }
return grammar return grammar
def forced_alignment(wav_file, word, pronunciation_ipa):
### IMPORTANT ###
# because of this function, this script should not be uploaded / shared.
# username / password cannot be passed as artuments...
p = argparse.ArgumentParser()
p.add_argument("--user", default='martijn.wieling')
p.add_argument("--password", default='fa0Thaic')
args = p.parse_args()
rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True) # , modeldir=modeldir)
grammar = make_grammar(word, pronunciation_ipa)
result = rec.setgrammar(grammar)
#print "Set grammar result", res
result = rec.recognize_wav(wav_file)
return result.export()
def result2pronunciation(result, word):
result_ = [result[i] for i in range(len(result)) if result[i]['label'] == word]
llh = result_[0]['llh']
phones = result_[0]['phones']
pronunciation_novo70 = [phone['label'] for phone in phones]
pronunciation_ipa = [novo702ipa(phone) for phone in pronunciation_novo70]
return pronunciation_ipa, pronunciation_novo70, llh