functions are added to perform forced_alignment using novoapi. results can be written in novo70 or IPA.

2019-01-10 23:39:02 +01:00
parent d6d5543d03
commit 1622655542
7 changed files with 56 additions and 67 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 ## important ##
 .acoustic_model/forced_alignment_novo.py
 .acoustic_model/novoapi_functions.py
 # User-specific files
 *.suo
--- a/.vs/acoustic_model/v15/.suo
+++ b/.vs/acoustic_model/v15/.suo
--- a/acoustic_model/pycache/defaultfiles.cpython-36.pyc
+++ b/acoustic_model/pycache/defaultfiles.cpython-36.pyc
--- a/acoustic_model/defaultfiles.py
+++ b/acoustic_model/defaultfiles.py
@@ -36,8 +36,14 @@ fame_s5_dir     = os.path.join(fame_dir, 's5')
 fame_corpus_dir = os.path.join(fame_dir, 'corpus')
 experiments_dir = r'c:\OneDrive\Research\rug\experiments'
-stimmen_transcription_xlsx = os.path.join(experiments_dir, 'stimmen', 'data', 'Frisian Variants Picture Task Stimmen.xlsx')
+stimmen_dir = os.path.join(experiments_dir, 'stimmen')
-stimmen_data_dir           = os.path.join(experiments_dir, 'stimmen', 'data')
+stimmen_data_dir = os.path.join(stimmen_dir, 'data')
 # 44.1 kHz
 #stimmen_wav_dir  = os.path.join(stimmen_dir, 'wav')
 # 16 kHz
 stimmen_wav_dir  = r'c:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\corpus\stimmen'
 stimmen_transcription_xlsx = os.path.join(stimmen_data_dir, 'Frisian Variants Picture Task Stimmen.xlsx')
 phonelist_friesian_txt     = os.path.join(experiments_dir, 'friesian', 'acoustic_model', 'config', 'phonelist_friesian.txt')
 novo_api_dir = os.path.join(WSL_dir, 'python-novo-api', 'novoapi')
--- a/acoustic_model/htk_vs_kaldi.py
+++ b/acoustic_model/htk_vs_kaldi.py
@@ -256,7 +256,7 @@ if make_kaldi_lexicon_txt:
        #        f.write("{0},{1}\n".format(key,c[key]))
        for key, value in c.most_common(option_num):
-            # make possible pronounciation variant list.
+            # make possible pronunciation variant list.
            pronvar_list = am_func.fame_pronunciation_variant(key)
            for pronvar_ in pronvar_list:
--- a/acoustic_model/novoapi_forced_alignment.py
+++ b/acoustic_model/novoapi_forced_alignment.py
@@ -37,11 +37,15 @@
 # Aki Kunikoshi
 # 428968@gmail.com
 #
 import os
 os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
 import argparse
 import json
 from novoapi.backend import session
 import novoapi_functions
 import defaultfiles as default
 # username / password cannot be passed as artuments...
 p = argparse.ArgumentParser()
@@ -51,68 +55,11 @@ p.add_argument("--user", default='martijn.wieling')
 p.add_argument("--password", default='fa0Thaic')
 args = p.parse_args()
-wav_file = 'c:\\OneDrive\\WSL\\test\\onetwothree.wav'
+#wav_file = 'c:\\OneDrive\\WSL\\test\\onetwothree.wav'
-
+wav_file = os.path.join(default.stimmen_wav_dir, 'pg_pauw_2206_0fjd8.wav')
 rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True) # , modeldir=modeldir)
 grammar = {
  "type": "confusion_network",
  "version": "1.0",
  "data": {
 	"kind": "sequence",
 	"elements": [{
 		"kind": "word",
 		"pronunciation": [{
 			"phones": ["wv",
 			  "a1",
 			  "n"],
 			"id": 0
 		  },
 		  {
 			"phones": ["wv",
 			  "uh1",
 			  "n"],
 			"id": 1
 		  }],
 		"label": "one"
 	  },
 	  {
 		"kind": "word",
 		"pronunciation": [{
 			"phones": ["t",
 			  "uw1"],
 			"id": 0
 		  }],
 		"label": "two"
 	  },
 	  {
 		"kind": "word",
 		"pronunciation": [{
 			"phones": ["t",
 			  "r",
 			  "iy1"],
 			"id": 0
 		  },
 		  {
 			"phones": ["s",
 			  "r",
 			  "iy1"],
 			"id": 1
 		  }],
 		"label": "three"
 	  }]
  },
  "return_objects": ["grammar"],
  "phoneset": "novo70"
 }
 res = rec.setgrammar(grammar)
 #print "Set grammar result", res
 #res = rec.recognize_wav("test/onetwothree.wav")
 res = rec.recognize_wav(wav_file)
 #print "Recognition result:", json.dumps(res.export(), indent=4)
 # list of the pronunciation for each words
 word = 'pauw'
 pronunciation_ipa = ['pau', 'pɑu']
-grammar = novoapi_functions.make_grammar(word, pronunciation_ipa)
+
 result = novoapi_functions.forced_alignment(wav_file, word, pronunciation_ipa)
 pronunciation_ipa, pronunciation_novo70, llh = novoapi_functions.result2pronunciation(result, word)
--- a/acoustic_model/novoapi_functions.py
+++ b/acoustic_model/novoapi_functions.py
@@ -1,7 +1,14 @@
 ## this script should be used only by Aki Kunikoshi.
 import numpy as np
 import argparse
 import json
 from novoapi.backend import session
 import defaultfiles as default
 def load_phonset():
 	translation_key_ipa2novo70 = dict()
 	translation_key_novo702ipa = dict()
@@ -112,7 +119,7 @@ def make_grammar(word, pronunciation_ipa):
 	grammer_data_elements0_pronunciation = []
 	for id, ipa in enumerate(pronunciation_ipa):
-		novo70 = novoapi_functions.ipa2novo70(ipa)
+		novo70 = ipa2novo70(ipa)
 		grammer_data_elements0_pronunciation.append({
 			"phones": novo70.split(),
 			"id": id
@@ -136,3 +143,31 @@ def make_grammar(word, pronunciation_ipa):
 		}
 	return grammar
 def forced_alignment(wav_file, word, pronunciation_ipa):
 	### IMPORTANT ###
 	# because of this function, this script should not be uploaded / shared.
 	# username / password cannot be passed as artuments...
 	p = argparse.ArgumentParser()
 	p.add_argument("--user", default='martijn.wieling')
 	p.add_argument("--password", default='fa0Thaic')
 	args = p.parse_args()
 	rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True) # , modeldir=modeldir)
 	grammar = make_grammar(word, pronunciation_ipa)
 	result = rec.setgrammar(grammar)
 	#print "Set grammar result", res
 	result = rec.recognize_wav(wav_file)
 	return result.export()
 def result2pronunciation(result, word):
 	result_ = [result[i] for i in range(len(result)) if result[i]['label'] == word] 
 	llh = result_[0]['llh']
 	phones = result_[0]['phones']
 	pronunciation_novo70 = [phone['label'] for phone in phones]
 	pronunciation_ipa = [novo702ipa(phone) for phone in pronunciation_novo70]
 	return pronunciation_ipa, pronunciation_novo70, llh