functions are added to perform forced_alignment using novoapi. results can be written in novo70 or IPA.

2019-01-10 23:39:02 +01:00
parent d6d5543d03
commit 1622655542
7 changed files with 56 additions and 67 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@

 ## important ##
 .acoustic_model/forced_alignment_novo.py
+.acoustic_model/novoapi_functions.py

 # User-specific files
 *.suo
--- a/.vs/acoustic_model/v15/.suo
+++ b/.vs/acoustic_model/v15/.suo
--- a/acoustic_model/pycache/defaultfiles.cpython-36.pyc
+++ b/acoustic_model/pycache/defaultfiles.cpython-36.pyc
--- a/acoustic_model/defaultfiles.py
+++ b/acoustic_model/defaultfiles.py
@@ -36,8 +36,14 @@ fame_s5_dir     = os.path.join(fame_dir, 's5')
 fame_corpus_dir = os.path.join(fame_dir, 'corpus')

 experiments_dir = r'c:\OneDrive\Research\rug\experiments'
-stimmen_transcription_xlsx = os.path.join(experiments_dir, 'stimmen', 'data', 'Frisian Variants Picture Task Stimmen.xlsx')
-stimmen_data_dir           = os.path.join(experiments_dir, 'stimmen', 'data')
+stimmen_dir = os.path.join(experiments_dir, 'stimmen')
+stimmen_data_dir = os.path.join(stimmen_dir, 'data')
+# 44.1 kHz
+#stimmen_wav_dir  = os.path.join(stimmen_dir, 'wav')
+# 16 kHz
+stimmen_wav_dir  = r'c:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\corpus\stimmen'
+
+stimmen_transcription_xlsx = os.path.join(stimmen_data_dir, 'Frisian Variants Picture Task Stimmen.xlsx')
 phonelist_friesian_txt     = os.path.join(experiments_dir, 'friesian', 'acoustic_model', 'config', 'phonelist_friesian.txt')

 novo_api_dir = os.path.join(WSL_dir, 'python-novo-api', 'novoapi')
--- a/acoustic_model/htk_vs_kaldi.py
+++ b/acoustic_model/htk_vs_kaldi.py
@@ -256,7 +256,7 @@ if make_kaldi_lexicon_txt:
        #        f.write("{0},{1}\n".format(key,c[key]))

        for key, value in c.most_common(option_num):
-            # make possible pronounciation variant list.
+            # make possible pronunciation variant list.
            pronvar_list = am_func.fame_pronunciation_variant(key)

            for pronvar_ in pronvar_list:
--- a/acoustic_model/novoapi_forced_alignment.py
+++ b/acoustic_model/novoapi_forced_alignment.py
@@ -37,11 +37,15 @@
 # Aki Kunikoshi
 # 428968@gmail.com
 #
+import os
+os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
+
 import argparse
 import json

 from novoapi.backend import session
 import novoapi_functions
+import defaultfiles as default

 # username / password cannot be passed as artuments...
 p = argparse.ArgumentParser()
@@ -51,68 +55,11 @@ p.add_argument("--user", default='martijn.wieling')
 p.add_argument("--password", default='fa0Thaic')
 args = p.parse_args()

-wav_file = 'c:\\OneDrive\\WSL\\test\\onetwothree.wav'
-
-rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True) # , modeldir=modeldir)
-grammar = {
-  "type": "confusion_network",
-  "version": "1.0",
-  "data": {
-	"kind": "sequence",
-	"elements": [{
-		"kind": "word",
-		"pronunciation": [{
-			"phones": ["wv",
-			  "a1",
-			  "n"],
-			"id": 0
-		  },
-		  {
-			"phones": ["wv",
-			  "uh1",
-			  "n"],
-			"id": 1
-		  }],
-		"label": "one"
-	  },
-	  {
-		"kind": "word",
-		"pronunciation": [{
-			"phones": ["t",
-			  "uw1"],
-			"id": 0
-		  }],
-		"label": "two"
-	  },
-	  {
-		"kind": "word",
-		"pronunciation": [{
-			"phones": ["t",
-			  "r",
-			  "iy1"],
-			"id": 0
-		  },
-		  {
-			"phones": ["s",
-			  "r",
-			  "iy1"],
-			"id": 1
-		  }],
-		"label": "three"
-	  }]
-  },
-  "return_objects": ["grammar"],
-  "phoneset": "novo70"
-}
-
-res = rec.setgrammar(grammar)
-#print "Set grammar result", res
-
-#res = rec.recognize_wav("test/onetwothree.wav")
-res = rec.recognize_wav(wav_file)
-#print "Recognition result:", json.dumps(res.export(), indent=4)
-
+#wav_file = 'c:\\OneDrive\\WSL\\test\\onetwothree.wav'
+wav_file = os.path.join(default.stimmen_wav_dir, 'pg_pauw_2206_0fjd8.wav')
 # list of the pronunciation for each words
 word = 'pauw'
 pronunciation_ipa = ['pau', 'pɑu']
-grammar = novoapi_functions.make_grammar(word, pronunciation_ipa)
+
+result = novoapi_functions.forced_alignment(wav_file, word, pronunciation_ipa)
+pronunciation_ipa, pronunciation_novo70, llh = novoapi_functions.result2pronunciation(result, word)
--- a/acoustic_model/novoapi_functions.py
+++ b/acoustic_model/novoapi_functions.py
@@ -1,7 +1,14 @@
+## this script should be used only by Aki Kunikoshi.
+
 import numpy as np
+import argparse
+import json
+
+from novoapi.backend import session

 import defaultfiles as default

+
 def load_phonset():
 	translation_key_ipa2novo70 = dict()
 	translation_key_novo702ipa = dict()
@@ -112,7 +119,7 @@ def make_grammar(word, pronunciation_ipa):

 	grammer_data_elements0_pronunciation = []
 	for id, ipa in enumerate(pronunciation_ipa):
-		novo70 = novoapi_functions.ipa2novo70(ipa)
+		novo70 = ipa2novo70(ipa)
 		grammer_data_elements0_pronunciation.append({
 			"phones": novo70.split(),
 			"id": id
@@ -135,4 +142,32 @@ def make_grammar(word, pronunciation_ipa):
 		"phoneset": "novo70"
 		}

-	return grammar
+	return grammar
+
+
+def forced_alignment(wav_file, word, pronunciation_ipa):
+	### IMPORTANT ###
+	# because of this function, this script should not be uploaded / shared.
+
+	# username / password cannot be passed as artuments...
+	p = argparse.ArgumentParser()
+	p.add_argument("--user", default='martijn.wieling')
+	p.add_argument("--password", default='fa0Thaic')
+	args = p.parse_args()
+	
+	rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True) # , modeldir=modeldir)
+
+	grammar = make_grammar(word, pronunciation_ipa)
+	result = rec.setgrammar(grammar)
+	#print "Set grammar result", res
+	result = rec.recognize_wav(wav_file)
+	return result.export()
+
+
+def result2pronunciation(result, word):
+	result_ = [result[i] for i in range(len(result)) if result[i]['label'] == word] 
+	llh = result_[0]['llh']
+	phones = result_[0]['phones']
+	pronunciation_novo70 = [phone['label'] for phone in phones]
+	pronunciation_ipa = [novo702ipa(phone) for phone in pronunciation_novo70]
+	return pronunciation_ipa, pronunciation_novo70, llh