rozen-test is added.

This commit is contained in:
yemaozi88 2019-01-21 10:35:50 +01:00
parent de5c9cecb9
commit 82a8e2302f
17 changed files with 164 additions and 33 deletions

Binary file not shown.

View File

@ -36,6 +36,9 @@
<Compile Include="fa_test.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="forced_aligner_comparison.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="novoapi_forced_alignment.py">
<SubType>Code</SubType>
</Compile>

View File

@ -10,14 +10,13 @@ import shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import novoapi
import defaultfiles as default
sys.path.append(default.forced_alignment_module_dir)
from forced_alignment import pyhtk, convert_phone_set
from forced_alignment import convert_phone_set
#import acoustic_model_functions as am_func
import convert_xsampa2ipa
import novoapi_functions
@ -47,10 +46,6 @@ david_suggestion = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː', 'w']
## read pronunciation variants.
stimmen_transcription_ = pd.ExcelFile(default.stimmen_transcription_xlsx)
df = pd.read_excel(stimmen_transcription_, 'frequency')
#for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']):
# ipa_converted = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
# if not ipa_converted == ipa:
# print('{0}: {1} - {2}'.format(xsampa, ipa_converted, ipa))
transcription_ipa = list(df['IPA'])
# transcription mistake?
@ -63,6 +58,7 @@ for ipa in transcription_ipa:
ipa = ipa.replace(':', 'ː')
ipa = convert_phone_set.split_ipa(ipa)
# list of phones not in novo70 phoneset.
not_in_novo70_ = [phone for phone in ipa
if not phone in phoneset_ipa and not phone in david_suggestion]
not_in_novo70_ = [phone.replace('sp', '') for phone in not_in_novo70_]
@ -106,6 +102,10 @@ df = pd.read_excel(stimmen_transcription_, 'original')
# mapping from ipa to xsampa
mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_converter_dir)
#for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']):
# ipa_converted = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
# if not ipa_converted == ipa:
# print('{0}: {1} - {2}'.format(xsampa, ipa_converted, ipa))
ipas = []
famehtks = []
@ -153,12 +153,12 @@ for word in word_list:
## ===== forced alignment =====
reus_dir = r'C:\OneDrive\Desktop\Reus'
rozen_dir = r'c:\Users\Aki\source\repos\acoustic_model\rozen-test'
if forced_alignment_novo70:
Results = pd.DataFrame(index=[],
columns=['filename', 'word', 'xsampa', 'ipa', 'result_ipa', 'result_novo70', 'llh'])
#for word in word_list:
for word in ['Reus']:
for word in ['Rozen']:
# pronunciation variants top 3
df_per_word_ = df_per_word[df_per_word['word']==word]
df_per_word_ = df_per_word_.sort_values('frequency', ascending=False)
@ -208,37 +208,35 @@ if forced_alignment_novo70:
wav_file = os.path.join(default.stimmen_wav_dir, filename)
if os.path.exists(wav_file):
# for Martijn
#shutil.copy(wav_file, os.path.join(reus_dir, filename))
shutil.copy(wav_file, os.path.join(rozen_dir, filename))
pronunciation_ipa_ = [ipa.replace(':', 'ː') for ipa in pronunciation_ipa]
result = novoapi_functions.forced_alignment(wav_file, word, pronunciation_ipa_)
result_ipa, result_novo70, llh = novoapi_functions.result2pronunciation(result, word)
result_ = pd.Series([
sample['filename'],
sample['word'],
sample['xsampa'],
sample['ipa'],
' '.join(result_ipa),
' '.join(result_novo70),
llh
], index=results.columns)
results = results.append(result_, ignore_index = True)
print('{0}/{1}: answer {2} - prediction {3}'.format(
i+1, len(samples), result_['ipa'], result_['result_ipa']))
results.to_excel(os.path.join(reus_dir, 'results.xlsx'), encoding="utf-8")
if len(results) > 0:
Results = Results.append(results, ignore_index = True)
Results.to_excel(os.path.join(default.stimmen_dir, 'Results.xlsx'), encoding="utf-8")
# pronunciation_ipa_ = [ipa.replace(':', 'ː') for ipa in pronunciation_ipa]
# result = novoapi_functions.forced_alignment(wav_file, word, pronunciation_ipa_)
# result_ipa, result_novo70, llh = novoapi_functions.result2pronunciation(result, word)
# result_ = pd.Series([
# sample['filename'],
# sample['word'],
# sample['xsampa'],
# sample['ipa'],
# ' '.join(result_ipa),
# ' '.join(result_novo70),
# llh
# ], index=results.columns)
# results = results.append(result_, ignore_index = True)
# print('{0}/{1}: answer {2} - prediction {3}'.format(
# i+1, len(samples), result_['ipa'], result_['result_ipa']))
# #results.to_excel(os.path.join(default.stimmen_dir, 'results.xlsx'), encoding="utf-8")
#if len(results) > 0:
# Results = Results.append(results, ignore_index = True)
#Results.to_excel(os.path.join(default.stimmen_result_novoapi_dir, 'Results.xlsx'), encoding="utf-8")
else:
Results_xlsx = pd.ExcelFile(os.path.join(default.stimmen_dir, 'Results.xlsx'), encoding="utf-8")
Results_xlsx = pd.ExcelFile(os.path.join(default.stimmen_result_novoapi_dir, 'Results.xlsx'), encoding="utf-8")
Results = pd.read_excel(Results_xlsx, 'Sheet1')
## ===== analysis =====
#result_novoapi_dir = os.path.join(default.stimmen_dir, 'result', 'novoapi')
#for word in word_list:
# if not word == 'Oog':
# Results_ = Results[Results['word'] == word]
# y_true = list(Results_['ipa'])
# y_pred_ = [ipa.replace(' ', '') for ipa in list(Results_['result_ipa'])]
@ -249,4 +247,4 @@ else:
# plt.figure()
# output_confusion_matrix.plot_confusion_matrix(cm, pronunciation_variants, normalize=False)
# #plt.show()
# plt.savefig(os.path.join(result_novoapi_dir, word + '.png'))
# plt.savefig(os.path.join(default.stimmen_result_novoapi_dir, word + '.png'))

View File

@ -43,6 +43,7 @@ stimmen_data_dir = os.path.join(stimmen_dir, 'data')
#stimmen_wav_dir = os.path.join(stimmen_dir, 'wav')
# 16 kHz
stimmen_wav_dir = r'c:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\corpus\stimmen'
stimmen_result_novoapi_dir = os.path.join(stimmen_dir, 'result', 'novoapi')
stimmen_transcription_xlsx = os.path.join(stimmen_data_dir, 'Frisian Variants Picture Task Stimmen.xlsx')
phonelist_friesian_txt = os.path.join(experiments_dir, 'friesian', 'acoustic_model', 'config', 'phonelist_friesian.txt')

View File

@ -0,0 +1,10 @@
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import defaultfiles as default
wav_dir = r'c:\Users\Aki\source\repos\acoustic_model\reus-test'
wav_files = ['reus1008-reus.wav',
'reus1167-man.wav',
'reus3768-mantsje.wav']

View File

@ -7,7 +7,7 @@ import json
from novoapi.backend import session
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
#os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import defaultfiles as default

BIN
reus-test/check_novoapi.zip Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

119
rozen-test/rozen-test.py Normal file
View File

@ -0,0 +1,119 @@
#!/usr/bin/env python
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import argparse
import json
from novoapi.backend import session
p = argparse.ArgumentParser()
p.add_argument("--user", default='martijn.wieling')
p.add_argument("--password", default='xxxxx')
args = p.parse_args()
rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True)
grammar = {
"type": "confusion_network",
"version": "1.0",
"data": {
"kind": "sequence",
"elements": [
{
"kind": "word",
"pronunciation": [
{
"phones": [
"r",
"eu0",
"s"
],
"id": 0
}
,
{
"phones": [
"m",
"a0",
"n"
],
"id": 1
}
,
{
"phones": [
"m",
"a0",
"n",
"t",
"s",
"y",
"ax"
],
"id": 2
}
],
"label": "reus"
}
]
},
"return_objects": [
"grammar"
],
"phoneset": "novo70"
}
res = rec.setgrammar(grammar)
#print "Set grammar result", res
## === novoapi/backend/session.py ===
#import wave
#import time
#from novoapi.backend.session import rpcid, segmentation
#wavf = "reus1008-reus.wav"
#w = wave.open(wavf, 'r')
#nchannels, sampwidth, framerate, nframes, comptype, compname = w.getparams()
#buf = w.readframes(nframes)
#w.close()
#buffer_size = 4096
#nbytes_sent = 0
#start = time.time()
#for j in range(0, len(buf), buffer_size):
# audio_packet = buf[j:j + buffer_size]
# nbytes_sent += len(audio_packet)
# rec.conn.send_binary(audio_packet)
#rec.conn.send(json.dumps({"jsonrpc": "2.0", "method": "get_result", "id": rpcid.next()}))
#print(rpcid.next())
#rec.last_message = rec.conn.recv()
#message = json.loads(rec.last_message)
#result = session.segmentation(message["result"]["words"])
#result.export()
## ====================================
def result2pronunciation(result, word):
#result_ = res.export()[1]
result_ = [result[i] for i in range(len(result)) if result[i]['label'] == word]
llh = result_[0]['llh']
phones = result_[0]['phones']
pronunciation = [phone['label'] for phone in phones]
return pronunciation, llh
res = rec.recognize_wav("reus1008-reus.wav")
#print "\n\n\nThe pronounced word in reus1008-reus.wav is: REUS\n\n"
#print "Recognition result:", json.dumps(res.export(), indent=4)
result2pronunciation(res.export(), 'reus')
#print "\n\n\nThe pronounced word in reus1167-man.wav is: MAN\n\n"
res2 = rec.recognize_wav("reus1167-man.wav")
#print "Recognition result:", json.dumps(res2.export(), indent=4)
result2pronunciation(res2.export(), 'reus')
#print "\n\n\nThe pronounced word in reus3768-mantsje.wav is: MANTSJE\n\n"
res3 = rec.recognize_wav("reus3768-mantsje.wav")
#print "Recognition result:", json.dumps(res3.export(), indent=4)
result2pronunciation(res3.export(), 'reus')