diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo index 36c4ef1..a64dccd 100644 Binary files a/.vs/acoustic_model/v15/.suo and b/.vs/acoustic_model/v15/.suo differ diff --git a/acoustic_model.sln b/acoustic_model.sln index 7d8fcbe..406d9e5 100644 --- a/acoustic_model.sln +++ b/acoustic_model.sln @@ -16,6 +16,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution ..\forced_alignment\forced_alignment\pronunciations.py = ..\forced_alignment\forced_alignment\pronunciations.py ..\toolbox\pyHTK.py = ..\toolbox\pyHTK.py ..\forced_alignment\forced_alignment\pyhtk.py = ..\forced_alignment\forced_alignment\pyhtk.py + reus-test\reus-test.py = reus-test\reus-test.py ..\forced_alignment\forced_alignment\scripts.py = ..\forced_alignment\forced_alignment\scripts.py ..\..\..\..\..\Python36-32\Lib\site-packages\novoapi\backend\session.py = ..\..\..\..\..\Python36-32\Lib\site-packages\novoapi\backend\session.py ..\forced_alignment\forced_alignment\tempfilename.py = ..\forced_alignment\forced_alignment\tempfilename.py diff --git a/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc b/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc index 8898cb1..ef367cd 100644 Binary files a/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc and b/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc differ diff --git a/acoustic_model/acoustic_model.pyproj b/acoustic_model/acoustic_model.pyproj index c24e77d..17163f2 100644 --- a/acoustic_model/acoustic_model.pyproj +++ b/acoustic_model/acoustic_model.pyproj @@ -4,8 +4,7 @@ 2.0 4d8c8573-32f0-4a62-9e62-3ce5cc680390 . - - + forced_aligner_comparison.py . @@ -36,6 +35,9 @@ Code + + Code + Code diff --git a/acoustic_model/acoustic_model_functions.py b/acoustic_model/acoustic_model_function.py similarity index 100% rename from acoustic_model/acoustic_model_functions.py rename to acoustic_model/acoustic_model_function.py diff --git a/acoustic_model/check_novoapi.py b/acoustic_model/check_novoapi.py index e70d754..96d8e8f 100644 --- a/acoustic_model/check_novoapi.py +++ b/acoustic_model/check_novoapi.py @@ -10,14 +10,13 @@ import shutil import numpy as np import pandas as pd import matplotlib.pyplot as plt - from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score import novoapi import defaultfiles as default sys.path.append(default.forced_alignment_module_dir) -from forced_alignment import pyhtk, convert_phone_set +from forced_alignment import convert_phone_set #import acoustic_model_functions as am_func import convert_xsampa2ipa import novoapi_functions @@ -47,10 +46,6 @@ david_suggestion = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː', 'w'] ## read pronunciation variants. stimmen_transcription_ = pd.ExcelFile(default.stimmen_transcription_xlsx) df = pd.read_excel(stimmen_transcription_, 'frequency') -#for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']): -# ipa_converted = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa) -# if not ipa_converted == ipa: -# print('{0}: {1} - {2}'.format(xsampa, ipa_converted, ipa)) transcription_ipa = list(df['IPA']) # transcription mistake? @@ -63,6 +58,7 @@ for ipa in transcription_ipa: ipa = ipa.replace(':', 'ː') ipa = convert_phone_set.split_ipa(ipa) + # list of phones not in novo70 phoneset. not_in_novo70_ = [phone for phone in ipa if not phone in phoneset_ipa and not phone in david_suggestion] not_in_novo70_ = [phone.replace('sp', '') for phone in not_in_novo70_] @@ -106,6 +102,10 @@ df = pd.read_excel(stimmen_transcription_, 'original') # mapping from ipa to xsampa mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_converter_dir) +#for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']): +# ipa_converted = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa) +# if not ipa_converted == ipa: +# print('{0}: {1} - {2}'.format(xsampa, ipa_converted, ipa)) ipas = [] famehtks = [] @@ -153,12 +153,12 @@ for word in word_list: ## ===== forced alignment ===== -reus_dir = r'C:\OneDrive\Desktop\Reus' +rozen_dir = r'c:\Users\Aki\source\repos\acoustic_model\rozen-test' if forced_alignment_novo70: Results = pd.DataFrame(index=[], columns=['filename', 'word', 'xsampa', 'ipa', 'result_ipa', 'result_novo70', 'llh']) #for word in word_list: - for word in ['Reus']: + for word in ['Rozen']: # pronunciation variants top 3 df_per_word_ = df_per_word[df_per_word['word']==word] df_per_word_ = df_per_word_.sort_values('frequency', ascending=False) @@ -208,37 +208,35 @@ if forced_alignment_novo70: wav_file = os.path.join(default.stimmen_wav_dir, filename) if os.path.exists(wav_file): # for Martijn - #shutil.copy(wav_file, os.path.join(reus_dir, filename)) + shutil.copy(wav_file, os.path.join(rozen_dir, filename)) - pronunciation_ipa_ = [ipa.replace(':', 'ː') for ipa in pronunciation_ipa] - result = novoapi_functions.forced_alignment(wav_file, word, pronunciation_ipa_) - result_ipa, result_novo70, llh = novoapi_functions.result2pronunciation(result, word) - result_ = pd.Series([ - sample['filename'], - sample['word'], - sample['xsampa'], - sample['ipa'], - ' '.join(result_ipa), - ' '.join(result_novo70), - llh - ], index=results.columns) - results = results.append(result_, ignore_index = True) - print('{0}/{1}: answer {2} - prediction {3}'.format( - i+1, len(samples), result_['ipa'], result_['result_ipa'])) - results.to_excel(os.path.join(reus_dir, 'results.xlsx'), encoding="utf-8") - if len(results) > 0: - Results = Results.append(results, ignore_index = True) - Results.to_excel(os.path.join(default.stimmen_dir, 'Results.xlsx'), encoding="utf-8") + # pronunciation_ipa_ = [ipa.replace(':', 'ː') for ipa in pronunciation_ipa] + # result = novoapi_functions.forced_alignment(wav_file, word, pronunciation_ipa_) + # result_ipa, result_novo70, llh = novoapi_functions.result2pronunciation(result, word) + # result_ = pd.Series([ + # sample['filename'], + # sample['word'], + # sample['xsampa'], + # sample['ipa'], + # ' '.join(result_ipa), + # ' '.join(result_novo70), + # llh + # ], index=results.columns) + # results = results.append(result_, ignore_index = True) + # print('{0}/{1}: answer {2} - prediction {3}'.format( + # i+1, len(samples), result_['ipa'], result_['result_ipa'])) + # #results.to_excel(os.path.join(default.stimmen_dir, 'results.xlsx'), encoding="utf-8") + #if len(results) > 0: + # Results = Results.append(results, ignore_index = True) + #Results.to_excel(os.path.join(default.stimmen_result_novoapi_dir, 'Results.xlsx'), encoding="utf-8") else: - Results_xlsx = pd.ExcelFile(os.path.join(default.stimmen_dir, 'Results.xlsx'), encoding="utf-8") + Results_xlsx = pd.ExcelFile(os.path.join(default.stimmen_result_novoapi_dir, 'Results.xlsx'), encoding="utf-8") Results = pd.read_excel(Results_xlsx, 'Sheet1') ## ===== analysis ===== -#result_novoapi_dir = os.path.join(default.stimmen_dir, 'result', 'novoapi') #for word in word_list: # if not word == 'Oog': - # Results_ = Results[Results['word'] == word] # y_true = list(Results_['ipa']) # y_pred_ = [ipa.replace(' ', '') for ipa in list(Results_['result_ipa'])] @@ -249,4 +247,4 @@ else: # plt.figure() # output_confusion_matrix.plot_confusion_matrix(cm, pronunciation_variants, normalize=False) # #plt.show() -# plt.savefig(os.path.join(result_novoapi_dir, word + '.png')) \ No newline at end of file +# plt.savefig(os.path.join(default.stimmen_result_novoapi_dir, word + '.png')) \ No newline at end of file diff --git a/acoustic_model/defaultfiles.py b/acoustic_model/defaultfiles.py index b9ab0ab..f53100f 100644 --- a/acoustic_model/defaultfiles.py +++ b/acoustic_model/defaultfiles.py @@ -31,6 +31,12 @@ ipa_xsampa_converter_dir = os.path.join(repo_dir, 'ipa-xsama-converter') forced_alignment_module_dir = os.path.join(repo_dir, 'forced_alignment') accent_classification_dir = os.path.join(repo_dir, 'accent_classification', 'accent_classification') +htk_config_dir = r'c:\Users\Aki\source\repos\forced_alignment\forced_alignment\data\htk\preset_models\aki_dutch_2017' +config_hvite = os.path.join(htk_config_dir, 'config.HVite') +#acoustic_model = os.path.join(htk_config_dir, 'hmmdefs.compo') +acoustic_model = r'c:\cygwin64\home\Aki\acoustic_model\model\barbara\hmm128-2\hmmdefs.compo' +phonelist_txt = os.path.join(htk_config_dir, 'phonelist.txt') + WSL_dir = r'C:\OneDrive\WSL' fame_dir = os.path.join(WSL_dir, 'kaldi-trunk', 'egs', 'fame') fame_s5_dir = os.path.join(fame_dir, 's5') @@ -43,6 +49,7 @@ stimmen_data_dir = os.path.join(stimmen_dir, 'data') #stimmen_wav_dir = os.path.join(stimmen_dir, 'wav') # 16 kHz stimmen_wav_dir = r'c:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\corpus\stimmen' +stimmen_result_novoapi_dir = os.path.join(stimmen_dir, 'result', 'novoapi') stimmen_transcription_xlsx = os.path.join(stimmen_data_dir, 'Frisian Variants Picture Task Stimmen.xlsx') phonelist_friesian_txt = os.path.join(experiments_dir, 'friesian', 'acoustic_model', 'config', 'phonelist_friesian.txt') diff --git a/acoustic_model/forced_aligner_comparison.py b/acoustic_model/forced_aligner_comparison.py new file mode 100644 index 0000000..d9d34a4 --- /dev/null +++ b/acoustic_model/forced_aligner_comparison.py @@ -0,0 +1,42 @@ +import os +os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') +import sys + +import defaultfiles as default +sys.path.append(default.forced_alignment_module_dir) +from forced_alignment import pyhtk, convert_phone_set, scripts + +reus_dir = r'c:\Users\Aki\source\repos\acoustic_model\reus-test' +wav_dir = reus_dir +wav_files = ['reus1008-reus.wav', + 'reus1167-man.wav', + 'reus3768-mantsje.wav'] + +word = 'reus' +pronunciation_ipa = ['rø:s', 'mɑn', 'mɑntsjə'] + +for wav_file in wav_files: + file_lab = os.path.join(reus_dir, wav_file.replace('.wav', '.lab')) + file_dic = os.path.join(reus_dir, wav_file.replace('.wav', '.dic')) + file_txt = os.path.join(reus_dir, wav_file.replace('.wav', '.txt')) + + # output htk dict file + with open(file_dic, 'w', encoding="utf-8") as f: + for ipa in pronunciation_ipa: + cgn = convert_phone_set.ipa2cgn([ipa.replace(':', 'ː')]) + barbara = convert_phone_set.cgn2barbara(cgn) + f.write(word.upper() + '\t' + barbara + '\n') + + # output htk label file. + pyhtk._create_label_file(word, file_lab) + + scripts.run_command([ + 'HVite','-T', '1', + '-a', + '-C', default.config_hvite, + '-H', default.acoustic_model, + '-m', + '-i', file_txt, + #'-S', script_file, + file_dic, default.phonelist_txt, os.path.join(wav_dir, wav_file) + ]) \ No newline at end of file diff --git a/acoustic_model/novoapi_functions.py b/acoustic_model/novoapi_functions.py index 5c76f6a..0c72b45 100644 --- a/acoustic_model/novoapi_functions.py +++ b/acoustic_model/novoapi_functions.py @@ -7,7 +7,7 @@ import json from novoapi.backend import session import os -os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') +#os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') import defaultfiles as default diff --git a/acoustic_model/acoustic_model.py b/acoustic_model/train_hmm_fame.py similarity index 100% rename from acoustic_model/acoustic_model.py rename to acoustic_model/train_hmm_fame.py diff --git a/reus-test/check_novoapi.zip b/reus-test/check_novoapi.zip new file mode 100644 index 0000000..4cc1a68 Binary files /dev/null and b/reus-test/check_novoapi.zip differ diff --git a/reus-test/reus1008-reus.dic b/reus-test/reus1008-reus.dic new file mode 100644 index 0000000..4d22a33 --- /dev/null +++ b/reus-test/reus1008-reus.dic @@ -0,0 +1,3 @@ +REUS r eu s +REUS m ac n +REUS m ac n t s j @ diff --git a/reus-test/reus1008-reus.lab b/reus-test/reus1008-reus.lab new file mode 100644 index 0000000..0475f18 --- /dev/null +++ b/reus-test/reus1008-reus.lab @@ -0,0 +1 @@ +REUS diff --git a/reus-test/reus1008-reus.txt b/reus-test/reus1008-reus.txt new file mode 100644 index 0000000..9726c94 --- /dev/null +++ b/reus-test/reus1008-reus.txt @@ -0,0 +1,6 @@ +#!MLF!# +"c:/Users/Aki/source/repos/acoustic_model/reus-test/reus1008-reus.rec" +0 9700000 r -12463.852539 REUS +9700000 12800000 eu -3622.108887 +12800000 26250001 s -17303.216797 +. diff --git a/reus-test/reus1167-man.dic b/reus-test/reus1167-man.dic new file mode 100644 index 0000000..4d22a33 --- /dev/null +++ b/reus-test/reus1167-man.dic @@ -0,0 +1,3 @@ +REUS r eu s +REUS m ac n +REUS m ac n t s j @ diff --git a/reus-test/reus1167-man.lab b/reus-test/reus1167-man.lab new file mode 100644 index 0000000..0475f18 --- /dev/null +++ b/reus-test/reus1167-man.lab @@ -0,0 +1 @@ +REUS diff --git a/reus-test/reus1167-man.txt b/reus-test/reus1167-man.txt new file mode 100644 index 0000000..06ad7b8 --- /dev/null +++ b/reus-test/reus1167-man.txt @@ -0,0 +1,10 @@ +#!MLF!# +"c:/Users/Aki/source/repos/acoustic_model/reus-test/reus1167-man.rec" +0 150000 m -230.057571 REUS +150000 300000 ac -250.994858 +300000 450000 n -202.377716 +450000 4600000 t -5128.984375 +4600000 5050000 s -711.338501 +5050000 5450000 j -564.730591 +5450000 16049999 @ -13249.787109 +. diff --git a/reus-test/reus3768-mantsje.dic b/reus-test/reus3768-mantsje.dic new file mode 100644 index 0000000..4d22a33 --- /dev/null +++ b/reus-test/reus3768-mantsje.dic @@ -0,0 +1,3 @@ +REUS r eu s +REUS m ac n +REUS m ac n t s j @ diff --git a/reus-test/reus3768-mantsje.lab b/reus-test/reus3768-mantsje.lab new file mode 100644 index 0000000..0475f18 --- /dev/null +++ b/reus-test/reus3768-mantsje.lab @@ -0,0 +1 @@ +REUS diff --git a/reus-test/reus3768-mantsje.txt b/reus-test/reus3768-mantsje.txt new file mode 100644 index 0000000..8e2bc08 --- /dev/null +++ b/reus-test/reus3768-mantsje.txt @@ -0,0 +1,10 @@ +#!MLF!# +"c:/Users/Aki/source/repos/acoustic_model/reus-test/reus3768-mantsje.rec" +0 150000 m -217.347229 REUS +150000 1150000 ac -1266.293579 +1150000 1650000 n -583.382568 +1650000 11100000 t -11259.270508 +11100000 11250000 s -247.939255 +11250000 11550000 j -445.511444 +11550000 24150000 @ -16769.048828 +. diff --git a/rozen-test/pg_rozen_100_jko5r.wav b/rozen-test/pg_rozen_100_jko5r.wav new file mode 100644 index 0000000..02027e9 Binary files /dev/null and b/rozen-test/pg_rozen_100_jko5r.wav differ diff --git a/rozen-test/pg_rozen_113_o9kzs.wav b/rozen-test/pg_rozen_113_o9kzs.wav new file mode 100644 index 0000000..7127250 Binary files /dev/null and b/rozen-test/pg_rozen_113_o9kzs.wav differ diff --git a/rozen-test/pg_rozen_1296_zbve2.wav b/rozen-test/pg_rozen_1296_zbve2.wav new file mode 100644 index 0000000..a6bcbb3 Binary files /dev/null and b/rozen-test/pg_rozen_1296_zbve2.wav differ diff --git a/rozen-test/pg_rozen_1709_kq9xr.wav b/rozen-test/pg_rozen_1709_kq9xr.wav new file mode 100644 index 0000000..c457bdd Binary files /dev/null and b/rozen-test/pg_rozen_1709_kq9xr.wav differ diff --git a/rozen-test/pg_rozen_241_bahqi.wav b/rozen-test/pg_rozen_241_bahqi.wav new file mode 100644 index 0000000..0a3ec97 Binary files /dev/null and b/rozen-test/pg_rozen_241_bahqi.wav differ diff --git a/rozen-test/pg_rozen_5502_q79fd.wav b/rozen-test/pg_rozen_5502_q79fd.wav new file mode 100644 index 0000000..26e050c Binary files /dev/null and b/rozen-test/pg_rozen_5502_q79fd.wav differ diff --git a/rozen-test/pg_rozen_632_2m04y.wav b/rozen-test/pg_rozen_632_2m04y.wav new file mode 100644 index 0000000..e4497e0 Binary files /dev/null and b/rozen-test/pg_rozen_632_2m04y.wav differ diff --git a/rozen-test/pg_rozen_911_1zvda.wav b/rozen-test/pg_rozen_911_1zvda.wav new file mode 100644 index 0000000..a739fac Binary files /dev/null and b/rozen-test/pg_rozen_911_1zvda.wav differ diff --git a/rozen-test/rozen-test.py b/rozen-test/rozen-test.py new file mode 100644 index 0000000..379acd4 --- /dev/null +++ b/rozen-test/rozen-test.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python +import os +os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') + +import argparse +import json + +from novoapi.backend import session + +p = argparse.ArgumentParser() +p.add_argument("--user", default='martijn.wieling') +p.add_argument("--password", default='xxxxx') +args = p.parse_args() + +rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True) + +grammar = { + "type": "confusion_network", + "version": "1.0", + "data": { + "kind": "sequence", + "elements": [ + { + "kind": "word", + "pronunciation": [ + { + "phones": [ + "r", + "eu0", + "s" + ], + "id": 0 + } + , + { + "phones": [ + "m", + "a0", + "n" + ], + "id": 1 + } + , + { + "phones": [ + "m", + "a0", + "n", + "t", + "s", + "y", + "ax" + ], + "id": 2 + } + ], + "label": "reus" + } + ] + }, + "return_objects": [ + "grammar" + ], + "phoneset": "novo70" +} + +res = rec.setgrammar(grammar) +#print "Set grammar result", res + + +## === novoapi/backend/session.py === +#import wave +#import time +#from novoapi.backend.session import rpcid, segmentation + +#wavf = "reus1008-reus.wav" +#w = wave.open(wavf, 'r') +#nchannels, sampwidth, framerate, nframes, comptype, compname = w.getparams() +#buf = w.readframes(nframes) +#w.close() + +#buffer_size = 4096 +#nbytes_sent = 0 +#start = time.time() +#for j in range(0, len(buf), buffer_size): +# audio_packet = buf[j:j + buffer_size] +# nbytes_sent += len(audio_packet) +# rec.conn.send_binary(audio_packet) +#rec.conn.send(json.dumps({"jsonrpc": "2.0", "method": "get_result", "id": rpcid.next()})) +#print(rpcid.next()) +#rec.last_message = rec.conn.recv() +#message = json.loads(rec.last_message) +#result = session.segmentation(message["result"]["words"]) +#result.export() +## ==================================== + +def result2pronunciation(result, word): + #result_ = res.export()[1] + result_ = [result[i] for i in range(len(result)) if result[i]['label'] == word] + llh = result_[0]['llh'] + phones = result_[0]['phones'] + pronunciation = [phone['label'] for phone in phones] + return pronunciation, llh + + +res = rec.recognize_wav("reus1008-reus.wav") +#print "\n\n\nThe pronounced word in reus1008-reus.wav is: REUS\n\n" +#print "Recognition result:", json.dumps(res.export(), indent=4) +result2pronunciation(res.export(), 'reus') + +#print "\n\n\nThe pronounced word in reus1167-man.wav is: MAN\n\n" +res2 = rec.recognize_wav("reus1167-man.wav") +#print "Recognition result:", json.dumps(res2.export(), indent=4) +result2pronunciation(res2.export(), 'reus') + +#print "\n\n\nThe pronounced word in reus3768-mantsje.wav is: MANTSJE\n\n" +res3 = rec.recognize_wav("reus3768-mantsje.wav") +#print "Recognition result:", json.dumps(res3.export(), indent=4) +result2pronunciation(res3.export(), 'reus')