From 22b9ae966b7b1b7ae432e3616d36ee2fb34673a1 Mon Sep 17 00:00:00 2001 From: yemaozi88 <428968@gmail.com> Date: Mon, 20 Aug 2018 22:50:53 +0200 Subject: [PATCH] Forced alignment by Kaldi is added. --- .vs/acoustic_model/v15/.suo | Bin 53248 -> 34304 bytes acoustic_model.sln | 1 + acoustic_model/acoustic_model.py | 8 +- acoustic_model/acoustic_model.pyproj | 3 + acoustic_model/config.ini | 2 +- acoustic_model/performance_check.py | 572 +++++++++++++++++++-------- acoustic_model/pyKaldi.py | 26 ++ 7 files changed, 451 insertions(+), 161 deletions(-) create mode 100644 acoustic_model/pyKaldi.py diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo index a3fe250a36274108985a39fafa5a92ccaae5cb9e..fea19a8bbf4d409361bcc61ea5e2f0c8856e8da6 100644 GIT binary patch delta 2513 zcmcguX-rgC6ux&D1{jfb7zV+zDI$y@TikU(R4k$(#x|`BC@2gFl*KqMWo&CPN{!5E zQ-f9!(xw8n$|E#26h;=Iu4r&cYm7f^>k?aRXic%T{q8VBE72eQ(ca{HXL;vt-#z!d zXRNU^R?7Cz=8h-lM9N&;dKZU(7t_yu8DqLw+A7=1`{)NginX%3Ah8nz+7M!Acmd?sRQDH)xguLv^<_bQSwPu8CfP- zzDr+Y@$;)^mq{gKEwO&Hz}RvJ(!Bzl+)$sg|bAc<7enSWl8*LnB71p z&;^L`&sn9=zlMI+YF~o909>@%;ynm%|F6eyaw+9pPi3y<&f%DPt)LOjT(ur9|q^c9&Z zRw50CIdf-FG}6eq`ILL_%qb_I6n8AB`Pp}W&Ei|988XUKy5FqkWbT+recg^!AEBno z9_5-Ftp_gMej{~P$<%yBicdl-1|C`2I_L+nq8WflvP{T)NP-l_!5gb~1I7b4V5>n= zCFBxZvWJjI6p}e2fs;%ZZ_mfWF-(&VG>P-qU2?KqF>(DY^-5HRfw9t1j6%lWq4?I4 zU;|5LnV2&Rxe&pMn1MajA8qt{P#=^2{ql?Fbk|}mTci8UN;;J5OhFME3WyBC4i)8$ z^c1>gjS~G9@nRk;lg_NR(WY9@*6{ z|0fCm<1y>F+pRA{6OJ$J@8EZQ&UQrITOzrI4sfbnnIUDYG0V6sr6&1OdXk1rW-S>r zRdmH1DlcN$jLeyOuR>PLHsP=f*m}6u;+ey;Y2LcexsHBX*Fd`Z_bE1OCCRgAOXe)I zjEp%tGUw_@r#IJ`b2u3{>Xf3!h;NL7QCnRvB^2DG9hvp$e%t z(!`w*P4YPMp*&#Uuwi)~z|$WP(a0N1qrk{0O3hP|EYHtIib)TiHntOwOtZYo()*Dy z&uBV?Ba`9}wR4kpz7rQ|DZbg3s#2=0{C8F5N=6ukwVntmsEI6!`C{{`>qBi(7aA%~ zh(*~FJ_X5d0yK8DU-70>-^fBr+WWQIiRq@_2evzX?7l-xKN39_TbhOQIPBWgzEnYX zI*Yx<^&edh7;~lM?qg&;V{&iFZZ<#pHACP0R`=tHKi*$$DGg%^Q9QEQDz<^uQq@Os z6m~|5(jbOD>xxf|6JvA0@h#Y8__?`?1$OwM95~=-1phDXxZ>SY81q1^Z6o(s|im!P7WS delta 2623 zcmdUxdrVVj6u|Fy+X5|6T3a3}V8P0;Ahcz*MNzM%;sfM0c^9e5!xA@MbxM6ul*L4y z(R<9Y7-wAFmJkxM_3B=ltpjw6e<(A~Kg?dq%r=$HZE;J63G961)@|8_BulpJOMW@$ zJKyO>_e4n69T286EQ zLb(aqj7%IW+rhKjl+3_K>u40_qwNt<2u~%mrk0kPr06{xQ6e#j3eh6-5Dk)m)FO0S zHOdrmVp6PV3|(K3{WQ^*JLP1PFBJ_OHSii;%~~{6A%#e>I7pW+LA?|yL&}jSkP3v3 zRidQ3RHLjxXf!q=bS%`i&<&y+(b|MGBN&$!UJ)gwGeM=WO({hRj=TrYD!P&`QXd=} zMmZwtmqdwkzxXF}14<7eU9df-C(43LKA?fb_M16u9d^ex!npbbiETx_1=)(Y5qhdN z=-14Ppqd9QFsd=~i6hd5RLA#OxSyCwKo@rgh7(P)7twkJevaEknqhnV6&aC;Zn9of z-h}Ri9`+I=j3?}6$ryCU9tg}j{vYh>*GYN92sjK*&}DoMzWFLpd2(PlWtrgf_z0x- z*}ZlsfGV_F%48$VC-u|zqU3WC$L zVD8BPUx^kvdSYSQqId|Fw8D|{IN(YLNdep|9RP0U3CJq16b_dE#6qx&gY)f?P}qK! zl!C8LWi#5lQ?=V|(Evt-scIPi~R&+IWb~#sq zufwOH$wKo*#=wX5AHco30T`^g2-m7VhOXLo;DqxA35Sk)7ab8cwAxw!a#pyxxu0lh z-3T>W#ObAAv_9X@vC!MR4(d0Sk@c{t#fF-eH57$ZSksWN9t+K648aGeON5gRUrE*b zADS8F5b6e)XgLa#wPvVny5P5m{a>Ow{-~(xi4FF+<|_ zw@Q1|tr$y1RgGkcd-A<4L?ZLhfdK8 z3U@N&h~LXb&D=ZSy~(D Code + + Code + diff --git a/acoustic_model/config.ini b/acoustic_model/config.ini index 88805f6..9232c5b 100644 --- a/acoustic_model/config.ini +++ b/acoustic_model/config.ini @@ -2,4 +2,4 @@ config_hcopy = c:\cygwin64\home\Aki\acoustic_model\config\config.HCopy config_train = c:\cygwin64\home\Aki\acoustic_model\config\config.train mkhmmdefs_pl = c:\cygwin64\home\Aki\acoustic_model\src\acoustic_model\mkhmmdefs.pl -FAME_dir = c:\OneDrive\Research\rug\experiments\friesian\corpus \ No newline at end of file +FAME_dir = C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\corpus \ No newline at end of file diff --git a/acoustic_model/performance_check.py b/acoustic_model/performance_check.py index 411d93e..0738ab9 100644 --- a/acoustic_model/performance_check.py +++ b/acoustic_model/performance_check.py @@ -4,52 +4,92 @@ import csv import subprocess import configparser from collections import Counter +import re import numpy as np import pandas as pd import matplotlib.pyplot as plt +from sklearn.metrics import confusion_matrix ## ======================= functions ======================= def read_fileFA(fileFA): - """ - read the result file of HTK forced alignment. - this function only works when input is one word. - """ - with open(fileFA, 'r') as f: - lines = f.read() - lines = lines.split('\n') + """ + read the result file of HTK forced alignment. + this function only works when input is one word. + """ + with open(fileFA, 'r') as f: + lines = f.read() + lines = lines.split('\n') - phones = [] - for line in lines: - line_split = line.split() - if len(line_split) > 1: - phones.append(line_split[2]) + phones = [] + for line in lines: + line_split = line.split() + if len(line_split) > 1: + phones.append(line_split[2]) - return ' '.join(phones) + return ' '.join(phones) -##################### -## USER DEFINE ## -##################### +def make_dic(word, pronvar_, fileDic, output_type): + """ + make dict files which can be used for HTK. + param word: target word. + param pronvar_: pronunciation variant. nx2 (WORD /t pronunciation) ndarray. + param fileDic: output dic file. + param output_type: 0:full, 1:statistics, 2:frequency <2% entries are removed. 3:top 3. + """ + #assert(output_type < 4 and output_type >= 0, 'output_type should be an integer between 0 and 3.') + + if output_type == 0: # full + pronvar = np.unique(pronvar_) + + with open(fileDic, 'w') as f: + for pvar in pronvar: + f.write('{0}\t{1}\n'.format(WORD, pvar)) + else: + c = Counter(pronvar_) + total_num = sum(c.values()) + with open(fileDic, 'w') as f: + if output_type == 3: + for key, value in c.most_common(3): + f.write('{0}\t{1}\n'.format(WORD, key)) + else: + for key, value in c.items(): + percentage = value/total_num*100 + + if output_type == 1: # all + f.write('{0}\t{1:.2f}\t{2}\t{3}\n'.format(value, percentage, WORD, key)) + elif output_type == 2: # less than 2 percent + if percentage < 2: + f.write('{0}\t{1}\n'.format(WORD, key)) + + +## ======================= user define ======================= curr_dir = r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model' config_ini = curr_dir + '\\config.ini' forced_alignment_module = r'C:\Users\Aki\source\repos\forced_alignment' forced_alignment_module_old = r'C:\OneDrive\Research\rug\code\forced_alignment\forced_alignment' -ipa_xsampa_converter_dir = r'C:\Users\Aki\source\repos\ipa-xsama-converter' +ipa_xsampa_converter_dir = r'C:\Users\Aki\source\repos\ipa-xsama-converter' +accent_classification_dir = r'C:\Users\Aki\source\repos\accent_classification\accent_classification' + -csvfile = r"C:\OneDrive\Research\rug\stimmen\Frisian Variants Picture Task Stimmen.csv" experiments_dir = r'C:\OneDrive\Research\rug\experiments' -data_dir = experiments_dir + '\\stimmen\\data' -cygwin_dir = r'C:\cygwin64\home\Aki\acoustic_model' +data_dir = experiments_dir + '\\stimmen\\data' +csvfile = data_dir + '\\Frisian Variants Picture Task Stimmen.csv' + +cygwin_dir = r'C:\cygwin64\home\Aki\acoustic_model' # procedure convert_phones = 0 make_dic_files = 0 make_dic_files_short = 0 -do_forced_alignment = 0 -eval_forced_alignment = 1 +do_forced_alignment_htk = 0 +make_kaldi_data_files = 0 +make_kaldi_lexicon_txt = 0 +load_forced_alignment_kaldi = 1 +eval_forced_alignment = 0 @@ -67,6 +107,10 @@ import acoustic_model_functions as am_func sys.path.append(forced_alignment_module_old) import pyHTK +# to output confusion matrix +sys.path.append(accent_classification_dir) +from output_confusion_matrix import plot_confusion_matrix + ## ======================= load variables ======================= config = configparser.ConfigParser() @@ -81,177 +125,393 @@ lex_asr_htk = FAME_dir + '\\lexicon\\lex.asr_htk' ## ======================= convert phones ====================== if convert_phones: - mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', ipa_xsampa_converter_dir) + mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', ipa_xsampa_converter_dir) - ## check phones included in FAME! - # the phones used in the lexicon. - #phonelist = am_func.get_phonelist(lex_htk) + ## check phones included in FAME! + # the phones used in the lexicon. + #phonelist = am_func.get_phonelist(lex_htk) - # the lines which include a specific phone. - #lines = am_func.find_phone(lex_asr, 'x') + # the lines which include a specific phone. + #lines = am_func.find_phone(lex_asr, 'x') - with open(csvfile, encoding="utf-8") as fin: - lines = csv.reader(fin, delimiter=';', lineterminator="\n", skipinitialspace=True) - next(lines, None) # skip the headers + with open(csvfile, encoding="utf-8") as fin: + lines = csv.reader(fin, delimiter=';', lineterminator="\n", skipinitialspace=True) + next(lines, None) # skip the headers - filenames = [] - words = [] - pronunciations = [] - for line in lines: - if line[1] is not '' and len(line) > 5: - filenames.append(line[0]) - words.append(line[1]) - pron_xsampa = line[3] - pron_ipa = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, pron_xsampa) - pron_ipa = pron_ipa.replace('ː', ':') - pron_famehtk = convert_phone_set.ipa2famehtk(pron_ipa) - - # adjust to phones used in the acoustic model. - pron_famehtk = pron_famehtk.replace('sp', 'sil') - pron_famehtk = pron_famehtk.replace('ce :', 'ce') # because ceh is ignored. - pron_famehtk = pron_famehtk.replace('w :', 'wh') - pron_famehtk = pron_famehtk.replace('e :', 'eh') - pron_famehtk = pron_famehtk.replace('eh :', 'eh') - pron_famehtk = pron_famehtk.replace('ih :', 'ih') + filenames = [] + words = [] + pronunciations = [] + for line in lines: + if line[1] is not '' and len(line) > 5: + filenames.append(line[0]) + words.append(line[1]) + pron_xsampa = line[3] + pron_ipa = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, pron_xsampa) + pron_ipa = pron_ipa.replace('ː', ':') + pron_famehtk = convert_phone_set.ipa2famehtk(pron_ipa) + + # adjust to phones used in the acoustic model. + pron_famehtk = pron_famehtk.replace('sp', 'sil') + pron_famehtk = pron_famehtk.replace('ce :', 'ce') # because ceh is ignored. + pron_famehtk = pron_famehtk.replace('w :', 'wh') + pron_famehtk = pron_famehtk.replace('e :', 'eh') + pron_famehtk = pron_famehtk.replace('eh :', 'eh') + pron_famehtk = pron_famehtk.replace('ih :', 'ih') - #translation_key = {'sp': 'sil', 'ce :': 'ceh', 'w :': 'wh'} - #pron = [] - #for phoneme in pron_famehtk.split(' '): - # pron.append(translation_key.get(phoneme, phoneme)) - #pronunciations.append(' '.join(pron_famehtk)) - pronunciations.append(pron_famehtk) + #translation_key = {'sp': 'sil', 'ce :': 'ceh', 'w :': 'wh'} + #pron = [] + #for phoneme in pron_famehtk.split(' '): + # pron.append(translation_key.get(phoneme, phoneme)) + #pronunciations.append(' '.join(pron_famehtk)) + pronunciations.append(pron_famehtk) - # check if all phones are in the phonelist of the acoustic model. - #phonelist = ' '.join(pronunciations) - #np.unique(phonelist.split(' ')) - #phonelist.find(':') + # check if all phones are in the phonelist of the acoustic model. + #phonelist = ' '.join(pronunciations) + #np.unique(phonelist.split(' ')) + #phonelist.find(':') - filenames = np.array(filenames) - words = np.array(words) - pronunciations = np.array(pronunciations) + filenames = np.array(filenames) + words = np.array(words) + pronunciations = np.array(pronunciations) - del line, lines - del pron_xsampa, pron_ipa, pron_famehtk + del line, lines + del pron_xsampa, pron_ipa, pron_famehtk - np.save(data_dir + '\\filenames.npy', filenames) - np.save(data_dir + '\\words.npy', words) - np.save(data_dir + '\\pronunciations.npy', pronunciations) + np.save(data_dir + '\\filenames.npy', filenames) + np.save(data_dir + '\\words.npy', words) + np.save(data_dir + '\\pronunciations.npy', pronunciations) else: - filenames = np.load(data_dir + '\\filenames.npy') - words = np.load(data_dir + '\\words.npy') - - pronunciations = np.load(data_dir + '\\pronunciations.npy') + filenames = np.load(data_dir + '\\filenames.npy') + words = np.load(data_dir + '\\words.npy') + + pronunciations = np.load(data_dir + '\\pronunciations.npy') word_list = np.unique(words) ## ======================= make dict files used for HTK. ====================== if make_dic_files: - output_dir = experiments_dir + r'\stimmen\dic' + output_type = 2 + output_dir = experiments_dir + r'\stimmen\dic_short' + + for word in word_list: + WORD = word.upper() + fileDic = output_dir + '\\' + word + '.dic' - for word in word_list: - WORD = word.upper() - fileDic = output_dir + '\\' + word + '.dic' + # pronunciation variant of the target word. + pronvar_ = pronunciations[words == word] + # remove '' + pronvar_ = np.delete(pronvar_, np.where(pronvar_=='')) - # make dic file. - pronvar_ = pronunciations[words == word] - pronvar = np.unique(pronvar_) + # make dic file. + make_dic(word, pronvar_, fileDic, output_type) + - with open(fileDic, 'w') as f: - for pvar in pronvar: - f.write('{0}\t{1}\n'.format(WORD, pvar)) +## ======================= forced alignment using HTK ======================= +if do_forced_alignment_htk: + configHVite = cygwin_dir + r'\config\config.HVite' + filePhoneList = experiments_dir + r'\friesian\acoustic_model\config\phonelist_friesian.txt' + wav_dir = experiments_dir + r'\stimmen\wav' + + #hmm_num = 128 + for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256]: + hmm_num_str = str(hmm_num) + AcousticModel = experiments_dir + r'\friesian\acoustic_model\model\hmm' + hmm_num_str + r'-2\hmmdefs' + + predictions = [] + file_num_max = len(filenames) + for i in range(0, file_num_max): + #for i in range(500, 502): + print('=== {0}/{1} ==='.format(i, file_num_max)) + filename = filenames[i] + fileWav = wav_dir + '\\' + filename + + if os.path.exists(fileWav): + word = words[i] + WORD = word.upper() + + # make label file. + fileLab = wav_dir + '\\' + filename.replace('.wav', '.lab') + with open(fileLab, 'w') as f: + lines = f.write(WORD) + + fileDic = experiments_dir + r'\stimmen\dic_top3' + '\\' + word + '.dic' + fileFA = experiments_dir + r'\stimmen\FA' + '\\' + filename.replace('.wav', '.txt') + hmm_num_str + + pyHTK.doHVite(fileWav, fileLab, fileDic, fileFA, configHVite, filePhoneList, AcousticModel) + prediction = read_fileFA(fileFA) + predictions.append(prediction) + + os.remove(fileLab) + print('{0}: {1} -> {2}'.format(WORD, pronunciations[i], prediction)) + else: + predictions.append('') + print('!!!!! file not found.') + + predictions = np.array(predictions) + match = np.c_[words[predictions != ''], pronunciations[predictions != ''], predictions[predictions != '']] + np.save(data_dir + '\\match_hmm' + hmm_num_str + '.npy', match) -## ======================= make dict files for most popular words. ====================== -if make_dic_files_short: - output_dir = experiments_dir + r'\stimmen\dic' +## ======================= make files which is used for forced alignment by Kaldi ======================= +if make_kaldi_data_files: + wav_dir = r'c:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\corpus\stimmen' + kaldi_work_dir = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5' + kaldi_data_dir = os.path.join(kaldi_work_dir, 'data', 'alignme') + kaldi_dict_dir = os.path.join(kaldi_work_dir, 'data', 'local', 'dict') + htk_dict_dir = os.path.join(experiments_dir, 'stimmen', 'dic_top3') - #word = word_list[3] - for word in word_list: - WORD = word.upper() - fileStat = output_dir + '\\' + word + '_stat.csv' - - pronvar = pronunciations[words == word] - c = Counter(pronvar) - total_num = sum(c.values()) + wav_scp = os.path.join(kaldi_data_dir, 'wav.scp') + text_file = os.path.join(kaldi_data_dir, 'text') + utt2spk = os.path.join(kaldi_data_dir, 'utt2spk') - with open(fileStat, 'w') as f: - for key, value in c.items(): - f.write('{0}\t{1:.2f}\t{2}\t{3}\n'.format(value, value/total_num*100, WORD, key)) + lexicon_txt = os.path.join(kaldi_dict_dir, 'lexicon.txt') + + predictions = [] + file_num_max = len(filenames) + + # remove previous files. + if os.path.exists(wav_scp): + os.remove(wav_scp) + if os.path.exists(text_file): + os.remove(text_file) + if os.path.exists(utt2spk): + os.remove(utt2spk) + + f_wav_scp = open(wav_scp, 'a', encoding="utf-8", newline='\n') + f_text_file = open(text_file, 'a', encoding="utf-8", newline='\n') + f_utt2spk = open(utt2spk, 'a', encoding="utf-8", newline='\n') + + # make wav.scp, text, and utt2spk files. + for i in range(0, file_num_max): + #for i in range(400, 410): + print('=== {0}/{1} ==='.format(i+1, file_num_max)) + filename = filenames[i] + wav_file = wav_dir + '\\' + filename + + if os.path.exists(wav_file): + speaker_id = 'speaker_' + str(i).zfill(4) + utterance_id = filename.replace('.wav', '') + utterance_id = utterance_id.replace(' ', '_') + utterance_id = speaker_id + '-' + utterance_id + + # wav.scp file + wav_file_unix = wav_file.replace('\\', '/') + wav_file_unix = wav_file_unix.replace('c:/', '/mnt/c/') + + f_wav_scp.write('{0} {1}\n'.format(utterance_id, wav_file_unix)) + + # text file + word = words[i].lower() + f_text_file.write('{0}\t{1}\n'.format(utterance_id, word)) + + # utt2spk + f_utt2spk.write('{0} {1}\n'.format(utterance_id, speaker_id)) + + f_wav_scp.close() + f_text_file.close() + f_utt2spk.close() -## ======================= forced alignment ======================= -if do_forced_alignment: - configHVite = cygwin_dir + r'\config\config.HVite' - filePhoneList = experiments_dir + r'\friesian\acoustic_model\config\phonelist_friesian.txt' - wav_dir = experiments_dir + r'\stimmen\wav' +## ======================= make lexicon txt which is used by Kaldi ======================= +if make_kaldi_lexicon_txt: + kaldi_work_dir = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5' + kaldi_dict_dir = os.path.join(kaldi_work_dir, 'data', 'local', 'dict') + lexicon_txt = os.path.join(kaldi_dict_dir, 'lexicon.txt') + option_num = 5 - #for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128]: - for hmm_num in [64]: - hmm_num_str = str(hmm_num) - AcousticModel = experiments_dir + r'\friesian\acoustic_model\model\hmm' + hmm_num_str + r'-3\hmmdefs' + # remove previous file. + if os.path.exists(lexicon_txt): + os.remove(lexicon_txt) - predictions = [] - file_num_max = len(filenames) - for i in range(0, file_num_max): - print('=== {0}/{1} ==='.format(i, file_num_max)) - filename = filenames[i] - fileWav = wav_dir + '\\' + filename - - if os.path.exists(fileWav): - word = words[i] - WORD = word.upper() + mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', ipa_xsampa_converter_dir) + with open(csvfile, encoding="utf-8") as fin: + lines = csv.reader(fin, delimiter=';', lineterminator="\n", skipinitialspace=True) + next(lines, None) # skip the headers - # make label file. - fileLab = wav_dir + '\\' + filename.replace('.wav', '.lab') - with open(fileLab, 'w') as f: - lines = f.write(WORD) + filenames = [] + words = [] + pronunciations = [] + p = [] + for line in lines: + if line[1] is not '' and len(line) > 5: + filenames.append(line[0]) + words.append(line[1]) + pron_xsampa = line[3] + pron_ipa = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, pron_xsampa) + pron_ipa = pron_ipa.replace('ː', ':') + + # adjust to phones used in the acoustic model. + pronunciations.append(pron_ipa) - fileDic = experiments_dir + r'\stimmen\dic_short' + '\\' + word + '.dic' - fileFA = experiments_dir + r'\stimmen\FA_short' + '\\' + filename.replace('.wav', '.txt') + hmm_num_str + # check if all phones are in the phonelist of the acoustic model. + #'y', 'b', 'ɾ', 'u', 'ɔ:', 'ø', 't', 'œ', 'n', 'ɒ', 'ɐ', 'f', 'o', 'k', 'x', 'ɡ', 'v', 's', 'ɛ:', 'ɪ:', 'ɑ', 'ɛ', 'a', 'd', 'z', 'ɪ', 'ɔ', 'l', 'i:', 'm', 'p', 'a:', 'i', 'e', 'j', 'o:', 'ʁ', 'h', ':', 'e:', 'ə', 'æ', 'χ', 'w', 'r', 'ə:', 'sp', 'ʊ', 'u:', 'ŋ' - pyHTK.doHVite(fileWav, fileLab, fileDic, fileFA, configHVite, filePhoneList, AcousticModel) - prediction = read_fileFA(fileFA) - predictions.append(prediction) + filenames = np.array(filenames) + words = np.array(words) + wordlist = np.unique(words) + pronunciations = np.array(pronunciations) + + # output lexicon.txt + #f_lexicon_txt = open(lexicon_txt, 'a', encoding="utf-8", newline='\n') + pronvar_list_all = [] + for word in word_list: - os.remove(fileLab) - print('{0}: {1} -> {2}'.format(WORD, pronunciations[i], prediction)) - else: - predictions.append('') - print('!!!!! file not found.') + # pronunciation variant of the target word. + pronvar_ = pronunciations[words == word] + # remove '' + pronvar_ = np.delete(pronvar_, np.where(pronvar_=='')) - predictions = np.array(predictions) - match = np.c_[words[predictions != ''], pronunciations[predictions != ''], predictions[predictions != '']] - np.save(data_dir + '\\match_hmm' + hmm_num_str + '.npy', match) + c = Counter(pronvar_) + total_num = sum(c.values()) + + for key, value in c.most_common(option_num): + #print('{0}\t{1}\t{2}\t{3}'.format(word, key, value, total_num)) + key = key.replace('æ', 'ɛ') + key = key.replace('ɐ', 'a') + key = key.replace('ɑ', 'a') + key = key.replace('ɾ', 'r') + key = key.replace('ʁ', 'r') + key = key.replace('ʊ', 'u') + key = key.replace('χ', 'x') + #print('-->{0}\t{1}\t{2}\t{3}\n'.format(word, key, value, total_num)) + + # make possible pronounciation variant list. + pronvar_list = [key] + while 'ø:' in ' '.join(pronvar_list) or 'œ' in ' '.join(pronvar_list) or 'ɒ' in ' '.join(pronvar_list): + pronvar_list_ = [] + for p in pronvar_list: + if 'ø:' in p: + pronvar_list_.append(p.replace('ø:', 'ö')) + pronvar_list_.append(p.replace('ø:', 'ö:')) + if 'œ' in p: + pronvar_list_.append(p.replace('œ', 'ɔ̈')) + pronvar_list_.append(p.replace('œ', 'ɔ̈:')) + if 'ɒ' in p: + pronvar_list_.append(p.replace('ɒ', 'ɔ̈')) + pronvar_list_.append(p.replace('ɒ', 'ɔ̈:')) + pronvar_list = np.unique(pronvar_list_) + + for pronvar_ in pronvar_list: + split_ipa = convert_phone_set.split_ipa_fame(pronvar_) + pronvar_out = ' '.join(split_ipa) + pronvar_list_all.append([word, pronvar_out]) + + # output + pronvar_list_all = np.array(pronvar_list_all) + pronvar_list_all = np.unique(pronvar_list_all, axis=0) + #f_lexicon_txt.write('\tSPN\n') + #for line in pronvar_list_all: + # f_lexicon_txt.write('{0}\t{1}\n'.format(line[0].lower(), line[1])) + + #f_lexicon_txt.close() + +## ======================= load kaldi forced alignment result ======================= +if load_forced_alignment_kaldi: + kaldi_work_dir = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5' + phones_txt = kaldi_work_dir + '\\data\\lang\\phones.txt' + merged_alignment_txt = kaldi_work_dir + '\\exp\\tri1_alignme\\merged_alignment.txt' + + filenames = np.load(data_dir + '\\filenames.npy') + words = np.load(data_dir + '\\words.npy') + pronunciations = np.load(data_dir + '\\pronunciations_ipa.npy') + pronvar_list_all = np.load(data_dir + '\\pronvar_list_all.npy') + word_list = np.unique(words) + + # load the mapping between phones and ids. + with open(phones_txt, 'r', encoding="utf-8") as f: + mappings = f.read().split('\n') + + phones = [] + phone_ids = [] + for m in mappings: + m = m.split(' ') + if len(m) > 1: + phones.append(m[0]) + phone_ids.append(int(m[1])) + + with open(merged_alignment_txt, 'r') as f: + lines = f.read() + lines = lines.split('\n') + + fa_filenames = [] + fa_pronunciations = [] + filename_ = '' + pron = [] + for line in lines: + line = line.split(' ') + if len(line) == 5: + filename = line[0] + if filename == filename_: + phone_id = int(line[4]) + #if not phone_id == 1: + phone = phones[phone_ids.index(phone_id)] + pron_ = re.sub(r'_[A-Z]', '', phone) + if not pron_ == 'SIL': + pron.append(pron_) + else: + fa_filenames.append(re.sub(r'speaker_[0-9]{4}-', '', filename)) + fa_pronunciations.append(' '.join(pron)) + pron = [] + + filename_ = filename + + # correct or not. + for filename, fa_pronunciation in zip(fa_filenames, fa_pronunciations): + ## ======================= evaluate the result of forced alignment ======================= if eval_forced_alignment: - #for hmm_num in [1, 2, 4, 8, 16, 32, 64]: - hmm_num = 64 - hmm_num_str = str(hmm_num) - match = np.load(data_dir + '\\match_hmm' + hmm_num_str + '.npy') - - # use dic_short? - if 1: - pronunciation_variants = np.array(['WORD', 'pronunciation']).reshape(1, 2) - for word in word_list: - fileDic = experiments_dir + r'\stimmen\dic_short' + '\\' + word + '.dic' - pronunciation_variants = np.r_[pronunciation_variants, pyHTK.loadHTKdic(fileDic)] + match_num = [] + for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256]: + #hmm_num = 256 + hmm_num_str = str(hmm_num) + match = np.load(data_dir + '\\match_hmm' + hmm_num_str + '.npy') + + # use dic_short? + if 1: + pronunciation_variants = np.array(['WORD', 'pronunciation']).reshape(1, 2) + for word in word_list: + fileDic = experiments_dir + r'\stimmen\dic_top3' + '\\' + word + '.dic' + pronunciation_variants = np.r_[pronunciation_variants, pyHTK.loadHTKdic(fileDic)] - match_short = [] - for line in match: - word = line[0] - WORD = word.upper() - pronvar = pronunciation_variants[pronunciation_variants[:, 0] == word.upper(), 1] - - if line[1] in pronvar: - match_short.append(line) + # see only words which appears in top 3. + match_short = [] + for line in match: + word = line[0] + WORD = word.upper() + pronvar = pronunciation_variants[pronunciation_variants[:, 0] == word.upper(), 1] + + if line[1] in pronvar: + match_short.append(line) - match_short = np.array(match_short) - match = np.copy(match_short) + match_short = np.array(match_short) + match = np.copy(match_short) - # number of match - total_match = sum(match[:, 1] == match[:, 2]) - print("{}: {}/{}".format(hmm_num_str, total_match, match.shape[0])) + # number of match + total_match = sum(match[:, 1] == match[:, 2]) + print("{}: {}/{}".format(hmm_num_str, total_match, match.shape[0])) + match_num.append([hmm_num, total_match, match.shape[0]]) + + # number of mixtures vs accuracy + match_num = np.array(match_num) + plt.xscale("log") + plt.plot(match_num[:, 0], match_num[:, 1]/match_num[0, 2], 'o-') + plt.xlabel('number of mixtures', fontsize=14, fontweight='bold') + plt.ylabel('accuracy', fontsize=14, fontweight='bold') + plt.show() + + # confusion matrix + #dir_out = r'C:\OneDrive\Research\rug\experiments\stimmen\result' + #word_list = np.unique(match[:, 0]) + + #for word in word_list: + # match_ = match[match[:, 0] == word, :] + # cm = confusion_matrix(match_[:, 1], match_[:, 2]) + # pronvar = pronunciation_variants[pronunciation_variants[:, 0] == word.upper(), 1] + + # plt.figure() + # plot_confusion_matrix(cm, classes=pronvar, normalize=True) + # plt.savefig(dir_out + '\\cm_' + word + '.png') \ No newline at end of file diff --git a/acoustic_model/pyKaldi.py b/acoustic_model/pyKaldi.py new file mode 100644 index 0000000..c65a99b --- /dev/null +++ b/acoustic_model/pyKaldi.py @@ -0,0 +1,26 @@ +import os +import sys + +forced_alignment_module = r'C:\Users\Aki\source\repos\forced_alignment' + +## ======================= add paths ======================= + +sys.path.append(forced_alignment_module) +from forced_alignment import convert_phone_set + + +htk_dict_file = r'C:\OneDrive\Research\rug\experiments\stimmen\dic_top3\Reus.dic' +#kaldi_lexicon = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\data\lang\phones\' +alignment_txt = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\exp\tri1_alignme\merged_alignment.txt' +phones_txt = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\exp\tri1_alignme\phones.txt' +phone_map_txt = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\data\local\lang\phone_map.txt' + +with open(phone_map_txt, 'r', encoding="utf-8") as f: + lines = f.read() + lines = lines.split('\n') + +with open(alignment_txt, 'r', encoding="utf-8") as f: + lines = +#phone_in = [line for line in lines if 'SIL' in line] +#if len(phone_in) == 1: + \ No newline at end of file