From d56ef7f0759e5f1c143d98dbd5329926503c2574 Mon Sep 17 00:00:00 2001 From: yemaozi88 <428968@gmail.com> Date: Thu, 21 Jun 2018 16:27:00 +0200 Subject: [PATCH] FA result evaluation and xsampa to ipa conversion is updated. --- .vs/acoustic_model/v15/.suo | Bin 49152 -> 53248 bytes acoustic_model/acoustic_model.py | 10 +- acoustic_model/performance_check.py | 271 +++++++++++++++++++++------- 3 files changed, 209 insertions(+), 72 deletions(-) diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo index 559d56bf81cb754771a6d69d72b94f2e0f5d5e2f..a3fe250a36274108985a39fafa5a92ccaae5cb9e 100644 GIT binary patch delta 4240 zcmeH~du-F!6~}+qkHk)hV}nT(NK)rP$b!vm2#Ct>XJUzc59i$_RqFX)m~cHJvz}ppn~js?4RkR zGPP7_scxqDau2QO<)V)9{iA7!8UOH$lU`m)lP#n@$)A3 zNf~UnpbNkU-Lb60 znwOx{NLJfjXP}(r0FP<}vzxlLN3W81-D>#Su^K4O<6#Xd&|8BJ(-b&h=1)Fq7UNnSRd zGDD#`<-r{=bb}rcR0?|`cY?2hU5f4NirlTpFl4`?r#oah1lwK^0sFvyZ~#0iUn_V` z=y+>X0iH2ko^K^fWtld)nRvbwu6$IyMOT)zYIyG@D}}$*q2a(o#wrTsEAyL$hNC-G zzo<>e0p?I@>z{zaODWN{vH`J2?k{fQwC!?Bx~=9Y?TDp3_tB44K|Ql_W9 z0Od3t3E62ZG@st7%A?U^mC#F)+&o_qyN!LtLz;L<6>m|UTz}1Ed^dbO?w-16nw%$% zy`DQWO&@G3qR0JtVv<~&ClD=OK{?+UC+G7=Da(H^9rQ035{a!1a>5@~Q528!DCo2s zc?I`ME@_G6l^SseKb<%#8mgf>B%8DvhH7akGzT6Jc7hF6xy%L4j>iSuQVo8+$;XY_ z`)KBvi=104DAZX;DqFE|)8M%SLilBHnnriiRV{#0DA(uB103W>wYtv`VMp_usWSzM}r*rjS=f&Wle(J^$}YNB#HIG0P=m88RjT;ZRa$n>_B|UZI%>ViI(?>*$xit(VO+D+ zc&gPfJbB^|9skgn4jtzS(0jP?y!Evdef|OJn6S!;Qwc%wsr7Vms*v8YWYAwHX2+^t z{hW;_pKa%>sv@^y(RWX+v^@F4n&$Wa{P1%AvvcDg&0ameTM@xQQ|xRaR$|YM;ohm0 zd{Qg!k^mOrXTC*LwJh(JJ5r+~<$9Ve3}mYziY{_eVhWt;5l6{oX`l-e74(c{o*XT8 z3VNfYn7l88)|O$RfTgfaK6FsAlopD>=4N4NoH$yUU-M&FzKb9gixd1ri(q8M?nIyUj1} z+;i_c_nh;7=iXaiSBTFS(r$U08)*?7r$(7a{1FHQNX@{iNIA5ojR}0Mw*6Y_t<|b4 zEE@;>W@zS{*>Skd7il;nYLp1r?KY|wp>Ul+b4atY0Yw!Ihr~Z0j}qDp&@3{p!)GEw zgNQ&(xrpcF0Xq&v37Q1u z5>zaJXEZ5kv?#Tus9A;xm!}4`fxs&4zb5M<$S!m#2elD{vt#ogC*(=fq>vS|MGBWz zip1yWa7jnul6UvQrGCevX}To&B?^~v^$6iwA0^xpw5<=Oa+MI;rl5DBJ=CcoLBgR$ zG3+l!v|hFz8ns0*V6?J&xUF@l@y!7FOpH*H4Gx;Z7ic&O>L>}M4|@rP?UWj9!ebgy z1&xSzbd@Y(#Nm}_kUl7QqNXR3vI;xZ2sdJ6QZ34Lh;?u|Dk^ION?t@GVk6>p#3n=& zq8YInu?4Xeu?_JC;!U`lS*W0zp>V)!yP}|v8SG>^V1TzO52C^k!`4FelPIa=VWC(7 zn=%S%1`K5^R#15aU(0l=Q?QeIPmyYZgw!4l%R~)T2u%}*Qo~qZ^jRk5Id)|3&=ixH zV3^L}tkDAASf`qpW0gmh@1k&49G-be8V@5+iJsJi-YdG$ot3}e*fzL*`&F?o`PO%r zII1CaS$*RMue+=S{HH8%bsz%l`~7SWfx(%YUnlV zq3uGL5a(qzVq;tp(=eh#GA=~UE8j+;7ltz(6Q-_B(7&i;#DCb~Hi1|FCOZwo`a<<2 zROw|q;Eh=f7mX&c@Cx-)sH7$wKCe6WerxI*g%@{pd=S6>sPe50DH8bQupXR;H-h%e z0@ejB{Y|5M4b>eRU>}9MFHiZp%Nw83(|%2QM{G4WGbOFbL5De=f{OU@xH130yes|e zYV7Cz*6Rzw*=qscS$oXbf$`T?v5%l*U#{Q^1^*HIziE4H3WId~0Arz4Iw-NM3L0au zYLSEwzWj(;j;L`Z&7!!-C7Z41GQNOr2z zv7?n0Dk|<9JlsicW-FjEWhW$=4M@VIfA~FtM|$3;X(asIX%k$k$Nq+r$Hec6JzjC& z<5$UJsWJd7L&>B%Cyo!reK~%aq~NY;Mgw#*O6B7~;bmwtE(q=(MyIq1#L2^=cZLHC zZp?1+6oUwRrjpTNA1o=myc`uyFwc2y;pG*TuEtt#iMP%(cit?U7(V~EkiO!(;b7u* zPg-#E9{aSDnX|;r6&-&X$?5weB2DtskVBObw!^Bt1XvmBr6m@V7ZL zC%dfFnp2)O%bJy$X0v9^O3Tg2%1+C5m1S1uW>~UIODndDyEgCQ1>5}58<<1GGSGpU z2q~XKHiw2w!``SPp{;wqqPv*yiQC3^FXkc6w@goGNw69GJ5V6*7fk4S%#^bFbz?#@ zdzxN`%XK;WiH9N|9)frF&SNS_Jtvrcs_X?lh?K;Xf&on7PB8D)iWl}oGhGV} zO6}&SbQj~qmXEtxbO=gxouUg2$<*Nc!?Domo~~a0WoS?_zd9Q_9CO8jrN^ImG(Jrp z#i!3WxLMi@j?$;WwtESzE;pd#V_7-UmmG0bB#y%(U+90!7(m8=zsld^uV%E_LP7O& z5OC*BlJf&^txbf5ZiCQif@?L=Oe^Mf&SCJyI;U8(Ws>55K$G0@XmX!0^sNjW!#^Li zzm2p)6jGq&kyW?IPW+zajdJfjRAK!f#Qale2IkfRSTaH9C1-;zYT*vkI?9wPdW_!<1vIZJS1 zBxO^f^-N^nQ6GcuUK4l*OmOgw6C&`cP*kP``^i1J55IYvecPYE>HTR5H~9D&wbCtx zU+Wf5`JsEi9Wf7n9O#CweaVn|UN2wIk|6cmfyrjhBVA_cy80eQn>A*}biCfRLD9L- F{{(vDqXPf{ diff --git a/acoustic_model/acoustic_model.py b/acoustic_model/acoustic_model.py index d3d9eb0..1ef57e7 100644 --- a/acoustic_model/acoustic_model.py +++ b/acoustic_model/acoustic_model.py @@ -22,11 +22,11 @@ dataset_list = ['devel', 'test', 'train'] extract_features = 0 make_feature_list = 0 conv_lexicon = 0 -check_lexicon = 0 +check_lexicon = 1 make_mlf = 0 combine_files = 0 flat_start = 0 -train_model = 1 +train_model = 0 forced_alignment = 0 @@ -133,7 +133,11 @@ if check_lexicon: print("==== check if all the phones are successfully converted. ====\n") # the phones used in the lexicon. - phonelist = am_func.get_phonelist(lex_htk) + phonelist_asr = am_func.get_phonelist(lex_asr) + phonelist_oov = am_func.get_phonelist(lex_oov) + phonelist_htk = am_func.get_phonelist(lex_htk) + + phonelist = phonelist_asr.union(phonelist_oov) # the lines which include a specific phone. lines = am_func.find_phone(lex_asr, 'g') diff --git a/acoustic_model/performance_check.py b/acoustic_model/performance_check.py index ec30efb..411d93e 100644 --- a/acoustic_model/performance_check.py +++ b/acoustic_model/performance_check.py @@ -3,19 +3,54 @@ import sys import csv import subprocess import configparser +from collections import Counter import numpy as np import pandas as pd +import matplotlib.pyplot as plt -## ======================= user define ======================= +## ======================= functions ======================= + +def read_fileFA(fileFA): + """ + read the result file of HTK forced alignment. + this function only works when input is one word. + """ + with open(fileFA, 'r') as f: + lines = f.read() + lines = lines.split('\n') + + phones = [] + for line in lines: + line_split = line.split() + if len(line_split) > 1: + phones.append(line_split[2]) + + return ' '.join(phones) + + +##################### +## USER DEFINE ## +##################### curr_dir = r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model' config_ini = curr_dir + '\\config.ini' -forced_alignment_module = r'C:\Users\Aki\source\repos\forced_alignment' +forced_alignment_module = r'C:\Users\Aki\source\repos\forced_alignment' +forced_alignment_module_old = r'C:\OneDrive\Research\rug\code\forced_alignment\forced_alignment' ipa_xsampa_converter_dir = r'C:\Users\Aki\source\repos\ipa-xsama-converter' + csvfile = r"C:\OneDrive\Research\rug\stimmen\Frisian Variants Picture Task Stimmen.csv" +experiments_dir = r'C:\OneDrive\Research\rug\experiments' +data_dir = experiments_dir + '\\stimmen\\data' +cygwin_dir = r'C:\cygwin64\home\Aki\acoustic_model' # procedure +convert_phones = 0 +make_dic_files = 0 +make_dic_files_short = 0 +do_forced_alignment = 0 +eval_forced_alignment = 1 + ## ======================= add paths ======================= @@ -28,6 +63,10 @@ sys.path.append(curr_dir) import convert_xsampa2ipa import acoustic_model_functions as am_func +# for forced-alignment +sys.path.append(forced_alignment_module_old) +import pyHTK + ## ======================= load variables ======================= config = configparser.ConfigParser() @@ -40,85 +79,179 @@ lex_asr = FAME_dir + '\\lexicon\\lex.asr' lex_asr_htk = FAME_dir + '\\lexicon\\lex.asr_htk' -## ======================= check phones included in FAME! ======================= -# the phones used in the lexicon. -#phonelist = am_func.get_phonelist(lex_htk) - -# the lines which include a specific phone. -#lines = am_func.find_phone(lex_asr, 'x') - - ## ======================= convert phones ====================== -mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', ipa_xsampa_converter_dir) +if convert_phones: + mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', ipa_xsampa_converter_dir) -with open(csvfile, encoding="utf-8") as fin: - lines = csv.reader(fin, delimiter=';', lineterminator="\n", skipinitialspace=True) - next(lines, None) # skip the headers + ## check phones included in FAME! + # the phones used in the lexicon. + #phonelist = am_func.get_phonelist(lex_htk) - filenames = [] - words = [] - pronunciations = [] - for line in lines: - if line[1] is not '' and len(line) > 5: - filenames.append(line[0]) - words.append(line[1]) - pron_xsampa = line[3] - pron_ipa = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, pron_xsampa) - pron_ipa = pron_ipa.replace('ː', ':') - pron_famehtk = convert_phone_set.ipa2famehtk(pron_ipa) + # the lines which include a specific phone. + #lines = am_func.find_phone(lex_asr, 'x') + + with open(csvfile, encoding="utf-8") as fin: + lines = csv.reader(fin, delimiter=';', lineterminator="\n", skipinitialspace=True) + next(lines, None) # skip the headers + + filenames = [] + words = [] + pronunciations = [] + for line in lines: + if line[1] is not '' and len(line) > 5: + filenames.append(line[0]) + words.append(line[1]) + pron_xsampa = line[3] + pron_ipa = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, pron_xsampa) + pron_ipa = pron_ipa.replace('ː', ':') + pron_famehtk = convert_phone_set.ipa2famehtk(pron_ipa) - # adjust to phones used in the acoustic model. - pron_famehtk = pron_famehtk.replace('sp', 'sil') - pron_famehtk = pron_famehtk.replace('ce :', 'ce') # because ceh is ignored. - pron_famehtk = pron_famehtk.replace('w :', 'wh') - pron_famehtk = pron_famehtk.replace('e :', 'eh') - pron_famehtk = pron_famehtk.replace('eh :', 'eh') - pron_famehtk = pron_famehtk.replace('ih :', 'ih') + # adjust to phones used in the acoustic model. + pron_famehtk = pron_famehtk.replace('sp', 'sil') + pron_famehtk = pron_famehtk.replace('ce :', 'ce') # because ceh is ignored. + pron_famehtk = pron_famehtk.replace('w :', 'wh') + pron_famehtk = pron_famehtk.replace('e :', 'eh') + pron_famehtk = pron_famehtk.replace('eh :', 'eh') + pron_famehtk = pron_famehtk.replace('ih :', 'ih') - #translation_key = {'sp': 'sil', 'ce :': 'ceh', 'w :': 'wh'} - #pron = [] - #for phoneme in pron_famehtk.split(' '): - # pron.append(translation_key.get(phoneme, phoneme)) - #pronunciations.append(' '.join(pron_famehtk)) - pronunciations.append(pron_famehtk) + #translation_key = {'sp': 'sil', 'ce :': 'ceh', 'w :': 'wh'} + #pron = [] + #for phoneme in pron_famehtk.split(' '): + # pron.append(translation_key.get(phoneme, phoneme)) + #pronunciations.append(' '.join(pron_famehtk)) + pronunciations.append(pron_famehtk) -filenames = np.array(filenames) -words = np.array(words) -pronunciations = np.array(pronunciations) + # check if all phones are in the phonelist of the acoustic model. + #phonelist = ' '.join(pronunciations) + #np.unique(phonelist.split(' ')) + #phonelist.find(':') -del line, lines -del pron_xsampa, pron_ipa, pron_famehtk + filenames = np.array(filenames) + words = np.array(words) + pronunciations = np.array(pronunciations) -# check if all phones are in the phonelist of the acoustic model. -#phonelist = ' '.join(pronunciations) -#np.unique(phonelist.split(' ')) -#phonelist.find(':') + del line, lines + del pron_xsampa, pron_ipa, pron_famehtk -# make dict files. + np.save(data_dir + '\\filenames.npy', filenames) + np.save(data_dir + '\\words.npy', words) + np.save(data_dir + '\\pronunciations.npy', pronunciations) +else: + filenames = np.load(data_dir + '\\filenames.npy') + words = np.load(data_dir + '\\words.npy') + + pronunciations = np.load(data_dir + '\\pronunciations.npy') word_list = np.unique(words) -word_id = 1 -word = word_list[word_id] + + +## ======================= make dict files used for HTK. ====================== +if make_dic_files: + output_dir = experiments_dir + r'\stimmen\dic' + + for word in word_list: + WORD = word.upper() + fileDic = output_dir + '\\' + word + '.dic' + + # make dic file. + pronvar_ = pronunciations[words == word] + pronvar = np.unique(pronvar_) + + with open(fileDic, 'w') as f: + for pvar in pronvar: + f.write('{0}\t{1}\n'.format(WORD, pvar)) + + +## ======================= make dict files for most popular words. ====================== +if make_dic_files_short: + output_dir = experiments_dir + r'\stimmen\dic' + + #word = word_list[3] + for word in word_list: + WORD = word.upper() + fileStat = output_dir + '\\' + word + '_stat.csv' + + pronvar = pronunciations[words == word] + c = Counter(pronvar) + total_num = sum(c.values()) + + with open(fileStat, 'w') as f: + for key, value in c.items(): + f.write('{0}\t{1:.2f}\t{2}\t{3}\n'.format(value, value/total_num*100, WORD, key)) ## ======================= forced alignment ======================= -#if forced_alignment: -# try: -# scripts.run_command([ -# 'HVite','-T', '1', '-a', '-C', configHVite, -# '-H', AcousticModel, '-m', '-I', -# mlf_file, '-i', fa_file, '-S', -# script_file, htk_dict_file, filePhoneList -# ]) -# except: -# print("\033[91mHVite command failed with these input files:\033[0m") -# print(_debug_show_file('HVite config', configHVite)) -# print(_debug_show_file('Accoustic model', AcousticModel)) -# print(_debug_show_file('Master Label file', mlf_file)) -# print(_debug_show_file('Output', fa_file)) -# print(_debug_show_file('Script file', script_file)) -# print(_debug_show_file('HTK dictionary', htk_dict_file)) -# print(_debug_show_file('Phoneme list', filePhoneList)) -# raise +if do_forced_alignment: + configHVite = cygwin_dir + r'\config\config.HVite' + filePhoneList = experiments_dir + r'\friesian\acoustic_model\config\phonelist_friesian.txt' + wav_dir = experiments_dir + r'\stimmen\wav' + + #for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128]: + for hmm_num in [64]: + hmm_num_str = str(hmm_num) + AcousticModel = experiments_dir + r'\friesian\acoustic_model\model\hmm' + hmm_num_str + r'-3\hmmdefs' + + predictions = [] + file_num_max = len(filenames) + for i in range(0, file_num_max): + print('=== {0}/{1} ==='.format(i, file_num_max)) + filename = filenames[i] + fileWav = wav_dir + '\\' + filename + + if os.path.exists(fileWav): + word = words[i] + WORD = word.upper() + + # make label file. + fileLab = wav_dir + '\\' + filename.replace('.wav', '.lab') + with open(fileLab, 'w') as f: + lines = f.write(WORD) + + fileDic = experiments_dir + r'\stimmen\dic_short' + '\\' + word + '.dic' + fileFA = experiments_dir + r'\stimmen\FA_short' + '\\' + filename.replace('.wav', '.txt') + hmm_num_str + + pyHTK.doHVite(fileWav, fileLab, fileDic, fileFA, configHVite, filePhoneList, AcousticModel) + prediction = read_fileFA(fileFA) + predictions.append(prediction) + + os.remove(fileLab) + print('{0}: {1} -> {2}'.format(WORD, pronunciations[i], prediction)) + else: + predictions.append('') + print('!!!!! file not found.') + + predictions = np.array(predictions) + match = np.c_[words[predictions != ''], pronunciations[predictions != ''], predictions[predictions != '']] + np.save(data_dir + '\\match_hmm' + hmm_num_str + '.npy', match) -##os.remove(hcopy_scp.name) +## ======================= evaluate the result of forced alignment ======================= +if eval_forced_alignment: + + #for hmm_num in [1, 2, 4, 8, 16, 32, 64]: + hmm_num = 64 + hmm_num_str = str(hmm_num) + match = np.load(data_dir + '\\match_hmm' + hmm_num_str + '.npy') + + # use dic_short? + if 1: + pronunciation_variants = np.array(['WORD', 'pronunciation']).reshape(1, 2) + for word in word_list: + fileDic = experiments_dir + r'\stimmen\dic_short' + '\\' + word + '.dic' + pronunciation_variants = np.r_[pronunciation_variants, pyHTK.loadHTKdic(fileDic)] + + match_short = [] + for line in match: + word = line[0] + WORD = word.upper() + pronvar = pronunciation_variants[pronunciation_variants[:, 0] == word.upper(), 1] + + if line[1] in pronvar: + match_short.append(line) + + match_short = np.array(match_short) + match = np.copy(match_short) + + # number of match + total_match = sum(match[:, 1] == match[:, 2]) + print("{}: {}/{}".format(hmm_num_str, total_match, match.shape[0])) +