diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo index 559d56b..a3fe250 100644 Binary files a/.vs/acoustic_model/v15/.suo and b/.vs/acoustic_model/v15/.suo differ diff --git a/acoustic_model/acoustic_model.py b/acoustic_model/acoustic_model.py index d3d9eb0..1ef57e7 100644 --- a/acoustic_model/acoustic_model.py +++ b/acoustic_model/acoustic_model.py @@ -22,11 +22,11 @@ dataset_list = ['devel', 'test', 'train'] extract_features = 0 make_feature_list = 0 conv_lexicon = 0 -check_lexicon = 0 +check_lexicon = 1 make_mlf = 0 combine_files = 0 flat_start = 0 -train_model = 1 +train_model = 0 forced_alignment = 0 @@ -133,7 +133,11 @@ if check_lexicon: print("==== check if all the phones are successfully converted. ====\n") # the phones used in the lexicon. - phonelist = am_func.get_phonelist(lex_htk) + phonelist_asr = am_func.get_phonelist(lex_asr) + phonelist_oov = am_func.get_phonelist(lex_oov) + phonelist_htk = am_func.get_phonelist(lex_htk) + + phonelist = phonelist_asr.union(phonelist_oov) # the lines which include a specific phone. lines = am_func.find_phone(lex_asr, 'g') diff --git a/acoustic_model/performance_check.py b/acoustic_model/performance_check.py index ec30efb..411d93e 100644 --- a/acoustic_model/performance_check.py +++ b/acoustic_model/performance_check.py @@ -3,19 +3,54 @@ import sys import csv import subprocess import configparser +from collections import Counter import numpy as np import pandas as pd +import matplotlib.pyplot as plt -## ======================= user define ======================= +## ======================= functions ======================= + +def read_fileFA(fileFA): + """ + read the result file of HTK forced alignment. + this function only works when input is one word. + """ + with open(fileFA, 'r') as f: + lines = f.read() + lines = lines.split('\n') + + phones = [] + for line in lines: + line_split = line.split() + if len(line_split) > 1: + phones.append(line_split[2]) + + return ' '.join(phones) + + +##################### +## USER DEFINE ## +##################### curr_dir = r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model' config_ini = curr_dir + '\\config.ini' -forced_alignment_module = r'C:\Users\Aki\source\repos\forced_alignment' +forced_alignment_module = r'C:\Users\Aki\source\repos\forced_alignment' +forced_alignment_module_old = r'C:\OneDrive\Research\rug\code\forced_alignment\forced_alignment' ipa_xsampa_converter_dir = r'C:\Users\Aki\source\repos\ipa-xsama-converter' + csvfile = r"C:\OneDrive\Research\rug\stimmen\Frisian Variants Picture Task Stimmen.csv" +experiments_dir = r'C:\OneDrive\Research\rug\experiments' +data_dir = experiments_dir + '\\stimmen\\data' +cygwin_dir = r'C:\cygwin64\home\Aki\acoustic_model' # procedure +convert_phones = 0 +make_dic_files = 0 +make_dic_files_short = 0 +do_forced_alignment = 0 +eval_forced_alignment = 1 + ## ======================= add paths ======================= @@ -28,6 +63,10 @@ sys.path.append(curr_dir) import convert_xsampa2ipa import acoustic_model_functions as am_func +# for forced-alignment +sys.path.append(forced_alignment_module_old) +import pyHTK + ## ======================= load variables ======================= config = configparser.ConfigParser() @@ -40,85 +79,179 @@ lex_asr = FAME_dir + '\\lexicon\\lex.asr' lex_asr_htk = FAME_dir + '\\lexicon\\lex.asr_htk' -## ======================= check phones included in FAME! ======================= -# the phones used in the lexicon. -#phonelist = am_func.get_phonelist(lex_htk) - -# the lines which include a specific phone. -#lines = am_func.find_phone(lex_asr, 'x') - - ## ======================= convert phones ====================== -mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', ipa_xsampa_converter_dir) +if convert_phones: + mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', ipa_xsampa_converter_dir) -with open(csvfile, encoding="utf-8") as fin: - lines = csv.reader(fin, delimiter=';', lineterminator="\n", skipinitialspace=True) - next(lines, None) # skip the headers + ## check phones included in FAME! + # the phones used in the lexicon. + #phonelist = am_func.get_phonelist(lex_htk) - filenames = [] - words = [] - pronunciations = [] - for line in lines: - if line[1] is not '' and len(line) > 5: - filenames.append(line[0]) - words.append(line[1]) - pron_xsampa = line[3] - pron_ipa = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, pron_xsampa) - pron_ipa = pron_ipa.replace('ː', ':') - pron_famehtk = convert_phone_set.ipa2famehtk(pron_ipa) + # the lines which include a specific phone. + #lines = am_func.find_phone(lex_asr, 'x') + + with open(csvfile, encoding="utf-8") as fin: + lines = csv.reader(fin, delimiter=';', lineterminator="\n", skipinitialspace=True) + next(lines, None) # skip the headers + + filenames = [] + words = [] + pronunciations = [] + for line in lines: + if line[1] is not '' and len(line) > 5: + filenames.append(line[0]) + words.append(line[1]) + pron_xsampa = line[3] + pron_ipa = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, pron_xsampa) + pron_ipa = pron_ipa.replace('ː', ':') + pron_famehtk = convert_phone_set.ipa2famehtk(pron_ipa) - # adjust to phones used in the acoustic model. - pron_famehtk = pron_famehtk.replace('sp', 'sil') - pron_famehtk = pron_famehtk.replace('ce :', 'ce') # because ceh is ignored. - pron_famehtk = pron_famehtk.replace('w :', 'wh') - pron_famehtk = pron_famehtk.replace('e :', 'eh') - pron_famehtk = pron_famehtk.replace('eh :', 'eh') - pron_famehtk = pron_famehtk.replace('ih :', 'ih') + # adjust to phones used in the acoustic model. + pron_famehtk = pron_famehtk.replace('sp', 'sil') + pron_famehtk = pron_famehtk.replace('ce :', 'ce') # because ceh is ignored. + pron_famehtk = pron_famehtk.replace('w :', 'wh') + pron_famehtk = pron_famehtk.replace('e :', 'eh') + pron_famehtk = pron_famehtk.replace('eh :', 'eh') + pron_famehtk = pron_famehtk.replace('ih :', 'ih') - #translation_key = {'sp': 'sil', 'ce :': 'ceh', 'w :': 'wh'} - #pron = [] - #for phoneme in pron_famehtk.split(' '): - # pron.append(translation_key.get(phoneme, phoneme)) - #pronunciations.append(' '.join(pron_famehtk)) - pronunciations.append(pron_famehtk) + #translation_key = {'sp': 'sil', 'ce :': 'ceh', 'w :': 'wh'} + #pron = [] + #for phoneme in pron_famehtk.split(' '): + # pron.append(translation_key.get(phoneme, phoneme)) + #pronunciations.append(' '.join(pron_famehtk)) + pronunciations.append(pron_famehtk) -filenames = np.array(filenames) -words = np.array(words) -pronunciations = np.array(pronunciations) + # check if all phones are in the phonelist of the acoustic model. + #phonelist = ' '.join(pronunciations) + #np.unique(phonelist.split(' ')) + #phonelist.find(':') -del line, lines -del pron_xsampa, pron_ipa, pron_famehtk + filenames = np.array(filenames) + words = np.array(words) + pronunciations = np.array(pronunciations) -# check if all phones are in the phonelist of the acoustic model. -#phonelist = ' '.join(pronunciations) -#np.unique(phonelist.split(' ')) -#phonelist.find(':') + del line, lines + del pron_xsampa, pron_ipa, pron_famehtk -# make dict files. + np.save(data_dir + '\\filenames.npy', filenames) + np.save(data_dir + '\\words.npy', words) + np.save(data_dir + '\\pronunciations.npy', pronunciations) +else: + filenames = np.load(data_dir + '\\filenames.npy') + words = np.load(data_dir + '\\words.npy') + + pronunciations = np.load(data_dir + '\\pronunciations.npy') word_list = np.unique(words) -word_id = 1 -word = word_list[word_id] + + +## ======================= make dict files used for HTK. ====================== +if make_dic_files: + output_dir = experiments_dir + r'\stimmen\dic' + + for word in word_list: + WORD = word.upper() + fileDic = output_dir + '\\' + word + '.dic' + + # make dic file. + pronvar_ = pronunciations[words == word] + pronvar = np.unique(pronvar_) + + with open(fileDic, 'w') as f: + for pvar in pronvar: + f.write('{0}\t{1}\n'.format(WORD, pvar)) + + +## ======================= make dict files for most popular words. ====================== +if make_dic_files_short: + output_dir = experiments_dir + r'\stimmen\dic' + + #word = word_list[3] + for word in word_list: + WORD = word.upper() + fileStat = output_dir + '\\' + word + '_stat.csv' + + pronvar = pronunciations[words == word] + c = Counter(pronvar) + total_num = sum(c.values()) + + with open(fileStat, 'w') as f: + for key, value in c.items(): + f.write('{0}\t{1:.2f}\t{2}\t{3}\n'.format(value, value/total_num*100, WORD, key)) ## ======================= forced alignment ======================= -#if forced_alignment: -# try: -# scripts.run_command([ -# 'HVite','-T', '1', '-a', '-C', configHVite, -# '-H', AcousticModel, '-m', '-I', -# mlf_file, '-i', fa_file, '-S', -# script_file, htk_dict_file, filePhoneList -# ]) -# except: -# print("\033[91mHVite command failed with these input files:\033[0m") -# print(_debug_show_file('HVite config', configHVite)) -# print(_debug_show_file('Accoustic model', AcousticModel)) -# print(_debug_show_file('Master Label file', mlf_file)) -# print(_debug_show_file('Output', fa_file)) -# print(_debug_show_file('Script file', script_file)) -# print(_debug_show_file('HTK dictionary', htk_dict_file)) -# print(_debug_show_file('Phoneme list', filePhoneList)) -# raise +if do_forced_alignment: + configHVite = cygwin_dir + r'\config\config.HVite' + filePhoneList = experiments_dir + r'\friesian\acoustic_model\config\phonelist_friesian.txt' + wav_dir = experiments_dir + r'\stimmen\wav' + + #for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128]: + for hmm_num in [64]: + hmm_num_str = str(hmm_num) + AcousticModel = experiments_dir + r'\friesian\acoustic_model\model\hmm' + hmm_num_str + r'-3\hmmdefs' + + predictions = [] + file_num_max = len(filenames) + for i in range(0, file_num_max): + print('=== {0}/{1} ==='.format(i, file_num_max)) + filename = filenames[i] + fileWav = wav_dir + '\\' + filename + + if os.path.exists(fileWav): + word = words[i] + WORD = word.upper() + + # make label file. + fileLab = wav_dir + '\\' + filename.replace('.wav', '.lab') + with open(fileLab, 'w') as f: + lines = f.write(WORD) + + fileDic = experiments_dir + r'\stimmen\dic_short' + '\\' + word + '.dic' + fileFA = experiments_dir + r'\stimmen\FA_short' + '\\' + filename.replace('.wav', '.txt') + hmm_num_str + + pyHTK.doHVite(fileWav, fileLab, fileDic, fileFA, configHVite, filePhoneList, AcousticModel) + prediction = read_fileFA(fileFA) + predictions.append(prediction) + + os.remove(fileLab) + print('{0}: {1} -> {2}'.format(WORD, pronunciations[i], prediction)) + else: + predictions.append('') + print('!!!!! file not found.') + + predictions = np.array(predictions) + match = np.c_[words[predictions != ''], pronunciations[predictions != ''], predictions[predictions != '']] + np.save(data_dir + '\\match_hmm' + hmm_num_str + '.npy', match) -##os.remove(hcopy_scp.name) +## ======================= evaluate the result of forced alignment ======================= +if eval_forced_alignment: + + #for hmm_num in [1, 2, 4, 8, 16, 32, 64]: + hmm_num = 64 + hmm_num_str = str(hmm_num) + match = np.load(data_dir + '\\match_hmm' + hmm_num_str + '.npy') + + # use dic_short? + if 1: + pronunciation_variants = np.array(['WORD', 'pronunciation']).reshape(1, 2) + for word in word_list: + fileDic = experiments_dir + r'\stimmen\dic_short' + '\\' + word + '.dic' + pronunciation_variants = np.r_[pronunciation_variants, pyHTK.loadHTKdic(fileDic)] + + match_short = [] + for line in match: + word = line[0] + WORD = word.upper() + pronvar = pronunciation_variants[pronunciation_variants[:, 0] == word.upper(), 1] + + if line[1] in pronvar: + match_short.append(line) + + match_short = np.array(match_short) + match = np.copy(match_short) + + # number of match + total_match = sum(match[:, 1] == match[:, 2]) + print("{}: {}/{}".format(hmm_num_str, total_match, match.shape[0])) +