import os import sys import csv import subprocess import configparser from collections import Counter import numpy as np import pandas as pd import matplotlib.pyplot as plt ## ======================= functions ======================= def read_fileFA(fileFA): """ read the result file of HTK forced alignment. this function only works when input is one word. """ with open(fileFA, 'r') as f: lines = f.read() lines = lines.split('\n') phones = [] for line in lines: line_split = line.split() if len(line_split) > 1: phones.append(line_split[2]) return ' '.join(phones) ##################### ## USER DEFINE ## ##################### curr_dir = r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model' config_ini = curr_dir + '\\config.ini' forced_alignment_module = r'C:\Users\Aki\source\repos\forced_alignment' forced_alignment_module_old = r'C:\OneDrive\Research\rug\code\forced_alignment\forced_alignment' ipa_xsampa_converter_dir = r'C:\Users\Aki\source\repos\ipa-xsama-converter' csvfile = r"C:\OneDrive\Research\rug\stimmen\Frisian Variants Picture Task Stimmen.csv" experiments_dir = r'C:\OneDrive\Research\rug\experiments' data_dir = experiments_dir + '\\stimmen\\data' cygwin_dir = r'C:\cygwin64\home\Aki\acoustic_model' # procedure convert_phones = 0 make_dic_files = 0 make_dic_files_short = 0 do_forced_alignment = 0 eval_forced_alignment = 1 ## ======================= add paths ======================= sys.path.append(forced_alignment_module) from forced_alignment import convert_phone_set # for interactive window sys.path.append(curr_dir) import convert_xsampa2ipa import acoustic_model_functions as am_func # for forced-alignment sys.path.append(forced_alignment_module_old) import pyHTK ## ======================= load variables ======================= config = configparser.ConfigParser() config.sections() config.read(config_ini) FAME_dir = config['Settings']['FAME_dir'] lex_asr = FAME_dir + '\\lexicon\\lex.asr' lex_asr_htk = FAME_dir + '\\lexicon\\lex.asr_htk' ## ======================= convert phones ====================== if convert_phones: mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', ipa_xsampa_converter_dir) ## check phones included in FAME! # the phones used in the lexicon. #phonelist = am_func.get_phonelist(lex_htk) # the lines which include a specific phone. #lines = am_func.find_phone(lex_asr, 'x') with open(csvfile, encoding="utf-8") as fin: lines = csv.reader(fin, delimiter=';', lineterminator="\n", skipinitialspace=True) next(lines, None) # skip the headers filenames = [] words = [] pronunciations = [] for line in lines: if line[1] is not '' and len(line) > 5: filenames.append(line[0]) words.append(line[1]) pron_xsampa = line[3] pron_ipa = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, pron_xsampa) pron_ipa = pron_ipa.replace('ː', ':') pron_famehtk = convert_phone_set.ipa2famehtk(pron_ipa) # adjust to phones used in the acoustic model. pron_famehtk = pron_famehtk.replace('sp', 'sil') pron_famehtk = pron_famehtk.replace('ce :', 'ce') # because ceh is ignored. pron_famehtk = pron_famehtk.replace('w :', 'wh') pron_famehtk = pron_famehtk.replace('e :', 'eh') pron_famehtk = pron_famehtk.replace('eh :', 'eh') pron_famehtk = pron_famehtk.replace('ih :', 'ih') #translation_key = {'sp': 'sil', 'ce :': 'ceh', 'w :': 'wh'} #pron = [] #for phoneme in pron_famehtk.split(' '): # pron.append(translation_key.get(phoneme, phoneme)) #pronunciations.append(' '.join(pron_famehtk)) pronunciations.append(pron_famehtk) # check if all phones are in the phonelist of the acoustic model. #phonelist = ' '.join(pronunciations) #np.unique(phonelist.split(' ')) #phonelist.find(':') filenames = np.array(filenames) words = np.array(words) pronunciations = np.array(pronunciations) del line, lines del pron_xsampa, pron_ipa, pron_famehtk np.save(data_dir + '\\filenames.npy', filenames) np.save(data_dir + '\\words.npy', words) np.save(data_dir + '\\pronunciations.npy', pronunciations) else: filenames = np.load(data_dir + '\\filenames.npy') words = np.load(data_dir + '\\words.npy') pronunciations = np.load(data_dir + '\\pronunciations.npy') word_list = np.unique(words) ## ======================= make dict files used for HTK. ====================== if make_dic_files: output_dir = experiments_dir + r'\stimmen\dic' for word in word_list: WORD = word.upper() fileDic = output_dir + '\\' + word + '.dic' # make dic file. pronvar_ = pronunciations[words == word] pronvar = np.unique(pronvar_) with open(fileDic, 'w') as f: for pvar in pronvar: f.write('{0}\t{1}\n'.format(WORD, pvar)) ## ======================= make dict files for most popular words. ====================== if make_dic_files_short: output_dir = experiments_dir + r'\stimmen\dic' #word = word_list[3] for word in word_list: WORD = word.upper() fileStat = output_dir + '\\' + word + '_stat.csv' pronvar = pronunciations[words == word] c = Counter(pronvar) total_num = sum(c.values()) with open(fileStat, 'w') as f: for key, value in c.items(): f.write('{0}\t{1:.2f}\t{2}\t{3}\n'.format(value, value/total_num*100, WORD, key)) ## ======================= forced alignment ======================= if do_forced_alignment: configHVite = cygwin_dir + r'\config\config.HVite' filePhoneList = experiments_dir + r'\friesian\acoustic_model\config\phonelist_friesian.txt' wav_dir = experiments_dir + r'\stimmen\wav' #for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128]: for hmm_num in [64]: hmm_num_str = str(hmm_num) AcousticModel = experiments_dir + r'\friesian\acoustic_model\model\hmm' + hmm_num_str + r'-3\hmmdefs' predictions = [] file_num_max = len(filenames) for i in range(0, file_num_max): print('=== {0}/{1} ==='.format(i, file_num_max)) filename = filenames[i] fileWav = wav_dir + '\\' + filename if os.path.exists(fileWav): word = words[i] WORD = word.upper() # make label file. fileLab = wav_dir + '\\' + filename.replace('.wav', '.lab') with open(fileLab, 'w') as f: lines = f.write(WORD) fileDic = experiments_dir + r'\stimmen\dic_short' + '\\' + word + '.dic' fileFA = experiments_dir + r'\stimmen\FA_short' + '\\' + filename.replace('.wav', '.txt') + hmm_num_str pyHTK.doHVite(fileWav, fileLab, fileDic, fileFA, configHVite, filePhoneList, AcousticModel) prediction = read_fileFA(fileFA) predictions.append(prediction) os.remove(fileLab) print('{0}: {1} -> {2}'.format(WORD, pronunciations[i], prediction)) else: predictions.append('') print('!!!!! file not found.') predictions = np.array(predictions) match = np.c_[words[predictions != ''], pronunciations[predictions != ''], predictions[predictions != '']] np.save(data_dir + '\\match_hmm' + hmm_num_str + '.npy', match) ## ======================= evaluate the result of forced alignment ======================= if eval_forced_alignment: #for hmm_num in [1, 2, 4, 8, 16, 32, 64]: hmm_num = 64 hmm_num_str = str(hmm_num) match = np.load(data_dir + '\\match_hmm' + hmm_num_str + '.npy') # use dic_short? if 1: pronunciation_variants = np.array(['WORD', 'pronunciation']).reshape(1, 2) for word in word_list: fileDic = experiments_dir + r'\stimmen\dic_short' + '\\' + word + '.dic' pronunciation_variants = np.r_[pronunciation_variants, pyHTK.loadHTKdic(fileDic)] match_short = [] for line in match: word = line[0] WORD = word.upper() pronvar = pronunciation_variants[pronunciation_variants[:, 0] == word.upper(), 1] if line[1] in pronvar: match_short.append(line) match_short = np.array(match_short) match = np.copy(match_short) # number of match total_match = sum(match[:, 1] == match[:, 2]) print("{}: {}/{}".format(hmm_num_str, total_match, match.shape[0]))