acoustic_model/acoustic_model/performance_check.py

258 lines
7.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import sys
import csv
import subprocess
import configparser
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
## ======================= functions =======================
def read_fileFA(fileFA):
"""
read the result file of HTK forced alignment.
this function only works when input is one word.
"""
with open(fileFA, 'r') as f:
lines = f.read()
lines = lines.split('\n')
phones = []
for line in lines:
line_split = line.split()
if len(line_split) > 1:
phones.append(line_split[2])
return ' '.join(phones)
#####################
## USER DEFINE ##
#####################
curr_dir = r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model'
config_ini = curr_dir + '\\config.ini'
forced_alignment_module = r'C:\Users\Aki\source\repos\forced_alignment'
forced_alignment_module_old = r'C:\OneDrive\Research\rug\code\forced_alignment\forced_alignment'
ipa_xsampa_converter_dir = r'C:\Users\Aki\source\repos\ipa-xsama-converter'
csvfile = r"C:\OneDrive\Research\rug\stimmen\Frisian Variants Picture Task Stimmen.csv"
experiments_dir = r'C:\OneDrive\Research\rug\experiments'
data_dir = experiments_dir + '\\stimmen\\data'
cygwin_dir = r'C:\cygwin64\home\Aki\acoustic_model'
# procedure
convert_phones = 0
make_dic_files = 0
make_dic_files_short = 0
do_forced_alignment = 0
eval_forced_alignment = 1
## ======================= add paths =======================
sys.path.append(forced_alignment_module)
from forced_alignment import convert_phone_set
# for interactive window
sys.path.append(curr_dir)
import convert_xsampa2ipa
import acoustic_model_functions as am_func
# for forced-alignment
sys.path.append(forced_alignment_module_old)
import pyHTK
## ======================= load variables =======================
config = configparser.ConfigParser()
config.sections()
config.read(config_ini)
FAME_dir = config['Settings']['FAME_dir']
lex_asr = FAME_dir + '\\lexicon\\lex.asr'
lex_asr_htk = FAME_dir + '\\lexicon\\lex.asr_htk'
## ======================= convert phones ======================
if convert_phones:
mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', ipa_xsampa_converter_dir)
## check phones included in FAME!
# the phones used in the lexicon.
#phonelist = am_func.get_phonelist(lex_htk)
# the lines which include a specific phone.
#lines = am_func.find_phone(lex_asr, 'x')
with open(csvfile, encoding="utf-8") as fin:
lines = csv.reader(fin, delimiter=';', lineterminator="\n", skipinitialspace=True)
next(lines, None) # skip the headers
filenames = []
words = []
pronunciations = []
for line in lines:
if line[1] is not '' and len(line) > 5:
filenames.append(line[0])
words.append(line[1])
pron_xsampa = line[3]
pron_ipa = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, pron_xsampa)
pron_ipa = pron_ipa.replace('ː', ':')
pron_famehtk = convert_phone_set.ipa2famehtk(pron_ipa)
# adjust to phones used in the acoustic model.
pron_famehtk = pron_famehtk.replace('sp', 'sil')
pron_famehtk = pron_famehtk.replace('ce :', 'ce') # because ceh is ignored.
pron_famehtk = pron_famehtk.replace('w :', 'wh')
pron_famehtk = pron_famehtk.replace('e :', 'eh')
pron_famehtk = pron_famehtk.replace('eh :', 'eh')
pron_famehtk = pron_famehtk.replace('ih :', 'ih')
#translation_key = {'sp': 'sil', 'ce :': 'ceh', 'w :': 'wh'}
#pron = []
#for phoneme in pron_famehtk.split(' '):
# pron.append(translation_key.get(phoneme, phoneme))
#pronunciations.append(' '.join(pron_famehtk))
pronunciations.append(pron_famehtk)
# check if all phones are in the phonelist of the acoustic model.
#phonelist = ' '.join(pronunciations)
#np.unique(phonelist.split(' '))
#phonelist.find(':')
filenames = np.array(filenames)
words = np.array(words)
pronunciations = np.array(pronunciations)
del line, lines
del pron_xsampa, pron_ipa, pron_famehtk
np.save(data_dir + '\\filenames.npy', filenames)
np.save(data_dir + '\\words.npy', words)
np.save(data_dir + '\\pronunciations.npy', pronunciations)
else:
filenames = np.load(data_dir + '\\filenames.npy')
words = np.load(data_dir + '\\words.npy')
pronunciations = np.load(data_dir + '\\pronunciations.npy')
word_list = np.unique(words)
## ======================= make dict files used for HTK. ======================
if make_dic_files:
output_dir = experiments_dir + r'\stimmen\dic'
for word in word_list:
WORD = word.upper()
fileDic = output_dir + '\\' + word + '.dic'
# make dic file.
pronvar_ = pronunciations[words == word]
pronvar = np.unique(pronvar_)
with open(fileDic, 'w') as f:
for pvar in pronvar:
f.write('{0}\t{1}\n'.format(WORD, pvar))
## ======================= make dict files for most popular words. ======================
if make_dic_files_short:
output_dir = experiments_dir + r'\stimmen\dic'
#word = word_list[3]
for word in word_list:
WORD = word.upper()
fileStat = output_dir + '\\' + word + '_stat.csv'
pronvar = pronunciations[words == word]
c = Counter(pronvar)
total_num = sum(c.values())
with open(fileStat, 'w') as f:
for key, value in c.items():
f.write('{0}\t{1:.2f}\t{2}\t{3}\n'.format(value, value/total_num*100, WORD, key))
## ======================= forced alignment =======================
if do_forced_alignment:
configHVite = cygwin_dir + r'\config\config.HVite'
filePhoneList = experiments_dir + r'\friesian\acoustic_model\config\phonelist_friesian.txt'
wav_dir = experiments_dir + r'\stimmen\wav'
#for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128]:
for hmm_num in [64]:
hmm_num_str = str(hmm_num)
AcousticModel = experiments_dir + r'\friesian\acoustic_model\model\hmm' + hmm_num_str + r'-3\hmmdefs'
predictions = []
file_num_max = len(filenames)
for i in range(0, file_num_max):
print('=== {0}/{1} ==='.format(i, file_num_max))
filename = filenames[i]
fileWav = wav_dir + '\\' + filename
if os.path.exists(fileWav):
word = words[i]
WORD = word.upper()
# make label file.
fileLab = wav_dir + '\\' + filename.replace('.wav', '.lab')
with open(fileLab, 'w') as f:
lines = f.write(WORD)
fileDic = experiments_dir + r'\stimmen\dic_short' + '\\' + word + '.dic'
fileFA = experiments_dir + r'\stimmen\FA_short' + '\\' + filename.replace('.wav', '.txt') + hmm_num_str
pyHTK.doHVite(fileWav, fileLab, fileDic, fileFA, configHVite, filePhoneList, AcousticModel)
prediction = read_fileFA(fileFA)
predictions.append(prediction)
os.remove(fileLab)
print('{0}: {1} -> {2}'.format(WORD, pronunciations[i], prediction))
else:
predictions.append('')
print('!!!!! file not found.')
predictions = np.array(predictions)
match = np.c_[words[predictions != ''], pronunciations[predictions != ''], predictions[predictions != '']]
np.save(data_dir + '\\match_hmm' + hmm_num_str + '.npy', match)
## ======================= evaluate the result of forced alignment =======================
if eval_forced_alignment:
#for hmm_num in [1, 2, 4, 8, 16, 32, 64]:
hmm_num = 64
hmm_num_str = str(hmm_num)
match = np.load(data_dir + '\\match_hmm' + hmm_num_str + '.npy')
# use dic_short?
if 1:
pronunciation_variants = np.array(['WORD', 'pronunciation']).reshape(1, 2)
for word in word_list:
fileDic = experiments_dir + r'\stimmen\dic_short' + '\\' + word + '.dic'
pronunciation_variants = np.r_[pronunciation_variants, pyHTK.loadHTKdic(fileDic)]
match_short = []
for line in match:
word = line[0]
WORD = word.upper()
pronvar = pronunciation_variants[pronunciation_variants[:, 0] == word.upper(), 1]
if line[1] in pronvar:
match_short.append(line)
match_short = np.array(match_short)
match = np.copy(match_short)
# number of match
total_match = sum(match[:, 1] == match[:, 2])
print("{}: {}/{}".format(hmm_num_str, total_match, match.shape[0]))