|
|
|
@ -1,255 +1,176 @@
@@ -1,255 +1,176 @@
|
|
|
|
|
import os |
|
|
|
|
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') |
|
|
|
|
|
|
|
|
|
import sys |
|
|
|
|
import csv |
|
|
|
|
import subprocess |
|
|
|
|
import configparser |
|
|
|
|
from collections import Counter |
|
|
|
|
import re |
|
|
|
|
|
|
|
|
|
import numpy as np |
|
|
|
|
import pandas as pd |
|
|
|
|
import matplotlib.pyplot as plt |
|
|
|
|
from sklearn.metrics import confusion_matrix |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## ======================= functions ======================= |
|
|
|
|
|
|
|
|
|
def read_fileFA(fileFA): |
|
|
|
|
""" |
|
|
|
|
read the result file of HTK forced alignment. |
|
|
|
|
this function only works when input is one word. |
|
|
|
|
""" |
|
|
|
|
with open(fileFA, 'r') as f: |
|
|
|
|
lines = f.read() |
|
|
|
|
lines = lines.split('\n') |
|
|
|
|
|
|
|
|
|
phones = [] |
|
|
|
|
for line in lines: |
|
|
|
|
line_split = line.split() |
|
|
|
|
if len(line_split) > 1: |
|
|
|
|
phones.append(line_split[2]) |
|
|
|
|
|
|
|
|
|
return ' '.join(phones) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def make_dic(word, pronvar_, fileDic, output_type): |
|
|
|
|
""" |
|
|
|
|
make dict files which can be used for HTK. |
|
|
|
|
param word: target word. |
|
|
|
|
param pronvar_: pronunciation variant. nx2 (WORD /t pronunciation) ndarray. |
|
|
|
|
param fileDic: output dic file. |
|
|
|
|
param output_type: 0:full, 1:statistics, 2:frequency <2% entries are removed. 3:top 3. |
|
|
|
|
""" |
|
|
|
|
#assert(output_type < 4 and output_type >= 0, 'output_type should be an integer between 0 and 3.') |
|
|
|
|
|
|
|
|
|
if output_type == 0: # full |
|
|
|
|
pronvar = np.unique(pronvar_) |
|
|
|
|
|
|
|
|
|
with open(fileDic, 'w') as f: |
|
|
|
|
for pvar in pronvar: |
|
|
|
|
f.write('{0}\t{1}\n'.format(WORD, pvar)) |
|
|
|
|
else: |
|
|
|
|
c = Counter(pronvar_) |
|
|
|
|
total_num = sum(c.values()) |
|
|
|
|
with open(fileDic, 'w') as f: |
|
|
|
|
if output_type == 3: |
|
|
|
|
for key, value in c.most_common(3): |
|
|
|
|
f.write('{0}\t{1}\n'.format(WORD, key)) |
|
|
|
|
else: |
|
|
|
|
for key, value in c.items(): |
|
|
|
|
percentage = value/total_num*100 |
|
|
|
|
#from sklearn.metrics import confusion_matrix |
|
|
|
|
|
|
|
|
|
if output_type == 1: # all |
|
|
|
|
f.write('{0}\t{1:.2f}\t{2}\t{3}\n'.format(value, percentage, WORD, key)) |
|
|
|
|
elif output_type == 2: # less than 2 percent |
|
|
|
|
if percentage < 2: |
|
|
|
|
f.write('{0}\t{1}\n'.format(WORD, key)) |
|
|
|
|
import acoustic_model_functions as am_func |
|
|
|
|
import convert_xsampa2ipa |
|
|
|
|
import defaultfiles as default |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## ======================= user define ======================= |
|
|
|
|
curr_dir = r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model' |
|
|
|
|
config_ini = curr_dir + '\\config.ini' |
|
|
|
|
forced_alignment_module = r'C:\Users\Aki\source\repos\forced_alignment' |
|
|
|
|
forced_alignment_module_old = r'C:\OneDrive\Research\rug\code\forced_alignment\forced_alignment' |
|
|
|
|
ipa_xsampa_converter_dir = r'C:\Users\Aki\source\repos\ipa-xsama-converter' |
|
|
|
|
accent_classification_dir = r'C:\Users\Aki\source\repos\accent_classification\accent_classification' |
|
|
|
|
#curr_dir = r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model' |
|
|
|
|
#config_ini = 'config.ini' |
|
|
|
|
#repo_dir = r'C:\Users\Aki\source\repos' |
|
|
|
|
#forced_alignment_module = repo_dir + '\\forced_alignment' |
|
|
|
|
#forced_alignment_module_old = repo_dir + '\\aki_tools' |
|
|
|
|
#ipa_xsampa_converter_dir = repo_dir + '\\ipa-xsama-converter' |
|
|
|
|
#accent_classification_dir = repo_dir + '\\accent_classification\accent_classification' |
|
|
|
|
excel_file = os.path.join(default.experiments_dir, 'stimmen', 'data', 'Frisian Variants Picture Task Stimmen.xlsx') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
experiments_dir = r'C:\OneDrive\Research\rug\experiments' |
|
|
|
|
data_dir = experiments_dir + '\\stimmen\\data' |
|
|
|
|
csvfile = data_dir + '\\Frisian Variants Picture Task Stimmen.csv' |
|
|
|
|
#experiments_dir = r'C:\OneDrive\Research\rug\experiments' |
|
|
|
|
data_dir = os.path.join(default.experiments_dir, 'stimmen', 'data') |
|
|
|
|
#csvfile = data_dir + '\\Frisian Variants Picture Task Stimmen.csv' |
|
|
|
|
wav_dir = os.path.join(default.experiments_dir, 'stimmen', 'wav') |
|
|
|
|
acoustic_model_dir = os.path.join(default.experiments_dir, 'friesian', 'acoustic_model', 'model') |
|
|
|
|
htk_dict_dir = os.path.join(default.experiments_dir, 'stimmen', 'dic_short') |
|
|
|
|
fa_dir = os.path.join(default.experiments_dir, 'stimmen', 'FA') |
|
|
|
|
|
|
|
|
|
#cygwin_dir = r'C:\cygwin64\home\Aki\acoustic_model' |
|
|
|
|
#lex_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr') |
|
|
|
|
#lex_asr_htk = os.path.join(default.fame_dir, 'lexicon', 'lex.asr_htk') |
|
|
|
|
|
|
|
|
|
cygwin_dir = r'C:\cygwin64\home\Aki\acoustic_model' |
|
|
|
|
|
|
|
|
|
# procedure |
|
|
|
|
convert_phones = 0 |
|
|
|
|
make_dic_files = 0 |
|
|
|
|
make_dic_files_short = 0 |
|
|
|
|
do_forced_alignment_htk = 0 |
|
|
|
|
do_forced_alignment_htk = 1 |
|
|
|
|
make_kaldi_data_files = 0 |
|
|
|
|
make_kaldi_lexicon_txt = 0 |
|
|
|
|
load_forced_alignment_kaldi = 1 |
|
|
|
|
load_forced_alignment_kaldi = 0 |
|
|
|
|
eval_forced_alignment = 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## ======================= add paths ======================= |
|
|
|
|
|
|
|
|
|
sys.path.append(forced_alignment_module) |
|
|
|
|
sys.path.append(os.path.join(default.repo_dir, 'forced_alignment')) |
|
|
|
|
from forced_alignment import convert_phone_set |
|
|
|
|
from forced_alignment import pyhtk |
|
|
|
|
|
|
|
|
|
# for interactive window |
|
|
|
|
sys.path.append(curr_dir) |
|
|
|
|
import convert_xsampa2ipa |
|
|
|
|
import acoustic_model_functions as am_func |
|
|
|
|
|
|
|
|
|
# for forced-alignment |
|
|
|
|
sys.path.append(forced_alignment_module_old) |
|
|
|
|
import pyHTK |
|
|
|
|
|
|
|
|
|
# to output confusion matrix |
|
|
|
|
sys.path.append(accent_classification_dir) |
|
|
|
|
from output_confusion_matrix import plot_confusion_matrix |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## ======================= load variables ======================= |
|
|
|
|
config = configparser.ConfigParser() |
|
|
|
|
config.sections() |
|
|
|
|
config.read(config_ini) |
|
|
|
|
|
|
|
|
|
FAME_dir = config['Settings']['FAME_dir'] |
|
|
|
|
|
|
|
|
|
lex_asr = FAME_dir + '\\lexicon\\lex.asr' |
|
|
|
|
lex_asr_htk = FAME_dir + '\\lexicon\\lex.asr_htk' |
|
|
|
|
sys.path.append(os.path.join(default.repo_dir, 'toolbox')) |
|
|
|
|
#import pyHTK |
|
|
|
|
from evaluation import plot_confusion_matrix |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## ======================= convert phones ====================== |
|
|
|
|
if convert_phones: |
|
|
|
|
mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', ipa_xsampa_converter_dir) |
|
|
|
|
|
|
|
|
|
## check phones included in FAME! |
|
|
|
|
# the phones used in the lexicon. |
|
|
|
|
#phonelist = am_func.get_phonelist(lex_htk) |
|
|
|
|
mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_converter_dir) |
|
|
|
|
|
|
|
|
|
# the lines which include a specific phone. |
|
|
|
|
#lines = am_func.find_phone(lex_asr, 'x') |
|
|
|
|
xls = pd.ExcelFile(excel_file) |
|
|
|
|
|
|
|
|
|
with open(csvfile, encoding="utf-8") as fin: |
|
|
|
|
lines = csv.reader(fin, delimiter=';', lineterminator="\n", skipinitialspace=True) |
|
|
|
|
next(lines, None) # skip the headers |
|
|
|
|
## check conversion |
|
|
|
|
#df = pd.read_excel(xls, 'frequency') |
|
|
|
|
#for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']): |
|
|
|
|
# #ipa_converted = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, xsampa_) |
|
|
|
|
# ipa_converted = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa) |
|
|
|
|
# if not ipa_converted == ipa: |
|
|
|
|
# print('{0}: {1} - {2}'.format(xsampa, ipa_converted, ipa)) |
|
|
|
|
|
|
|
|
|
filenames = [] |
|
|
|
|
words = [] |
|
|
|
|
pronunciations = [] |
|
|
|
|
for line in lines: |
|
|
|
|
if line[1] is not '' and len(line) > 5: |
|
|
|
|
filenames.append(line[0]) |
|
|
|
|
words.append(line[1]) |
|
|
|
|
pron_xsampa = line[3] |
|
|
|
|
pron_ipa = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, pron_xsampa) |
|
|
|
|
pron_ipa = pron_ipa.replace('ː', ':') |
|
|
|
|
pron_famehtk = convert_phone_set.ipa2famehtk(pron_ipa) |
|
|
|
|
|
|
|
|
|
# adjust to phones used in the acoustic model. |
|
|
|
|
pron_famehtk = pron_famehtk.replace('sp', 'sil') |
|
|
|
|
pron_famehtk = pron_famehtk.replace('ce :', 'ce') # because ceh is ignored. |
|
|
|
|
pron_famehtk = pron_famehtk.replace('w :', 'wh') |
|
|
|
|
pron_famehtk = pron_famehtk.replace('e :', 'eh') |
|
|
|
|
pron_famehtk = pron_famehtk.replace('eh :', 'eh') |
|
|
|
|
pron_famehtk = pron_famehtk.replace('ih :', 'ih') |
|
|
|
|
|
|
|
|
|
#translation_key = {'sp': 'sil', 'ce :': 'ceh', 'w :': 'wh'} |
|
|
|
|
#pron = [] |
|
|
|
|
#for phoneme in pron_famehtk.split(' '): |
|
|
|
|
# pron.append(translation_key.get(phoneme, phoneme)) |
|
|
|
|
#pronunciations.append(' '.join(pron_famehtk)) |
|
|
|
|
pronunciations.append(pron_famehtk) |
|
|
|
|
|
|
|
|
|
# check if all phones are in the phonelist of the acoustic model. |
|
|
|
|
#phonelist = ' '.join(pronunciations) |
|
|
|
|
#np.unique(phonelist.split(' ')) |
|
|
|
|
#phonelist.find(':') |
|
|
|
|
## check phones included in FAME! |
|
|
|
|
# the phones used in the lexicon. |
|
|
|
|
#phonelist = am_func.get_phonelist(lex_asr) |
|
|
|
|
|
|
|
|
|
filenames = np.array(filenames) |
|
|
|
|
words = np.array(words) |
|
|
|
|
pronunciations = np.array(pronunciations) |
|
|
|
|
# the lines which include a specific phone. |
|
|
|
|
#lines = am_func.find_phone(lex_asr, 'x') |
|
|
|
|
|
|
|
|
|
del line, lines |
|
|
|
|
del pron_xsampa, pron_ipa, pron_famehtk |
|
|
|
|
|
|
|
|
|
np.save(data_dir + '\\filenames.npy', filenames) |
|
|
|
|
np.save(data_dir + '\\words.npy', words) |
|
|
|
|
np.save(data_dir + '\\pronunciations.npy', pronunciations) |
|
|
|
|
else: |
|
|
|
|
filenames = np.load(data_dir + '\\filenames.npy') |
|
|
|
|
words = np.load(data_dir + '\\words.npy') |
|
|
|
|
# Filename, Word, Self Xsampa |
|
|
|
|
df = pd.read_excel(xls, 'original') |
|
|
|
|
|
|
|
|
|
pronunciations = np.load(data_dir + '\\pronunciations.npy') |
|
|
|
|
word_list = np.unique(words) |
|
|
|
|
ipas = [] |
|
|
|
|
famehtks = [] |
|
|
|
|
for xsampa in df['Self Xsampa']: |
|
|
|
|
if not isinstance(xsampa, float): # 'NaN' |
|
|
|
|
# typo? |
|
|
|
|
xsampa = xsampa.replace('r2:z@rA:\\t', 'r2:z@rA:t') |
|
|
|
|
xsampa = xsampa.replace(';', ':') |
|
|
|
|
|
|
|
|
|
ipa = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa) |
|
|
|
|
ipa = ipa.replace('ː', ':') |
|
|
|
|
ipa = ipa.replace(' ', '') |
|
|
|
|
ipas.append(ipa) |
|
|
|
|
famehtk = convert_phone_set.ipa2famehtk(ipa) |
|
|
|
|
famehtks.append(famehtk) |
|
|
|
|
else: |
|
|
|
|
ipas.append('') |
|
|
|
|
famehtks.append('') |
|
|
|
|
|
|
|
|
|
# extract interesting cols. |
|
|
|
|
df = pd.DataFrame({'filename': df['Filename'], |
|
|
|
|
'word': df['Word'], |
|
|
|
|
'xsampa': df['Self Xsampa'], |
|
|
|
|
'ipa': pd.Series(ipas), |
|
|
|
|
'famehtk': pd.Series(famehtks)}) |
|
|
|
|
# cleansing. |
|
|
|
|
df = df[~df['famehtk'].isin(['/', ''])] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## ======================= make dict files used for HTK. ====================== |
|
|
|
|
if make_dic_files: |
|
|
|
|
output_type = 2 |
|
|
|
|
output_dir = experiments_dir + r'\stimmen\dic_short' |
|
|
|
|
word_list = np.unique(df['word']) |
|
|
|
|
|
|
|
|
|
output_type = 3 |
|
|
|
|
|
|
|
|
|
for word in word_list: |
|
|
|
|
WORD = word.upper() |
|
|
|
|
fileDic = output_dir + '\\' + word + '.dic' |
|
|
|
|
htk_dict_file = htk_dict_dir + '\\' + word + '.dic' |
|
|
|
|
|
|
|
|
|
# pronunciation variant of the target word. |
|
|
|
|
pronvar_ = pronunciations[words == word] |
|
|
|
|
# remove '' |
|
|
|
|
pronvar_ = np.delete(pronvar_, np.where(pronvar_=='')) |
|
|
|
|
pronvar_ = df['famehtk'][df['word'].str.match(word)] |
|
|
|
|
|
|
|
|
|
# make dic file. |
|
|
|
|
make_dic(word, pronvar_, fileDic, output_type) |
|
|
|
|
am_func.make_dic(word, pronvar_, htk_dict_file, output_type) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## ======================= forced alignment using HTK ======================= |
|
|
|
|
if do_forced_alignment_htk: |
|
|
|
|
configHVite = cygwin_dir + r'\config\config.HVite' |
|
|
|
|
filePhoneList = experiments_dir + r'\friesian\acoustic_model\config\phonelist_friesian.txt' |
|
|
|
|
wav_dir = experiments_dir + r'\stimmen\wav' |
|
|
|
|
|
|
|
|
|
#hmm_num = 128 |
|
|
|
|
for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256]: |
|
|
|
|
#hmm_num = 2 |
|
|
|
|
for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]: |
|
|
|
|
|
|
|
|
|
hmm_num_str = str(hmm_num) |
|
|
|
|
AcousticModel = experiments_dir + r'\friesian\acoustic_model\model\hmm' + hmm_num_str + r'-2\hmmdefs' |
|
|
|
|
acoustic_model = os.path.join(acoustic_model_dir, 'hmm' + hmm_num_str + r'-2\hmmdefs') |
|
|
|
|
|
|
|
|
|
predictions = [] |
|
|
|
|
file_num_max = len(filenames) |
|
|
|
|
for i in range(0, file_num_max): |
|
|
|
|
#for i in range(500, 502): |
|
|
|
|
print('=== {0}/{1} ==='.format(i, file_num_max)) |
|
|
|
|
filename = filenames[i] |
|
|
|
|
fileWav = wav_dir + '\\' + filename |
|
|
|
|
for i, filename in enumerate(df['filename']): |
|
|
|
|
print('=== {0}/{1} ==='.format(i, len(df))) |
|
|
|
|
wav_file = os.path.join(wav_dir, filename) |
|
|
|
|
|
|
|
|
|
if os.path.exists(fileWav): |
|
|
|
|
word = words[i] |
|
|
|
|
if os.path.exists(wav_file) and i in df['filename'].keys(): |
|
|
|
|
word = df['word'][i] |
|
|
|
|
WORD = word.upper() |
|
|
|
|
|
|
|
|
|
# make label file. |
|
|
|
|
fileLab = wav_dir + '\\' + filename.replace('.wav', '.lab') |
|
|
|
|
with open(fileLab, 'w') as f: |
|
|
|
|
label_file = os.path.join(wav_dir, filename.replace('.wav', '.lab')) |
|
|
|
|
with open(label_file, 'w') as f: |
|
|
|
|
lines = f.write(WORD) |
|
|
|
|
|
|
|
|
|
fileDic = experiments_dir + r'\stimmen\dic_top3' + '\\' + word + '.dic' |
|
|
|
|
fileFA = experiments_dir + r'\stimmen\FA' + '\\' + filename.replace('.wav', '.txt') + hmm_num_str |
|
|
|
|
htk_dict_file = os.path.join(htk_dict_dir, word + '.dic') |
|
|
|
|
fa_file = os.path.join(fa_dir, filename.replace('.wav', '.txt') + hmm_num_str) |
|
|
|
|
pyhtk.doHVite(wav_file, label_file, htk_dict_file, fa_file, default.config_hvite, default.phonelist, acoustic_model) |
|
|
|
|
|
|
|
|
|
pyHTK.doHVite(fileWav, fileLab, fileDic, fileFA, configHVite, filePhoneList, AcousticModel) |
|
|
|
|
prediction = read_fileFA(fileFA) |
|
|
|
|
prediction = am_func.read_fileFA(fa_file) |
|
|
|
|
predictions.append(prediction) |
|
|
|
|
|
|
|
|
|
os.remove(fileLab) |
|
|
|
|
print('{0}: {1} -> {2}'.format(WORD, pronunciations[i], prediction)) |
|
|
|
|
os.remove(label_file) |
|
|
|
|
print('{0}: {1} -> {2}'.format(WORD, df['famehtk'][i], prediction)) |
|
|
|
|
else: |
|
|
|
|
predictions.append('') |
|
|
|
|
print('!!!!! file not found.') |
|
|
|
|
|
|
|
|
|
predictions = np.array(predictions) |
|
|
|
|
match = np.c_[words[predictions != ''], pronunciations[predictions != ''], predictions[predictions != '']] |
|
|
|
|
np.save(data_dir + '\\match_hmm' + hmm_num_str + '.npy', match) |
|
|
|
|
#match = np.c_[words[predictions != ''], pronunciations[predictions != ''], predictions[predictions != '']] |
|
|
|
|
np.save(os.path.join(data_dir, 'predictions_hmm' + hmm_num_str + '.npy'), predictions) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## ======================= make files which is used for forced alignment by Kaldi ======================= |
|
|
|
@ -392,7 +313,7 @@ if make_kaldi_lexicon_txt:
@@ -392,7 +313,7 @@ if make_kaldi_lexicon_txt:
|
|
|
|
|
pronvar_list = np.unique(pronvar_list_) |
|
|
|
|
|
|
|
|
|
for pronvar_ in pronvar_list: |
|
|
|
|
split_ipa = convert_phone_set.split_ipa_fame(pronvar_) |
|
|
|
|
split_ipa = convert_phone_set.split_fame_ipa(pronvar_) |
|
|
|
|
pronvar_out = ' '.join(split_ipa) |
|
|
|
|
pronvar_list_all.append([word, pronvar_out]) |
|
|
|
|
|
|
|
|
@ -456,13 +377,12 @@ if load_forced_alignment_kaldi:
@@ -456,13 +377,12 @@ if load_forced_alignment_kaldi:
|
|
|
|
|
filename_ = filename |
|
|
|
|
|
|
|
|
|
# correct or not. |
|
|
|
|
for filename, fa_pronunciation in zip(fa_filenames, fa_pronunciations): |
|
|
|
|
#for filename, fa_pronunciation in zip(fa_filenames, fa_pronunciations): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## ======================= evaluate the result of forced alignment ======================= |
|
|
|
|
if eval_forced_alignment: |
|
|
|
|
|
|
|
|
|
match_num = [] |
|
|
|
|
for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256]: |
|
|
|
|
#hmm_num = 256 |
|
|
|
|