Forced alignment by Kaldi is added.

This commit is contained in:
yemaozi88 2018-08-20 22:50:53 +02:00
parent d56ef7f075
commit 22b9ae966b
7 changed files with 451 additions and 161 deletions

Binary file not shown.

View File

@ -15,6 +15,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
..\forced_alignment\forced_alignment\htk_dict.py = ..\forced_alignment\forced_alignment\htk_dict.py ..\forced_alignment\forced_alignment\htk_dict.py = ..\forced_alignment\forced_alignment\htk_dict.py
..\forced_alignment\forced_alignment\lexicon.py = ..\forced_alignment\forced_alignment\lexicon.py ..\forced_alignment\forced_alignment\lexicon.py = ..\forced_alignment\forced_alignment\lexicon.py
..\forced_alignment\forced_alignment\mlf.py = ..\forced_alignment\forced_alignment\mlf.py ..\forced_alignment\forced_alignment\mlf.py = ..\forced_alignment\forced_alignment\mlf.py
..\accent_classification\accent_classification\output_confusion_matrix.py = ..\accent_classification\accent_classification\output_confusion_matrix.py
..\forced_alignment\forced_alignment\pronunciations.py = ..\forced_alignment\forced_alignment\pronunciations.py ..\forced_alignment\forced_alignment\pronunciations.py = ..\forced_alignment\forced_alignment\pronunciations.py
..\forced_alignment\forced_alignment\pyhtk.py = ..\forced_alignment\forced_alignment\pyhtk.py ..\forced_alignment\forced_alignment\pyhtk.py = ..\forced_alignment\forced_alignment\pyhtk.py
..\forced_alignment\forced_alignment\scripts.py = ..\forced_alignment\forced_alignment\scripts.py ..\forced_alignment\forced_alignment\scripts.py = ..\forced_alignment\forced_alignment\scripts.py

View File

@ -22,12 +22,11 @@ dataset_list = ['devel', 'test', 'train']
extract_features = 0 extract_features = 0
make_feature_list = 0 make_feature_list = 0
conv_lexicon = 0 conv_lexicon = 0
check_lexicon = 1 check_lexicon = 0
make_mlf = 0 make_mlf = 0
combine_files = 0 combine_files = 0
flat_start = 0 flat_start = 0
train_model = 0 train_model = 1
forced_alignment = 0
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir)) sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir))
@ -288,7 +287,7 @@ if flat_start:
## ======================= estimate monophones ======================= ## ======================= estimate monophones =======================
if train_model: if train_model:
iter_num_max = 3 iter_num_max = 3
for mix_num in [16, 32, 64, 128]: for mix_num in [128, 256, 512, 1024]:
for iter_num in range(1, iter_num_max+1): for iter_num in range(1, iter_num_max+1):
print("===== mix{}, iter{} =====".format(mix_num, iter_num)) print("===== mix{}, iter{} =====".format(mix_num, iter_num))
iter_num_pre = iter_num - 1 iter_num_pre = iter_num - 1
@ -315,5 +314,6 @@ if train_model:
fout.write("MU %d {*.state[2-4].mix}" % (mix_num_next)) fout.write("MU %d {*.state[2-4].mix}" % (mix_num_next))
subprocessStr = 'HHEd -T 1 -H ' + modelN_dir + '\\' + hmmdefs_name + ' -M ' + modelN_dir_next + ' ' + header_file + ' ' + phonelist subprocessStr = 'HHEd -T 1 -H ' + modelN_dir + '\\' + hmmdefs_name + ' -M ' + modelN_dir_next + ' ' + header_file + ' ' + phonelist
subprocess.call(subprocessStr, shell=True) subprocess.call(subprocessStr, shell=True)

View File

@ -31,6 +31,9 @@
<Compile Include="performance_check.py"> <Compile Include="performance_check.py">
<SubType>Code</SubType> <SubType>Code</SubType>
</Compile> </Compile>
<Compile Include="pyKaldi.py">
<SubType>Code</SubType>
</Compile>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<Content Include="config.ini" /> <Content Include="config.ini" />

View File

@ -2,4 +2,4 @@
config_hcopy = c:\cygwin64\home\Aki\acoustic_model\config\config.HCopy config_hcopy = c:\cygwin64\home\Aki\acoustic_model\config\config.HCopy
config_train = c:\cygwin64\home\Aki\acoustic_model\config\config.train config_train = c:\cygwin64\home\Aki\acoustic_model\config\config.train
mkhmmdefs_pl = c:\cygwin64\home\Aki\acoustic_model\src\acoustic_model\mkhmmdefs.pl mkhmmdefs_pl = c:\cygwin64\home\Aki\acoustic_model\src\acoustic_model\mkhmmdefs.pl
FAME_dir = c:\OneDrive\Research\rug\experiments\friesian\corpus FAME_dir = C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\corpus

View File

@ -4,52 +4,92 @@ import csv
import subprocess import subprocess
import configparser import configparser
from collections import Counter from collections import Counter
import re
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
## ======================= functions ======================= ## ======================= functions =======================
def read_fileFA(fileFA): def read_fileFA(fileFA):
""" """
read the result file of HTK forced alignment. read the result file of HTK forced alignment.
this function only works when input is one word. this function only works when input is one word.
""" """
with open(fileFA, 'r') as f: with open(fileFA, 'r') as f:
lines = f.read() lines = f.read()
lines = lines.split('\n') lines = lines.split('\n')
phones = [] phones = []
for line in lines: for line in lines:
line_split = line.split() line_split = line.split()
if len(line_split) > 1: if len(line_split) > 1:
phones.append(line_split[2]) phones.append(line_split[2])
return ' '.join(phones) return ' '.join(phones)
##################### def make_dic(word, pronvar_, fileDic, output_type):
## USER DEFINE ## """
##################### make dict files which can be used for HTK.
param word: target word.
param pronvar_: pronunciation variant. nx2 (WORD /t pronunciation) ndarray.
param fileDic: output dic file.
param output_type: 0:full, 1:statistics, 2:frequency <2% entries are removed. 3:top 3.
"""
#assert(output_type < 4 and output_type >= 0, 'output_type should be an integer between 0 and 3.')
if output_type == 0: # full
pronvar = np.unique(pronvar_)
with open(fileDic, 'w') as f:
for pvar in pronvar:
f.write('{0}\t{1}\n'.format(WORD, pvar))
else:
c = Counter(pronvar_)
total_num = sum(c.values())
with open(fileDic, 'w') as f:
if output_type == 3:
for key, value in c.most_common(3):
f.write('{0}\t{1}\n'.format(WORD, key))
else:
for key, value in c.items():
percentage = value/total_num*100
if output_type == 1: # all
f.write('{0}\t{1:.2f}\t{2}\t{3}\n'.format(value, percentage, WORD, key))
elif output_type == 2: # less than 2 percent
if percentage < 2:
f.write('{0}\t{1}\n'.format(WORD, key))
## ======================= user define =======================
curr_dir = r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model' curr_dir = r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model'
config_ini = curr_dir + '\\config.ini' config_ini = curr_dir + '\\config.ini'
forced_alignment_module = r'C:\Users\Aki\source\repos\forced_alignment' forced_alignment_module = r'C:\Users\Aki\source\repos\forced_alignment'
forced_alignment_module_old = r'C:\OneDrive\Research\rug\code\forced_alignment\forced_alignment' forced_alignment_module_old = r'C:\OneDrive\Research\rug\code\forced_alignment\forced_alignment'
ipa_xsampa_converter_dir = r'C:\Users\Aki\source\repos\ipa-xsama-converter' ipa_xsampa_converter_dir = r'C:\Users\Aki\source\repos\ipa-xsama-converter'
accent_classification_dir = r'C:\Users\Aki\source\repos\accent_classification\accent_classification'
csvfile = r"C:\OneDrive\Research\rug\stimmen\Frisian Variants Picture Task Stimmen.csv"
experiments_dir = r'C:\OneDrive\Research\rug\experiments' experiments_dir = r'C:\OneDrive\Research\rug\experiments'
data_dir = experiments_dir + '\\stimmen\\data' data_dir = experiments_dir + '\\stimmen\\data'
cygwin_dir = r'C:\cygwin64\home\Aki\acoustic_model' csvfile = data_dir + '\\Frisian Variants Picture Task Stimmen.csv'
cygwin_dir = r'C:\cygwin64\home\Aki\acoustic_model'
# procedure # procedure
convert_phones = 0 convert_phones = 0
make_dic_files = 0 make_dic_files = 0
make_dic_files_short = 0 make_dic_files_short = 0
do_forced_alignment = 0 do_forced_alignment_htk = 0
eval_forced_alignment = 1 make_kaldi_data_files = 0
make_kaldi_lexicon_txt = 0
load_forced_alignment_kaldi = 1
eval_forced_alignment = 0
@ -67,6 +107,10 @@ import acoustic_model_functions as am_func
sys.path.append(forced_alignment_module_old) sys.path.append(forced_alignment_module_old)
import pyHTK import pyHTK
# to output confusion matrix
sys.path.append(accent_classification_dir)
from output_confusion_matrix import plot_confusion_matrix
## ======================= load variables ======================= ## ======================= load variables =======================
config = configparser.ConfigParser() config = configparser.ConfigParser()
@ -81,177 +125,393 @@ lex_asr_htk = FAME_dir + '\\lexicon\\lex.asr_htk'
## ======================= convert phones ====================== ## ======================= convert phones ======================
if convert_phones: if convert_phones:
mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', ipa_xsampa_converter_dir) mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', ipa_xsampa_converter_dir)
## check phones included in FAME! ## check phones included in FAME!
# the phones used in the lexicon. # the phones used in the lexicon.
#phonelist = am_func.get_phonelist(lex_htk) #phonelist = am_func.get_phonelist(lex_htk)
# the lines which include a specific phone. # the lines which include a specific phone.
#lines = am_func.find_phone(lex_asr, 'x') #lines = am_func.find_phone(lex_asr, 'x')
with open(csvfile, encoding="utf-8") as fin: with open(csvfile, encoding="utf-8") as fin:
lines = csv.reader(fin, delimiter=';', lineterminator="\n", skipinitialspace=True) lines = csv.reader(fin, delimiter=';', lineterminator="\n", skipinitialspace=True)
next(lines, None) # skip the headers next(lines, None) # skip the headers
filenames = [] filenames = []
words = [] words = []
pronunciations = [] pronunciations = []
for line in lines: for line in lines:
if line[1] is not '' and len(line) > 5: if line[1] is not '' and len(line) > 5:
filenames.append(line[0]) filenames.append(line[0])
words.append(line[1]) words.append(line[1])
pron_xsampa = line[3] pron_xsampa = line[3]
pron_ipa = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, pron_xsampa) pron_ipa = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, pron_xsampa)
pron_ipa = pron_ipa.replace('ː', ':') pron_ipa = pron_ipa.replace('ː', ':')
pron_famehtk = convert_phone_set.ipa2famehtk(pron_ipa) pron_famehtk = convert_phone_set.ipa2famehtk(pron_ipa)
# adjust to phones used in the acoustic model. # adjust to phones used in the acoustic model.
pron_famehtk = pron_famehtk.replace('sp', 'sil') pron_famehtk = pron_famehtk.replace('sp', 'sil')
pron_famehtk = pron_famehtk.replace('ce :', 'ce') # because ceh is ignored. pron_famehtk = pron_famehtk.replace('ce :', 'ce') # because ceh is ignored.
pron_famehtk = pron_famehtk.replace('w :', 'wh') pron_famehtk = pron_famehtk.replace('w :', 'wh')
pron_famehtk = pron_famehtk.replace('e :', 'eh') pron_famehtk = pron_famehtk.replace('e :', 'eh')
pron_famehtk = pron_famehtk.replace('eh :', 'eh') pron_famehtk = pron_famehtk.replace('eh :', 'eh')
pron_famehtk = pron_famehtk.replace('ih :', 'ih') pron_famehtk = pron_famehtk.replace('ih :', 'ih')
#translation_key = {'sp': 'sil', 'ce :': 'ceh', 'w :': 'wh'} #translation_key = {'sp': 'sil', 'ce :': 'ceh', 'w :': 'wh'}
#pron = [] #pron = []
#for phoneme in pron_famehtk.split(' '): #for phoneme in pron_famehtk.split(' '):
# pron.append(translation_key.get(phoneme, phoneme)) # pron.append(translation_key.get(phoneme, phoneme))
#pronunciations.append(' '.join(pron_famehtk)) #pronunciations.append(' '.join(pron_famehtk))
pronunciations.append(pron_famehtk) pronunciations.append(pron_famehtk)
# check if all phones are in the phonelist of the acoustic model. # check if all phones are in the phonelist of the acoustic model.
#phonelist = ' '.join(pronunciations) #phonelist = ' '.join(pronunciations)
#np.unique(phonelist.split(' ')) #np.unique(phonelist.split(' '))
#phonelist.find(':') #phonelist.find(':')
filenames = np.array(filenames) filenames = np.array(filenames)
words = np.array(words) words = np.array(words)
pronunciations = np.array(pronunciations) pronunciations = np.array(pronunciations)
del line, lines del line, lines
del pron_xsampa, pron_ipa, pron_famehtk del pron_xsampa, pron_ipa, pron_famehtk
np.save(data_dir + '\\filenames.npy', filenames) np.save(data_dir + '\\filenames.npy', filenames)
np.save(data_dir + '\\words.npy', words) np.save(data_dir + '\\words.npy', words)
np.save(data_dir + '\\pronunciations.npy', pronunciations) np.save(data_dir + '\\pronunciations.npy', pronunciations)
else: else:
filenames = np.load(data_dir + '\\filenames.npy') filenames = np.load(data_dir + '\\filenames.npy')
words = np.load(data_dir + '\\words.npy') words = np.load(data_dir + '\\words.npy')
pronunciations = np.load(data_dir + '\\pronunciations.npy') pronunciations = np.load(data_dir + '\\pronunciations.npy')
word_list = np.unique(words) word_list = np.unique(words)
## ======================= make dict files used for HTK. ====================== ## ======================= make dict files used for HTK. ======================
if make_dic_files: if make_dic_files:
output_dir = experiments_dir + r'\stimmen\dic' output_type = 2
output_dir = experiments_dir + r'\stimmen\dic_short'
for word in word_list:
WORD = word.upper()
fileDic = output_dir + '\\' + word + '.dic'
for word in word_list: # pronunciation variant of the target word.
WORD = word.upper() pronvar_ = pronunciations[words == word]
fileDic = output_dir + '\\' + word + '.dic' # remove ''
pronvar_ = np.delete(pronvar_, np.where(pronvar_==''))
# make dic file. # make dic file.
pronvar_ = pronunciations[words == word] make_dic(word, pronvar_, fileDic, output_type)
pronvar = np.unique(pronvar_)
with open(fileDic, 'w') as f: ## ======================= forced alignment using HTK =======================
for pvar in pronvar: if do_forced_alignment_htk:
f.write('{0}\t{1}\n'.format(WORD, pvar)) configHVite = cygwin_dir + r'\config\config.HVite'
filePhoneList = experiments_dir + r'\friesian\acoustic_model\config\phonelist_friesian.txt'
wav_dir = experiments_dir + r'\stimmen\wav'
#hmm_num = 128
for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256]:
hmm_num_str = str(hmm_num)
AcousticModel = experiments_dir + r'\friesian\acoustic_model\model\hmm' + hmm_num_str + r'-2\hmmdefs'
predictions = []
file_num_max = len(filenames)
for i in range(0, file_num_max):
#for i in range(500, 502):
print('=== {0}/{1} ==='.format(i, file_num_max))
filename = filenames[i]
fileWav = wav_dir + '\\' + filename
if os.path.exists(fileWav):
word = words[i]
WORD = word.upper()
# make label file.
fileLab = wav_dir + '\\' + filename.replace('.wav', '.lab')
with open(fileLab, 'w') as f:
lines = f.write(WORD)
fileDic = experiments_dir + r'\stimmen\dic_top3' + '\\' + word + '.dic'
fileFA = experiments_dir + r'\stimmen\FA' + '\\' + filename.replace('.wav', '.txt') + hmm_num_str
pyHTK.doHVite(fileWav, fileLab, fileDic, fileFA, configHVite, filePhoneList, AcousticModel)
prediction = read_fileFA(fileFA)
predictions.append(prediction)
os.remove(fileLab)
print('{0}: {1} -> {2}'.format(WORD, pronunciations[i], prediction))
else:
predictions.append('')
print('!!!!! file not found.')
predictions = np.array(predictions)
match = np.c_[words[predictions != ''], pronunciations[predictions != ''], predictions[predictions != '']]
np.save(data_dir + '\\match_hmm' + hmm_num_str + '.npy', match)
## ======================= make dict files for most popular words. ====================== ## ======================= make files which is used for forced alignment by Kaldi =======================
if make_dic_files_short: if make_kaldi_data_files:
output_dir = experiments_dir + r'\stimmen\dic' wav_dir = r'c:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\corpus\stimmen'
kaldi_work_dir = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5'
kaldi_data_dir = os.path.join(kaldi_work_dir, 'data', 'alignme')
kaldi_dict_dir = os.path.join(kaldi_work_dir, 'data', 'local', 'dict')
htk_dict_dir = os.path.join(experiments_dir, 'stimmen', 'dic_top3')
#word = word_list[3] wav_scp = os.path.join(kaldi_data_dir, 'wav.scp')
for word in word_list: text_file = os.path.join(kaldi_data_dir, 'text')
WORD = word.upper() utt2spk = os.path.join(kaldi_data_dir, 'utt2spk')
fileStat = output_dir + '\\' + word + '_stat.csv'
pronvar = pronunciations[words == word]
c = Counter(pronvar)
total_num = sum(c.values())
with open(fileStat, 'w') as f: lexicon_txt = os.path.join(kaldi_dict_dir, 'lexicon.txt')
for key, value in c.items():
f.write('{0}\t{1:.2f}\t{2}\t{3}\n'.format(value, value/total_num*100, WORD, key)) predictions = []
file_num_max = len(filenames)
# remove previous files.
if os.path.exists(wav_scp):
os.remove(wav_scp)
if os.path.exists(text_file):
os.remove(text_file)
if os.path.exists(utt2spk):
os.remove(utt2spk)
f_wav_scp = open(wav_scp, 'a', encoding="utf-8", newline='\n')
f_text_file = open(text_file, 'a', encoding="utf-8", newline='\n')
f_utt2spk = open(utt2spk, 'a', encoding="utf-8", newline='\n')
# make wav.scp, text, and utt2spk files.
for i in range(0, file_num_max):
#for i in range(400, 410):
print('=== {0}/{1} ==='.format(i+1, file_num_max))
filename = filenames[i]
wav_file = wav_dir + '\\' + filename
if os.path.exists(wav_file):
speaker_id = 'speaker_' + str(i).zfill(4)
utterance_id = filename.replace('.wav', '')
utterance_id = utterance_id.replace(' ', '_')
utterance_id = speaker_id + '-' + utterance_id
# wav.scp file
wav_file_unix = wav_file.replace('\\', '/')
wav_file_unix = wav_file_unix.replace('c:/', '/mnt/c/')
f_wav_scp.write('{0} {1}\n'.format(utterance_id, wav_file_unix))
# text file
word = words[i].lower()
f_text_file.write('{0}\t{1}\n'.format(utterance_id, word))
# utt2spk
f_utt2spk.write('{0} {1}\n'.format(utterance_id, speaker_id))
f_wav_scp.close()
f_text_file.close()
f_utt2spk.close()
## ======================= forced alignment ======================= ## ======================= make lexicon txt which is used by Kaldi =======================
if do_forced_alignment: if make_kaldi_lexicon_txt:
configHVite = cygwin_dir + r'\config\config.HVite' kaldi_work_dir = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5'
filePhoneList = experiments_dir + r'\friesian\acoustic_model\config\phonelist_friesian.txt' kaldi_dict_dir = os.path.join(kaldi_work_dir, 'data', 'local', 'dict')
wav_dir = experiments_dir + r'\stimmen\wav' lexicon_txt = os.path.join(kaldi_dict_dir, 'lexicon.txt')
option_num = 5
#for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128]: # remove previous file.
for hmm_num in [64]: if os.path.exists(lexicon_txt):
hmm_num_str = str(hmm_num) os.remove(lexicon_txt)
AcousticModel = experiments_dir + r'\friesian\acoustic_model\model\hmm' + hmm_num_str + r'-3\hmmdefs'
predictions = [] mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', ipa_xsampa_converter_dir)
file_num_max = len(filenames) with open(csvfile, encoding="utf-8") as fin:
for i in range(0, file_num_max): lines = csv.reader(fin, delimiter=';', lineterminator="\n", skipinitialspace=True)
print('=== {0}/{1} ==='.format(i, file_num_max)) next(lines, None) # skip the headers
filename = filenames[i]
fileWav = wav_dir + '\\' + filename
if os.path.exists(fileWav):
word = words[i]
WORD = word.upper()
# make label file. filenames = []
fileLab = wav_dir + '\\' + filename.replace('.wav', '.lab') words = []
with open(fileLab, 'w') as f: pronunciations = []
lines = f.write(WORD) p = []
for line in lines:
if line[1] is not '' and len(line) > 5:
filenames.append(line[0])
words.append(line[1])
pron_xsampa = line[3]
pron_ipa = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, pron_xsampa)
pron_ipa = pron_ipa.replace('ː', ':')
# adjust to phones used in the acoustic model.
pronunciations.append(pron_ipa)
fileDic = experiments_dir + r'\stimmen\dic_short' + '\\' + word + '.dic' # check if all phones are in the phonelist of the acoustic model.
fileFA = experiments_dir + r'\stimmen\FA_short' + '\\' + filename.replace('.wav', '.txt') + hmm_num_str #'y', 'b', 'ɾ', 'u', 'ɔ:', 'ø', 't', 'œ', 'n', 'ɒ', 'ɐ', 'f', 'o', 'k', 'x', 'ɡ', 'v', 's', 'ɛ:', 'ɪ:', 'ɑ', 'ɛ', 'a', 'd', 'z', 'ɪ', 'ɔ', 'l', 'i:', 'm', 'p', 'a:', 'i', 'e', 'j', 'o:', 'ʁ', 'h', ':', 'e:', 'ə', 'æ', 'χ', 'w', 'r', 'ə:', 'sp', 'ʊ', 'u:', 'ŋ'
pyHTK.doHVite(fileWav, fileLab, fileDic, fileFA, configHVite, filePhoneList, AcousticModel) filenames = np.array(filenames)
prediction = read_fileFA(fileFA) words = np.array(words)
predictions.append(prediction) wordlist = np.unique(words)
pronunciations = np.array(pronunciations)
# output lexicon.txt
#f_lexicon_txt = open(lexicon_txt, 'a', encoding="utf-8", newline='\n')
pronvar_list_all = []
for word in word_list:
os.remove(fileLab) # pronunciation variant of the target word.
print('{0}: {1} -> {2}'.format(WORD, pronunciations[i], prediction)) pronvar_ = pronunciations[words == word]
else: # remove ''
predictions.append('') pronvar_ = np.delete(pronvar_, np.where(pronvar_==''))
print('!!!!! file not found.')
predictions = np.array(predictions) c = Counter(pronvar_)
match = np.c_[words[predictions != ''], pronunciations[predictions != ''], predictions[predictions != '']] total_num = sum(c.values())
np.save(data_dir + '\\match_hmm' + hmm_num_str + '.npy', match)
for key, value in c.most_common(option_num):
#print('{0}\t{1}\t{2}\t{3}'.format(word, key, value, total_num))
key = key.replace('æ', 'ɛ')
key = key.replace('ɐ', 'a')
key = key.replace('ɑ', 'a')
key = key.replace('ɾ', 'r')
key = key.replace('ʁ', 'r')
key = key.replace('ʊ', 'u')
key = key.replace('χ', 'x')
#print('-->{0}\t{1}\t{2}\t{3}\n'.format(word, key, value, total_num))
# make possible pronounciation variant list.
pronvar_list = [key]
while 'ø:' in ' '.join(pronvar_list) or 'œ' in ' '.join(pronvar_list) or 'ɒ' in ' '.join(pronvar_list):
pronvar_list_ = []
for p in pronvar_list:
if 'ø:' in p:
pronvar_list_.append(p.replace('ø:', 'ö'))
pronvar_list_.append(p.replace('ø:', 'ö:'))
if 'œ' in p:
pronvar_list_.append(p.replace('œ', 'ɔ̈'))
pronvar_list_.append(p.replace('œ', 'ɔ̈:'))
if 'ɒ' in p:
pronvar_list_.append(p.replace('ɒ', 'ɔ̈'))
pronvar_list_.append(p.replace('ɒ', 'ɔ̈:'))
pronvar_list = np.unique(pronvar_list_)
for pronvar_ in pronvar_list:
split_ipa = convert_phone_set.split_ipa_fame(pronvar_)
pronvar_out = ' '.join(split_ipa)
pronvar_list_all.append([word, pronvar_out])
# output
pronvar_list_all = np.array(pronvar_list_all)
pronvar_list_all = np.unique(pronvar_list_all, axis=0)
#f_lexicon_txt.write('<UNK>\tSPN\n')
#for line in pronvar_list_all:
# f_lexicon_txt.write('{0}\t{1}\n'.format(line[0].lower(), line[1]))
#f_lexicon_txt.close()
## ======================= load kaldi forced alignment result =======================
if load_forced_alignment_kaldi:
kaldi_work_dir = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5'
phones_txt = kaldi_work_dir + '\\data\\lang\\phones.txt'
merged_alignment_txt = kaldi_work_dir + '\\exp\\tri1_alignme\\merged_alignment.txt'
filenames = np.load(data_dir + '\\filenames.npy')
words = np.load(data_dir + '\\words.npy')
pronunciations = np.load(data_dir + '\\pronunciations_ipa.npy')
pronvar_list_all = np.load(data_dir + '\\pronvar_list_all.npy')
word_list = np.unique(words)
# load the mapping between phones and ids.
with open(phones_txt, 'r', encoding="utf-8") as f:
mappings = f.read().split('\n')
phones = []
phone_ids = []
for m in mappings:
m = m.split(' ')
if len(m) > 1:
phones.append(m[0])
phone_ids.append(int(m[1]))
with open(merged_alignment_txt, 'r') as f:
lines = f.read()
lines = lines.split('\n')
fa_filenames = []
fa_pronunciations = []
filename_ = ''
pron = []
for line in lines:
line = line.split(' ')
if len(line) == 5:
filename = line[0]
if filename == filename_:
phone_id = int(line[4])
#if not phone_id == 1:
phone = phones[phone_ids.index(phone_id)]
pron_ = re.sub(r'_[A-Z]', '', phone)
if not pron_ == 'SIL':
pron.append(pron_)
else:
fa_filenames.append(re.sub(r'speaker_[0-9]{4}-', '', filename))
fa_pronunciations.append(' '.join(pron))
pron = []
filename_ = filename
# correct or not.
for filename, fa_pronunciation in zip(fa_filenames, fa_pronunciations):
## ======================= evaluate the result of forced alignment ======================= ## ======================= evaluate the result of forced alignment =======================
if eval_forced_alignment: if eval_forced_alignment:
#for hmm_num in [1, 2, 4, 8, 16, 32, 64]: match_num = []
hmm_num = 64 for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256]:
hmm_num_str = str(hmm_num) #hmm_num = 256
match = np.load(data_dir + '\\match_hmm' + hmm_num_str + '.npy') hmm_num_str = str(hmm_num)
match = np.load(data_dir + '\\match_hmm' + hmm_num_str + '.npy')
# use dic_short?
if 1: # use dic_short?
pronunciation_variants = np.array(['WORD', 'pronunciation']).reshape(1, 2) if 1:
for word in word_list: pronunciation_variants = np.array(['WORD', 'pronunciation']).reshape(1, 2)
fileDic = experiments_dir + r'\stimmen\dic_short' + '\\' + word + '.dic' for word in word_list:
pronunciation_variants = np.r_[pronunciation_variants, pyHTK.loadHTKdic(fileDic)] fileDic = experiments_dir + r'\stimmen\dic_top3' + '\\' + word + '.dic'
pronunciation_variants = np.r_[pronunciation_variants, pyHTK.loadHTKdic(fileDic)]
match_short = [] # see only words which appears in top 3.
for line in match: match_short = []
word = line[0] for line in match:
WORD = word.upper() word = line[0]
pronvar = pronunciation_variants[pronunciation_variants[:, 0] == word.upper(), 1] WORD = word.upper()
pronvar = pronunciation_variants[pronunciation_variants[:, 0] == word.upper(), 1]
if line[1] in pronvar:
match_short.append(line) if line[1] in pronvar:
match_short.append(line)
match_short = np.array(match_short) match_short = np.array(match_short)
match = np.copy(match_short) match = np.copy(match_short)
# number of match # number of match
total_match = sum(match[:, 1] == match[:, 2]) total_match = sum(match[:, 1] == match[:, 2])
print("{}: {}/{}".format(hmm_num_str, total_match, match.shape[0])) print("{}: {}/{}".format(hmm_num_str, total_match, match.shape[0]))
match_num.append([hmm_num, total_match, match.shape[0]])
# number of mixtures vs accuracy
match_num = np.array(match_num)
plt.xscale("log")
plt.plot(match_num[:, 0], match_num[:, 1]/match_num[0, 2], 'o-')
plt.xlabel('number of mixtures', fontsize=14, fontweight='bold')
plt.ylabel('accuracy', fontsize=14, fontweight='bold')
plt.show()
# confusion matrix
#dir_out = r'C:\OneDrive\Research\rug\experiments\stimmen\result'
#word_list = np.unique(match[:, 0])
#for word in word_list:
# match_ = match[match[:, 0] == word, :]
# cm = confusion_matrix(match_[:, 1], match_[:, 2])
# pronvar = pronunciation_variants[pronunciation_variants[:, 0] == word.upper(), 1]
# plt.figure()
# plot_confusion_matrix(cm, classes=pronvar, normalize=True)
# plt.savefig(dir_out + '\\cm_' + word + '.png')

26
acoustic_model/pyKaldi.py Normal file
View File

@ -0,0 +1,26 @@
import os
import sys
forced_alignment_module = r'C:\Users\Aki\source\repos\forced_alignment'
## ======================= add paths =======================
sys.path.append(forced_alignment_module)
from forced_alignment import convert_phone_set
htk_dict_file = r'C:\OneDrive\Research\rug\experiments\stimmen\dic_top3\Reus.dic'
#kaldi_lexicon = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\data\lang\phones\'
alignment_txt = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\exp\tri1_alignme\merged_alignment.txt'
phones_txt = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\exp\tri1_alignme\phones.txt'
phone_map_txt = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\data\local\lang\phone_map.txt'
with open(phone_map_txt, 'r', encoding="utf-8") as f:
lines = f.read()
lines = lines.split('\n')
with open(alignment_txt, 'r', encoding="utf-8") as f:
lines =
#phone_in = [line for line in lines if 'SIL' in line]
#if len(phone_in) == 1: