You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
416 lines
14 KiB
416 lines
14 KiB
import sys |
|
import os |
|
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') |
|
|
|
import tempfile |
|
import shutil |
|
import glob |
|
import time |
|
|
|
import numpy as np |
|
import pandas as pd |
|
|
|
import fame_functions |
|
from phoneset import fame_ipa, fame_asr |
|
import defaultfiles as default |
|
sys.path.append(default.toolbox_dir) |
|
import file_handling as fh |
|
from htk import pyhtk |
|
|
|
|
|
## ======================= user define ======================= |
|
# procedure |
|
make_lexicon = 0 |
|
make_label = 0 # it takes roughly 4800 sec on Surface pro 2. |
|
make_mlf = 0 |
|
extract_features = 0 |
|
flat_start = 0 |
|
train_monophone_without_sp = 0 |
|
add_sp = 0 |
|
train_monophone_with_re_aligned_mlf = 0 |
|
train_triphone = 0 |
|
train_triphone_tied = 1 |
|
|
|
|
|
# pre-defined values. |
|
dataset_list = ['devel', 'test', 'train'] |
|
feature_size = 39 |
|
improvement_threshold = 0.3 |
|
|
|
hmmdefs_name = 'hmmdefs' |
|
proto_name = 'proto' |
|
|
|
lexicon_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr') |
|
lexicon_oov = os.path.join(default.fame_dir, 'lexicon', 'lex.oov') |
|
|
|
config_dir = os.path.join(default.htk_dir, 'config') |
|
|
|
model_dir = os.path.join(default.htk_dir, 'model') |
|
model_mono0_dir = os.path.join(model_dir, 'mono0') |
|
model_mono1_dir = os.path.join(model_dir, 'mono1') |
|
model_mono1sp_dir = os.path.join(model_dir, 'mono1sp') |
|
model_mono1sp2_dir = os.path.join(model_dir, 'mono1sp2') |
|
model_tri1_dir = os.path.join(model_dir, 'tri1') |
|
|
|
# directories / files to be made. |
|
lexicon_dir = os.path.join(default.htk_dir, 'lexicon') |
|
lexicon_htk_asr = os.path.join(lexicon_dir, 'lex.htk_asr') |
|
lexicon_htk_oov = os.path.join(lexicon_dir, 'lex.htk_oov') |
|
lexicon_htk = os.path.join(lexicon_dir, 'lex.htk') |
|
#lexicon_htk_with_sp = os.path.join(lexicon_dir, 'lex_with_sp.htk') |
|
|
|
feature_dir = os.path.join(default.htk_dir, 'mfc') |
|
fh.make_new_directory(feature_dir, existing_dir='leave') |
|
tmp_dir = os.path.join(default.htk_dir, 'tmp') |
|
fh.make_new_directory(tmp_dir, existing_dir='leave') |
|
label_dir = os.path.join(default.htk_dir, 'label') |
|
fh.make_new_directory(label_dir, existing_dir='leave') |
|
|
|
|
|
## training |
|
hcompv_scp_train = os.path.join(tmp_dir, 'train.scp') |
|
mlf_file_train = os.path.join(label_dir, 'train_phone.mlf') |
|
mlf_file_train_with_sp = os.path.join(label_dir, 'train_phone_with_sp.mlf') |
|
mlf_file_train_aligned = os.path.join(label_dir, 'train_phone_aligned.mlf') |
|
hcompv_scp_train_updated = hcompv_scp_train.replace('.scp', '_updated.scp') |
|
|
|
## testing |
|
htk_stimmen_dir = os.path.join(default.htk_dir, 'stimmen') |
|
|
|
|
|
## ======================= make lexicon for HTK ======================= |
|
if make_lexicon: |
|
timer_start = time.time() |
|
print('==== making lexicon for HTK ====') |
|
|
|
# convert each lexicon from fame_asr phoneset to fame_htk phoneset. |
|
print('>>> converting each lexicon from fame_asr phoneset to fame_htk phoneset...') |
|
fame_functions.lexicon_asr2htk(lexicon_asr, lexicon_htk_asr) |
|
fame_functions.lexicon_asr2htk(lexicon_oov, lexicon_htk_oov) |
|
|
|
# combine lexicon |
|
print('>>> combining lexicon files into one lexicon...') |
|
# pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov. |
|
# therefore there is no overlap between lex_asr and lex_oov. |
|
fame_functions.combine_lexicon(lexicon_htk_asr, lexicon_htk_oov, lexicon_htk) |
|
|
|
## fixing the lexicon for HTK. |
|
# (1) Replace all tabs with single space; |
|
# (2) Put a '\' before any dictionary entry beginning with single quote |
|
# http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html |
|
print('>>> fixing the lexicon...') |
|
fame_functions.fix_lexicon(lexicon_htk) |
|
|
|
## add sp to the end of each line. |
|
#print('>>> adding sp...') |
|
#with open(lexicon_htk) as f: |
|
# lines = f.read().split('\n') |
|
#lines = [line + ' sp' for line in lines] |
|
#with open(lexicon_htk_with_sp, 'wb') as f: |
|
# f.write(bytes('\n'.join(lines), 'ascii')) |
|
|
|
print("elapsed time: {}".format(time.time() - timer_start)) |
|
|
|
|
|
## intialize the instance for HTK. |
|
chtk = pyhtk.HTK(config_dir, fame_asr.phoneset_htk, lexicon_htk, feature_size) |
|
|
|
|
|
## ======================= make label files ======================= |
|
if make_label: |
|
for dataset in dataset_list: |
|
timer_start = time.time() |
|
print("==== making label files on dataset {}".format(dataset)) |
|
|
|
script_list = os.path.join(default.fame_dir, 'data', dataset, 'text') |
|
wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset) |
|
label_dir_ = os.path.join(label_dir, dataset) |
|
dictionary_file = os.path.join(label_dir_, 'temp.dic') |
|
fh.make_new_directory(label_dir_, existing_dir='leave') |
|
|
|
# list of scripts |
|
with open(script_list, "rt", encoding="utf-8") as fin: |
|
scripts = fin.read().split('\n') |
|
|
|
for line in scripts: |
|
# sample line: |
|
# sp0457m_test_1968_plakkenfryslanterhorne_2168 en dan begjinne je natuerlik |
|
filename_ = line.split(' ')[0] |
|
filename = '_'.join(filename_.split('_')[1:]) |
|
sentence = ' '.join(line.split(' ')[1:]) |
|
sentence_htk = fame_functions.word2htk(sentence) |
|
|
|
wav_file = os.path.join(wav_dir_, filename + '.wav') |
|
if os.path.exists(wav_file) and chtk.can_be_ascii(sentence_htk) == 0: |
|
if chtk.get_number_of_missing_words( |
|
sentence_htk, dictionary_file) == 0: |
|
# when the file name is too long, HDMan command does not work. |
|
# therefore first temporary dictionary_file is made, then renamed. |
|
shutil.move(dictionary_file, os.path.join(label_dir_, filename + '.dic')) |
|
|
|
label_file = os.path.join(label_dir_, filename + '.lab') |
|
chtk.create_label_file(sentence_htk, label_file) |
|
else: |
|
os.remove(dictionary_file) |
|
|
|
print("elapsed time: {}".format(time.time() - timer_start)) |
|
|
|
|
|
## ======================= make master label files ======================= |
|
if make_mlf: |
|
timer_start = time.time() |
|
print("==== making master label files ====") |
|
|
|
# train_2002_gongfansaken_10347.lab is empty. should be removed. |
|
empty_lab_file = os.path.join(label_dir, 'train', 'train_2002_gongfansaken_10347.lab') |
|
empty_dic_file = empty_lab_file.replace('.lab', '.dic') |
|
|
|
if os.path.exists(empty_lab_file): |
|
os.remove(empty_lab_file) |
|
if os.path.exists(empty_dic_file): |
|
os.remove(empty_dic_file) |
|
|
|
for dataset in dataset_list: |
|
#wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset) |
|
feature_dir_ = os.path.join(feature_dir, dataset) |
|
label_dir_ = os.path.join(label_dir, dataset) |
|
mlf_word = os.path.join(label_dir, dataset + '_word.mlf') |
|
mlf_phone = os.path.join(label_dir, dataset + '_phone.mlf') |
|
mlf_phone_with_sp = os.path.join(label_dir, dataset + '_phone_with_sp.mlf') |
|
|
|
print(">>> generating a word level mlf file for {}...".format(dataset)) |
|
chtk.label2mlf(label_dir_, mlf_word) |
|
print(">>> generating a phone level mlf file for {}...".format(dataset)) |
|
chtk.mlf_word2phone(mlf_phone, mlf_word, with_sp=False) |
|
chtk.mlf_word2phone(mlf_phone_with_sp, mlf_word, with_sp=True) |
|
|
|
|
|
print("elapsed time: {}".format(time.time() - timer_start)) |
|
|
|
|
|
## ======================= extract features ======================= |
|
if extract_features: |
|
for dataset in dataset_list: |
|
timer_start = time.time() |
|
print('==== extract features on dataset {} ===='.format(dataset)) |
|
|
|
wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset) |
|
label_dir_ = os.path.join(label_dir, dataset) |
|
feature_dir_ = os.path.join(feature_dir, dataset) |
|
fh.make_new_directory(feature_dir_, existing_dir='delete') |
|
|
|
# a script file for HCopy |
|
print(">>> making a script file for HCopy...") |
|
hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False) |
|
hcopy_scp.close() |
|
|
|
# get a list of features (hcopy.scp) |
|
# from the filelist in FAME! corpus. |
|
#fame_functions.make_hcopy_scp_from_filelist_in_fame(default.fame_dir, dataset, feature_dir_, hcopy_scp.name) |
|
# from the list of label files. |
|
lab_list = glob.glob(os.path.join(label_dir_, '*.lab')) |
|
feature_list = [ |
|
os.path.join(wav_dir_, os.path.basename(lab_file).replace('.lab', '.wav')) + '\t' |
|
+ os.path.join(feature_dir_, os.path.basename(lab_file).replace('.lab', '.mfc')) |
|
for lab_file in lab_list] |
|
|
|
if os.path.exists(empty_mfc_file): |
|
os.remove(empty_mfc_file) |
|
with open(hcopy_scp.name, 'wb') as f: |
|
f.write(bytes('\n'.join(feature_list), 'ascii')) |
|
|
|
# extract features. |
|
print(">>> extracting features on {}...".format(dataset)) |
|
chtk.wav2mfc(hcopy_scp.name) |
|
os.remove(hcopy_scp.name) |
|
|
|
# make hcompv.scp. |
|
print(">>> making a script file for {}...".format(dataset)) |
|
listdir = glob.glob(os.path.join(label_dir_, '*.dic')) |
|
mfc_list = [filename.replace(label_dir_, feature_dir_).replace('.dic', '.mfc') for filename in listdir] |
|
hcompv_scp = os.path.join(tmp_dir, dataset + '.scp') |
|
with open(hcompv_scp, 'wb') as f: |
|
f.write(bytes('\n'.join(mfc_list) + '\n', 'ascii')) |
|
|
|
print("elapsed time: {}".format(time.time() - timer_start)) |
|
|
|
|
|
## ======================= flat start monophones ======================= |
|
if flat_start: |
|
timer_start = time.time() |
|
print('==== flat start ====') |
|
fh.make_new_directory(model_mono0_dir, existing_dir='leave') |
|
|
|
chtk.flat_start(hcompv_scp_train, model_mono0_dir) |
|
|
|
# create macros. |
|
vFloors = os.path.join(model_mono0_dir, 'vFloors') |
|
if os.path.exists(vFloors): |
|
chtk.create_macros(vFloors) |
|
|
|
# allocate mean & variance to all phones in the phone list |
|
print('>>> allocating mean & variance to all phones in the phone list...') |
|
chtk.create_hmmdefs( |
|
os.path.join(model_mono0_dir, proto_name), |
|
os.path.join(model_mono0_dir, 'hmmdefs') |
|
) |
|
|
|
print("elapsed time: {}".format(time.time() - timer_start)) |
|
|
|
|
|
## ======================= train model without short pause ======================= |
|
if train_monophone_without_sp: |
|
print('==== train monophone without sp ====') |
|
|
|
timer_start = time.time() |
|
niter = chtk.re_estimation_until_saturated( |
|
model_mono1_dir, |
|
model_mono0_dir, improvement_threshold, hcompv_scp_train, |
|
os.path.join(htk_stimmen_dir, 'mfc'), |
|
'mfc', |
|
os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), |
|
mlf_file=mlf_file_train, |
|
lexicon_file=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic') |
|
) |
|
|
|
print("elapsed time: {}".format(time.time() - timer_start)) |
|
|
|
|
|
## ======================= adding sp to the model ======================= |
|
if add_sp: |
|
print('==== adding sp to the model ====') |
|
# reference: |
|
# http://www.f.waseda.jp/yusukekondo/htk.html#flat_start_estimation |
|
timer_start = time.time() |
|
|
|
# make model with sp. |
|
print('>>> adding sp state to the last model in the previous step...') |
|
fh.make_new_directory(model_mono1sp_dir, existing_dir='leave') |
|
niter = chtk.get_niter_max(model_mono1_dir) |
|
modeln_dir_pre = os.path.join(model_mono1_dir, 'iter'+str(niter)) |
|
modeln_dir = os.path.join(model_mono1sp_dir, 'iter0') |
|
|
|
#hmmdefs_pre = os.path.join(modeln_dir_pre, 'hmmdefs') |
|
chtk.add_sp(modeln_dir_pre, modeln_dir) |
|
print("elapsed time: {}".format(time.time() - timer_start)) |
|
|
|
niter = chtk.re_estimation_until_saturated( |
|
model_mono1sp_dir, modeln_dir, improvement_threshold, hcompv_scp_train, |
|
os.path.join(htk_stimmen_dir, 'mfc'), |
|
'mfc', |
|
os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), |
|
mlf_file=mlf_file_train_with_sp, |
|
lexicon_file=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'), |
|
model_type='monophone_with_sp' |
|
) |
|
|
|
|
|
## ======================= train model with re-aligned mlf ======================= |
|
if train_monophone_with_re_aligned_mlf: |
|
print('==== traina monophone with re-aligned mlf ====') |
|
|
|
print('>>> re-aligning the training data... ') |
|
timer_start = time.time() |
|
niter = chtk.get_niter_max(model_mono1sp_dir) |
|
modeln_dir = os.path.join(model_mono1sp_dir, 'iter'+str(niter)) |
|
chtk.make_aligned_label( |
|
os.path.join(modeln_dir, 'macros'), |
|
os.path.join(modeln_dir, 'hmmdefs'), |
|
mlf_file_train_aligned, |
|
os.path.join(label_dir, 'train_word.mlf'), |
|
hcompv_scp_train) |
|
|
|
print('>>> updating the script file... ') |
|
chtk.update_script_file( |
|
mlf_file_train_aligned, |
|
mlf_file_train_with_sp, |
|
hcompv_scp_train, |
|
hcompv_scp_train_updated) |
|
print("elapsed time: {}".format(time.time() - timer_start)) |
|
|
|
print('>>> re-estimation... ') |
|
timer_start = time.time() |
|
fh.make_new_directory(model_mono1sp2_dir, existing_dir='leave') |
|
niter = chtk.get_niter_max(model_mono1sp_dir) |
|
niter = chtk.re_estimation_until_saturated( |
|
model_mono1sp2_dir, |
|
os.path.join(model_mono1sp_dir, 'iter'+str(niter)), |
|
improvement_threshold, |
|
hcompv_scp_train_updated, |
|
os.path.join(htk_stimmen_dir, 'mfc'), |
|
'mfc', |
|
os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), |
|
mlf_file=mlf_file_train_aligned, |
|
lexicon_file=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'), |
|
model_type='monophone_with_sp' |
|
) |
|
print("elapsed time: {}".format(time.time() - timer_start)) |
|
|
|
|
|
## ======================= train triphone ======================= |
|
if train_triphone: |
|
print('==== traina triphone model ====') |
|
#model_out_dir = os.path.join(model_dir, 'hmm1_tri', 'iter1') |
|
|
|
triphonelist_txt = os.path.join(config_dir, 'triphonelist.txt') |
|
triphone_mlf = os.path.join(default.htk_dir, 'label', 'train_triphone.mlf') |
|
|
|
print('>>> making triphone list... ') |
|
chtk.make_triphonelist( |
|
triphonelist_txt, |
|
triphone_mlf, |
|
mlf_file_train_aligned) |
|
|
|
print('>>> making triphone header... ') |
|
chtk.make_tri_hed( |
|
os.path.join(config_dir, 'mktri.hed') |
|
) |
|
|
|
print('>>> init triphone model... ') |
|
niter = chtk.get_niter_max(model_mono1sp2_dir) |
|
fh.make_new_directory(os.path.join(model_tri1_dir, 'iter0'), existing_dir='leave') |
|
chtk.init_triphone( |
|
os.path.join(model_mono1sp2_dir, 'iter'+str(niter)), |
|
os.path.join(model_tri1_dir, 'iter0') |
|
) |
|
|
|
print('>>> re-estimation... ') |
|
# I wanted to train until satulated: |
|
# #niter = chtk.re_estimation_until_saturated( |
|
# model_tri1_dir, |
|
# os.path.join(model_tri1_dir, 'iter0'), |
|
# improvement_threshold, |
|
# hcompv_scp_train_updated, |
|
# os.path.join(htk_stimmen_dir, 'mfc'), |
|
# 'mfc', |
|
# os.path.join(htk_stimmen_dir, 'word_lattice.ltc'), |
|
# mlf_file=triphone_mlf, |
|
# lexicon_file=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'), |
|
# model_type='triphone' |
|
# ) |
|
# |
|
# but because the data size is limited, some triphone cannot be trained and received the error: |
|
# ERROR [+8231] GetHCIModel: Cannot find hmm [i:-]r[+???] |
|
# therefore only two times re-estimation is performed. |
|
output_dir = model_tri1_dir |
|
|
|
for niter in range(1, 4): |
|
hmm_n = 'iter' + str(niter) |
|
hmm_n_pre = 'iter' + str(niter-1) |
|
_modeln_dir = os.path.join(output_dir, hmm_n) |
|
_modeln_dir_pre = os.path.join(output_dir, hmm_n_pre) |
|
|
|
fh.make_new_directory(_modeln_dir, 'leave') |
|
chtk.re_estimation( |
|
os.path.join(_modeln_dir_pre, 'hmmdefs'), |
|
_modeln_dir, |
|
hcompv_scp_train_updated, |
|
mlf_file=triphone_mlf, |
|
macros=os.path.join(_modeln_dir_pre, 'macros'), |
|
model_type='triphone') |
|
|
|
|
|
## ======================= train triphone ======================= |
|
if train_triphone_tied: |
|
print('==== traina tied-state triphone ====') |
|
|