bug related encoding on label file is fixed.

This commit is contained in:
yemaozi88 2019-02-04 13:46:27 +01:00
parent 322a8a0079
commit f6e7c8eefa
5 changed files with 151 additions and 219 deletions

Binary file not shown.

View File

@ -1,14 +1,13 @@
import os
#default_hvite_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', 'htk', 'config.HVite')
# add path of the parent directory
#os.path.dirname(os.path.realpath(__file__))
#cygwin_dir = r'C:\cygwin64\home\Aki\acoustic_model'
#htk_dir = r'C:\Aki\htk_fame'
htk_dir = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk'
config_hcopy = os.path.join(htk_dir, 'config', 'config.HCopy')
#config_train = os.path.join(cygwin_dir, 'config', 'config.train')
#config_hvite = os.path.join(cygwin_dir, 'config', 'config.HVite')
#mkhmmdefs_pl = os.path.join(cygwin_dir, 'src', 'acoustic_model', 'mkhmmdefs.pl')

View File

@ -5,8 +5,6 @@ os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import tempfile
import shutil
import glob
#import configparser
#import subprocess
import time
import numpy as np
@ -21,45 +19,42 @@ from htk import pyhtk
## ======================= user define =======================
#repo_dir = 'C:\\Users\\Aki\\source\\repos\\acoustic_model'
#curr_dir = repo_dir + '\\acoustic_model'
#config_ini = curr_dir + '\\config.ini'
#output_dir = 'C:\\OneDrive\\Research\\rug\\experiments\\friesian\\acoustic_model'
#forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced_alignment'
# procedure
make_lexicon = 0
make_label = 0 # it takes roughly 4800 sec on Surface pro 2.
make_htk_files = 0
extract_features = 0
flat_start = 0
train_model_without_sp = 1
# pre-defined values.
dataset_list = ['devel', 'test', 'train']
hmmdefs_name = 'hmmdefs'
# procedure
extract_features = 0
make_lexicon = 0
make_dictionary = 0 # 4800 sec
make_htk_files = 1
combine_files = 0
flat_start = 0
train_model = 0
lexicon_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr')
lexicon_oov = os.path.join(default.fame_dir, 'lexicon', 'lex.oov')
config_dir = os.path.join(default.htk_dir, 'config')
config_hcopy = os.path.join(config_dir, 'config.HCopy')
config_train = os.path.join(config_dir, 'config.train')
global_ded = os.path.join(config_dir, 'global.ded')
mkphones_led = os.path.join(config_dir, 'mkphones.led')
prototype = os.path.join(config_dir, 'proto39')
model_dir = os.path.join(default.htk_dir, 'model')
## ======================= load variables =======================
# directories / files to be made.
lexicon_dir = os.path.join(default.fame_dir, 'lexicon')
lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
lexicon_oov = os.path.join(lexicon_dir, 'lex.oov')
lexicon_htk_asr = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_asr')
lexicon_htk_oov = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_oov')
lexicon_htk = os.path.join(default.htk_dir, 'lexicon', 'lex.htk')
lexicon_dir = os.path.join(default.htk_dir, 'lexicon')
lexicon_htk_asr = os.path.join(lexicon_dir, 'lex.htk_asr')
lexicon_htk_oov = os.path.join(lexicon_dir, 'lex.htk_oov')
lexicon_htk = os.path.join(lexicon_dir, 'lex.htk')
global_ded = os.path.join(default.htk_dir, 'config', 'global.ded')
#hcompv_scp = output_dir + '\\scp\\combined.scp'
#combined_mlf = output_dir + '\\label\\combined.mlf'
#model_dir = output_dir + '\\model'
#model0_dir = model_dir + '\\hmm0'
#proto_init = model_dir + '\\proto38'
#proto_name = 'proto'
#phonelist = output_dir + '\\config\\phonelist_friesian.txt'
#hmmdefs_name = 'hmmdefs'
phonelist_txt = os.path.join(config_dir, 'phonelist.txt')
model0_dir = os.path.join(model_dir, 'hmm0')
feature_dir = os.path.join(default.htk_dir, 'mfc')
if not os.path.exists(feature_dir):
@ -72,42 +67,18 @@ if not os.path.exists(label_dir):
os.makedirs(label_dir)
## ======================= extract features =======================
if extract_features:
for dataset in dataset_list:
print('==== extract features on dataset {} ====\n'.format(dataset))
# a script file for HCopy
print(">>> making a script file for HCopy... \n")
hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False)
hcopy_scp.close()
# get a list of features (hcopy.scp) from the filelist in FAME! corpus
feature_dir_ = os.path.join(feature_dir, dataset)
if not os.path.exists(feature_dir_):
os.makedirs(feature_dir_)
# extract features
print(">>> extracting features... \n")
fame_functions.make_hcopy_scp_from_filelist_in_fame(default.fame_dir, dataset, feature_dir_, hcopy_scp.name)
pyhtk.wav2mfc(default.config_hcopy, hcopy_scp.name)
os.remove(hcopy_scp.name)
## ======================= make lexicon for HTK =======================
if make_lexicon:
print('==== make lexicon for HTK ====\n')
timer_start = time.time()
print('==== making lexicon for HTK ====')
# convert each lexicon from fame_asr phoneset to fame_htk phoneset.
print('>>> converting each lexicon from fame_asr phoneset to fame_htk phoneset... \n')
print('>>> converting each lexicon from fame_asr phoneset to fame_htk phoneset...')
fame_functions.lexicon_asr2htk(lexicon_asr, lexicon_htk_asr)
fame_functions.lexicon_asr2htk(lexicon_oov, lexicon_htk_oov)
# combine lexicon
print('>>> combining lexicon files into one lexicon... \n')
print('>>> combining lexicon files into one lexicon...')
# pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov.
# therefore there is no overlap between lex_asr and lex_oov.
fame_functions.combine_lexicon(lexicon_htk_asr, lexicon_htk_oov, lexicon_htk)
@ -119,28 +90,26 @@ if make_lexicon:
# (2) Put a '\' before any dictionary entry beginning with single quote
#http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html
fame_functions.fix_single_quote(lexicon_htk)
print("elapsed time: {}".format(time.time() - timer_start))
## ======================= make dic files =======================
if make_dictionary:
## ======================= make label files =======================
if make_label:
for dataset in dataset_list:
timer_start = time.time()
print("==== generating HTK dictionary files on dataset {}\n".format(dataset))
print("==== making label files on dataset {}".format(dataset))
#hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp'
#hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'
script_list = os.path.join(default.fame_dir, 'data', dataset, 'text')
#mlf_word = output_dir + '\\label\\' + dataset + '_word.mlf'
#mlf_phone = output_dir + '\\label\\' + dataset + '_phone.mlf'
wav_dir = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
dictionary_file = os.path.join(wav_dir, 'temp.dic')
wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
label_dir_ = os.path.join(label_dir, dataset)
dictionary_file = os.path.join(label_dir_, 'temp.dic')
fh.make_new_directory(label_dir_)
# list of scripts
with open(script_list, "rt", encoding="utf-8") as fin:
scripts = fin.read().split('\n')
for line in scripts:
#for line in ['sp0035m_train_1975_fragmentenvraaggesprekkenruilverkaveling_15413 en dat kan men nog meer']:
# sample line:
# sp0457m_test_1968_plakkenfryslanterhorne_2168 en dan begjinne je natuerlik
filename_ = line.split(' ')[0]
@ -148,180 +117,144 @@ if make_dictionary:
sentence = ' '.join(line.split(' ')[1:])
sentence_htk = fame_functions.word2htk(sentence)
wav_file = os.path.join(wav_dir, filename + '.wav')
if os.path.exists(wav_file):
#dictionary_file = os.path.join(wav_dir, filename + '.dic')
wav_file = os.path.join(wav_dir_, filename + '.wav')
if os.path.exists(wav_file) and pyhtk.can_be_ascii(sentence_htk) == 0:
if pyhtk.create_dictionary_without_log(
sentence, global_ded, dictionary_file, lexicon_htk) == 0:
sentence_htk, global_ded, dictionary_file, lexicon_htk) == 0:
# when the file name is too long, HDMan command does not work.
# therefore first temporary dictionary_file is made, then renamed.
shutil.move(dictionary_file, os.path.join(wav_dir, filename + '.dic'))
label_file = os.path.join(wav_dir, filename + '.lab')
pyhtk.create_label_file(sentence, label_file)
shutil.move(dictionary_file, os.path.join(label_dir_, filename + '.dic'))
label_file = os.path.join(label_dir_, filename + '.lab')
pyhtk.create_label_file(sentence_htk, label_file)
else:
os.remove(dictionary_file)
print("elapsed time: {}".format(time.time() - timer_start))
# lexicon
#lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])
# list of features
#with open(hcompv_scp) as fin:
# features = fin.read()
# features = features.split('\n')
#i = 0
#missing_words = []
#fscp = open(hcompv_scp2, 'wt')
#fmlf = open(mlf_word, "wt", encoding="utf-8")
#fmlf.write("#!MLF!#\n")
#feature_nr = 1
#for feature in features:
# sys.stdout.write("\r%d/%d" % (feature_nr, len(features)))
# sys.stdout.flush()
# feature_nr += 1
# file_basename = os.path.basename(feature).replace('.mfc', '')
# # get words from scripts.
# try:
# script = scripts[scripts.str.contains(file_basename)]
# except IndexError:
# script = []
# if len(script) != 0:
# script_id = script.index[0]
# script_txt = script.get(script_id)
# script_words = script_txt.split(' ')
# del script_words[0]
# check if all words can be found in the lexicon.
# SCRIPT_WORDS = []
# script_prons = []
# is_in_lexicon = 1
# for word in script_words:
# WORD = word.upper()
# SCRIPT_WORDS.append(WORD)
# extracted = lexicon_htk[lexicon_htk['word']==WORD]
# if len(extracted) == 0:
# missing_words.append(word)
# script_prons.append(extracted)
# is_in_lexicon *= len(extracted)
# if all pronunciations are found in the lexicon, update scp and mlf files.
# if is_in_lexicon:
# add the feature filename into the .scp file.
# fscp.write("{}\n".format(feature))
# i += 1
# add the words to the mlf file.
# fmlf.write('\"*/{}.lab\"\n'.format(file_basename))
#fmlf.write('{}'.format('\n'.join(SCRIPT_WORDS)))
# for word_ in SCRIPT_WORDS:
# if word_[0] == '\'':
# word_ = '\\' + word_
# fmlf.write('{}\n'.format(word_))
# fmlf.write('.\n')
# print("\n{0} has {1} samples.\n".format(dataset, i))
# np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words)
# fscp.close()
# fmlf.close()
## ======================= make other required files =======================
if make_htk_files:
## phonelist
phonelist_txt = os.path.join(default.htk_dir, 'config', 'phonelist.txt')
timer_start = time.time()
print("==== making files required for HTK ====")
print(">>> making a phonelist...")
pyhtk.create_phonelist_file(fame_asr.phoneset_htk, phonelist_txt)
## hcomp_v.scp
print(">>> making a script file for HCompV... \n")
for dataset in dataset_list:
#timer_start = time.time()
wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
feature_dir_ = os.path.join(feature_dir, dataset)
label_dir_ = os.path.join(label_dir, dataset)
mlf_word = os.path.join(label_dir, dataset + '_word.mlf')
mlf_phone = os.path.join(label_dir, dataset + '_phone.mlf')
wav_dir = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
#print(">>> making a script file for {}...".format(dataset))
#listdir = glob.glob(os.path.join(wav_dir_, '*.dic'))
#mfc_list = [filename.replace(wav_dir_, feature_dir_).replace('.dic', '.mfc') for filename in listdir]
#hcompv_scp = os.path.join(tmp_dir, dataset + '.scp')
#with open(hcompv_scp, 'wb') as f:
# f.write(bytes('\n'.join(mfc_list) + '\n', 'ascii'))
listdir = glob.glob(os.path.join(wav_dir, '*.dic'))
filelist = [filename.replace(wav_dir, feature_dir).replace('.dic', '.fea') for filename in listdir]
print(">>> making a mlf file for {}...".format(dataset))
lab_list = glob.glob(os.path.join(label_dir_, '*.lab'))
with open(mlf_word, 'wb') as fmlf:
fmlf.write(bytes('#!MLF!#\n', 'ascii'))
for label_file in lab_list:
filename = os.path.basename(label_file)
fmlf.write(bytes('\"*/{}\"\n'.format(filename), 'ascii'))
with open(label_file) as flab:
lines = flab.read()
fmlf.write(bytes(lines + '.\n', 'ascii'))
hcompv_scp = os.path.join(tmp_dir, dataset + '.scp')
with open(hcompv_scp, 'wt', newline='\r\n') as f:
f.write('\n'.join(filelist))
print(">>> generating phone level transcription for {}...".format(dataset))
pyhtk.mlf_word2phone(lexicon_htk, mlf_phone, mlf_word, mkphones_led)
print("elapsed time: {}".format(time.time() - timer_start))
## hcomp_scp
# a script file for HCompV
## ======================= extract features =======================
if extract_features:
for dataset in dataset_list:
timer_start = time.time()
print('==== extract features on dataset {} ===='.format(dataset))
# print("generating phone level transcription...\n")
# mkphones = output_dir + '\\label\\mkphones0.txt'
# subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word
# subprocess.call(subprocessStr, shell=True)
wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
label_dir_ = os.path.join(label_dir, dataset)
feature_dir_ = os.path.join(feature_dir, dataset)
fh.make_new_directory(feature_dir_)
## ======================= combined scps and mlfs =======================
#if combine_files:
# print("==== combine scps and mlfs ====\n")
# a script file for HCopy
print(">>> making a script file for HCopy...")
hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False)
hcopy_scp.close()
# fscp = open(hcompv_scp, 'wt')
# fmlf = open(combined_mlf, 'wt')
# for dataset in dataset_list:
# fmlf.write("#!MLF!#\n")
# for dataset in dataset_list:
# each_mlf = output_dir + '\\label\\' + dataset + '_phone.mlf'
# each_scp = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'
# get a list of features (hcopy.scp)
# from the filelist in FAME! corpus.
#fame_functions.make_hcopy_scp_from_filelist_in_fame(default.fame_dir, dataset, feature_dir_, hcopy_scp.name)
# from the list of label files.
lab_list = glob.glob(os.path.join(label_dir_, '*.lab'))
feature_list = [
os.path.join(wav_dir_, os.path.basename(lab_file).replace('.lab', '.wav')) + '\t'
+ os.path.join(feature_dir_, os.path.basename(lab_file).replace('.lab', '.mfc'))
for lab_file in lab_list]
with open(hcopy_scp.name, 'wb') as f:
f.write(bytes('\n'.join(feature_list), 'ascii'))
# with open(each_mlf, 'r') as fin:
# lines = fin.read()
# lines = lines.split('\n')
# fmlf.write('\n'.join(lines[1:]))
# extract features.
print(">>> extracting features on {}...".format(dataset))
pyhtk.wav2mfc(config_hcopy, hcopy_scp.name)
os.remove(hcopy_scp.name)
# with open(each_scp, 'r') as fin:
# lines = fin.read()
# fscp.write(lines)
# make hcompv.scp.
print(">>> making a script file for {}...".format(dataset))
listdir = glob.glob(os.path.join(label_dir_, '*.dic'))
mfc_list = [filename.replace(label_dir_, feature_dir_).replace('.dic', '.mfc') for filename in listdir]
hcompv_scp = os.path.join(tmp_dir, dataset + '.scp')
with open(hcompv_scp, 'wb') as f:
f.write(bytes('\n'.join(mfc_list) + '\n', 'ascii'))
# fscp.close()
# fmlf.close()
print("elapsed time: {}".format(time.time() - timer_start))
## ======================= flat start monophones =======================
if flat_start:
subprocessStr = 'HCompV -T 1 -C ' + config_train + ' -m -v 0.01 -S ' + hcompv_scp + ' -M ' + model0_dir + ' ' + proto_init
subprocess.call(subprocessStr, shell=True)
if flat_start:
hcompv_scp = os.path.join(tmp_dir, 'test.scp')
timer_start = time.time()
print('==== flat start ====')
pyhtk.flat_start(config_train, hcompv_scp, model0_dir, prototype)
# allocate mean & variance to all phones in the phone list
subprocessStr = 'perl ' + mkhmmdefs_pl + ' ' + model0_dir + '\\proto38' + ' ' + phonelist + ' > ' + model0_dir + '\\' + hmmdefs_name
subprocess.call(subprocessStr, shell=True)
pyhtk.create_hmmdefs(
os.path.join(model0_dir, 'proto39'),
os.path.join(model0_dir, 'hmmdefs'),
phonelist_txt)
print("elapsed time: {}".format(time.time() - timer_start))
## ======================= estimate monophones =======================
if train_model:
iter_num_max = 3
for mix_num in [128, 256, 512, 1024]:
for iter_num in range(1, iter_num_max+1):
print("===== mix{}, iter{} =====".format(mix_num, iter_num))
iter_num_pre = iter_num - 1
modelN_dir = model_dir + '\\hmm' + str(mix_num) + '-' + str(iter_num)
if not os.path.exists(modelN_dir):
os.makedirs(modelN_dir)
if train_model_without_sp:
hcompv_scp = os.path.join(tmp_dir, 'test.scp')
mlf_file = os.path.join(label_dir, 'test_phone.mlf')
output_dir = os.path.join(model_dir, 'hmm1')
fh.make_new_directory(output_dir)
if iter_num == 1 and mix_num == 1:
modelN_dir_pre = model0_dir
else:
modelN_dir_pre = model_dir + '\\hmm' + str(mix_num) + '-' + str(iter_num_pre)
print('==== train model without sp ====')
if not os.path.exists(os.path.join(output_dir, 'iter0')):
shutil.copytree(model0_dir, os.path.join(output_dir, 'iter0'))
niter = 1
for niter in range(1, 5):
timer_start = time.time()
hmm_n = 'iter' + str(niter)
hmm_n_pre = 'iter' + str(niter-1)
modeln_dir = os.path.join(output_dir, hmm_n)
modeln_dir_pre = os.path.join(output_dir, hmm_n_pre)
## re-estimation
subprocessStr = 'HERest -T 1 -C ' + config_train + ' -v 0.01 -I ' + combined_mlf + ' -H ' + modelN_dir_pre + '\\' + hmmdefs_name + ' -M ' + modelN_dir + ' ' + phonelist + ' -S ' + hcompv_scp
subprocess.call(subprocessStr, shell=True)
mix_num_next = mix_num * 2
modelN_dir_next = model_dir + '\\hmm' + str(mix_num_next) + '-0'
if not os.path.exists(modelN_dir_next):
os.makedirs(modelN_dir_next)
header_file = modelN_dir + '\\mix' + str(mix_num_next) + '.hed'
with open(header_file, 'w') as fout:
fout.write("MU %d {*.state[2-4].mix}" % (mix_num_next))
subprocessStr = 'HHEd -T 1 -H ' + modelN_dir + '\\' + hmmdefs_name + ' -M ' + modelN_dir_next + ' ' + header_file + ' ' + phonelist
subprocess.call(subprocessStr, shell=True)
# re-estimation
fh.make_new_directory(modeln_dir)
pyhtk.re_estimation(
config_train,
os.path.join(modeln_dir_pre, 'proto39'),
os.path.join(modeln_dir_pre, hmmdefs_name),
modeln_dir,
hcompv_scp, phonelist_txt,
mlf_file=mlf_file)
print("elapsed time: {}".format(time.time() - timer_start))