fix the bug there are characters in the lexicon which cannot be described in ascii.

This commit is contained in:
yemaozi88
2019-02-03 00:34:35 +01:00
parent dc6b7b84b6
commit 22cccfb61d
9 changed files with 199 additions and 103 deletions

View File

@ -3,6 +3,7 @@ import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import tempfile
import shutil
#import configparser
#import subprocess
import time
@ -11,6 +12,7 @@ import numpy as np
import pandas as pd
import fame_functions
from phoneset import fame_ipa, fame_asr
import defaultfiles as default
sys.path.append(default.toolbox_dir)
import file_handling as fh
@ -28,7 +30,7 @@ dataset_list = ['devel', 'test', 'train']
# procedure
extract_features = 0
make_lexicon = 0
make_lexicon = 1
make_mlf = 0
combine_files = 0
flat_start = 0
@ -44,6 +46,9 @@ lexicon_htk_asr = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_asr')
lexicon_htk_oov = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_oov')
lexicon_htk = os.path.join(default.htk_dir, 'lexicon', 'lex.htk')
global_ded = os.path.join(default.htk_dir, 'config', 'global.ded')
#hcompv_scp = output_dir + '\\scp\\combined.scp'
#combined_mlf = output_dir + '\\label\\combined.mlf'
@ -60,14 +65,17 @@ if not os.path.exists(feature_dir):
tmp_dir = os.path.join(default.htk_dir, 'tmp')
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
label_dir = os.path.join(default.htk_dir, 'label')
if not os.path.exists(label_dir):
os.makedirs(label_dir)
## ======================= extract features =======================
if extract_features:
print('==== extract features ====\n')
for dataset in dataset_list:
print('==== dataset: {} ===='.format(dataset))
print('==== extract features on dataset {} ====\n'.format(dataset))
# a script file for HCopy
print(">>> making a script file for HCopy... \n")
@ -89,6 +97,8 @@ if extract_features:
hcompv_scp = os.path.join(tmp_dir, dataset + '.scp')
fh.make_filelist(feature_dir_, hcompv_scp, '.mfc')
os.remove(hcopy_scp.name)
## ======================= make lexicon for HTK =======================
if make_lexicon:
@ -114,94 +124,132 @@ if make_lexicon:
fame_functions.fix_single_quote(lexicon_htk)
## ======================= make phonelist =======================
#phonelist_txt = os.path.join(default.htk_dir, 'config', 'phonelist.txt')
#pyhtk.create_phonelist_file(fame_asr.phoneset_htk, phonelist_txt)
#sentence = 'ien fan de minsken fan it deiferbliuw sels brúntsje visser'
#log_txt = os.path.join(default.htk_dir, 'config', 'log.txt')
#dictionary_file = os.path.join(default.htk_dir, 'config', 'test.dic')
#pyhtk.create_dictionary(
# sentence, global_ded, log_txt, dictionary_file, lexicon_htk)
#pyhtk.create_dictionary_without_log(
# sentence, global_ded, dictionary_file, lexicon_htk)
## ======================= make label file =======================
if make_mlf:
print("==== make mlf ====\n")
print("generating word level transcription...\n")
for dataset in dataset_list:
hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp'
hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'
script_list = FAME_dir + '\\data\\' + dataset + '\\text'
mlf_word = output_dir + '\\label\\' + dataset + '_word.mlf'
mlf_phone = output_dir + '\\label\\' + dataset + '_phone.mlf'
timer_start = time.time()
print("==== generating word level transcription on dataset {}\n".format(dataset))
# lexicon
lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])
# list of features
with open(hcompv_scp) as fin:
features = fin.read()
features = features.split('\n')
#hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp'
#hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'
script_list = os.path.join(default.fame_dir, 'data', dataset, 'text')
#mlf_word = output_dir + '\\label\\' + dataset + '_word.mlf'
#mlf_phone = output_dir + '\\label\\' + dataset + '_phone.mlf'
wav_dir = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
dictionary_file = os.path.join(wav_dir, 'temp.dic')
# list of scripts
with open(script_list, "rt", encoding="utf-8") as fin:
scripts = fin.read()
scripts = pd.Series(scripts.split('\n'))
scripts = fin.read().split('\n')
i = 0
missing_words = []
fscp = open(hcompv_scp2, 'wt')
fmlf = open(mlf_word, "wt", encoding="utf-8")
fmlf.write("#!MLF!#\n")
feature_nr = 1
for feature in features:
sys.stdout.write("\r%d/%d" % (feature_nr, len(features)))
sys.stdout.flush()
feature_nr += 1
file_basename = os.path.basename(feature).replace('.mfc', '')
for line in scripts:
#for line in ['sp0035m_train_1975_fragmentenvraaggesprekkenruilverkaveling_15413 en dat kan men nog meer']:
# sample line:
# sp0457m_test_1968_plakkenfryslanterhorne_2168 en dan begjinne je natuerlik
filename_ = line.split(' ')[0]
filename = '_'.join(filename_.split('_')[1:])
sentence = ' '.join(line.split(' ')[1:])
# get words from scripts.
try:
script = scripts[scripts.str.contains(file_basename)]
except IndexError:
script = []
wav_file = os.path.join(wav_dir, filename + '.wav')
if len(re.findall(r'[\w]+[âêûô\'ú]+[\w]+', sentence))==0:
try:
sentence_ascii = bytes(sentence, 'ascii')
except UnicodeEncodeError:
print(sentence)
#if os.path.exists(wav_file):
# #dictionary_file = os.path.join(wav_dir, filename + '.dic')
# if pyhtk.create_dictionary_without_log(
# sentence, global_ded, dictionary_file, lexicon_htk) == 0:
# # when the file name is too long, HDMan command does not work.
# # therefore first temporary dictionary_file is made, then renamed.
# shutil.move(dictionary_file, os.path.join(wav_dir, filename + '.dic'))
# label_file = os.path.join(wav_dir, filename + '.lab')
# pyhtk.create_label_file(sentence, label_file)
# else:
# os.remove(dictionary_file)
print("elapsed time: {}".format(time.time() - timer_start))
# lexicon
#lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])
if len(script) != 0:
script_id = script.index[0]
script_txt = script.get(script_id)
script_words = script_txt.split(' ')
del script_words[0]
# list of features
#with open(hcompv_scp) as fin:
# features = fin.read()
# features = features.split('\n')
#i = 0
#missing_words = []
#fscp = open(hcompv_scp2, 'wt')
#fmlf = open(mlf_word, "wt", encoding="utf-8")
#fmlf.write("#!MLF!#\n")
#feature_nr = 1
#for feature in features:
# sys.stdout.write("\r%d/%d" % (feature_nr, len(features)))
# sys.stdout.flush()
# feature_nr += 1
# file_basename = os.path.basename(feature).replace('.mfc', '')
# # get words from scripts.
# try:
# script = scripts[scripts.str.contains(file_basename)]
# except IndexError:
# script = []
# if len(script) != 0:
# script_id = script.index[0]
# script_txt = script.get(script_id)
# script_words = script_txt.split(' ')
# del script_words[0]
# check if all words can be found in the lexicon.
SCRIPT_WORDS = []
script_prons = []
is_in_lexicon = 1
for word in script_words:
WORD = word.upper()
SCRIPT_WORDS.append(WORD)
extracted = lexicon_htk[lexicon_htk['word']==WORD]
if len(extracted) == 0:
missing_words.append(word)
script_prons.append(extracted)
is_in_lexicon *= len(extracted)
# SCRIPT_WORDS = []
# script_prons = []
# is_in_lexicon = 1
# for word in script_words:
# WORD = word.upper()
# SCRIPT_WORDS.append(WORD)
# extracted = lexicon_htk[lexicon_htk['word']==WORD]
# if len(extracted) == 0:
# missing_words.append(word)
# script_prons.append(extracted)
# is_in_lexicon *= len(extracted)
# if all pronunciations are found in the lexicon, update scp and mlf files.
if is_in_lexicon:
# if is_in_lexicon:
# add the feature filename into the .scp file.
fscp.write("{}\n".format(feature))
i += 1
# fscp.write("{}\n".format(feature))
# i += 1
# add the words to the mlf file.
fmlf.write('\"*/{}.lab\"\n'.format(file_basename))
# fmlf.write('\"*/{}.lab\"\n'.format(file_basename))
#fmlf.write('{}'.format('\n'.join(SCRIPT_WORDS)))
for word_ in SCRIPT_WORDS:
if word_[0] == '\'':
word_ = '\\' + word_
fmlf.write('{}\n'.format(word_))
fmlf.write('.\n')
print("\n{0} has {1} samples.\n".format(dataset, i))
np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words)
# for word_ in SCRIPT_WORDS:
# if word_[0] == '\'':
# word_ = '\\' + word_
# fmlf.write('{}\n'.format(word_))
# fmlf.write('.\n')
# print("\n{0} has {1} samples.\n".format(dataset, i))
# np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words)
fscp.close()
fmlf.close()
# fscp.close()
# fmlf.close()
## generate phone level transcription
print("generating phone level transcription...\n")
mkphones = output_dir + '\\label\\mkphones0.txt'
subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word
subprocess.call(subprocessStr, shell=True)
# print("generating phone level transcription...\n")
# mkphones = output_dir + '\\label\\mkphones0.txt'
# subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word
# subprocess.call(subprocessStr, shell=True)
## ======================= combined scps and mlfs =======================