|
|
|
@ -3,6 +3,7 @@ import os
@@ -3,6 +3,7 @@ import os
|
|
|
|
|
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') |
|
|
|
|
|
|
|
|
|
import tempfile |
|
|
|
|
import shutil |
|
|
|
|
#import configparser |
|
|
|
|
#import subprocess |
|
|
|
|
import time |
|
|
|
@ -11,6 +12,7 @@ import numpy as np
@@ -11,6 +12,7 @@ import numpy as np
|
|
|
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
|
import fame_functions |
|
|
|
|
from phoneset import fame_ipa, fame_asr |
|
|
|
|
import defaultfiles as default |
|
|
|
|
sys.path.append(default.toolbox_dir) |
|
|
|
|
import file_handling as fh |
|
|
|
@ -28,7 +30,7 @@ dataset_list = ['devel', 'test', 'train']
@@ -28,7 +30,7 @@ dataset_list = ['devel', 'test', 'train']
|
|
|
|
|
|
|
|
|
|
# procedure |
|
|
|
|
extract_features = 0 |
|
|
|
|
make_lexicon = 0 |
|
|
|
|
make_lexicon = 1 |
|
|
|
|
make_mlf = 0 |
|
|
|
|
combine_files = 0 |
|
|
|
|
flat_start = 0 |
|
|
|
@ -44,6 +46,9 @@ lexicon_htk_asr = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_asr')
@@ -44,6 +46,9 @@ lexicon_htk_asr = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_asr')
|
|
|
|
|
lexicon_htk_oov = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_oov') |
|
|
|
|
lexicon_htk = os.path.join(default.htk_dir, 'lexicon', 'lex.htk') |
|
|
|
|
|
|
|
|
|
global_ded = os.path.join(default.htk_dir, 'config', 'global.ded') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#hcompv_scp = output_dir + '\\scp\\combined.scp' |
|
|
|
|
#combined_mlf = output_dir + '\\label\\combined.mlf' |
|
|
|
|
|
|
|
|
@ -60,14 +65,17 @@ if not os.path.exists(feature_dir):
@@ -60,14 +65,17 @@ if not os.path.exists(feature_dir):
|
|
|
|
|
tmp_dir = os.path.join(default.htk_dir, 'tmp') |
|
|
|
|
if not os.path.exists(tmp_dir): |
|
|
|
|
os.makedirs(tmp_dir) |
|
|
|
|
label_dir = os.path.join(default.htk_dir, 'label') |
|
|
|
|
if not os.path.exists(label_dir): |
|
|
|
|
os.makedirs(label_dir) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## ======================= extract features ======================= |
|
|
|
|
if extract_features: |
|
|
|
|
print('==== extract features ====\n') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for dataset in dataset_list: |
|
|
|
|
print('==== dataset: {} ===='.format(dataset)) |
|
|
|
|
print('==== extract features on dataset {} ====\n'.format(dataset)) |
|
|
|
|
|
|
|
|
|
# a script file for HCopy |
|
|
|
|
print(">>> making a script file for HCopy... \n") |
|
|
|
@ -89,6 +97,8 @@ if extract_features:
@@ -89,6 +97,8 @@ if extract_features:
|
|
|
|
|
hcompv_scp = os.path.join(tmp_dir, dataset + '.scp') |
|
|
|
|
fh.make_filelist(feature_dir_, hcompv_scp, '.mfc') |
|
|
|
|
|
|
|
|
|
os.remove(hcopy_scp.name) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## ======================= make lexicon for HTK ======================= |
|
|
|
|
if make_lexicon: |
|
|
|
@ -114,94 +124,132 @@ if make_lexicon:
@@ -114,94 +124,132 @@ if make_lexicon:
|
|
|
|
|
fame_functions.fix_single_quote(lexicon_htk) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## ======================= make phonelist ======================= |
|
|
|
|
#phonelist_txt = os.path.join(default.htk_dir, 'config', 'phonelist.txt') |
|
|
|
|
#pyhtk.create_phonelist_file(fame_asr.phoneset_htk, phonelist_txt) |
|
|
|
|
#sentence = 'ien fan de minsken fan it deiferbliuw sels brúntsje visser' |
|
|
|
|
#log_txt = os.path.join(default.htk_dir, 'config', 'log.txt') |
|
|
|
|
#dictionary_file = os.path.join(default.htk_dir, 'config', 'test.dic') |
|
|
|
|
#pyhtk.create_dictionary( |
|
|
|
|
# sentence, global_ded, log_txt, dictionary_file, lexicon_htk) |
|
|
|
|
#pyhtk.create_dictionary_without_log( |
|
|
|
|
# sentence, global_ded, dictionary_file, lexicon_htk) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## ======================= make label file ======================= |
|
|
|
|
if make_mlf: |
|
|
|
|
print("==== make mlf ====\n") |
|
|
|
|
|
|
|
|
|
print("generating word level transcription...\n") |
|
|
|
|
for dataset in dataset_list: |
|
|
|
|
hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp' |
|
|
|
|
hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp' |
|
|
|
|
script_list = FAME_dir + '\\data\\' + dataset + '\\text' |
|
|
|
|
mlf_word = output_dir + '\\label\\' + dataset + '_word.mlf' |
|
|
|
|
mlf_phone = output_dir + '\\label\\' + dataset + '_phone.mlf' |
|
|
|
|
timer_start = time.time() |
|
|
|
|
print("==== generating word level transcription on dataset {}\n".format(dataset)) |
|
|
|
|
|
|
|
|
|
# lexicon |
|
|
|
|
lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation']) |
|
|
|
|
|
|
|
|
|
# list of features |
|
|
|
|
with open(hcompv_scp) as fin: |
|
|
|
|
features = fin.read() |
|
|
|
|
features = features.split('\n') |
|
|
|
|
#hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp' |
|
|
|
|
#hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp' |
|
|
|
|
script_list = os.path.join(default.fame_dir, 'data', dataset, 'text') |
|
|
|
|
#mlf_word = output_dir + '\\label\\' + dataset + '_word.mlf' |
|
|
|
|
#mlf_phone = output_dir + '\\label\\' + dataset + '_phone.mlf' |
|
|
|
|
wav_dir = os.path.join(default.fame_dir, 'fame', 'wav', dataset) |
|
|
|
|
dictionary_file = os.path.join(wav_dir, 'temp.dic') |
|
|
|
|
|
|
|
|
|
# list of scripts |
|
|
|
|
with open(script_list, "rt", encoding="utf-8") as fin: |
|
|
|
|
scripts = fin.read() |
|
|
|
|
scripts = pd.Series(scripts.split('\n')) |
|
|
|
|
scripts = fin.read().split('\n') |
|
|
|
|
|
|
|
|
|
for line in scripts: |
|
|
|
|
#for line in ['sp0035m_train_1975_fragmentenvraaggesprekkenruilverkaveling_15413 en dat kan men nog meer']: |
|
|
|
|
# sample line: |
|
|
|
|
# sp0457m_test_1968_plakkenfryslanterhorne_2168 en dan begjinne je natuerlik |
|
|
|
|
filename_ = line.split(' ')[0] |
|
|
|
|
filename = '_'.join(filename_.split('_')[1:]) |
|
|
|
|
sentence = ' '.join(line.split(' ')[1:]) |
|
|
|
|
|
|
|
|
|
wav_file = os.path.join(wav_dir, filename + '.wav') |
|
|
|
|
if len(re.findall(r'[\w]+[âêûô\'ú]+[\w]+', sentence))==0: |
|
|
|
|
try: |
|
|
|
|
sentence_ascii = bytes(sentence, 'ascii') |
|
|
|
|
except UnicodeEncodeError: |
|
|
|
|
print(sentence) |
|
|
|
|
#if os.path.exists(wav_file): |
|
|
|
|
# #dictionary_file = os.path.join(wav_dir, filename + '.dic') |
|
|
|
|
# if pyhtk.create_dictionary_without_log( |
|
|
|
|
# sentence, global_ded, dictionary_file, lexicon_htk) == 0: |
|
|
|
|
# # when the file name is too long, HDMan command does not work. |
|
|
|
|
# # therefore first temporary dictionary_file is made, then renamed. |
|
|
|
|
# shutil.move(dictionary_file, os.path.join(wav_dir, filename + '.dic')) |
|
|
|
|
# label_file = os.path.join(wav_dir, filename + '.lab') |
|
|
|
|
# pyhtk.create_label_file(sentence, label_file) |
|
|
|
|
# else: |
|
|
|
|
# os.remove(dictionary_file) |
|
|
|
|
print("elapsed time: {}".format(time.time() - timer_start)) |
|
|
|
|
# lexicon |
|
|
|
|
#lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation']) |
|
|
|
|
|
|
|
|
|
i = 0 |
|
|
|
|
missing_words = [] |
|
|
|
|
fscp = open(hcompv_scp2, 'wt') |
|
|
|
|
fmlf = open(mlf_word, "wt", encoding="utf-8") |
|
|
|
|
fmlf.write("#!MLF!#\n") |
|
|
|
|
feature_nr = 1 |
|
|
|
|
for feature in features: |
|
|
|
|
sys.stdout.write("\r%d/%d" % (feature_nr, len(features))) |
|
|
|
|
sys.stdout.flush() |
|
|
|
|
feature_nr += 1 |
|
|
|
|
file_basename = os.path.basename(feature).replace('.mfc', '') |
|
|
|
|
|
|
|
|
|
# get words from scripts. |
|
|
|
|
try: |
|
|
|
|
script = scripts[scripts.str.contains(file_basename)] |
|
|
|
|
except IndexError: |
|
|
|
|
script = [] |
|
|
|
|
|
|
|
|
|
if len(script) != 0: |
|
|
|
|
script_id = script.index[0] |
|
|
|
|
script_txt = script.get(script_id) |
|
|
|
|
script_words = script_txt.split(' ') |
|
|
|
|
del script_words[0] |
|
|
|
|
# list of features |
|
|
|
|
#with open(hcompv_scp) as fin: |
|
|
|
|
# features = fin.read() |
|
|
|
|
# features = features.split('\n') |
|
|
|
|
#i = 0 |
|
|
|
|
#missing_words = [] |
|
|
|
|
#fscp = open(hcompv_scp2, 'wt') |
|
|
|
|
#fmlf = open(mlf_word, "wt", encoding="utf-8") |
|
|
|
|
#fmlf.write("#!MLF!#\n") |
|
|
|
|
#feature_nr = 1 |
|
|
|
|
#for feature in features: |
|
|
|
|
# sys.stdout.write("\r%d/%d" % (feature_nr, len(features))) |
|
|
|
|
# sys.stdout.flush() |
|
|
|
|
# feature_nr += 1 |
|
|
|
|
# file_basename = os.path.basename(feature).replace('.mfc', '') |
|
|
|
|
|
|
|
|
|
# # get words from scripts. |
|
|
|
|
# try: |
|
|
|
|
# script = scripts[scripts.str.contains(file_basename)] |
|
|
|
|
# except IndexError: |
|
|
|
|
# script = [] |
|
|
|
|
|
|
|
|
|
# if len(script) != 0: |
|
|
|
|
# script_id = script.index[0] |
|
|
|
|
# script_txt = script.get(script_id) |
|
|
|
|
# script_words = script_txt.split(' ') |
|
|
|
|
# del script_words[0] |
|
|
|
|
|
|
|
|
|
# check if all words can be found in the lexicon. |
|
|
|
|
SCRIPT_WORDS = [] |
|
|
|
|
script_prons = [] |
|
|
|
|
is_in_lexicon = 1 |
|
|
|
|
for word in script_words: |
|
|
|
|
WORD = word.upper() |
|
|
|
|
SCRIPT_WORDS.append(WORD) |
|
|
|
|
extracted = lexicon_htk[lexicon_htk['word']==WORD] |
|
|
|
|
if len(extracted) == 0: |
|
|
|
|
missing_words.append(word) |
|
|
|
|
script_prons.append(extracted) |
|
|
|
|
is_in_lexicon *= len(extracted) |
|
|
|
|
# SCRIPT_WORDS = [] |
|
|
|
|
# script_prons = [] |
|
|
|
|
# is_in_lexicon = 1 |
|
|
|
|
# for word in script_words: |
|
|
|
|
# WORD = word.upper() |
|
|
|
|
# SCRIPT_WORDS.append(WORD) |
|
|
|
|
# extracted = lexicon_htk[lexicon_htk['word']==WORD] |
|
|
|
|
# if len(extracted) == 0: |
|
|
|
|
# missing_words.append(word) |
|
|
|
|
# script_prons.append(extracted) |
|
|
|
|
# is_in_lexicon *= len(extracted) |
|
|
|
|
|
|
|
|
|
# if all pronunciations are found in the lexicon, update scp and mlf files. |
|
|
|
|
if is_in_lexicon: |
|
|
|
|
# if is_in_lexicon: |
|
|
|
|
# add the feature filename into the .scp file. |
|
|
|
|
fscp.write("{}\n".format(feature)) |
|
|
|
|
i += 1 |
|
|
|
|
# fscp.write("{}\n".format(feature)) |
|
|
|
|
# i += 1 |
|
|
|
|
|
|
|
|
|
# add the words to the mlf file. |
|
|
|
|
fmlf.write('\"*/{}.lab\"\n'.format(file_basename)) |
|
|
|
|
# fmlf.write('\"*/{}.lab\"\n'.format(file_basename)) |
|
|
|
|
#fmlf.write('{}'.format('\n'.join(SCRIPT_WORDS))) |
|
|
|
|
for word_ in SCRIPT_WORDS: |
|
|
|
|
if word_[0] == '\'': |
|
|
|
|
word_ = '\\' + word_ |
|
|
|
|
fmlf.write('{}\n'.format(word_)) |
|
|
|
|
fmlf.write('.\n') |
|
|
|
|
print("\n{0} has {1} samples.\n".format(dataset, i)) |
|
|
|
|
np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words) |
|
|
|
|
# for word_ in SCRIPT_WORDS: |
|
|
|
|
# if word_[0] == '\'': |
|
|
|
|
# word_ = '\\' + word_ |
|
|
|
|
# fmlf.write('{}\n'.format(word_)) |
|
|
|
|
# fmlf.write('.\n') |
|
|
|
|
# print("\n{0} has {1} samples.\n".format(dataset, i)) |
|
|
|
|
# np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words) |
|
|
|
|
|
|
|
|
|
fscp.close() |
|
|
|
|
fmlf.close() |
|
|
|
|
# fscp.close() |
|
|
|
|
# fmlf.close() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## generate phone level transcription |
|
|
|
|
print("generating phone level transcription...\n") |
|
|
|
|
mkphones = output_dir + '\\label\\mkphones0.txt' |
|
|
|
|
subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word |
|
|
|
|
subprocess.call(subprocessStr, shell=True) |
|
|
|
|
# print("generating phone level transcription...\n") |
|
|
|
|
# mkphones = output_dir + '\\label\\mkphones0.txt' |
|
|
|
|
# subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word |
|
|
|
|
# subprocess.call(subprocessStr, shell=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## ======================= combined scps and mlfs ======================= |
|
|
|
|