fix the bug there are characters in the lexicon which cannot be described in ascii.

This commit is contained in:
yemaozi88 2019-02-03 00:34:35 +01:00
parent dc6b7b84b6
commit 22cccfb61d
9 changed files with 199 additions and 103 deletions

Binary file not shown.

View File

@ -4,8 +4,7 @@
<SchemaVersion>2.0</SchemaVersion> <SchemaVersion>2.0</SchemaVersion>
<ProjectGuid>4d8c8573-32f0-4a62-9e62-3ce5cc680390</ProjectGuid> <ProjectGuid>4d8c8573-32f0-4a62-9e62-3ce5cc680390</ProjectGuid>
<ProjectHome>.</ProjectHome> <ProjectHome>.</ProjectHome>
<StartupFile> <StartupFile>fame_hmm.py</StartupFile>
</StartupFile>
<SearchPath> <SearchPath>
</SearchPath> </SearchPath>
<WorkingDirectory>.</WorkingDirectory> <WorkingDirectory>.</WorkingDirectory>

View File

@ -39,11 +39,11 @@ toolbox_dir = os.path.join(repo_dir, 'toolbox')
#config_hvite = os.path.join(htk_config_dir, 'config.HVite') #config_hvite = os.path.join(htk_config_dir, 'config.HVite')
#acoustic_model = os.path.join(htk_config_dir, 'hmmdefs.compo') #acoustic_model = os.path.join(htk_config_dir, 'hmmdefs.compo')
#acoustic_model = r'c:\cygwin64\home\A.Kunikoshi\acoustic_model\model\barbara\hmm128-2\hmmdefs.compo' #acoustic_model = r'c:\cygwin64\home\A.Kunikoshi\acoustic_model\model\barbara\hmm128-2\hmmdefs.compo'
#phonelist_txt = os.path.join(htk_config_dir, 'phonelist.txt') phonelist_txt = os.path.join(htk_dir, 'config', 'phonelist.txt')
WSL_dir = r'C:\OneDrive\WSL' WSL_dir = r'C:\OneDrive\WSL'
#fame_dir = os.path.join(WSL_dir, 'kaldi-trunk', 'egs', 'fame') #fame_dir = os.path.join(WSL_dir, 'kaldi-trunk', 'egs', 'fame')
fame_dir = r'd:\_corpus\fame' fame_dir = r'c:\OneDrive\Research\rug\_data\FAME'
fame_s5_dir = os.path.join(fame_dir, 's5') fame_s5_dir = os.path.join(fame_dir, 's5')
fame_corpus_dir = os.path.join(fame_dir, 'corpus') fame_corpus_dir = os.path.join(fame_dir, 'corpus')

View File

@ -290,15 +290,17 @@ def lexicon_asr2htk(lexicon_file_asr, lexicon_file_htk):
""" """
lex_asr = load_lexicon(lexicon_file_asr) lex_asr = load_lexicon(lexicon_file_asr)
def word2htk_(row):
return word2htk(row['word'])
def asr2htk_space_delimited_(row): def asr2htk_space_delimited_(row):
return asr2htk_space_delimited(row['pronunciation']) return asr2htk_space_delimited(row['pronunciation'])
lex_htk = pd.DataFrame({ lex_htk = pd.DataFrame({
'word': lex_asr['word'], 'word': lex_asr.apply(word2htk_, axis=1).str.upper(),
'pronunciation': lex_asr.apply(asr2htk_space_delimited_, axis=1) 'pronunciation': lex_asr.apply(asr2htk_space_delimited_, axis=1)
}) })
lex_htk = lex_htk.ix[:, ['word', 'pronunciation']] lex_htk = lex_htk.ix[:, ['word', 'pronunciation']]
lex_htk.to_csv(lexicon_file_htk, header=None, index=None, sep='\t') lex_htk.to_csv(lexicon_file_htk, header=None, index=None, sep='\t', encoding='utf-8')
return return
@ -316,20 +318,26 @@ def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out):
lex2 = load_lexicon(lexicon_file2) lex2 = load_lexicon(lexicon_file2)
lex = pd.concat([lex1, lex2]) lex = pd.concat([lex1, lex2])
lex = lex.sort_values(by='word', ascending=True) lex = lex.sort_values(by='word', ascending=True)
lex.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t') lex.to_csv(lexicon_out, index=False, header=False, sep='\t', encoding='utf-8')
def fix_single_quote(lexicon_file): def fix_single_quote(lexicon_file):
""" add '\' before all single quote at the beginning of words. """ add '\' before all single quote at the beginning of words.
convert special characters to ascii compatible characters.
Args: Args:
lexicon_file (path): lexicon file, which will be overwitten. lexicon_file (path): lexicon file, which will be overwitten.
""" """
lex = load_lexicon(lexicon_file) lex = load_lexicon(lexicon_file)
lex = lex.dropna() # remove N/A.
for i in lex[lex['word'].str.startswith('\'')].index.values: for i in lex[lex['word'].str.startswith('\'')].index.values:
lex.iat[i, 0] = lex.iat[i, 0].replace('\'', '\\\'') lex.iat[i, 0] = lex.iat[i, 0].replace('\'', '\\\'')
# to_csv does not work with space seperator. therefore all tabs should manually be replaced. # to_csv does not work with space seperator. therefore all tabs should manually be replaced.
#lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\') #lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep='\t') lex.to_csv(lexicon_file, index=False, header=False, sep='\t', encoding='utf-8')
return return
def word2htk(word):
return ''.join([fame_asr.translation_key_word2htk.get(i, i) for i in word])

View File

@ -3,6 +3,7 @@ import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import tempfile import tempfile
import shutil
#import configparser #import configparser
#import subprocess #import subprocess
import time import time
@ -11,6 +12,7 @@ import numpy as np
import pandas as pd import pandas as pd
import fame_functions import fame_functions
from phoneset import fame_ipa, fame_asr
import defaultfiles as default import defaultfiles as default
sys.path.append(default.toolbox_dir) sys.path.append(default.toolbox_dir)
import file_handling as fh import file_handling as fh
@ -28,7 +30,7 @@ dataset_list = ['devel', 'test', 'train']
# procedure # procedure
extract_features = 0 extract_features = 0
make_lexicon = 0 make_lexicon = 1
make_mlf = 0 make_mlf = 0
combine_files = 0 combine_files = 0
flat_start = 0 flat_start = 0
@ -44,6 +46,9 @@ lexicon_htk_asr = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_asr')
lexicon_htk_oov = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_oov') lexicon_htk_oov = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_oov')
lexicon_htk = os.path.join(default.htk_dir, 'lexicon', 'lex.htk') lexicon_htk = os.path.join(default.htk_dir, 'lexicon', 'lex.htk')
global_ded = os.path.join(default.htk_dir, 'config', 'global.ded')
#hcompv_scp = output_dir + '\\scp\\combined.scp' #hcompv_scp = output_dir + '\\scp\\combined.scp'
#combined_mlf = output_dir + '\\label\\combined.mlf' #combined_mlf = output_dir + '\\label\\combined.mlf'
@ -60,14 +65,17 @@ if not os.path.exists(feature_dir):
tmp_dir = os.path.join(default.htk_dir, 'tmp') tmp_dir = os.path.join(default.htk_dir, 'tmp')
if not os.path.exists(tmp_dir): if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir) os.makedirs(tmp_dir)
label_dir = os.path.join(default.htk_dir, 'label')
if not os.path.exists(label_dir):
os.makedirs(label_dir)
## ======================= extract features ======================= ## ======================= extract features =======================
if extract_features: if extract_features:
print('==== extract features ====\n')
for dataset in dataset_list: for dataset in dataset_list:
print('==== dataset: {} ===='.format(dataset)) print('==== extract features on dataset {} ====\n'.format(dataset))
# a script file for HCopy # a script file for HCopy
print(">>> making a script file for HCopy... \n") print(">>> making a script file for HCopy... \n")
@ -89,6 +97,8 @@ if extract_features:
hcompv_scp = os.path.join(tmp_dir, dataset + '.scp') hcompv_scp = os.path.join(tmp_dir, dataset + '.scp')
fh.make_filelist(feature_dir_, hcompv_scp, '.mfc') fh.make_filelist(feature_dir_, hcompv_scp, '.mfc')
os.remove(hcopy_scp.name)
## ======================= make lexicon for HTK ======================= ## ======================= make lexicon for HTK =======================
if make_lexicon: if make_lexicon:
@ -114,94 +124,132 @@ if make_lexicon:
fame_functions.fix_single_quote(lexicon_htk) fame_functions.fix_single_quote(lexicon_htk)
## ======================= make phonelist =======================
#phonelist_txt = os.path.join(default.htk_dir, 'config', 'phonelist.txt')
#pyhtk.create_phonelist_file(fame_asr.phoneset_htk, phonelist_txt)
#sentence = 'ien fan de minsken fan it deiferbliuw sels brúntsje visser'
#log_txt = os.path.join(default.htk_dir, 'config', 'log.txt')
#dictionary_file = os.path.join(default.htk_dir, 'config', 'test.dic')
#pyhtk.create_dictionary(
# sentence, global_ded, log_txt, dictionary_file, lexicon_htk)
#pyhtk.create_dictionary_without_log(
# sentence, global_ded, dictionary_file, lexicon_htk)
## ======================= make label file ======================= ## ======================= make label file =======================
if make_mlf: if make_mlf:
print("==== make mlf ====\n")
print("generating word level transcription...\n")
for dataset in dataset_list: for dataset in dataset_list:
hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp' timer_start = time.time()
hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp' print("==== generating word level transcription on dataset {}\n".format(dataset))
script_list = FAME_dir + '\\data\\' + dataset + '\\text'
mlf_word = output_dir + '\\label\\' + dataset + '_word.mlf'
mlf_phone = output_dir + '\\label\\' + dataset + '_phone.mlf'
# lexicon #hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp'
lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation']) #hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'
script_list = os.path.join(default.fame_dir, 'data', dataset, 'text')
# list of features #mlf_word = output_dir + '\\label\\' + dataset + '_word.mlf'
with open(hcompv_scp) as fin: #mlf_phone = output_dir + '\\label\\' + dataset + '_phone.mlf'
features = fin.read() wav_dir = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
features = features.split('\n') dictionary_file = os.path.join(wav_dir, 'temp.dic')
# list of scripts # list of scripts
with open(script_list, "rt", encoding="utf-8") as fin: with open(script_list, "rt", encoding="utf-8") as fin:
scripts = fin.read() scripts = fin.read().split('\n')
scripts = pd.Series(scripts.split('\n'))
i = 0 for line in scripts:
missing_words = [] #for line in ['sp0035m_train_1975_fragmentenvraaggesprekkenruilverkaveling_15413 en dat kan men nog meer']:
fscp = open(hcompv_scp2, 'wt') # sample line:
fmlf = open(mlf_word, "wt", encoding="utf-8") # sp0457m_test_1968_plakkenfryslanterhorne_2168 en dan begjinne je natuerlik
fmlf.write("#!MLF!#\n") filename_ = line.split(' ')[0]
feature_nr = 1 filename = '_'.join(filename_.split('_')[1:])
for feature in features: sentence = ' '.join(line.split(' ')[1:])
sys.stdout.write("\r%d/%d" % (feature_nr, len(features)))
sys.stdout.flush()
feature_nr += 1
file_basename = os.path.basename(feature).replace('.mfc', '')
# get words from scripts. wav_file = os.path.join(wav_dir, filename + '.wav')
try: if len(re.findall(r'[\w]+[âêûô\'ú]+[\w]+', sentence))==0:
script = scripts[scripts.str.contains(file_basename)] try:
except IndexError: sentence_ascii = bytes(sentence, 'ascii')
script = [] except UnicodeEncodeError:
print(sentence)
#if os.path.exists(wav_file):
# #dictionary_file = os.path.join(wav_dir, filename + '.dic')
# if pyhtk.create_dictionary_without_log(
# sentence, global_ded, dictionary_file, lexicon_htk) == 0:
# # when the file name is too long, HDMan command does not work.
# # therefore first temporary dictionary_file is made, then renamed.
# shutil.move(dictionary_file, os.path.join(wav_dir, filename + '.dic'))
# label_file = os.path.join(wav_dir, filename + '.lab')
# pyhtk.create_label_file(sentence, label_file)
# else:
# os.remove(dictionary_file)
print("elapsed time: {}".format(time.time() - timer_start))
# lexicon
#lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])
if len(script) != 0: # list of features
script_id = script.index[0] #with open(hcompv_scp) as fin:
script_txt = script.get(script_id) # features = fin.read()
script_words = script_txt.split(' ') # features = features.split('\n')
del script_words[0] #i = 0
#missing_words = []
#fscp = open(hcompv_scp2, 'wt')
#fmlf = open(mlf_word, "wt", encoding="utf-8")
#fmlf.write("#!MLF!#\n")
#feature_nr = 1
#for feature in features:
# sys.stdout.write("\r%d/%d" % (feature_nr, len(features)))
# sys.stdout.flush()
# feature_nr += 1
# file_basename = os.path.basename(feature).replace('.mfc', '')
# # get words from scripts.
# try:
# script = scripts[scripts.str.contains(file_basename)]
# except IndexError:
# script = []
# if len(script) != 0:
# script_id = script.index[0]
# script_txt = script.get(script_id)
# script_words = script_txt.split(' ')
# del script_words[0]
# check if all words can be found in the lexicon. # check if all words can be found in the lexicon.
SCRIPT_WORDS = [] # SCRIPT_WORDS = []
script_prons = [] # script_prons = []
is_in_lexicon = 1 # is_in_lexicon = 1
for word in script_words: # for word in script_words:
WORD = word.upper() # WORD = word.upper()
SCRIPT_WORDS.append(WORD) # SCRIPT_WORDS.append(WORD)
extracted = lexicon_htk[lexicon_htk['word']==WORD] # extracted = lexicon_htk[lexicon_htk['word']==WORD]
if len(extracted) == 0: # if len(extracted) == 0:
missing_words.append(word) # missing_words.append(word)
script_prons.append(extracted) # script_prons.append(extracted)
is_in_lexicon *= len(extracted) # is_in_lexicon *= len(extracted)
# if all pronunciations are found in the lexicon, update scp and mlf files. # if all pronunciations are found in the lexicon, update scp and mlf files.
if is_in_lexicon: # if is_in_lexicon:
# add the feature filename into the .scp file. # add the feature filename into the .scp file.
fscp.write("{}\n".format(feature)) # fscp.write("{}\n".format(feature))
i += 1 # i += 1
# add the words to the mlf file. # add the words to the mlf file.
fmlf.write('\"*/{}.lab\"\n'.format(file_basename)) # fmlf.write('\"*/{}.lab\"\n'.format(file_basename))
#fmlf.write('{}'.format('\n'.join(SCRIPT_WORDS))) #fmlf.write('{}'.format('\n'.join(SCRIPT_WORDS)))
for word_ in SCRIPT_WORDS: # for word_ in SCRIPT_WORDS:
if word_[0] == '\'': # if word_[0] == '\'':
word_ = '\\' + word_ # word_ = '\\' + word_
fmlf.write('{}\n'.format(word_)) # fmlf.write('{}\n'.format(word_))
fmlf.write('.\n') # fmlf.write('.\n')
print("\n{0} has {1} samples.\n".format(dataset, i)) # print("\n{0} has {1} samples.\n".format(dataset, i))
np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words) # np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words)
fscp.close() # fscp.close()
fmlf.close() # fmlf.close()
## generate phone level transcription ## generate phone level transcription
print("generating phone level transcription...\n") # print("generating phone level transcription...\n")
mkphones = output_dir + '\\label\\mkphones0.txt' # mkphones = output_dir + '\\label\\mkphones0.txt'
subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word # subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word
subprocess.call(subprocessStr, shell=True) # subprocess.call(subprocessStr, shell=True)
## ======================= combined scps and mlfs ======================= ## ======================= combined scps and mlfs =======================

View File

@ -3,6 +3,7 @@ import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
from collections import Counter from collections import Counter
import time import time
import re
import numpy as np import numpy as np
import pandas as pd import pandas as pd
@ -82,22 +83,52 @@ np.save(os.path.join('phoneset', 'fame_ipa2asr.npy'), translation_key_ipa2asr)
## check if all the phones in lexicon.htk are in fame_asr.py. ## check if all the phones in lexicon.htk are in fame_asr.py.
timer_start = time.time() #timer_start = time.time()
phoneset_htk = fame_asr.phoneset_htk #phoneset_htk = fame_asr.phoneset_htk
phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_htk) #phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_htk)
phoneset_lex.remove('') #phoneset_lex.remove('')
print("phones which is in lexicon.htk but not in the fame_asr.py are:\n{}".format( #print("phones which is in lexicon.htk but not in the fame_asr.py are:\n{}".format(
set(phoneset_htk) - set(phoneset_lex))) # set(phoneset_htk) - set(phoneset_lex)))
print("elapsed time: {}".format(time.time() - timer_start)) #print("elapsed time: {}".format(time.time() - timer_start))
# statistics over the lexicon ## statistics over the lexicon
lex_htk = fame_functions.load_lexicon(lexicon_htk) #lex_htk = fame_functions.load_lexicon(lexicon_htk)
phones_all = (' '.join(lex_htk['pronunciation'])).split(' ') #phones_all = (' '.join(lex_htk['pronunciation'])).split(' ')
c = Counter(phones_all) #c = Counter(phones_all)
#lexicon_out = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\lexicon\lex.htk2'
#for i in lex_htk[lex_htk['word'].str.startswith('\'')].index.values:
# lex_htk.iat[i, 0] = lex_htk.iat[i, 0].replace('\'', '\\\'')
## to_csv does not work with space seperator. therefore all tabs should manually be replaced.
##lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
#lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
## check which letters are not coded in ascii.
print('asr phones which cannot be coded in ascii:\n')
for i in fame_asr.phoneset_short:
try:
i_encoded = i.encode("ascii")
#print("{0} --> {1}".format(i, i.encode("ascii")))
except UnicodeEncodeError:
print(">>> {}".format(i))
print("letters in the scripts which is not coded in ascii:\n")
for dataset in ['train', 'devel', 'test']:
timer_start = time.time()
script_list = os.path.join(default.fame_dir, 'data', dataset, 'text')
with open(script_list, "rt", encoding="utf-8") as fin:
scripts = fin.read().split('\n')
for line in scripts:
sentence = ' '.join(line.split(' ')[1:])
sentence_htk = fame_functions.word2htk(sentence)
#if len(re.findall(r'[âêôûč\'àéèúćäëïöü]', sentence))==0:
try:
sentence_htk = bytes(sentence_htk, 'ascii')
except UnicodeEncodeError:
print(sentence)
print(sentence_htk)
lexicon_out = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\lexicon\lex.htk2'
for i in lex_htk[lex_htk['word'].str.startswith('\'')].index.values:
lex_htk.iat[i, 0] = lex_htk.iat[i, 0].replace('\'', '\\\'')
# to_csv does not work with space seperator. therefore all tabs should manually be replaced.
#lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')

View File

@ -103,12 +103,22 @@ translation_key_asr2htk = {
} }
phoneset_htk = [translation_key_asr2htk.get(i, i) for i in phoneset_short] phoneset_htk = [translation_key_asr2htk.get(i, i) for i in phoneset_short]
## check #not_in_ascii = [
#for i in phoneset_short: # '\'',
# try: # 'â', 'ê', 'ô', 'û', 'č',
# print("{0} --> {1}".format(i, i.encode("ascii"))) # 'à', 'í', 'é', 'è', 'ú', 'ć',
# except UnicodeEncodeError: # 'ä', 'ë', 'ï', 'ö', 'ü'
# print(">>> {}".format(i)) #]
translation_key_word2htk = {
'\'': '\\\'',
'í':'i1', 'é':'e1', 'ú':'u1', 'ć':'c1',
'à':'a2', 'è':'e2',
'â':'a3', 'ê':'e3', 'ô':'o3', 'û':'u3',
'č':'c4',
'ä': 'ao', 'ë': 'ee', 'ï': 'ie', 'ö': 'oe', 'ü': 'ue',
}
#[translation_key_word2htk.get(i, i) for i in not_in_ascii]
## the list of multi character phones. ## the list of multi character phones.

0
acoustic_model/test.txt Normal file
View File