fix the bug there are characters in the lexicon which cannot be described in ascii.
This commit is contained in:
parent
dc6b7b84b6
commit
22cccfb61d
Binary file not shown.
Binary file not shown.
@ -4,8 +4,7 @@
|
|||||||
<SchemaVersion>2.0</SchemaVersion>
|
<SchemaVersion>2.0</SchemaVersion>
|
||||||
<ProjectGuid>4d8c8573-32f0-4a62-9e62-3ce5cc680390</ProjectGuid>
|
<ProjectGuid>4d8c8573-32f0-4a62-9e62-3ce5cc680390</ProjectGuid>
|
||||||
<ProjectHome>.</ProjectHome>
|
<ProjectHome>.</ProjectHome>
|
||||||
<StartupFile>
|
<StartupFile>fame_hmm.py</StartupFile>
|
||||||
</StartupFile>
|
|
||||||
<SearchPath>
|
<SearchPath>
|
||||||
</SearchPath>
|
</SearchPath>
|
||||||
<WorkingDirectory>.</WorkingDirectory>
|
<WorkingDirectory>.</WorkingDirectory>
|
||||||
|
@ -39,11 +39,11 @@ toolbox_dir = os.path.join(repo_dir, 'toolbox')
|
|||||||
#config_hvite = os.path.join(htk_config_dir, 'config.HVite')
|
#config_hvite = os.path.join(htk_config_dir, 'config.HVite')
|
||||||
#acoustic_model = os.path.join(htk_config_dir, 'hmmdefs.compo')
|
#acoustic_model = os.path.join(htk_config_dir, 'hmmdefs.compo')
|
||||||
#acoustic_model = r'c:\cygwin64\home\A.Kunikoshi\acoustic_model\model\barbara\hmm128-2\hmmdefs.compo'
|
#acoustic_model = r'c:\cygwin64\home\A.Kunikoshi\acoustic_model\model\barbara\hmm128-2\hmmdefs.compo'
|
||||||
#phonelist_txt = os.path.join(htk_config_dir, 'phonelist.txt')
|
phonelist_txt = os.path.join(htk_dir, 'config', 'phonelist.txt')
|
||||||
|
|
||||||
WSL_dir = r'C:\OneDrive\WSL'
|
WSL_dir = r'C:\OneDrive\WSL'
|
||||||
#fame_dir = os.path.join(WSL_dir, 'kaldi-trunk', 'egs', 'fame')
|
#fame_dir = os.path.join(WSL_dir, 'kaldi-trunk', 'egs', 'fame')
|
||||||
fame_dir = r'd:\_corpus\fame'
|
fame_dir = r'c:\OneDrive\Research\rug\_data\FAME'
|
||||||
|
|
||||||
fame_s5_dir = os.path.join(fame_dir, 's5')
|
fame_s5_dir = os.path.join(fame_dir, 's5')
|
||||||
fame_corpus_dir = os.path.join(fame_dir, 'corpus')
|
fame_corpus_dir = os.path.join(fame_dir, 'corpus')
|
||||||
|
@ -290,15 +290,17 @@ def lexicon_asr2htk(lexicon_file_asr, lexicon_file_htk):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
lex_asr = load_lexicon(lexicon_file_asr)
|
lex_asr = load_lexicon(lexicon_file_asr)
|
||||||
|
def word2htk_(row):
|
||||||
|
return word2htk(row['word'])
|
||||||
def asr2htk_space_delimited_(row):
|
def asr2htk_space_delimited_(row):
|
||||||
return asr2htk_space_delimited(row['pronunciation'])
|
return asr2htk_space_delimited(row['pronunciation'])
|
||||||
|
|
||||||
lex_htk = pd.DataFrame({
|
lex_htk = pd.DataFrame({
|
||||||
'word': lex_asr['word'],
|
'word': lex_asr.apply(word2htk_, axis=1).str.upper(),
|
||||||
'pronunciation': lex_asr.apply(asr2htk_space_delimited_, axis=1)
|
'pronunciation': lex_asr.apply(asr2htk_space_delimited_, axis=1)
|
||||||
})
|
})
|
||||||
lex_htk = lex_htk.ix[:, ['word', 'pronunciation']]
|
lex_htk = lex_htk.ix[:, ['word', 'pronunciation']]
|
||||||
lex_htk.to_csv(lexicon_file_htk, header=None, index=None, sep='\t')
|
lex_htk.to_csv(lexicon_file_htk, header=None, index=None, sep='\t', encoding='utf-8')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
@ -316,20 +318,26 @@ def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out):
|
|||||||
lex2 = load_lexicon(lexicon_file2)
|
lex2 = load_lexicon(lexicon_file2)
|
||||||
lex = pd.concat([lex1, lex2])
|
lex = pd.concat([lex1, lex2])
|
||||||
lex = lex.sort_values(by='word', ascending=True)
|
lex = lex.sort_values(by='word', ascending=True)
|
||||||
lex.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
|
lex.to_csv(lexicon_out, index=False, header=False, sep='\t', encoding='utf-8')
|
||||||
|
|
||||||
|
|
||||||
def fix_single_quote(lexicon_file):
|
def fix_single_quote(lexicon_file):
|
||||||
""" add '\' before all single quote at the beginning of words.
|
""" add '\' before all single quote at the beginning of words.
|
||||||
|
convert special characters to ascii compatible characters.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
lexicon_file (path): lexicon file, which will be overwitten.
|
lexicon_file (path): lexicon file, which will be overwitten.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
lex = load_lexicon(lexicon_file)
|
lex = load_lexicon(lexicon_file)
|
||||||
|
lex = lex.dropna() # remove N/A.
|
||||||
for i in lex[lex['word'].str.startswith('\'')].index.values:
|
for i in lex[lex['word'].str.startswith('\'')].index.values:
|
||||||
lex.iat[i, 0] = lex.iat[i, 0].replace('\'', '\\\'')
|
lex.iat[i, 0] = lex.iat[i, 0].replace('\'', '\\\'')
|
||||||
# to_csv does not work with space seperator. therefore all tabs should manually be replaced.
|
# to_csv does not work with space seperator. therefore all tabs should manually be replaced.
|
||||||
#lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
|
#lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
|
||||||
lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep='\t')
|
lex.to_csv(lexicon_file, index=False, header=False, sep='\t', encoding='utf-8')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
def word2htk(word):
|
||||||
|
return ''.join([fame_asr.translation_key_word2htk.get(i, i) for i in word])
|
||||||
|
@ -3,6 +3,7 @@ import os
|
|||||||
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
||||||
|
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import shutil
|
||||||
#import configparser
|
#import configparser
|
||||||
#import subprocess
|
#import subprocess
|
||||||
import time
|
import time
|
||||||
@ -11,6 +12,7 @@ import numpy as np
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
import fame_functions
|
import fame_functions
|
||||||
|
from phoneset import fame_ipa, fame_asr
|
||||||
import defaultfiles as default
|
import defaultfiles as default
|
||||||
sys.path.append(default.toolbox_dir)
|
sys.path.append(default.toolbox_dir)
|
||||||
import file_handling as fh
|
import file_handling as fh
|
||||||
@ -28,7 +30,7 @@ dataset_list = ['devel', 'test', 'train']
|
|||||||
|
|
||||||
# procedure
|
# procedure
|
||||||
extract_features = 0
|
extract_features = 0
|
||||||
make_lexicon = 0
|
make_lexicon = 1
|
||||||
make_mlf = 0
|
make_mlf = 0
|
||||||
combine_files = 0
|
combine_files = 0
|
||||||
flat_start = 0
|
flat_start = 0
|
||||||
@ -44,6 +46,9 @@ lexicon_htk_asr = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_asr')
|
|||||||
lexicon_htk_oov = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_oov')
|
lexicon_htk_oov = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_oov')
|
||||||
lexicon_htk = os.path.join(default.htk_dir, 'lexicon', 'lex.htk')
|
lexicon_htk = os.path.join(default.htk_dir, 'lexicon', 'lex.htk')
|
||||||
|
|
||||||
|
global_ded = os.path.join(default.htk_dir, 'config', 'global.ded')
|
||||||
|
|
||||||
|
|
||||||
#hcompv_scp = output_dir + '\\scp\\combined.scp'
|
#hcompv_scp = output_dir + '\\scp\\combined.scp'
|
||||||
#combined_mlf = output_dir + '\\label\\combined.mlf'
|
#combined_mlf = output_dir + '\\label\\combined.mlf'
|
||||||
|
|
||||||
@ -60,14 +65,17 @@ if not os.path.exists(feature_dir):
|
|||||||
tmp_dir = os.path.join(default.htk_dir, 'tmp')
|
tmp_dir = os.path.join(default.htk_dir, 'tmp')
|
||||||
if not os.path.exists(tmp_dir):
|
if not os.path.exists(tmp_dir):
|
||||||
os.makedirs(tmp_dir)
|
os.makedirs(tmp_dir)
|
||||||
|
label_dir = os.path.join(default.htk_dir, 'label')
|
||||||
|
if not os.path.exists(label_dir):
|
||||||
|
os.makedirs(label_dir)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## ======================= extract features =======================
|
## ======================= extract features =======================
|
||||||
if extract_features:
|
if extract_features:
|
||||||
print('==== extract features ====\n')
|
|
||||||
|
|
||||||
for dataset in dataset_list:
|
for dataset in dataset_list:
|
||||||
print('==== dataset: {} ===='.format(dataset))
|
print('==== extract features on dataset {} ====\n'.format(dataset))
|
||||||
|
|
||||||
# a script file for HCopy
|
# a script file for HCopy
|
||||||
print(">>> making a script file for HCopy... \n")
|
print(">>> making a script file for HCopy... \n")
|
||||||
@ -89,6 +97,8 @@ if extract_features:
|
|||||||
hcompv_scp = os.path.join(tmp_dir, dataset + '.scp')
|
hcompv_scp = os.path.join(tmp_dir, dataset + '.scp')
|
||||||
fh.make_filelist(feature_dir_, hcompv_scp, '.mfc')
|
fh.make_filelist(feature_dir_, hcompv_scp, '.mfc')
|
||||||
|
|
||||||
|
os.remove(hcopy_scp.name)
|
||||||
|
|
||||||
|
|
||||||
## ======================= make lexicon for HTK =======================
|
## ======================= make lexicon for HTK =======================
|
||||||
if make_lexicon:
|
if make_lexicon:
|
||||||
@ -114,94 +124,132 @@ if make_lexicon:
|
|||||||
fame_functions.fix_single_quote(lexicon_htk)
|
fame_functions.fix_single_quote(lexicon_htk)
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= make phonelist =======================
|
||||||
|
#phonelist_txt = os.path.join(default.htk_dir, 'config', 'phonelist.txt')
|
||||||
|
#pyhtk.create_phonelist_file(fame_asr.phoneset_htk, phonelist_txt)
|
||||||
|
#sentence = 'ien fan de minsken fan it deiferbliuw sels brúntsje visser'
|
||||||
|
#log_txt = os.path.join(default.htk_dir, 'config', 'log.txt')
|
||||||
|
#dictionary_file = os.path.join(default.htk_dir, 'config', 'test.dic')
|
||||||
|
#pyhtk.create_dictionary(
|
||||||
|
# sentence, global_ded, log_txt, dictionary_file, lexicon_htk)
|
||||||
|
#pyhtk.create_dictionary_without_log(
|
||||||
|
# sentence, global_ded, dictionary_file, lexicon_htk)
|
||||||
|
|
||||||
|
|
||||||
## ======================= make label file =======================
|
## ======================= make label file =======================
|
||||||
if make_mlf:
|
if make_mlf:
|
||||||
print("==== make mlf ====\n")
|
|
||||||
|
|
||||||
print("generating word level transcription...\n")
|
|
||||||
for dataset in dataset_list:
|
for dataset in dataset_list:
|
||||||
hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp'
|
timer_start = time.time()
|
||||||
hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'
|
print("==== generating word level transcription on dataset {}\n".format(dataset))
|
||||||
script_list = FAME_dir + '\\data\\' + dataset + '\\text'
|
|
||||||
mlf_word = output_dir + '\\label\\' + dataset + '_word.mlf'
|
|
||||||
mlf_phone = output_dir + '\\label\\' + dataset + '_phone.mlf'
|
|
||||||
|
|
||||||
# lexicon
|
#hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp'
|
||||||
lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])
|
#hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'
|
||||||
|
script_list = os.path.join(default.fame_dir, 'data', dataset, 'text')
|
||||||
# list of features
|
#mlf_word = output_dir + '\\label\\' + dataset + '_word.mlf'
|
||||||
with open(hcompv_scp) as fin:
|
#mlf_phone = output_dir + '\\label\\' + dataset + '_phone.mlf'
|
||||||
features = fin.read()
|
wav_dir = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
|
||||||
features = features.split('\n')
|
dictionary_file = os.path.join(wav_dir, 'temp.dic')
|
||||||
|
|
||||||
# list of scripts
|
# list of scripts
|
||||||
with open(script_list, "rt", encoding="utf-8") as fin:
|
with open(script_list, "rt", encoding="utf-8") as fin:
|
||||||
scripts = fin.read()
|
scripts = fin.read().split('\n')
|
||||||
scripts = pd.Series(scripts.split('\n'))
|
|
||||||
|
|
||||||
i = 0
|
for line in scripts:
|
||||||
missing_words = []
|
#for line in ['sp0035m_train_1975_fragmentenvraaggesprekkenruilverkaveling_15413 en dat kan men nog meer']:
|
||||||
fscp = open(hcompv_scp2, 'wt')
|
# sample line:
|
||||||
fmlf = open(mlf_word, "wt", encoding="utf-8")
|
# sp0457m_test_1968_plakkenfryslanterhorne_2168 en dan begjinne je natuerlik
|
||||||
fmlf.write("#!MLF!#\n")
|
filename_ = line.split(' ')[0]
|
||||||
feature_nr = 1
|
filename = '_'.join(filename_.split('_')[1:])
|
||||||
for feature in features:
|
sentence = ' '.join(line.split(' ')[1:])
|
||||||
sys.stdout.write("\r%d/%d" % (feature_nr, len(features)))
|
|
||||||
sys.stdout.flush()
|
|
||||||
feature_nr += 1
|
|
||||||
file_basename = os.path.basename(feature).replace('.mfc', '')
|
|
||||||
|
|
||||||
# get words from scripts.
|
wav_file = os.path.join(wav_dir, filename + '.wav')
|
||||||
|
if len(re.findall(r'[\w]+[âêûô\'ú]+[\w]+', sentence))==0:
|
||||||
try:
|
try:
|
||||||
script = scripts[scripts.str.contains(file_basename)]
|
sentence_ascii = bytes(sentence, 'ascii')
|
||||||
except IndexError:
|
except UnicodeEncodeError:
|
||||||
script = []
|
print(sentence)
|
||||||
|
#if os.path.exists(wav_file):
|
||||||
|
# #dictionary_file = os.path.join(wav_dir, filename + '.dic')
|
||||||
|
# if pyhtk.create_dictionary_without_log(
|
||||||
|
# sentence, global_ded, dictionary_file, lexicon_htk) == 0:
|
||||||
|
# # when the file name is too long, HDMan command does not work.
|
||||||
|
# # therefore first temporary dictionary_file is made, then renamed.
|
||||||
|
# shutil.move(dictionary_file, os.path.join(wav_dir, filename + '.dic'))
|
||||||
|
# label_file = os.path.join(wav_dir, filename + '.lab')
|
||||||
|
# pyhtk.create_label_file(sentence, label_file)
|
||||||
|
# else:
|
||||||
|
# os.remove(dictionary_file)
|
||||||
|
print("elapsed time: {}".format(time.time() - timer_start))
|
||||||
|
# lexicon
|
||||||
|
#lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])
|
||||||
|
|
||||||
if len(script) != 0:
|
# list of features
|
||||||
script_id = script.index[0]
|
#with open(hcompv_scp) as fin:
|
||||||
script_txt = script.get(script_id)
|
# features = fin.read()
|
||||||
script_words = script_txt.split(' ')
|
# features = features.split('\n')
|
||||||
del script_words[0]
|
#i = 0
|
||||||
|
#missing_words = []
|
||||||
|
#fscp = open(hcompv_scp2, 'wt')
|
||||||
|
#fmlf = open(mlf_word, "wt", encoding="utf-8")
|
||||||
|
#fmlf.write("#!MLF!#\n")
|
||||||
|
#feature_nr = 1
|
||||||
|
#for feature in features:
|
||||||
|
# sys.stdout.write("\r%d/%d" % (feature_nr, len(features)))
|
||||||
|
# sys.stdout.flush()
|
||||||
|
# feature_nr += 1
|
||||||
|
# file_basename = os.path.basename(feature).replace('.mfc', '')
|
||||||
|
|
||||||
|
# # get words from scripts.
|
||||||
|
# try:
|
||||||
|
# script = scripts[scripts.str.contains(file_basename)]
|
||||||
|
# except IndexError:
|
||||||
|
# script = []
|
||||||
|
|
||||||
|
# if len(script) != 0:
|
||||||
|
# script_id = script.index[0]
|
||||||
|
# script_txt = script.get(script_id)
|
||||||
|
# script_words = script_txt.split(' ')
|
||||||
|
# del script_words[0]
|
||||||
|
|
||||||
# check if all words can be found in the lexicon.
|
# check if all words can be found in the lexicon.
|
||||||
SCRIPT_WORDS = []
|
# SCRIPT_WORDS = []
|
||||||
script_prons = []
|
# script_prons = []
|
||||||
is_in_lexicon = 1
|
# is_in_lexicon = 1
|
||||||
for word in script_words:
|
# for word in script_words:
|
||||||
WORD = word.upper()
|
# WORD = word.upper()
|
||||||
SCRIPT_WORDS.append(WORD)
|
# SCRIPT_WORDS.append(WORD)
|
||||||
extracted = lexicon_htk[lexicon_htk['word']==WORD]
|
# extracted = lexicon_htk[lexicon_htk['word']==WORD]
|
||||||
if len(extracted) == 0:
|
# if len(extracted) == 0:
|
||||||
missing_words.append(word)
|
# missing_words.append(word)
|
||||||
script_prons.append(extracted)
|
# script_prons.append(extracted)
|
||||||
is_in_lexicon *= len(extracted)
|
# is_in_lexicon *= len(extracted)
|
||||||
|
|
||||||
# if all pronunciations are found in the lexicon, update scp and mlf files.
|
# if all pronunciations are found in the lexicon, update scp and mlf files.
|
||||||
if is_in_lexicon:
|
# if is_in_lexicon:
|
||||||
# add the feature filename into the .scp file.
|
# add the feature filename into the .scp file.
|
||||||
fscp.write("{}\n".format(feature))
|
# fscp.write("{}\n".format(feature))
|
||||||
i += 1
|
# i += 1
|
||||||
|
|
||||||
# add the words to the mlf file.
|
# add the words to the mlf file.
|
||||||
fmlf.write('\"*/{}.lab\"\n'.format(file_basename))
|
# fmlf.write('\"*/{}.lab\"\n'.format(file_basename))
|
||||||
#fmlf.write('{}'.format('\n'.join(SCRIPT_WORDS)))
|
#fmlf.write('{}'.format('\n'.join(SCRIPT_WORDS)))
|
||||||
for word_ in SCRIPT_WORDS:
|
# for word_ in SCRIPT_WORDS:
|
||||||
if word_[0] == '\'':
|
# if word_[0] == '\'':
|
||||||
word_ = '\\' + word_
|
# word_ = '\\' + word_
|
||||||
fmlf.write('{}\n'.format(word_))
|
# fmlf.write('{}\n'.format(word_))
|
||||||
fmlf.write('.\n')
|
# fmlf.write('.\n')
|
||||||
print("\n{0} has {1} samples.\n".format(dataset, i))
|
# print("\n{0} has {1} samples.\n".format(dataset, i))
|
||||||
np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words)
|
# np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words)
|
||||||
|
|
||||||
fscp.close()
|
# fscp.close()
|
||||||
fmlf.close()
|
# fmlf.close()
|
||||||
|
|
||||||
|
|
||||||
## generate phone level transcription
|
## generate phone level transcription
|
||||||
print("generating phone level transcription...\n")
|
# print("generating phone level transcription...\n")
|
||||||
mkphones = output_dir + '\\label\\mkphones0.txt'
|
# mkphones = output_dir + '\\label\\mkphones0.txt'
|
||||||
subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word
|
# subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word
|
||||||
subprocess.call(subprocessStr, shell=True)
|
# subprocess.call(subprocessStr, shell=True)
|
||||||
|
|
||||||
|
|
||||||
## ======================= combined scps and mlfs =======================
|
## ======================= combined scps and mlfs =======================
|
||||||
|
@ -3,6 +3,7 @@ import os
|
|||||||
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import time
|
import time
|
||||||
|
import re
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@ -82,22 +83,52 @@ np.save(os.path.join('phoneset', 'fame_ipa2asr.npy'), translation_key_ipa2asr)
|
|||||||
|
|
||||||
|
|
||||||
## check if all the phones in lexicon.htk are in fame_asr.py.
|
## check if all the phones in lexicon.htk are in fame_asr.py.
|
||||||
|
#timer_start = time.time()
|
||||||
|
#phoneset_htk = fame_asr.phoneset_htk
|
||||||
|
#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_htk)
|
||||||
|
#phoneset_lex.remove('')
|
||||||
|
#print("phones which is in lexicon.htk but not in the fame_asr.py are:\n{}".format(
|
||||||
|
# set(phoneset_htk) - set(phoneset_lex)))
|
||||||
|
#print("elapsed time: {}".format(time.time() - timer_start))
|
||||||
|
|
||||||
|
## statistics over the lexicon
|
||||||
|
#lex_htk = fame_functions.load_lexicon(lexicon_htk)
|
||||||
|
#phones_all = (' '.join(lex_htk['pronunciation'])).split(' ')
|
||||||
|
#c = Counter(phones_all)
|
||||||
|
|
||||||
|
#lexicon_out = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\lexicon\lex.htk2'
|
||||||
|
#for i in lex_htk[lex_htk['word'].str.startswith('\'')].index.values:
|
||||||
|
# lex_htk.iat[i, 0] = lex_htk.iat[i, 0].replace('\'', '\\\'')
|
||||||
|
## to_csv does not work with space seperator. therefore all tabs should manually be replaced.
|
||||||
|
##lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
|
||||||
|
#lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
|
||||||
|
|
||||||
|
|
||||||
|
## check which letters are not coded in ascii.
|
||||||
|
print('asr phones which cannot be coded in ascii:\n')
|
||||||
|
for i in fame_asr.phoneset_short:
|
||||||
|
try:
|
||||||
|
i_encoded = i.encode("ascii")
|
||||||
|
#print("{0} --> {1}".format(i, i.encode("ascii")))
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
print(">>> {}".format(i))
|
||||||
|
|
||||||
|
print("letters in the scripts which is not coded in ascii:\n")
|
||||||
|
for dataset in ['train', 'devel', 'test']:
|
||||||
timer_start = time.time()
|
timer_start = time.time()
|
||||||
phoneset_htk = fame_asr.phoneset_htk
|
|
||||||
phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_htk)
|
|
||||||
phoneset_lex.remove('')
|
|
||||||
print("phones which is in lexicon.htk but not in the fame_asr.py are:\n{}".format(
|
|
||||||
set(phoneset_htk) - set(phoneset_lex)))
|
|
||||||
print("elapsed time: {}".format(time.time() - timer_start))
|
|
||||||
|
|
||||||
# statistics over the lexicon
|
script_list = os.path.join(default.fame_dir, 'data', dataset, 'text')
|
||||||
lex_htk = fame_functions.load_lexicon(lexicon_htk)
|
with open(script_list, "rt", encoding="utf-8") as fin:
|
||||||
phones_all = (' '.join(lex_htk['pronunciation'])).split(' ')
|
scripts = fin.read().split('\n')
|
||||||
c = Counter(phones_all)
|
|
||||||
|
for line in scripts:
|
||||||
|
sentence = ' '.join(line.split(' ')[1:])
|
||||||
|
sentence_htk = fame_functions.word2htk(sentence)
|
||||||
|
|
||||||
|
#if len(re.findall(r'[âêôûč\'àéèúćäëïöü]', sentence))==0:
|
||||||
|
try:
|
||||||
|
sentence_htk = bytes(sentence_htk, 'ascii')
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
print(sentence)
|
||||||
|
print(sentence_htk)
|
||||||
|
|
||||||
lexicon_out = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\lexicon\lex.htk2'
|
|
||||||
for i in lex_htk[lex_htk['word'].str.startswith('\'')].index.values:
|
|
||||||
lex_htk.iat[i, 0] = lex_htk.iat[i, 0].replace('\'', '\\\'')
|
|
||||||
# to_csv does not work with space seperator. therefore all tabs should manually be replaced.
|
|
||||||
#lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
|
|
||||||
lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
|
|
||||||
|
@ -103,12 +103,22 @@ translation_key_asr2htk = {
|
|||||||
}
|
}
|
||||||
phoneset_htk = [translation_key_asr2htk.get(i, i) for i in phoneset_short]
|
phoneset_htk = [translation_key_asr2htk.get(i, i) for i in phoneset_short]
|
||||||
|
|
||||||
## check
|
#not_in_ascii = [
|
||||||
#for i in phoneset_short:
|
# '\'',
|
||||||
# try:
|
# 'â', 'ê', 'ô', 'û', 'č',
|
||||||
# print("{0} --> {1}".format(i, i.encode("ascii")))
|
# 'à', 'í', 'é', 'è', 'ú', 'ć',
|
||||||
# except UnicodeEncodeError:
|
# 'ä', 'ë', 'ï', 'ö', 'ü'
|
||||||
# print(">>> {}".format(i))
|
#]
|
||||||
|
translation_key_word2htk = {
|
||||||
|
'\'': '\\\'',
|
||||||
|
'í':'i1', 'é':'e1', 'ú':'u1', 'ć':'c1',
|
||||||
|
'à':'a2', 'è':'e2',
|
||||||
|
'â':'a3', 'ê':'e3', 'ô':'o3', 'û':'u3',
|
||||||
|
'č':'c4',
|
||||||
|
'ä': 'ao', 'ë': 'ee', 'ï': 'ie', 'ö': 'oe', 'ü': 'ue',
|
||||||
|
}
|
||||||
|
#[translation_key_word2htk.get(i, i) for i in not_in_ascii]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## the list of multi character phones.
|
## the list of multi character phones.
|
||||||
|
0
acoustic_model/test.txt
Normal file
0
acoustic_model/test.txt
Normal file
Loading…
Reference in New Issue
Block a user