novoapi_functions.py is adjusted to use convert_phoneset.py.

This commit is contained in:
yemaozi88 2019-04-22 00:59:53 +02:00
parent b444b70af9
commit 2004399179
5 changed files with 283 additions and 132 deletions

Binary file not shown.

View File

@ -14,18 +14,20 @@ def multi_character_tokenize(line, multi_character_tokens):
line = line[1:] line = line[1:]
def split_word(word, multi_character_phones): def split_word(word, phoneset):
""" """
split a line by given phoneset. split a line by given phoneset.
Args: Args:
word (str): a word written in given phoneset. word (str): a word written in given phoneset.
multi_character_phones (list): the list of multicharacter phones which is considered as one phone. this can be obtained with phoneset definition such as fame_ipa.py. #multi_character_phones (list): the list of multicharacter phones which is considered as one phone. this can be obtained with phoneset definition such as fame_ipa.py.
phoneset (list): the list of phones.
Returns: Returns:
(word_seperated) (list): the word splitted in given phoneset. (word_seperated) (list): the word splitted in given phoneset.
""" """
multi_character_phones = extract_multi_character_phones(phoneset)
return [phone return [phone
for phone in multi_character_tokenize(word.strip(), multi_character_phones) for phone in multi_character_tokenize(word.strip(), multi_character_phones)
] ]
@ -43,4 +45,14 @@ def convert_phoneset(word_list, translation_key):
def phone_reduction(phones, reduction_key): def phone_reduction(phones, reduction_key):
multi_character_tokenize(wo.strip(), multi_character_phones) multi_character_tokenize(wo.strip(), multi_character_phones)
return [reduction_key.get(i, i) for i in phones return [reduction_key.get(i, i) for i in phones
if not i in phones_to_be_removed] if not i in phones_to_be_removed]
def extract_multi_character_phones(phoneset):
"""
Args:
phoneset (list):
"""
multi_character_phones = [i for i in phoneset if len(i) > 1]
multi_character_phones.sort(key=len, reverse=True)
return multi_character_phones

View File

@ -352,6 +352,9 @@ def fix_lexicon(lexicon_file):
return return
#def add_sp_to_lexicon(lexicon_file):
def word2htk(word): def word2htk(word):
return ''.join([fame_asr.translation_key_word2htk.get(i, i) for i in word]) return ''.join([fame_asr.translation_key_word2htk.get(i, i) for i in word])

View File

@ -16,50 +16,53 @@ import defaultfiles as default
sys.path.append(default.toolbox_dir) sys.path.append(default.toolbox_dir)
import file_handling as fh import file_handling as fh
from htk import pyhtk from htk import pyhtk
#from scripts import run_command
## ======================= user define ======================= ## ======================= user define =======================
# procedure # procedure
combine_all = 1
make_lexicon = 0 make_lexicon = 0
make_label = 0 # it takes roughly 4800 sec on Surface pro 2. make_label = 0 # it takes roughly 4800 sec on Surface pro 2.
make_mlf = 0 make_mlf = 0
extract_features = 0 extract_features = 0
flat_start = 0 flat_start = 1
train_monophone_without_sp = 0 train_monophone_without_sp = 1
add_sp = 0 add_sp = 1
train_monophone_with_re_aligned_mlf = 0 train_monophone_with_re_aligned_mlf = 1
increase_mixture = 1
train_triphone = 0 train_triphone = 0
train_triphone_tied = 1 train_triphone_tied = 0
# pre-defined values. # pre-defined values.
dataset_list = ['devel', 'test', 'train'] dataset_list = ['devel', 'test', 'train']
feature_size = 39 feature_size = 30
improvement_threshold = 0.3 improvement_threshold = 0.3
hmmdefs_name = 'hmmdefs'
proto_name = 'proto'
lexicon_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr') lexicon_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr')
lexicon_oov = os.path.join(default.fame_dir, 'lexicon', 'lex.oov') lexicon_oov = os.path.join(default.fame_dir, 'lexicon', 'lex.oov')
config_dir = os.path.join(default.htk_dir, 'config') config_dir = os.path.join(default.htk_dir, 'config')
phonelist_full_txt = os.path.join(config_dir, 'phonelist_full.txt') phonelist_full_txt = os.path.join(config_dir, 'phonelist_full.txt')
tree_hed = os.path.join(config_dir, 'tree.hed') tree_hed = os.path.join(config_dir, 'tree.hed')
quest_hed = os.path.join(config_dir, 'quests.hed') quests_hed = os.path.join(config_dir, 'quests.hed')
model_dir = os.path.join(default.htk_dir, 'model') model_dir = os.path.join(default.htk_dir, 'model')
model_mono0_dir = os.path.join(model_dir, 'mono0') model_mono0_dir = os.path.join(model_dir, 'mono0')
model_mono1_dir = os.path.join(model_dir, 'mono1') model_mono1_dir = os.path.join(model_dir, 'mono1')
model_mono1sp_dir = os.path.join(model_dir, 'mono1sp') model_mono1sp_dir = os.path.join(model_dir, 'mono1sp')
model_mono1sp2_dir = os.path.join(model_dir, 'mono1sp2') model_mono1sp2_dir = os.path.join(model_dir, 'mono1sp2')
model_tri1_dir = os.path.join(model_dir, 'tri1') model_tri1_dir = os.path.join(model_dir, 'tri1')
model_tri1tied_dir = os.path.join(model_dir, 'tri1tied')
# directories / files to be made. # directories / files to be made.
lexicon_dir = os.path.join(default.htk_dir, 'lexicon') lexicon_dir = os.path.join(default.htk_dir, 'lexicon')
lexicon_htk_asr = os.path.join(lexicon_dir, 'lex.htk_asr') lexicon_htk_asr = os.path.join(lexicon_dir, 'lex.htk_asr')
lexicon_htk_oov = os.path.join(lexicon_dir, 'lex.htk_oov') lexicon_htk_oov = os.path.join(lexicon_dir, 'lex.htk_oov')
lexicon_htk = os.path.join(lexicon_dir, 'lex.htk') lexicon_htk = os.path.join(lexicon_dir, 'lex.htk')
lexicon_htk_with_sp = os.path.join(lexicon_dir, 'lex_with_sp.htk')
lexicon_htk_triphone = os.path.join(lexicon_dir, 'lex_triphone.htk') lexicon_htk_triphone = os.path.join(lexicon_dir, 'lex_triphone.htk')
feature_dir = os.path.join(default.htk_dir, 'mfc') feature_dir = os.path.join(default.htk_dir, 'mfc')
@ -71,10 +74,20 @@ fh.make_new_directory(label_dir, existing_dir='leave')
## training ## training
hcompv_scp_train = os.path.join(tmp_dir, 'train.scp') if combine_all:
mlf_file_train = os.path.join(label_dir, 'train_phone.mlf') hcompv_scp_train = os.path.join(tmp_dir, 'all.scp')
mlf_file_train_with_sp = os.path.join(label_dir, 'train_phone_with_sp.mlf') mlf_file_train = os.path.join(label_dir, 'all_phone.mlf')
mlf_file_train_aligned = os.path.join(label_dir, 'train_phone_aligned.mlf') mlf_file_train_word = os.path.join(label_dir, 'all_word.mlf')
mlf_file_train_with_sp = os.path.join(label_dir, 'all_phone_with_sp.mlf')
mlf_file_train_aligned = os.path.join(label_dir, 'all_phone_aligned.mlf')
triphone_mlf = os.path.join(label_dir, 'all_triphone.mlf')
else:
hcompv_scp_train = os.path.join(tmp_dir, 'train.scp')
mlf_file_train = os.path.join(label_dir, 'train_phone.mlf')
mlf_file_train_word = os.path.join(label_dir, 'train_word.mlf')
mlf_file_train_with_sp = os.path.join(label_dir, 'train_phone_with_sp.mlf')
mlf_file_train_aligned = os.path.join(label_dir, 'train_phone_aligned.mlf')
triphone_mlf = os.path.join(label_dir, 'train_triphone.mlf')
hcompv_scp_train_updated = hcompv_scp_train.replace('.scp', '_updated.scp') hcompv_scp_train_updated = hcompv_scp_train.replace('.scp', '_updated.scp')
## testing ## testing
@ -104,19 +117,18 @@ if make_lexicon:
print('>>> fixing the lexicon...') print('>>> fixing the lexicon...')
fame_functions.fix_lexicon(lexicon_htk) fame_functions.fix_lexicon(lexicon_htk)
## add sp to the end of each line. ## adding sp to the lexicon for HTK.
#print('>>> adding sp...') print('>>> adding sp to the lexicon...')
#with open(lexicon_htk) as f: with open(lexicon_htk) as f:
# lines = f.read().split('\n') lines = f.read().split('\n')
#lines = [line + ' sp' for line in lines] with open(lexicon_htk_with_sp, 'wb') as f:
#with open(lexicon_htk_with_sp, 'wb') as f: f.write(bytes(' sp\n'.join(lines), 'ascii'))
# f.write(bytes('\n'.join(lines), 'ascii'))
print("elapsed time: {}".format(time.time() - timer_start)) print("elapsed time: {}".format(time.time() - timer_start))
## intialize the instance for HTK. ## intialize the instance for HTK.
chtk = pyhtk.HTK(config_dir, fame_asr.phoneset_htk, lexicon_htk, feature_size) chtk = pyhtk.HTK(config_dir, fame_asr.phoneset_htk, lexicon_htk_with_sp, feature_size)
## ======================= make label files ======================= ## ======================= make label files =======================
@ -152,7 +164,7 @@ if make_label:
shutil.move(dictionary_file, os.path.join(label_dir_, filename + '.dic')) shutil.move(dictionary_file, os.path.join(label_dir_, filename + '.dic'))
label_file = os.path.join(label_dir_, filename + '.lab') label_file = os.path.join(label_dir_, filename + '.lab')
chtk.create_label_file(sentence_htk, label_file) chtk.make_label_file(sentence_htk, label_file)
else: else:
os.remove(dictionary_file) os.remove(dictionary_file)
@ -174,7 +186,6 @@ if make_mlf:
os.remove(empty_dic_file) os.remove(empty_dic_file)
for dataset in dataset_list: for dataset in dataset_list:
#wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
feature_dir_ = os.path.join(feature_dir, dataset) feature_dir_ = os.path.join(feature_dir, dataset)
label_dir_ = os.path.join(label_dir, dataset) label_dir_ = os.path.join(label_dir, dataset)
mlf_word = os.path.join(label_dir, dataset + '_word.mlf') mlf_word = os.path.join(label_dir, dataset + '_word.mlf')
@ -183,11 +194,11 @@ if make_mlf:
print(">>> generating a word level mlf file for {}...".format(dataset)) print(">>> generating a word level mlf file for {}...".format(dataset))
chtk.label2mlf(label_dir_, mlf_word) chtk.label2mlf(label_dir_, mlf_word)
print(">>> generating a phone level mlf file for {}...".format(dataset)) print(">>> generating a phone level mlf file for {}...".format(dataset))
chtk.mlf_word2phone(mlf_phone, mlf_word, with_sp=False) chtk.mlf_word2phone(mlf_phone, mlf_word, with_sp=False)
chtk.mlf_word2phone(mlf_phone_with_sp, mlf_word, with_sp=True) chtk.mlf_word2phone(mlf_phone_with_sp, mlf_word, with_sp=True)
print("elapsed time: {}".format(time.time() - timer_start)) print("elapsed time: {}".format(time.time() - timer_start))
@ -197,7 +208,7 @@ if extract_features:
timer_start = time.time() timer_start = time.time()
print('==== extract features on dataset {} ===='.format(dataset)) print('==== extract features on dataset {} ===='.format(dataset))
wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset) wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
label_dir_ = os.path.join(label_dir, dataset) label_dir_ = os.path.join(label_dir, dataset)
feature_dir_ = os.path.join(feature_dir, dataset) feature_dir_ = os.path.join(feature_dir, dataset)
fh.make_new_directory(feature_dir_, existing_dir='delete') fh.make_new_directory(feature_dir_, existing_dir='delete')
@ -217,8 +228,8 @@ if extract_features:
+ os.path.join(feature_dir_, os.path.basename(lab_file).replace('.lab', '.mfc')) + os.path.join(feature_dir_, os.path.basename(lab_file).replace('.lab', '.mfc'))
for lab_file in lab_list] for lab_file in lab_list]
if os.path.exists(empty_mfc_file): #if os.path.exists(empty_mfc_file):
os.remove(empty_mfc_file) # os.remove(empty_mfc_file)
with open(hcopy_scp.name, 'wb') as f: with open(hcopy_scp.name, 'wb') as f:
f.write(bytes('\n'.join(feature_list), 'ascii')) f.write(bytes('\n'.join(feature_list), 'ascii'))
@ -235,9 +246,64 @@ if extract_features:
with open(hcompv_scp, 'wb') as f: with open(hcompv_scp, 'wb') as f:
f.write(bytes('\n'.join(mfc_list) + '\n', 'ascii')) f.write(bytes('\n'.join(mfc_list) + '\n', 'ascii'))
print(">>> extracting features on stimmen...")
chtk.wav2mfc(os.path.join(htk_stimmen_dir, 'hcopy.scp'))
print("elapsed time: {}".format(time.time() - timer_start)) print("elapsed time: {}".format(time.time() - timer_start))
## ======================= flat start monophones =======================
if combine_all:
# script files.
fh.concatenate(
os.path.join(tmp_dir, 'devel.scp'),
os.path.join(tmp_dir, 'test.scp'),
hcompv_scp_train
)
fh.concatenate(
hcompv_scp_train,
os.path.join(tmp_dir, 'train.scp'),
hcompv_scp_train
)
# phone level mlfs.
fh.concatenate(
os.path.join(label_dir, 'devel_phone.mlf'),
os.path.join(label_dir, 'test_phone.mlf'),
mlf_file_train
)
fh.concatenate(
mlf_file_train,
os.path.join(label_dir, 'train_phone.mlf'),
mlf_file_train
)
# phone level mlfs with sp.
fh.concatenate(
os.path.join(label_dir, 'devel_phone_with_sp.mlf'),
os.path.join(label_dir, 'test_phone_with_sp.mlf'),
mlf_file_train_with_sp
)
fh.concatenate(
mlf_file_train_with_sp,
os.path.join(label_dir, 'train_phone_with_sp.mlf'),
mlf_file_train_with_sp
)
# word level mlfs.
fh.concatenate(
os.path.join(label_dir, 'devel_word.mlf'),
os.path.join(label_dir, 'test_word.mlf'),
mlf_file_train_word
)
fh.concatenate(
mlf_file_train_word,
os.path.join(label_dir, 'train_word.mlf'),
mlf_file_train_word
)
## ======================= flat start monophones ======================= ## ======================= flat start monophones =======================
if flat_start: if flat_start:
timer_start = time.time() timer_start = time.time()
@ -246,17 +312,14 @@ if flat_start:
chtk.flat_start(hcompv_scp_train, model_mono0_dir) chtk.flat_start(hcompv_scp_train, model_mono0_dir)
# create macros. # make macros.
vFloors = os.path.join(model_mono0_dir, 'vFloors') vFloors = os.path.join(model_mono0_dir, 'vFloors')
if os.path.exists(vFloors): if os.path.exists(vFloors):
chtk.create_macros(vFloors) chtk.make_macros(vFloors)
# allocate mean & variance to all phones in the phone list # allocate mean & variance to all phones in the phone list
print('>>> allocating mean & variance to all phones in the phone list...') print('>>> allocating mean & variance to all phones in the phone list...')
chtk.create_hmmdefs( chtk.make_hmmdefs(model_mono0_dir)
os.path.join(model_mono0_dir, proto_name),
os.path.join(model_mono0_dir, 'hmmdefs')
)
print("elapsed time: {}".format(time.time() - timer_start)) print("elapsed time: {}".format(time.time() - timer_start))
@ -320,8 +383,9 @@ if train_monophone_with_re_aligned_mlf:
os.path.join(modeln_dir, 'macros'), os.path.join(modeln_dir, 'macros'),
os.path.join(modeln_dir, 'hmmdefs'), os.path.join(modeln_dir, 'hmmdefs'),
mlf_file_train_aligned, mlf_file_train_aligned,
os.path.join(label_dir, 'train_word.mlf'), mlf_file_train_word,
hcompv_scp_train) hcompv_scp_train)
chtk.fix_mlf(mlf_file_train_aligned)
print('>>> updating the script file... ') print('>>> updating the script file... ')
chtk.update_script_file( chtk.update_script_file(
@ -349,24 +413,55 @@ if train_monophone_with_re_aligned_mlf:
print("elapsed time: {}".format(time.time() - timer_start)) print("elapsed time: {}".format(time.time() - timer_start))
## ======================= train triphone ======================= ## ======================= increase mixture =======================
if train_triphone: if increase_mixture:
print('==== traina triphone model ====') print('==== increase mixture ====')
timer_start = time.time() timer_start = time.time()
for nmix in [2, 4, 8, 16]:
if nmix == 2:
modeln_dir_ = model_mono1sp2_dir
else:
modeln_dir_ = os.path.join(model_dir, 'mono'+str(nmix_))
modeln_dir = os.path.join(model_dir, 'mono'+str(nmix))
triphonelist_txt = os.path.join(config_dir, 'triphonelist.txt') print('mixture: {}'.format(nmix))
triphone_mlf = os.path.join(default.htk_dir, 'label', 'train_triphone.mlf') fh.make_new_directory(modeln_dir, existing_dir='delete')
niter = chtk.get_niter_max(modeln_dir_)
chtk.increase_mixture(
os.path.join(modeln_dir_, 'iter'+str(niter), 'hmmdefs'),
nmix,
os.path.join(modeln_dir, 'iter0'),
model_type='monophone_with_sp')
shutil.copy2(os.path.join(modeln_dir_, 'iter'+str(niter), 'macros'),
os.path.join(modeln_dir, 'iter0', 'macros'))
print('>>> making triphone list... ') #improvement_threshold = -10
chtk.make_triphonelist( niter = chtk.re_estimation_until_saturated(
triphonelist_txt, modeln_dir,
triphone_mlf, os.path.join(modeln_dir_, 'iter0'),
mlf_file_train_aligned) improvement_threshold,
hcompv_scp_train_updated,
os.path.join(htk_stimmen_dir, 'mfc'),
'mfc',
os.path.join(htk_stimmen_dir, 'word_lattice.ltc'),
mlf_file=mlf_file_train_aligned,
lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'),
model_type='monophone_with_sp'
)
nmix_ = nmix
print('>>> making triphone header... ') print("elapsed time: {}".format(time.time() - timer_start))
chtk.make_tri_hed(
os.path.join(config_dir, 'mktri.hed')
) ## ======================= train triphone =======================
print('>>> making triphone list... ')
chtk.make_triphonelist(
mlf_file_train_aligned,
triphone_mlf)
if train_triphone:
print('==== train triphone model ====')
timer_start = time.time()
print('>>> init triphone model... ') print('>>> init triphone model... ')
niter = chtk.get_niter_max(model_mono1sp2_dir) niter = chtk.get_niter_max(model_mono1sp2_dir)
@ -377,8 +472,8 @@ if train_triphone:
) )
print('>>> re-estimation... ') print('>>> re-estimation... ')
# I wanted to train until satulated: ## I wanted to train until satulated:
# #niter = chtk.re_estimation_until_saturated( #niter = chtk.re_estimation_until_saturated(
# model_tri1_dir, # model_tri1_dir,
# os.path.join(model_tri1_dir, 'iter0'), # os.path.join(model_tri1_dir, 'iter0'),
# improvement_threshold, # improvement_threshold,
@ -395,7 +490,6 @@ if train_triphone:
# ERROR [+8231] GetHCIModel: Cannot find hmm [i:-]r[+???] # ERROR [+8231] GetHCIModel: Cannot find hmm [i:-]r[+???]
# therefore only two times re-estimation is performed. # therefore only two times re-estimation is performed.
output_dir = model_tri1_dir output_dir = model_tri1_dir
for niter in range(1, 4): for niter in range(1, 4):
hmm_n = 'iter' + str(niter) hmm_n = 'iter' + str(niter)
hmm_n_pre = 'iter' + str(niter-1) hmm_n_pre = 'iter' + str(niter-1)
@ -414,18 +508,59 @@ if train_triphone:
print("elapsed time: {}".format(time.time() - timer_start)) print("elapsed time: {}".format(time.time() - timer_start))
## ======================= train triphone ======================= ## ======================= train tied-state triphones =======================
if train_triphone_tied: if train_triphone_tied:
print('==== traina tied-state triphone ====') print('==== train tied-state triphones ====')
timer_start = time.time() timer_start = time.time()
print('>>> making lexicon for triphone... ') print('>>> making lexicon for triphone... ')
chtk.make_triphone_full(phonelist_full_txt, lexicon_htk_triphone) chtk.make_lexicon_triphone(phonelist_full_txt, lexicon_htk_triphone)
chtk.combine_phonelists(phonelist_full_txt)
print('>>> making headers... ') print('>>> making a tree header... ')
chtk.make_tree_header(tree_hed) fame_phonetics.make_quests_hed(quests_hed)
fame_phonetics.make_quests_hed(quest_hed) stats = os.path.join(r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\model\tri1\iter3', 'stats')
chtk.make_tree_header(tree_hed, quests_hed, stats, config_dir)
print("elapsed time: {}".format(time.time() - timer_start)) print('>>> init triphone model... ')
niter = chtk.get_niter_max(model_tri1_dir)
fh.make_new_directory(os.path.join(model_tri1tied_dir, 'iter0'), existing_dir='leave')
chtk.init_triphone(
os.path.join(model_tri1_dir, 'iter'+str(niter)),
os.path.join(model_tri1tied_dir, 'iter0'),
tied=True)
# I wanted to train until satulated:
#niter = chtk.re_estimation_until_saturated(
# model_tri1tied_dir,
# os.path.join(model_tri1tied_dir, 'iter0'),
# improvement_threshold,
# hcompv_scp_train_updated,
# os.path.join(htk_stimmen_dir, 'mfc'),
# 'mfc',
# os.path.join(htk_stimmen_dir, 'word_lattice.ltc'),
# mlf_file=triphone_mlf,
# lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'),
# model_type='triphone'
# )
#
# but because the data size is limited, some triphone cannot be trained and received the error:
# ERROR [+8231] GetHCIModel: Cannot find hmm [i:-]r[+???]
# therefore only 3 times re-estimation is performed.
output_dir = model_tri1tied_dir
for niter in range(1, 4):
hmm_n = 'iter' + str(niter)
hmm_n_pre = 'iter' + str(niter-1)
_modeln_dir = os.path.join(output_dir, hmm_n)
_modeln_dir_pre = os.path.join(output_dir, hmm_n_pre)
fh.make_new_directory(_modeln_dir, 'leave')
chtk.re_estimation(
os.path.join(_modeln_dir_pre, 'hmmdefs'),
_modeln_dir,
hcompv_scp_train_updated,
mlf_file=triphone_mlf,
macros=os.path.join(_modeln_dir_pre, 'macros'),
model_type='triphone')
print("elapsed time: {}".format(time.time() - timer_start))

View File

@ -1,20 +1,19 @@
## this script should be used only by Aki Kunikoshi. ## this script should be used only by Aki Kunikoshi.
import os
import numpy as np import numpy as np
import pandas as pd
import argparse import argparse
import json import json
from novoapi.backend import session from novoapi.backend import session
import os
#os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import defaultfiles as default import defaultfiles as default
import convert_phoneset
def load_phonset(): def load_novo70_phoneset():
translation_key_ipa2novo70 = dict()
translation_key_novo702ipa = dict()
#phonelist_novo70_ = pd.ExcelFile(default.phonelist_novo70_xlsx) #phonelist_novo70_ = pd.ExcelFile(default.phonelist_novo70_xlsx)
#df = pd.read_excel(phonelist_novo70_, 'list') #df = pd.read_excel(phonelist_novo70_, 'list')
## *_simple includes columns which has only one phone in. ## *_simple includes columns which has only one phone in.
@ -23,21 +22,23 @@ def load_phonset():
# print('{0}:{1}'.format(ipa, novo70)) # print('{0}:{1}'.format(ipa, novo70))
# translation_key[ipa] = novo70 # translation_key[ipa] = novo70
#phonelist_novo70 = np.unique(list(df['novo70_simple'])) #phonelist_novo70 = np.unique(list(df['novo70_simple']))
novo70_phoneset = pd.read_csv(default.novo70_phoneset, delimiter='\t', header=None, encoding="utf-8")
novo70_phoneset.rename(columns={0: 'novo70', 1: 'ipa', 2: 'description'}, inplace=True)
phoneset_ipa = [] #phoneset_ipa = []
phoneset_novo70 = [] #phoneset_novo70 = []
with open(default.novo70_phoneset, "rt", encoding="utf-8") as fin: #with open(default.novo70_phoneset, "rt", encoding="utf-8") as fin:
lines = fin.read() # lines = fin.read()
lines = lines.split('\n') # lines = lines.split('\n')
for line in lines: # for line in lines:
words = line.split('\t') # words = line.split('\t')
if len(words) > 1: # if len(words) > 1:
novo70 = words[0] # novo70 = words[0]
ipa = words[1] # ipa = words[1]
phoneset_ipa.append(ipa) # phoneset_ipa.append(ipa)
phoneset_novo70.append(novo70) # phoneset_novo70.append(novo70)
translation_key_ipa2novo70[ipa] = novo70 # translation_key_ipa2novo70[ipa] = novo70
translation_key_novo702ipa[novo70] = ipa # translation_key_novo702ipa[novo70] = ipa
# As per Nederlandse phoneset_aki.xlsx recieved from David # As per Nederlandse phoneset_aki.xlsx recieved from David
# [ɔː] oh / ohr # from ipa->novo70, only oh is used. # [ɔː] oh / ohr # from ipa->novo70, only oh is used.
@ -47,15 +48,26 @@ def load_phonset():
# [ɛː] eh # [ɛː] eh
# [w] wv in IPA written as ʋ. # [w] wv in IPA written as ʋ.
extra_ipa = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː', 'ʋ'] extra_ipa = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː', 'ʋ']
extra_novo70 = ['oh', 'ih', 'iy', 'uh', 'eh', 'wv'] extra_novo70 = ['oh', 'ih', 'iy', 'uh', 'eh', 'wv']
for ipa, novo70 in zip(extra_ipa, extra_novo70):
phoneset_ipa.append(ipa) phoneset_ipa = list(novo70_phoneset['ipa'])
phoneset_novo70.append(novo70) phoneset_ipa.extend(extra_ipa)
phoneset_ipa = [i.replace('ː', ':') for i in phoneset_ipa]
phoneset_novo70 = list(novo70_phoneset['novo70'])
phoneset_novo70.extend(extra_novo70)
phoneset_novo70 = [i.replace('ː', ':') for i in phoneset_novo70]
translation_key_ipa2novo70 = dict()
translation_key_novo702ipa = dict()
for ipa, novo70 in zip(phoneset_ipa, phoneset_novo70):
#phoneset_ipa.append(ipa)
#phoneset_novo70.append(novo70)
translation_key_ipa2novo70[ipa] = novo70 translation_key_ipa2novo70[ipa] = novo70
translation_key_novo702ipa[novo70] = ipa translation_key_novo702ipa[novo70] = ipa
translation_key_novo702ipa['ohr'] = 'ɔː' translation_key_novo702ipa['ohr'] = 'ɔ:'
translation_key_novo702ipa['ihr'] = 'ɪː' translation_key_novo702ipa['ihr'] = 'ɪ:'
phoneset_ipa = np.unique(phoneset_ipa) phoneset_ipa = np.unique(phoneset_ipa)
phoneset_novo70 = np.unique(phoneset_novo70) phoneset_novo70 = np.unique(phoneset_novo70)
@ -63,25 +75,6 @@ def load_phonset():
return phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa return phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa
def multi_character_tokenize(line, multi_character_tokens):
"""
Tries to match one of the tokens in multi_character_tokens at each position of line,
starting at position 0,
if so tokenizes and eats that token. Otherwise tokenizes a single character.
Copied from forced_alignment.convert_phone_set.py
"""
while line != '':
for token in multi_character_tokens:
if line.startswith(token) and len(token) > 0:
yield token
line = line[len(token):]
break
else:
yield line[:1]
line = line[1:]
def split_ipa(line): def split_ipa(line):
""" """
Split a line by IPA phones. Split a line by IPA phones.
@ -89,13 +82,16 @@ def split_ipa(line):
:param string line: one line written in IPA. :param string line: one line written in IPA.
:return string lineSeperated: the line splitted in IPA phone. :return string lineSeperated: the line splitted in IPA phone.
""" """
phoneset_ipa, _, _, _ = load_novo70_phoneset()
#multi_character_phones = [i for i in phoneset_ipa if len(i) > 1]
#multi_character_phones.sort(key=len, reverse=True)
#multi_character_phones = [
# # IPAs in CGN.
# u'ʌu', u'ɛi', u'œy', u'aː', u'eː', u'iː', u'oː', u'øː', u'ɛː', u'œː', u'ɔː', u'ɛ̃ː', u'ɑ̃ː', u'ɔ̃ː', u'œ̃', u'ɪː'
# ]
#return [phone for phone in multi_character_tokenize(line.strip(), multi_character_phones)]
multi_character_phones = [ return convert_phoneset.split_word(line, phoneset_ipa)
# IPAs in CGN.
u'ʌu', u'ɛi', u'œy', u'aː', u'eː', u'iː', u'oː', u'øː', u'ɛː', u'œː', u'ɔː', u'ɛ̃ː', u'ɑ̃ː', u'ɔ̃ː', u'œ̃', u'ɪː'
]
return [phone for phone in multi_character_tokenize(line.strip(), multi_character_phones)]
def split_novo70(line): def split_novo70(line):
@ -104,30 +100,33 @@ def split_novo70(line):
:param string line: one line written in novo70. :param string line: one line written in novo70.
:return string lineSeperated: the line splitted by novo70 phones. :return string lineSeperated: the line splitted by novo70 phones.
""" """
_, phoneset_novo70, _, _ = load_phonset() _, phoneset_novo70, _, _ = load_novo70_phoneset()
multi_character_phones = [p for p in phoneset_novo70 if len(p) > 1] #multi_character_phones = [p for p in phoneset_novo70 if len(p) > 1]
multi_character_phones = sorted(multi_character_phones, key=len, reverse=True) #multi_character_phones = sorted(multi_character_phones, key=len, reverse=True)
multi_character_phones = convert_phoneset.extract_multi_character_phones(phoneset_novo70)
return ['sp' if phone == ' ' else phone return ['sp' if phone == ' ' else phone
for phone in multi_character_tokenize(line.strip(), multi_character_phones)] for phone in multi_character_tokenize(line.strip(), multi_character_phones)]
def novo702ipa(tokens): def novo702ipa(line):
pronunciation = [] #pronunciation = []
_, _, _, translation_key = load_phonset() _, _, _, translation_key = load_novo70_phoneset()
for phone in split_novo70(tokens): #for phone in split_novo70(tokens):
pronunciation.append(translation_key.get(phone, phone)) # pronunciation.append(translation_key.get(phone, phone))
return ' '.join(pronunciation) #return ' '.join(pronunciation)
return ' '.join(convert_phoneset.convert_phoneset(split_novo70(line), translation_key))
# numbering of novo70 should be checked. # numbering of novo70 should be checked.
def ipa2novo70(tokens): def ipa2novo70(line):
pronunciation = [] #pronunciation = []
_, _, translation_key, _ = load_phonset() _, _, translation_key, _ = load_novo70_phoneset()
for phone in split_ipa(tokens): #for phone in split_ipa(tokens):
pronunciation.append(translation_key.get(phone, phone)) # pronunciation.append(translation_key.get(phone, phone))
return ' '.join(pronunciation) #return ' '.join(pronunciation)
return ' '.join(convert_phoneset.convert_phoneset(split_ipa(line), translation_key))
def make_grammar(word, pronunciation_ipa): def make_grammar(word, pronunciation_ipa):
""" """
@ -174,6 +173,7 @@ def forced_alignment(wav_file, word, pronunciation_ipa):
p = argparse.ArgumentParser() p = argparse.ArgumentParser()
p.add_argument("--user", default='martijn.wieling') p.add_argument("--user", default='martijn.wieling')
p.add_argument("--password", default='xxxxxx') p.add_argument("--password", default='xxxxxx')
args = p.parse_args() args = p.parse_args()
rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True) # , modeldir=modeldir) rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True) # , modeldir=modeldir)
@ -196,4 +196,5 @@ def result2pronunciation(result, word):
if __name__ == 'main': if __name__ == 'main':
pronunciation_ipa = ['rø:s', 'mɑn', 'mɑntsjə'] pronunciation_ipa = ['rø:s', 'mɑn', 'mɑntsjə']
grammar = make_grammar('reus', pronunciation_ipa) #grammar = make_grammar('reus', pronunciation_ipa)
phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa = load_novo70_phoneset()