correspondence between lex.asr and lex.ipa is automatically obtained. header is added to the functions in fame_functions.py.

This commit is contained in:
yemaozi88 2019-01-27 23:52:33 +01:00
parent 813f013d7a
commit 87abbbb95a
9 changed files with 244 additions and 144 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -16,14 +16,14 @@ def multi_character_tokenize(line, multi_character_tokens):
def split_word(word, multi_character_phones):
"""
Split a line by given phoneset.
split a line by given phoneset.
Args:
word (str): one word written in given phoneset.
multi_character_phones:
word (str): a word written in given phoneset.
multi_character_phones (list): the list of multicharacter phones which is considered as one phone. this can be obtained with phoneset definition such as fame_phoneset.py.
Returns:
word_seperated (str): the word splitted in given phoneset.
"""
(word_seperated) (list): the word splitted in given phoneset.
"""
return [phone for phone in multi_character_tokenize(word.strip(), multi_character_phones)]

View File

@ -33,7 +33,6 @@ repo_dir = r'C:\Users\Aki\source\repos'
ipa_xsampa_converter_dir = os.path.join(repo_dir, 'ipa-xsama-converter')
forced_alignment_module_dir = os.path.join(repo_dir, 'forced_alignment')
accent_classification_dir = os.path.join(repo_dir, 'accent_classification', 'accent_classification')
#pyhtk_dir = os.path.join(repo_dir, 'pyhtk', 'pyhtk')
toolbox_dir = os.path.join(repo_dir, 'toolbox')
#htk_config_dir = r'c:\Users\A.Kunikoshi\source\repos\forced_alignment\forced_alignment\data\htk\preset_models\aki_dutch_2017'

View File

@ -12,24 +12,6 @@ import defaultfiles as default
import fame_phoneset
import convert_phone_set
#sys.path.append(default.forced_alignment_module_dir)
#from forced_alignment import convert_phone_set
#def find_phone(lexicon_file, phone):
# """ Search where the phone is used in the lexicon. """
# with open(lexicon_file, "rt", encoding="utf-8") as fin:
# lines = fin.read()
# lines = lines.split('\n')
# extracted = []
# for line in lines:
# line = line.split('\t')
# if len(line) > 1:
# pronunciation = line[1]
# if phone in pronunciation:
# extracted.append(line)
# return extracted
#def ipa2famehtk_lexicon(lexicon_file_in, lexicon_file_out):
# """ Convert a lexicon file from IPA to HTK format for FAME! corpus. """
@ -128,25 +110,6 @@ import convert_phone_set
# return ipa
def make_hcopy_scp_from_filelist_in_fame(fame_dir, dataset, feature_dir, hcopy_scp):
""" Make a script file for HCopy using the filelist in FAME! corpus. """
filelist_txt = os.path.join(fame_dir, 'fame', 'filelists', dataset + 'list.txt')
with open(filelist_txt) as fin:
filelist = fin.read()
filelist = filelist.split('\n')
with open(hcopy_scp, 'w') as fout:
for filename_ in filelist:
filename = filename_.replace('.TextGrid', '')
if len(filename) > 3: # remove '.', '..' and ''
wav_file = os.path.join(fame_dir, 'fame', 'wav', dataset, filename + '.wav')
mfc_file = os.path.join(feature_dir, filename + '.mfc')
fout.write(wav_file + '\t' + mfc_file + '\n')
#def make_filelist(input_dir, output_txt):
# """ Make a list of files in the input_dir. """
# filenames = os.listdir(input_dir)
@ -191,98 +154,147 @@ def make_hcopy_scp_from_filelist_in_fame(fame_dir, dataset, feature_dir, hcopy_s
# f.write('{0}\t{1}\n'.format(WORD, key))
def make_hcopy_scp_from_filelist_in_fame(fame_dir, dataset, feature_dir, hcopy_scp):
""" Make a script file for HCopy using the filelist in FAME! corpus.
Args:
fame_dir (path): the directory of FAME corpus.
dataset (str): 'devel', 'test' or 'train'.
feature_dir (path): the directory where feature will be stored.
hcopy_scp (path): a script file for HCopy to be made.
"""
filelist_txt = os.path.join(fame_dir, 'fame', 'filelists', dataset + 'list.txt')
with open(filelist_txt) as fin:
filelist = fin.read()
filelist = filelist.split('\n')
with open(hcopy_scp, 'w') as fout:
for filename_ in filelist:
filename = filename_.replace('.TextGrid', '')
if len(filename) > 3: # remove '.', '..' and ''
wav_file = os.path.join(fame_dir, 'fame', 'wav', dataset, filename + '.wav')
mfc_file = os.path.join(feature_dir, filename + '.mfc')
fout.write(wav_file + '\t' + mfc_file + '\n')
def load_lexicon(lexicon_file):
""" load lexicon file as Data Frame.
Args:
lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'.
Returns:
lex (df): lexicon as Data Frame, which has columns 'word' and 'pronunciation'.
"""
lex = pd.read_csv(lexicon_file, delimiter='\t', header=None, encoding="utf-8")
lex.rename(columns={0: 'word', 1: 'pronunciation'}, inplace=True)
return lex
def get_phonelist(lexicon_asr):
""" Make a list of phones which appears in the lexicon. """
def get_phoneset_from_lexicon(lexicon_file, phoneset='asr'):
""" Make a list of phones which appears in the lexicon.
#with open(lexicon_file, "rt", encoding="utf-8") as fin:
# lines = fin.read()
# lines = lines.split('\n')
# phonelist = set([])
# for line in lines:
# line = line.split('\t')
# if len(line) > 1:
# pronunciation = set(line[1].split())
# phonelist = phonelist | pronunciation
lex = load_lexicon(lexicon_asr)
return set(' '.join(lex['pronunciation']).split(' '))
Args:
lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'.
phoneset (str): the phoneset with which lexicon_file is written. 'asr'(default) or 'ipa'.
Returns:
(list_of_phones) (set): the set of phones included in the lexicon_file.
"""
assert phoneset in ['asr', 'ipa'], 'phoneset should be \'asr\' or \'ipa\''
lex = load_lexicon(lexicon_file)
if phoneset == 'asr':
return set(' '.join(lex['pronunciation']).split(' '))
elif phoneset == 'ipa':
join_pronunciations = ''.join(lex['pronunciation'])
return set(convert_phone_set.split_word(join_pronunciations, fame_phoneset.multi_character_phones_ipa))
def extract_unknown_phones(word_list, known_phones):
return [i for i in word_list if not i in known_phones]
def extract_unknown_phones(ipa, known_phones):
"""extract unknown phones in the pronunciation written in IPA.
Args:
ipa (str): a pronunciation written in IPA.
known_phones (list): list of phones already know.
Returns:
(list_of_phones) (list): unknown phones not included in 'known_phones'.
"""
ipa_split = convert_phone_set.split_word(ipa, fame_phoneset.multi_character_phones_ipa)
return [i for i in ipa_split if not i in known_phones]
if __name__ == '__main__':
import time
timer_start = time.time()
def get_translation_key(lexicon_file_ipa, lexicon_file_asr):
""" get correspondence between lexicon_file_ipa and lexicon_file_asr.
#def get_translation_key():
dir_tmp = r'c:\Users\Aki\source\repos\acoustic_model\_tmp'
lexicon_ipa = r'd:\_corpus\FAME\lexicon\lex.ipa'
lexicon_asr = r'd:\_corpus\FAME\lexicon\lex.asr'
Args:
lexicon_file_ipa (path): lexicon in the format of 'word' /t 'pronunciation (IPA)'.
lexicon_file_asr (path): lexicon in the format of 'word' /t 'pronunciation (asr)'.
the each character of 'pronunciation' should be delimited by ' '.
lex_ipa = load_lexicon(lexicon_ipa)
lex_asr = load_lexicon(lexicon_asr)
if 1:
phone_to_be_searched = fame_phoneset.phoneset_ipa[:]
translation_key = dict()
for word in lex_ipa['word']:
if np.sum(lex_ipa['word'] == word) == 1 and np.sum(lex_asr['word'] == word) == 1:
ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
asr = lex_asr[lex_asr['word'] == word].iat[0, 1]
Returns:
translation_key (dict): translation key from ipa to asr.
(phone_unknown) (list): the list of IPA phones, which does not appear in lexicon_file_asr.
ipa_list = convert_phone_set.split_word(ipa, fame_phoneset.multi_character_phones_ipa)
asr_list = asr.split(' ')
"""
lex_ipa = load_lexicon(lexicon_file_ipa)
lex_asr = load_lexicon(lexicon_file_asr)
phone_unknown = fame_phoneset.phoneset_ipa[:]
translation_key = dict()
for word in lex_ipa['word']:
if np.sum(lex_ipa['word'] == word) == 1 and np.sum(lex_asr['word'] == word) == 1:
ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
asr = lex_asr[lex_asr['word'] == word].iat[0, 1]
# if there are phones which is not in phone_to_be_searched
#if len([True for i in asr_list if i in phone_to_be_searched]) > 0:
if(len(ipa_list) == len(asr_list)):
print("{0}: {1} --> {2}".format(word, ipa_list, asr_list))
for ipa_, asr_ in zip(ipa_list, asr_list):
if ipa_ in phone_to_be_searched:
translation_key[ipa_] = asr_
phone_to_be_searched.remove(ipa_)
ipa_list = convert_phone_set.split_word(ipa, fame_phoneset.multi_character_phones_ipa)
asr_list = asr.split(' ')
print("elapsed time: {}".format(time.time() - timer_start))
np.save(os.path.join(dir_tmp, 'translation_key.npy'), translation_key)
np.save(os.path.join(dir_tmp, 'phone_to_be_searched.npy'), phone_to_be_searched)
else:
translation_key = np.load(os.path.join(dir_tmp, 'translation_key.npy')).item()
phone_to_be_searched = np.load(os.path.join(dir_tmp, 'phone_to_be_searched.npy')).item()
# if there are phones which is not in phone_unknown
#if len([True for i in asr_list if i in phone_unknown]) > 0:
if(len(ipa_list) == len(asr_list)):
print("{0}: {1} --> {2}".format(word, ipa_list, asr_list))
for ipa_, asr_ in zip(ipa_list, asr_list):
if ipa_ in phone_unknown:
translation_key[ipa_] = asr_
phone_unknown.remove(ipa_)
return translation_key, list(phone_unknown)
#phone_unknown = list(phone_to_be_searched)
##phone_unknown.remove('')
#phone_known = list(translation_key.keys())
def find_phone(lexicon_file, phone, phoneset='ipa'):
""" extract rows where the phone is used in the lexicon_file.
#p = phone_unknown[0]
Args:
lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'.
phone (str): the phone to be searched.
phoneset (str): the phoneset with which lexicon_file is written. 'asr' or 'ipa'(default).
### extract lines which contains 'unknown' phone.
#lex_ipa_ = lex_ipa[lex_ipa['pronunciation'].str.count(p)>0]
##phone_unknown_ = phone_unknown[:]
##phone_unknown_.remove(p)
#phone_known_ = phone_known[:]
#phone_known_.append(p)
#for index, row in lex_ipa_.iterrows():
# ipa = row['pronunciation']
# phone_extract_unknown_phones(asr_list, phone_known_):
Returns:
extracted (df): rows where the phone is used.
# # check the number of phones in phone_unknown_
# if len([True for i in asr_list if i in phone_unknown_]) == 0:
# word = row['word']
# ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
# print("{0}: {1} --> {2}".format(word, ipa, asr))
# #print("{0}:{1}".format(index, row['pronunciation']))
ToDo:
* develop when the phonset == 'asr'.
"""
assert phoneset in ['asr', 'ipa'], 'phoneset should be \'asr\' or \'ipa\''
lex = load_lexicon(lexicon_file)
# to reduce the calculation time, only target rows which include 'phone' at least once.
lex_ = lex[lex['pronunciation'].str.count(phone)>0]
extracted = pd.DataFrame(index=[], columns=['word', 'pronunciation'])
for index, row in lex_.iterrows():
if phoneset == 'ipa':
pronunciation = convert_phone_set.split_word(row['pronunciation'], fame_phoneset.multi_character_phones_ipa)
if phone in pronunciation:
extracted_ = pd.Series([row['word'], pronunciation], index=extracted.columns)
extracted = extracted.append(extracted_, ignore_index=True)
return extracted

View File

@ -6,6 +6,7 @@ import tempfile
#import configparser
#import subprocess
#from collections import Counter
import time
#import numpy as np
#import pandas as pd
@ -27,8 +28,8 @@ from htk import pyhtk
dataset_list = ['devel', 'test', 'train']
# procedure
extract_features = 1
#conv_lexicon = 0
extract_features = 0
conv_lexicon = 1
#check_lexicon = 0
#make_mlf = 0
#combine_files = 0
@ -84,16 +85,14 @@ if not os.path.exists(tmp_dir):
## ======================= extract features =======================
if extract_features:
for dataset in dataset_list:
#for dataset in ['test']:
print('==== {} ===='.format(dataset))
# a script file for HCopy
print(">>> making a script file for HCopy... \n")
hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False)
hcopy_scp.close()
#hcopy_scp = os.path.join(default.htk_dir, 'tmp', 'HCopy.scp')
## get a list of features (hcopy.scp) from the filelist in FAME! corpus
# get a list of features (hcopy.scp) from the filelist in FAME! corpus
feature_dir_ = os.path.join(feature_dir, dataset)
if not os.path.exists(feature_dir_):
os.makedirs(feature_dir_)
@ -101,26 +100,11 @@ if extract_features:
# extract features
print(">>> extracting features... \n")
fame_functions.make_hcopy_scp_from_filelist_in_fame(default.fame_dir, dataset, feature_dir_, hcopy_scp.name)
#subprocessStr = 'HCopy -C ' + config_hcopy + ' -S ' + hcopy_scp.name
#subprocess.call(subprocessStr, shell=True)
pyhtk.wav2mfc(default.config_hcopy, hcopy_scp.name)
# a script file for HCompV
print(">>> making a script file for HCompV... \n")
## ======================= make a list of features =======================
#if make_feature_list:
# print("==== make a list of features ====\n")
# for dataset in dataset_list:
# print(dataset)
#feature_dir = output_dir + '\\mfc\\' + dataset
hcompv_scp = os.path.join(tmp_dir, dataset + '.scp')
#am_func.make_filelist(feature_dir, hcompv_scp)
fh.make_filelist(feature_dir_, hcompv_scp, '.mfc')
@ -128,6 +112,59 @@ if extract_features:
if conv_lexicon:
print('==== convert lexicon from ipa 2 fame ====\n')
#dir_out = r'c:\Users\Aki\source\repos\acoustic_model\_tmp'
lexicon_dir = os.path.join(default.fame_dir, 'lexicon')
lexicon_ipa = os.path.join(lexicon_dir, 'lex.ipa')
lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
# get the correspondence between lex_ipa and lex_asr.
lex_asr = fame_functions.load_lexicon(lexicon_asr)
lex_ipa = fame_functions.load_lexicon(lexicon_ipa)
if 1:
timer_start = time.time()
translation_key, phone_unknown = fame_functions.get_translation_key(lexicon_ipa, lexicon_asr)
print("elapsed time: {}".format(time.time() - timer_start))
np.save('translation_key_ipa2asr.npy', translation_key)
np.save('phone_unknown.npy', phone_unknown)
else:
translation_key = np.load('translation_key_ipa2asr.npy').item()
phone_unknown = np.load('phone_unknown.npy')
phone_unknown = list(phone_unknown)
## manually check the correspondence for the phone in phone_unknown.
#p = phone_unknown[0]
#lex_ipa_ = find_phone(lexicon_ipa, p, phoneset='ipa')
#for word in lex_ipa_['word']:
# ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
# if np.sum(lex_asr['word'] == word) > 0:
# asr = lex_asr[lex_asr['word'] == word].iat[0, 1]
# ipa_list = convert_phone_set.split_word(ipa, fame_phoneset.multi_character_phones_ipa)
# asr_list = asr.split(' ')
# if p in ipa_list and (len(ipa_list) == len(asr_list)):
# print("{0}: {1} --> {2}".format(word, ipa_list, asr_list))
# for ipa_, asr_ in zip(ipa_list, asr_list):
# if ipa_ in phone_unknown:
# translation_key[ipa_] = asr_
# phone_unknown.remove(ipa_)
## check if all the phones in lexicon_ipa are in fame_phoneset.py.
#timer_start = time.time()
#phoneset_lex = get_phoneset_from_lexicon(lexicon_ipa, phoneset='ipa')
#print("elapsed time: {}".format(time.time() - timer_start))
#phoneset_py = fame_phoneset.phoneset_ipa
#set(phoneset_lex) - set(phoneset_py)
##timer_start = time.time()
##extracted = find_phone(lexicon_ipa, 'ⁿ')
##print("elapsed time: {}".format(time.time() - timer_start))
# lex.asr is Kaldi compatible version of lex.ipa.
# to check...
#lexicon_ipa = pd.read_table(lex_ipa, names=['word', 'pronunciation'])
@ -140,13 +177,13 @@ if conv_lexicon:
# fout.write("{0}\t{1}\n".format(word, ' '.join(pronunciation_split)))
# convert each lexicon from ipa description to fame_htk phoneset.
am_func.ipa2famehtk_lexicon(lex_oov, lex_oov_htk)
am_func.ipa2famehtk_lexicon(lex_asr, lex_asr_htk)
#am_func.ipa2famehtk_lexicon(lex_oov, lex_oov_htk)
#am_func.ipa2famehtk_lexicon(lex_asr, lex_asr_htk)
# combine lexicon
# pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov.
# therefore there is no overlap between lex_asr and lex_oov.
am_func.combine_lexicon(lex_asr_htk, lex_oov_htk, lex_htk)
#am_func.combine_lexicon(lex_asr_htk, lex_oov_htk, lex_htk)
## ======================= check if all the phones are successfully converted =======================

View File

@ -1,41 +1,79 @@
""" definition of the phones to be used. """
## phones in IPA.
phoneset_ipa = [
# vowels
'',
'i̯ⁿ',
'y',
'i',
'i.',
'iⁿ',
'i:',
'i:ⁿ',
'ɪ',
'ɪ:',
'ɪⁿ',
'ɪ.',
#'ɪ:', # not included in lex.ipa
'ɪ:ⁿ',
'e',
'e:',
'e:ⁿ',
'ə',
'əⁿ',
'ə:',
'ɛ',
'ɛ.',
'ɛⁿ',
'ɛ:',
'ɛ:ⁿ',
'a',
'aⁿ',
'a.',
'a:',
'a:ⁿ',
'',
'ú',
'ṷ.',
'ṷⁿ',
#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr.
'u',
'uⁿ',
'u.',
'u:',
'u:ⁿ',
'ü',
'ü.',
'üⁿ',
'ü:',
'ü:ⁿ',
'o',
'oⁿ',
'o.',
'o:',
'o:ⁿ',
'ö',
'ö.',
'öⁿ',
'ö:',
'ö:ⁿ',
'ɔ',
'ɔ.',
'ɔⁿ',
'ɔ:',
'ɔ̈',
'ɔ:ⁿ',
#'ɔ̈', # not included in lex.ipa
'ɔ̈.',
'ɔ̈:',
# plosives
'p',
'b',
't',
'tⁿ',
'd',
'k',
'g',
'ɡ', # = 'g'
# nasals
'm',
@ -48,8 +86,22 @@ phoneset_ipa = [
's',
's:',
'z',
'zⁿ',
'x',
'h',
# tap and flip
'r',
'r.', # only appears in word 'mearpartijestelsel'(does not exist in lex_asr) and 'tenoarpartij'.
'r:', # only appears in word 'mûsearflearmûs' and 'sjochdêr'.
# approximant
'j',
'j.',
'l'
]
## the list of multi character phones.
# for example, the length of 'i̯ⁿ' is 3, but in the codes it is treated as one letter.
multi_character_phones_ipa = [i for i in phoneset_ipa if len(i) > 1]
multi_character_phones_ipa.sort(key=len, reverse=True)