
139 lines
5.3 KiB
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import sys
import os
from collections import Counter
import time
import re
import numpy as np
import pandas as pd
import fame_functions
import defaultfiles as default
from phoneset import fame_ipa, fame_asr
import convert_phoneset
lexicon_dir = os.path.join(default.fame_dir, 'lexicon')
lexicon_ipa = os.path.join(lexicon_dir, 'lex.ipa')
lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
lexicon_htk = os.path.join(default.htk_dir, 'lexicon', 'lex.htk')
## check if all the phones in lexicon.ipa are in
#timer_start = time.time()
#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_ipa, phoneset='ipa')
#phoneset_py = fame_ipa.phoneset
#print("phones which is in lexicon.ipa but not in\n{}".format(
# set(phoneset_lex) - set(phoneset_py)))
#print("elapsed time: {}".format(time.time() - timer_start))
# check which word has the phone.
#timer_start = time.time()
#extracted = find_phone(lexicon_ipa, 'ⁿ')
#print("elapsed time: {}".format(time.time() - timer_start))
## get the correspondence between lex_ipa and lex_asr.
lex_asr = fame_functions.load_lexicon(lexicon_asr)
lex_ipa = fame_functions.load_lexicon(lexicon_ipa)
if 0:
timer_start = time.time()
translation_key_ipa2asr, phone_unknown = fame_functions.get_translation_key(lexicon_ipa, lexicon_asr)
print("elapsed time: {}".format(time.time() - timer_start))'phoneset', 'output_get_translation_key_translation_key.npy'), translation_key_ipa2asr)'phoneset', 'output_get_translation_key_phone_unknown.npy'), phone_unknown)
translation_key_ipa2asr = np.load(os.path.join('phoneset', 'output_get_translation_key_translation_key.npy')).item()
phone_unknown = np.load(os.path.join('phoneset', 'output_get_translation_key_phone_unknown.npy'))
phone_unknown = list(phone_unknown)
# manually check the correspondence for the phone in phone_unknown.
#p = phone_unknown[0]
#lex_ipa_ = find_phone(lexicon_ipa, p, phoneset='ipa')
#for word in lex_ipa_['word']:
# ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
# if np.sum(lex_asr['word'] == word) > 0:
# asr = lex_asr[lex_asr['word'] == word].iat[0, 1]
# ipa_list = convert_phone_set.split_word(ipa, fame_ipa.multi_character_phones)
# asr_list = asr.split(' ')
# if p in ipa_list and (len(ipa_list) == len(asr_list)):
# print("{0}: {1} --> {2}".format(word, ipa_list, asr_list))
# for ipa_, asr_ in zip(ipa_list, asr_list):
# if ipa_ in phone_unknown:
# translation_key_ipa2asr[ipa_] = asr_
# phone_unknown.remove(ipa_)
translation_key_ipa2asr['ə:'] = 'ə'
translation_key_ipa2asr['r.'] = 'r'
translation_key_ipa2asr['r:'] = 'r'
# added for stimmen.
translation_key_ipa2asr['ɪ:'] = 'ɪ:'
translation_key_ipa2asr['y:'] = 'y''phoneset', 'fame_ipa2asr.npy'), translation_key_ipa2asr)
## check if all the phones in lexicon.asr are in translation_key_ipa2asr.
#timer_start = time.time()
#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_asr, phoneset='asr')
#phoneset_asr = list(set(translation_key_ipa2asr.values()))
#print("phones which is in lexicon.asr but not in the translation_key_ipa2asr:\n{}".format(
# set(phoneset_lex) - set(phoneset_asr)))
#print("elapsed time: {}".format(time.time() - timer_start))
## check if all the phones in lexicon.htk are in
#timer_start = time.time()
#phoneset_htk = fame_asr.phoneset_htk
#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_htk)
#print("phones which is in lexicon.htk but not in the are:\n{}".format(
# set(phoneset_htk) - set(phoneset_lex)))
#print("elapsed time: {}".format(time.time() - timer_start))
## statistics over the lexicon
#lex_htk = fame_functions.load_lexicon(lexicon_htk)
#phones_all = (' '.join(lex_htk['pronunciation'])).split(' ')
#c = Counter(phones_all)
#lexicon_out = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\lexicon\lex.htk2'
#for i in lex_htk[lex_htk['word'].str.startswith('\'')].index.values:
# lex_htk.iat[i, 0] = lex_htk.iat[i, 0].replace('\'', '\\\'')
## to_csv does not work with space seperator. therefore all tabs should manually be replaced.
##lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
#lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
## check which letters are not coded in ascii.
#print('asr phones which cannot be coded in ascii:\n')
#for i in fame_asr.phoneset_short:
# try:
# i_encoded = i.encode("ascii")
# #print("{0} --> {1}".format(i, i.encode("ascii")))
# except UnicodeEncodeError:
# print(">>> {}".format(i))
#print("letters in the scripts which is not coded in ascii:\n")
#for dataset in ['train', 'devel', 'test']:
# timer_start = time.time()
# script_list = os.path.join(default.fame_dir, 'data', dataset, 'text')
# with open(script_list, "rt", encoding="utf-8") as fin:
# scripts ='\n')
# for line in scripts:
# sentence = ' '.join(line.split(' ')[1:])
# sentence_htk = fame_functions.word2htk(sentence)
# #if len(re.findall(r'[âêôûč\'àéèúćäëïöü]', sentence))==0:
# try:
# sentence_htk = bytes(sentence_htk, 'ascii')
# except UnicodeEncodeError:
# print(sentence)
# print(sentence_htk)