139 lines
5.3 KiB
Python
139 lines
5.3 KiB
Python
import sys
|
||
import os
|
||
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
||
from collections import Counter
|
||
import time
|
||
import re
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
|
||
import fame_functions
|
||
import defaultfiles as default
|
||
sys.path.append(default.toolbox_dir)
|
||
from phoneset import fame_ipa, fame_asr
|
||
import convert_phoneset
|
||
|
||
lexicon_dir = os.path.join(default.fame_dir, 'lexicon')
|
||
lexicon_ipa = os.path.join(lexicon_dir, 'lex.ipa')
|
||
lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
|
||
lexicon_htk = os.path.join(default.htk_dir, 'lexicon', 'lex.htk')
|
||
|
||
## check if all the phones in lexicon.ipa are in fame_ipa.py.
|
||
#timer_start = time.time()
|
||
#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_ipa, phoneset='ipa')
|
||
#phoneset_py = fame_ipa.phoneset
|
||
#print("phones which is in lexicon.ipa but not in fame_ipa.py:\n{}".format(
|
||
# set(phoneset_lex) - set(phoneset_py)))
|
||
#print("elapsed time: {}".format(time.time() - timer_start))
|
||
|
||
# check which word has the phone.
|
||
#timer_start = time.time()
|
||
#extracted = find_phone(lexicon_ipa, 'ⁿ')
|
||
#print("elapsed time: {}".format(time.time() - timer_start))
|
||
|
||
|
||
## get the correspondence between lex_ipa and lex_asr.
|
||
lex_asr = fame_functions.load_lexicon(lexicon_asr)
|
||
lex_ipa = fame_functions.load_lexicon(lexicon_ipa)
|
||
if 0:
|
||
timer_start = time.time()
|
||
translation_key_ipa2asr, phone_unknown = fame_functions.get_translation_key(lexicon_ipa, lexicon_asr)
|
||
print("elapsed time: {}".format(time.time() - timer_start))
|
||
|
||
np.save(os.path.join('phoneset', 'output_get_translation_key_translation_key.npy'), translation_key_ipa2asr)
|
||
np.save(os.path.join('phoneset', 'output_get_translation_key_phone_unknown.npy'), phone_unknown)
|
||
else:
|
||
translation_key_ipa2asr = np.load(os.path.join('phoneset', 'output_get_translation_key_translation_key.npy')).item()
|
||
phone_unknown = np.load(os.path.join('phoneset', 'output_get_translation_key_phone_unknown.npy'))
|
||
phone_unknown = list(phone_unknown)
|
||
|
||
# manually check the correspondence for the phone in phone_unknown.
|
||
#p = phone_unknown[0]
|
||
#lex_ipa_ = find_phone(lexicon_ipa, p, phoneset='ipa')
|
||
|
||
#for word in lex_ipa_['word']:
|
||
# ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
|
||
# if np.sum(lex_asr['word'] == word) > 0:
|
||
# asr = lex_asr[lex_asr['word'] == word].iat[0, 1]
|
||
|
||
# ipa_list = convert_phone_set.split_word(ipa, fame_ipa.multi_character_phones)
|
||
# asr_list = asr.split(' ')
|
||
# if p in ipa_list and (len(ipa_list) == len(asr_list)):
|
||
# print("{0}: {1} --> {2}".format(word, ipa_list, asr_list))
|
||
# for ipa_, asr_ in zip(ipa_list, asr_list):
|
||
# if ipa_ in phone_unknown:
|
||
# translation_key_ipa2asr[ipa_] = asr_
|
||
# phone_unknown.remove(ipa_)
|
||
|
||
translation_key_ipa2asr['ə:'] = 'ə'
|
||
translation_key_ipa2asr['r.'] = 'r'
|
||
translation_key_ipa2asr['r:'] = 'r'
|
||
# added for stimmen.
|
||
translation_key_ipa2asr['ɪ:'] = 'ɪ:'
|
||
translation_key_ipa2asr['y:'] = 'y'
|
||
|
||
np.save(os.path.join('phoneset', 'fame_ipa2asr.npy'), translation_key_ipa2asr)
|
||
|
||
|
||
## check if all the phones in lexicon.asr are in translation_key_ipa2asr.
|
||
#timer_start = time.time()
|
||
#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_asr, phoneset='asr')
|
||
#phoneset_lex.remove("")
|
||
#phoneset_asr = list(set(translation_key_ipa2asr.values()))
|
||
#print("phones which is in lexicon.asr but not in the translation_key_ipa2asr:\n{}".format(
|
||
# set(phoneset_lex) - set(phoneset_asr)))
|
||
#print("elapsed time: {}".format(time.time() - timer_start))
|
||
|
||
|
||
## check if all the phones in lexicon.htk are in fame_asr.py.
|
||
#timer_start = time.time()
|
||
#phoneset_htk = fame_asr.phoneset_htk
|
||
#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_htk)
|
||
#phoneset_lex.remove('')
|
||
#print("phones which is in lexicon.htk but not in the fame_asr.py are:\n{}".format(
|
||
# set(phoneset_htk) - set(phoneset_lex)))
|
||
#print("elapsed time: {}".format(time.time() - timer_start))
|
||
|
||
## statistics over the lexicon
|
||
#lex_htk = fame_functions.load_lexicon(lexicon_htk)
|
||
#phones_all = (' '.join(lex_htk['pronunciation'])).split(' ')
|
||
#c = Counter(phones_all)
|
||
|
||
#lexicon_out = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\lexicon\lex.htk2'
|
||
#for i in lex_htk[lex_htk['word'].str.startswith('\'')].index.values:
|
||
# lex_htk.iat[i, 0] = lex_htk.iat[i, 0].replace('\'', '\\\'')
|
||
## to_csv does not work with space seperator. therefore all tabs should manually be replaced.
|
||
##lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
|
||
#lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
|
||
|
||
|
||
## check which letters are not coded in ascii.
|
||
#print('asr phones which cannot be coded in ascii:\n')
|
||
#for i in fame_asr.phoneset_short:
|
||
# try:
|
||
# i_encoded = i.encode("ascii")
|
||
# #print("{0} --> {1}".format(i, i.encode("ascii")))
|
||
# except UnicodeEncodeError:
|
||
# print(">>> {}".format(i))
|
||
|
||
#print("letters in the scripts which is not coded in ascii:\n")
|
||
#for dataset in ['train', 'devel', 'test']:
|
||
# timer_start = time.time()
|
||
|
||
# script_list = os.path.join(default.fame_dir, 'data', dataset, 'text')
|
||
# with open(script_list, "rt", encoding="utf-8") as fin:
|
||
# scripts = fin.read().split('\n')
|
||
|
||
# for line in scripts:
|
||
# sentence = ' '.join(line.split(' ')[1:])
|
||
# sentence_htk = fame_functions.word2htk(sentence)
|
||
|
||
# #if len(re.findall(r'[âêôûč\'àéèúćäëïöü]', sentence))==0:
|
||
# try:
|
||
# sentence_htk = bytes(sentence_htk, 'ascii')
|
||
# except UnicodeEncodeError:
|
||
# print(sentence)
|
||
# print(sentence_htk)
|
||
|