import sys import os os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') from collections import Counter import time import re import numpy as np import pandas as pd import fame_functions import defaultfiles as default sys.path.append(default.toolbox_dir) from phoneset import fame_ipa, fame_asr import convert_phoneset lexicon_dir = os.path.join(default.fame_dir, 'lexicon') lexicon_ipa = os.path.join(lexicon_dir, 'lex.ipa') lexicon_asr = os.path.join(lexicon_dir, 'lex.asr') lexicon_htk = os.path.join(default.htk_dir, 'lexicon', 'lex.htk') ## check if all the phones in lexicon.ipa are in fame_ipa.py. #timer_start = time.time() #phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_ipa, phoneset='ipa') #phoneset_py = fame_ipa.phoneset #print("phones which is in lexicon.ipa but not in fame_ipa.py:\n{}".format( # set(phoneset_lex) - set(phoneset_py))) #print("elapsed time: {}".format(time.time() - timer_start)) # check which word has the phone. #timer_start = time.time() #extracted = find_phone(lexicon_ipa, 'ⁿ') #print("elapsed time: {}".format(time.time() - timer_start)) ## get the correspondence between lex_ipa and lex_asr. lex_asr = fame_functions.load_lexicon(lexicon_asr) lex_ipa = fame_functions.load_lexicon(lexicon_ipa) if 0: timer_start = time.time() translation_key_ipa2asr, phone_unknown = fame_functions.get_translation_key(lexicon_ipa, lexicon_asr) print("elapsed time: {}".format(time.time() - timer_start)) np.save(os.path.join('phoneset', 'output_get_translation_key_translation_key.npy'), translation_key_ipa2asr) np.save(os.path.join('phoneset', 'output_get_translation_key_phone_unknown.npy'), phone_unknown) else: translation_key_ipa2asr = np.load(os.path.join('phoneset', 'output_get_translation_key_translation_key.npy')).item() phone_unknown = np.load(os.path.join('phoneset', 'output_get_translation_key_phone_unknown.npy')) phone_unknown = list(phone_unknown) # manually check the correspondence for the phone in phone_unknown. #p = phone_unknown[0] #lex_ipa_ = find_phone(lexicon_ipa, p, phoneset='ipa') #for word in lex_ipa_['word']: # ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1] # if np.sum(lex_asr['word'] == word) > 0: # asr = lex_asr[lex_asr['word'] == word].iat[0, 1] # ipa_list = convert_phone_set.split_word(ipa, fame_ipa.multi_character_phones) # asr_list = asr.split(' ') # if p in ipa_list and (len(ipa_list) == len(asr_list)): # print("{0}: {1} --> {2}".format(word, ipa_list, asr_list)) # for ipa_, asr_ in zip(ipa_list, asr_list): # if ipa_ in phone_unknown: # translation_key_ipa2asr[ipa_] = asr_ # phone_unknown.remove(ipa_) translation_key_ipa2asr['ə:'] = 'ə' translation_key_ipa2asr['r.'] = 'r' translation_key_ipa2asr['r:'] = 'r' np.save(os.path.join('phoneset', 'fame_ipa2asr.npy'), translation_key_ipa2asr) ## check if all the phones in lexicon.asr are in translation_key_ipa2asr. #timer_start = time.time() #phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_asr, phoneset='asr') #phoneset_lex.remove("") #phoneset_asr = list(set(translation_key_ipa2asr.values())) #print("phones which is in lexicon.asr but not in the translation_key_ipa2asr:\n{}".format( # set(phoneset_lex) - set(phoneset_asr))) #print("elapsed time: {}".format(time.time() - timer_start)) ## check if all the phones in lexicon.htk are in fame_asr.py. #timer_start = time.time() #phoneset_htk = fame_asr.phoneset_htk #phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_htk) #phoneset_lex.remove('') #print("phones which is in lexicon.htk but not in the fame_asr.py are:\n{}".format( # set(phoneset_htk) - set(phoneset_lex))) #print("elapsed time: {}".format(time.time() - timer_start)) ## statistics over the lexicon #lex_htk = fame_functions.load_lexicon(lexicon_htk) #phones_all = (' '.join(lex_htk['pronunciation'])).split(' ') #c = Counter(phones_all) #lexicon_out = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\lexicon\lex.htk2' #for i in lex_htk[lex_htk['word'].str.startswith('\'')].index.values: # lex_htk.iat[i, 0] = lex_htk.iat[i, 0].replace('\'', '\\\'') ## to_csv does not work with space seperator. therefore all tabs should manually be replaced. ##lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\') #lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t') ## check which letters are not coded in ascii. print('asr phones which cannot be coded in ascii:\n') for i in fame_asr.phoneset_short: try: i_encoded = i.encode("ascii") #print("{0} --> {1}".format(i, i.encode("ascii"))) except UnicodeEncodeError: print(">>> {}".format(i)) print("letters in the scripts which is not coded in ascii:\n") for dataset in ['train', 'devel', 'test']: timer_start = time.time() script_list = os.path.join(default.fame_dir, 'data', dataset, 'text') with open(script_list, "rt", encoding="utf-8") as fin: scripts = fin.read().split('\n') for line in scripts: sentence = ' '.join(line.split(' ')[1:]) sentence_htk = fame_functions.word2htk(sentence) #if len(re.findall(r'[âêôûč\'àéèúćäëïöü]', sentence))==0: try: sentence_htk = bytes(sentence_htk, 'ascii') except UnicodeEncodeError: print(sentence) print(sentence_htk)