diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo index d1feede..649e0b1 100644 Binary files a/.vs/acoustic_model/v15/.suo and b/.vs/acoustic_model/v15/.suo differ diff --git a/acoustic_model/fame_functions.py b/acoustic_model/fame_functions.py index 295ed79..10f16cd 100644 --- a/acoustic_model/fame_functions.py +++ b/acoustic_model/fame_functions.py @@ -343,6 +343,16 @@ def word2htk(word): return ''.join([fame_asr.translation_key_word2htk.get(i, i) for i in word]) +def ipa2asr(ipa): + curr_dir = os.path.dirname(os.path.abspath(__file__)) + translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0) + + ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones) + ipa_splitted = fame_ipa.phone_reduction(ipa_splitted) + asr_splitted = convert_phoneset.convert_phoneset(ipa_splitted, translation_key_ipa2asr) + return ''.join(asr_splitted) + + def ipa2htk(ipa): curr_dir = os.path.dirname(os.path.abspath(__file__)) translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0) diff --git a/acoustic_model/htk_vs_kaldi.py b/acoustic_model/htk_vs_kaldi.py index 9095b3c..00c699c 100644 --- a/acoustic_model/htk_vs_kaldi.py +++ b/acoustic_model/htk_vs_kaldi.py @@ -9,7 +9,7 @@ import sys import shutil import glob -#import numpy as np +import numpy as np import pandas as pd #import matplotlib.pyplot as plt #from sklearn.metrics import confusion_matrix @@ -75,24 +75,22 @@ lattice_file = os.path.join(config_dir, 'stimmen.ltc') hvite_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test.scp') -## ======================= make test data ====================== -# copy wav files which is in the stimmen data. +## ======================= load test data ====================== stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test' -fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav') -df = stimmen_functions.load_transcriptions() +df = stimmen_functions.load_transcriptions_clean(stimmen_test_dir) +df = stimmen_functions.add_row_asr(df) +df = stimmen_functions.add_row_htk(df) + word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)] word_list = sorted(word_list) +# pronunciation variants +for word in word_list: + df_ = df[df['word']==word] + print('{0} has {1} variants'.format(word, len(np.unique(df_['htk']))) -# after manually removed files which does not contain clear sound, -# update df as df_test. -wav_file_list = glob.glob(os.path.join(stimmen_test_dir, '*.wav')) -df_test = pd.DataFrame(index=[], columns=list(df.keys())) -for wav_file in wav_file_list: - filename = os.path.basename(wav_file) - df_ = df[df['filename'].str.match(filename)] - df_test = pd.concat([df_test, df_]) +#fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav') #output = pyhtk.recognition( # os.path.join(default.htk_dir, 'config', 'config.rec', @@ -102,58 +100,21 @@ for wav_file in wav_file_list: # os.path.join(config_dir, 'phonelist.txt'), # hvite_scp) -htk = [fame_functions.ipa2htk(ipa) for ipa in df['ipa']] + #pyhtk.create_label_file( + # row['word'], + # os.path.join(stimmen_test_dir, filename.replace('.wav', '.lab'))) -ipa = 'e:χ' -fame_functions.ipa2htk(ipa) - - - -# Filename, Word, Self Xsampa -df = pd.read_excel(xls, 'original') - -ipas = [] -famehtks = [] -for xsampa in df['Self Xsampa']: - if not isinstance(xsampa, float): # 'NaN' - # typo? - xsampa = xsampa.replace('r2:z@rA:\\t', 'r2:z@rA:t') - xsampa = xsampa.replace(';', ':') - - ipa = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa) - ipa = ipa.replace('ː', ':') - ipa = ipa.replace(' ', '') - ipas.append(ipa) - famehtk = convert_phone_set.ipa2famehtk(ipa) - famehtks.append(famehtk) - else: - ipas.append('') - famehtks.append('') - -# extract interesting cols. -df = pd.DataFrame({'filename': df['Filename'], - 'word': df['Word'], - 'xsampa': df['Self Xsampa'], - 'ipa': pd.Series(ipas), - 'famehtk': pd.Series(famehtks)}) -# cleansing. -df = df[~df['famehtk'].isin(['/', ''])] - -word_list = np.unique(df['word']) - - -## ======================= make dict files used for HTK. ====================== -if make_htk_dict_files: - output_type = 3 - - for word in word_list: - htk_dict_file = htk_dict_dir + '\\' + word + '.dic' - - # pronunciation variant of the target word. - pronvar_ = df['famehtk'][df['word'].str.match(word)] +## ======================= make a HTK dic file ====================== +#if make_htk_dic_file: +# output_type = 3 +dictionary_txt = os.path.join(default.htk_dir, 'lexicon', 'stimmen.dic') +#for word in word_list: +word = word_list[2] +# pronunciation variant of the target word. +pronunciations = df_test['asr'][df_test['word'].str.match(word)] # make dic file. - am_func.make_htk_dict(word, pronvar_, htk_dict_file, output_type) + #am_func.make_htk_dict(word, pronvar_, htk_dict_file, output_type) ## ======================= forced alignment using HTK ======================= diff --git a/acoustic_model/stimmen_functions.py b/acoustic_model/stimmen_functions.py index 9d28093..a272d42 100644 --- a/acoustic_model/stimmen_functions.py +++ b/acoustic_model/stimmen_functions.py @@ -1,13 +1,15 @@ import os os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') +import glob import pandas as pd import convert_xsampa2ipa import defaultfiles as default +import fame_functions -def load_transcriptions(): +def _load_transcriptions(): stimmen_transcription = pd.ExcelFile(default.stimmen_transcription_xlsx) df = pd.read_excel(stimmen_transcription, 'original') @@ -34,5 +36,48 @@ def load_transcriptions(): 'word': df['Word'], 'xsampa': df['Self Xsampa'], 'ipa': pd.Series(ipas)}) - df_ = df_[~df_['ipa'].str.contains('/')] + + # not valid inputs, but seperator. + df_ = df_[~df_['ipa'].str.contains('/')] return df_.dropna() + + +def load_transcriptions(): + """ in default.stimmen_transcription_xlsx + rows of which wav files can be easily found""" + df = _load_transcriptions() + df_ = pd.DataFrame(index=[], columns=list(df.keys())) + for index, row in df.iterrows(): + filename = row['filename'] + if isinstance(filename, str): + wav_file = os.path.join(default.stimmen_wav_dir, filename) + if os.path.exists(wav_file): + df_ = df_.append(row, ignore_index=True) + return df_ + + +def load_transcriptions_clean(clean_wav_dir): + df = _load_transcriptions() + wav_file_list = glob.glob(os.path.join(clean_wav_dir, '*.wav')) + df_clean = pd.DataFrame(index=[], columns=list(df.keys())) + for wav_file in wav_file_list: + filename = os.path.basename(wav_file) + df_ = df[df['filename'].str.match(filename)] + df_clean = pd.concat([df_clean, df_]) + return df_clean + + +def add_row_htk(df): + """ df['htk'] is made from df['ipa'] and added. """ + htk = [] + for index, row in df.iterrows(): + htk.append(fame_functions.ipa2htk(row['ipa'])) + return df.assign(htk=htk) + + +def add_row_asr(df): + """ df['asr'] is made from df['ipa'] and added. """ + asr = [] + for index, row in df.iterrows(): + asr.append(fame_functions.ipa2asr(row['ipa'])) + return df.assign(asr=asr) diff --git a/acoustic_model/stimmen_test.py b/acoustic_model/stimmen_test.py index 8cbdace..60e96eb 100644 --- a/acoustic_model/stimmen_test.py +++ b/acoustic_model/stimmen_test.py @@ -1,9 +1,7 @@ import os os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') import sys - import shutil -import glob #import numpy as np import pandas as pd @@ -24,23 +22,27 @@ from htk import pyhtk ## ======================= make test data ====================== -# copy wav files which is in the stimmen data. stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test' -fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav') +## copy wav files which is in the stimmen data. df = stimmen_functions.load_transcriptions() -#word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)] -#word_list = sorted(word_list) - #for index, row in df.iterrows(): # filename = row['filename'] -# if isinstance(filename, str): -# wav_file = os.path.join(default.stimmen_wav_dir, filename) -# if os.path.exists(wav_file): -# shutil.copy(wav_file, os.path.join(stimmen_test_dir, filename)) -# pyhtk.create_label_file( -# row['word'], -# os.path.join(stimmen_test_dir, filename.replace('.wav', '.lab'))) +# wav_file = os.path.join(default.stimmen_wav_dir, filename) +# shutil.copy(wav_file, os.path.join(stimmen_test_dir, filename)) + +# after manually removed files which has too much noise and multiple words... +# update the info. +df_clean = stimmen_functions.load_transcriptions_clean(stimmen_test_dir) + +# count how many files are removed due to the quality. +word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)] +word_list = sorted(word_list) +for word in word_list: + df_ = df[df['word']==word] + df_clean_ = df_clean[df_clean['word']==word] + print('word {0} has {1} clean files among {2} files ({3:.2f} [%]).'.format( + word, len(df_clean_), len(df_), len(df_clean_)/len(df_)*100)) ## check phones included in stimmen but not in FAME! @@ -59,3 +61,4 @@ for ipa in df['ipa']: ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones) if ':' in ipa_splitted: print(ipa_splitted) +