import os os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') import sys import shutil from collections import Counter import numpy as np import pandas as pd import defaultfiles as default import convert_xsampa2ipa import stimmen_functions import fame_functions import convert_phoneset from phoneset import fame_ipa, fame_asr sys.path.append(default.toolbox_dir) import file_handling as fh from htk import pyhtk ## ======================= user define ======================= ## ======================= make test data ====================== stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test' ## copy wav files which is in the stimmen data. df = stimmen_functions.load_transcriptions() #for index, row in df.iterrows(): # filename = row['filename'] # wav_file = os.path.join(default.stimmen_wav_dir, filename) # shutil.copy(wav_file, os.path.join(stimmen_test_dir, filename)) # after manually removed files which has too much noise and multiple words... # update the info. df_clean = stimmen_functions.load_transcriptions_clean(stimmen_test_dir) # count how many files are removed due to the quality. word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)] word_list = sorted(word_list) for word in word_list: df_ = df[df['word']==word] df_clean_ = df_clean[df_clean['word']==word] print('word {0} has {1} clean files among {2} files ({3:.2f} [%]).'.format( word, len(df_clean_), len(df_), len(df_clean_)/len(df_)*100)) ## check phones included in stimmen but not in FAME! splitted_ipas = [' '.join( convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)) for ipa in df['ipa']] stimmen_phones = set(' '.join(splitted_ipas)) stimmen_phones = list(stimmen_phones) fame_phones = fame_ipa.phoneset stimmen_phones.sort() fame_phones.sort() print('phones which are used in stimmen transcription but not in FAME corpus are:\n{}'.format( set(stimmen_phones) - set(fame_phones) )) for ipa in df['ipa']: ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones) if ':' in ipa_splitted: print(ipa_splitted) ## check pronunciation variants df_clean = stimmen_functions.load_transcriptions_clean(stimmen_test_dir) df_clean = stimmen_functions.add_row_asr(df_clean) df_clean = stimmen_functions.add_row_htk(df_clean) for word in word_list: #word = word_list[1] df_ = df_clean[df_clean['word']==word] c = Counter(df_['htk']) pronunciations = dict() for key, value in zip(c.keys(), c.values()): if value > 3: pronunciations[key] = value print(pronunciations) monophone_mlf = os.path.join(default.htk_dir, 'label', 'train_phone_aligned.mlf') triphone_mlf = os.path.join(default.htk_dir, 'label', 'train_triphone.mlf') def filenames_in_mlf(file_mlf): with open(file_mlf) as f: lines_ = f.read().split('\n') lines = [line for line in lines_ if len(line.split(' ')) == 1 and line != '.'] filenames = [line.replace('"', '').replace('*/', '') for line in lines[1:-1]] return filenames filenames_mono = filenames_in_mlf(monophone_mlf) filenames_tri = filenames_in_mlf(triphone_mlf)