import os os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') import sys import shutil import glob #import numpy as np import pandas as pd import defaultfiles as default import convert_xsampa2ipa import stimmen_functions import fame_functions import convert_phoneset from phoneset import fame_ipa, fame_asr sys.path.append(default.toolbox_dir) import file_handling as fh from htk import pyhtk ## ======================= user define ======================= ## ======================= make test data ====================== # copy wav files which is in the stimmen data. stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test' fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav') df = stimmen_functions.load_transcriptions() #word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)] #word_list = sorted(word_list) #for index, row in df.iterrows(): # filename = row['filename'] # if isinstance(filename, str): # wav_file = os.path.join(default.stimmen_wav_dir, filename) # if os.path.exists(wav_file): # shutil.copy(wav_file, os.path.join(stimmen_test_dir, filename)) # pyhtk.create_label_file( # row['word'], # os.path.join(stimmen_test_dir, filename.replace('.wav', '.lab'))) ## check phones included in stimmen but not in FAME! splitted_ipas = [' '.join( convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)) for ipa in df['ipa']] stimmen_phones = set(' '.join(splitted_ipas)) stimmen_phones = list(stimmen_phones) fame_phones = fame_ipa.phoneset stimmen_phones.sort() fame_phones.sort() print('phones which are used in stimmen transcription but not in FAME corpus are:\n{}'.format( set(stimmen_phones) - set(fame_phones) )) for ipa in df['ipa']: ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones) if ':' in ipa_splitted: print(ipa_splitted)