import os os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') import glob import pandas as pd import convert_xsampa2ipa import defaultfiles as default import fame_functions import novoapi_functions def _load_transcriptions(): stimmen_transcription = pd.ExcelFile(default.stimmen_transcription_xlsx) df = pd.read_excel(stimmen_transcription, 'original') # mapping from ipa to xsampa mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_converter_dir) #for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']): # ipa_converted = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa) # if not ipa_converted == ipa: # print('{0}: {1} - {2}'.format(xsampa, ipa_converted, ipa)) ipas = [] for xsampa in df['Self Xsampa']: if not isinstance(xsampa, float): # 'NaN' # typo? xsampa = xsampa.replace('r2:z@rA:\\t', 'r2:z@rA:t').replace(';', ':') ipa = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa) ipa = ipa.replace('ː', ':').replace(' ', '') ipas.append(ipa) else: ipas.append('') df_ = pd.DataFrame({'filename': df['Filename'], 'word': df['Word'], 'xsampa': df['Self Xsampa'], 'ipa': pd.Series(ipas)}) # not valid inputs, but seperator. df_ = df_[~df_['ipa'].str.contains('/')] return df_.dropna() def load_transcriptions(): """ in default.stimmen_transcription_xlsx rows of which wav files can be easily found""" df = _load_transcriptions() df_ = pd.DataFrame(index=[], columns=list(df.keys())) for index, row in df.iterrows(): filename = row['filename'] if isinstance(filename, str): wav_file = os.path.join(default.stimmen_wav_dir, filename) if os.path.exists(wav_file): df_ = df_.append(row, ignore_index=True) return df_ def load_transcriptions_clean(clean_wav_dir): df = _load_transcriptions() wav_file_list = glob.glob(os.path.join(clean_wav_dir, '*.wav')) df_clean = pd.DataFrame(index=[], columns=list(df.keys())) for wav_file in wav_file_list: filename = os.path.basename(wav_file) df_ = df[df['filename'].str.match(filename)] df_clean = pd.concat([df_clean, df_]) return df_clean def load_transcriptions_novo70(clean_wav_dir): """ extract rows of which ipa is written in novo70 phonset. """ df = load_transcriptions_clean(clean_wav_dir) df_novo70 = pd.DataFrame(index=[], columns=list(df.keys())) for index, row in df.iterrows(): not_in_novo70 = novoapi_functions.phones_not_in_novo70(row['ipa']) if len(not_in_novo70) == 0: df_novo70 = df_novo70.append(row, ignore_index=True) return df_novo70 def add_row_htk(df): """ df['htk'] is made from df['ipa'] and added. """ htk = [] for index, row in df.iterrows(): htk.append(fame_functions.ipa2htk(row['ipa'])) return df.assign(htk=htk) def add_row_asr(df): """ df['asr'] is made from df['ipa'] and added. """ asr = [] for index, row in df.iterrows(): asr.append(fame_functions.ipa2asr(row['ipa'])) return df.assign(asr=asr) def load_pronunciations(WORD, htk_dic): """ load pronunciation variants from HTK dic file. Args: WORD (str): word in capital letters. htk_dic (path): HTK dict file. Returns: (pronunciations) (list): pronunciation variants of WORD. Notes: Because this function loads all contents from htk_dic file, it is not recommended to use for large lexicon. """ with open(htk_dic) as f: lines = f.read().replace(' sil', '') lines = lines.split('\n') return [' '.join(line.split(' ')[1:]) for line in lines if line.split(' ')[0]==WORD]