84 lines
2.4 KiB
Python
84 lines
2.4 KiB
Python
import os
|
||
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
||
import glob
|
||
|
||
import pandas as pd
|
||
|
||
import convert_xsampa2ipa
|
||
import defaultfiles as default
|
||
import fame_functions
|
||
|
||
|
||
def _load_transcriptions():
|
||
stimmen_transcription = pd.ExcelFile(default.stimmen_transcription_xlsx)
|
||
df = pd.read_excel(stimmen_transcription, 'original')
|
||
|
||
# mapping from ipa to xsampa
|
||
mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_converter_dir)
|
||
#for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']):
|
||
# ipa_converted = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
|
||
# if not ipa_converted == ipa:
|
||
# print('{0}: {1} - {2}'.format(xsampa, ipa_converted, ipa))
|
||
|
||
ipas = []
|
||
for xsampa in df['Self Xsampa']:
|
||
if not isinstance(xsampa, float): # 'NaN'
|
||
# typo?
|
||
xsampa = xsampa.replace('r2:z@rA:\\t', 'r2:z@rA:t').replace(';', ':')
|
||
|
||
ipa = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
|
||
ipa = ipa.replace('ː', ':').replace(' ', '')
|
||
ipas.append(ipa)
|
||
else:
|
||
ipas.append('')
|
||
|
||
df_ = pd.DataFrame({'filename': df['Filename'],
|
||
'word': df['Word'],
|
||
'xsampa': df['Self Xsampa'],
|
||
'ipa': pd.Series(ipas)})
|
||
|
||
# not valid inputs, but seperator.
|
||
df_ = df_[~df_['ipa'].str.contains('/')]
|
||
return df_.dropna()
|
||
|
||
|
||
def load_transcriptions():
|
||
""" in default.stimmen_transcription_xlsx
|
||
rows of which wav files can be easily found"""
|
||
df = _load_transcriptions()
|
||
df_ = pd.DataFrame(index=[], columns=list(df.keys()))
|
||
for index, row in df.iterrows():
|
||
filename = row['filename']
|
||
if isinstance(filename, str):
|
||
wav_file = os.path.join(default.stimmen_wav_dir, filename)
|
||
if os.path.exists(wav_file):
|
||
df_ = df_.append(row, ignore_index=True)
|
||
return df_
|
||
|
||
|
||
def load_transcriptions_clean(clean_wav_dir):
|
||
df = _load_transcriptions()
|
||
wav_file_list = glob.glob(os.path.join(clean_wav_dir, '*.wav'))
|
||
df_clean = pd.DataFrame(index=[], columns=list(df.keys()))
|
||
for wav_file in wav_file_list:
|
||
filename = os.path.basename(wav_file)
|
||
df_ = df[df['filename'].str.match(filename)]
|
||
df_clean = pd.concat([df_clean, df_])
|
||
return df_clean
|
||
|
||
|
||
def add_row_htk(df):
|
||
""" df['htk'] is made from df['ipa'] and added. """
|
||
htk = []
|
||
for index, row in df.iterrows():
|
||
htk.append(fame_functions.ipa2htk(row['ipa']))
|
||
return df.assign(htk=htk)
|
||
|
||
|
||
def add_row_asr(df):
|
||
""" df['asr'] is made from df['ipa'] and added. """
|
||
asr = []
|
||
for index, row in df.iterrows():
|
||
asr.append(fame_functions.ipa2asr(row['ipa']))
|
||
return df.assign(asr=asr)
|