You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

119 lines
3.4 KiB

import os
import glob
import pandas as pd
import convert_xsampa2ipa
import defaultfiles as default
import fame_functions
import novoapi_functions
def _load_transcriptions():
stimmen_transcription = pd.ExcelFile(default.stimmen_transcription_xlsx)
df = pd.read_excel(stimmen_transcription, 'original')
# mapping from ipa to xsampa
mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_converter_dir)
#for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']):
# ipa_converted = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
# if not ipa_converted == ipa:
# print('{0}: {1} - {2}'.format(xsampa, ipa_converted, ipa))
ipas = []
for xsampa in df['Self Xsampa']:
if not isinstance(xsampa, float): # 'NaN'
# typo?
xsampa = xsampa.replace('r2:z@rA:\\t', 'r2:z@rA:t').replace(';', ':')
ipa = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
ipa = ipa.replace('ː', ':').replace(' ', '')
df_ = pd.DataFrame({'filename': df['Filename'],
'word': df['Word'],
'xsampa': df['Self Xsampa'],
'ipa': pd.Series(ipas)})
# not valid inputs, but seperator.
df_ = df_[~df_['ipa'].str.contains('/')]
return df_.dropna()
def load_transcriptions():
""" in default.stimmen_transcription_xlsx
rows of which wav files can be easily found"""
df = _load_transcriptions()
df_ = pd.DataFrame(index=[], columns=list(df.keys()))
for index, row in df.iterrows():
filename = row['filename']
if isinstance(filename, str):
wav_file = os.path.join(default.stimmen_wav_dir, filename)
if os.path.exists(wav_file):
df_ = df_.append(row, ignore_index=True)
return df_
def load_transcriptions_clean(clean_wav_dir):
df = _load_transcriptions()
wav_file_list = glob.glob(os.path.join(clean_wav_dir, '*.wav'))
df_clean = pd.DataFrame(index=[], columns=list(df.keys()))
for wav_file in wav_file_list:
filename = os.path.basename(wav_file)
df_ = df[df['filename'].str.match(filename)]
df_clean = pd.concat([df_clean, df_])
return df_clean
def load_transcriptions_novo70(clean_wav_dir):
""" extract rows of which ipa is written in novo70 phonset. """
df = load_transcriptions_clean(clean_wav_dir)
df_novo70 = pd.DataFrame(index=[], columns=list(df.keys()))
for index, row in df.iterrows():
not_in_novo70 = novoapi_functions.phones_not_in_novo70(row['ipa'])
if len(not_in_novo70) == 0:
df_novo70 = df_novo70.append(row, ignore_index=True)
return df_novo70
def add_row_htk(df):
""" df['htk'] is made from df['ipa'] and added. """
htk = []
for index, row in df.iterrows():
return df.assign(htk=htk)
def add_row_asr(df):
""" df['asr'] is made from df['ipa'] and added. """
asr = []
for index, row in df.iterrows():
return df.assign(asr=asr)
def load_pronunciations(WORD, htk_dic):
""" load pronunciation variants from HTK dic file.
WORD (str): word in capital letters.
htk_dic (path): HTK dict file.
(pronunciations) (list): pronunciation variants of WORD.
Because this function loads all contents from htk_dic file,
it is not recommended to use for large lexicon.
with open(htk_dic) as f:
lines =' sil', '')
lines = lines.split('\n')
return [' '.join(line.split(' ')[1:])
for line in lines if line.split(' ')[0]==WORD]