dataset is made.
This commit is contained in:
parent
f6e563ecd3
commit
8f89f60538
Binary file not shown.
@ -343,6 +343,16 @@ def word2htk(word):
|
||||
return ''.join([fame_asr.translation_key_word2htk.get(i, i) for i in word])
|
||||
|
||||
|
||||
def ipa2asr(ipa):
|
||||
curr_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0)
|
||||
|
||||
ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)
|
||||
ipa_splitted = fame_ipa.phone_reduction(ipa_splitted)
|
||||
asr_splitted = convert_phoneset.convert_phoneset(ipa_splitted, translation_key_ipa2asr)
|
||||
return ''.join(asr_splitted)
|
||||
|
||||
|
||||
def ipa2htk(ipa):
|
||||
curr_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0)
|
||||
|
@ -9,7 +9,7 @@ import sys
|
||||
import shutil
|
||||
import glob
|
||||
|
||||
#import numpy as np
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
#import matplotlib.pyplot as plt
|
||||
#from sklearn.metrics import confusion_matrix
|
||||
@ -75,24 +75,22 @@ lattice_file = os.path.join(config_dir, 'stimmen.ltc')
|
||||
hvite_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test.scp')
|
||||
|
||||
|
||||
## ======================= make test data ======================
|
||||
# copy wav files which is in the stimmen data.
|
||||
## ======================= load test data ======================
|
||||
stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test'
|
||||
fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav')
|
||||
|
||||
df = stimmen_functions.load_transcriptions()
|
||||
df = stimmen_functions.load_transcriptions_clean(stimmen_test_dir)
|
||||
df = stimmen_functions.add_row_asr(df)
|
||||
df = stimmen_functions.add_row_htk(df)
|
||||
|
||||
word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
|
||||
word_list = sorted(word_list)
|
||||
|
||||
# pronunciation variants
|
||||
for word in word_list:
|
||||
df_ = df[df['word']==word]
|
||||
print('{0} has {1} variants'.format(word, len(np.unique(df_['htk'])))
|
||||
|
||||
# after manually removed files which does not contain clear sound,
|
||||
# update df as df_test.
|
||||
wav_file_list = glob.glob(os.path.join(stimmen_test_dir, '*.wav'))
|
||||
df_test = pd.DataFrame(index=[], columns=list(df.keys()))
|
||||
for wav_file in wav_file_list:
|
||||
filename = os.path.basename(wav_file)
|
||||
df_ = df[df['filename'].str.match(filename)]
|
||||
df_test = pd.concat([df_test, df_])
|
||||
#fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav')
|
||||
|
||||
#output = pyhtk.recognition(
|
||||
# os.path.join(default.htk_dir, 'config', 'config.rec',
|
||||
@ -102,58 +100,21 @@ for wav_file in wav_file_list:
|
||||
# os.path.join(config_dir, 'phonelist.txt'),
|
||||
# hvite_scp)
|
||||
|
||||
htk = [fame_functions.ipa2htk(ipa) for ipa in df['ipa']]
|
||||
#pyhtk.create_label_file(
|
||||
# row['word'],
|
||||
# os.path.join(stimmen_test_dir, filename.replace('.wav', '.lab')))
|
||||
|
||||
ipa = 'e:χ'
|
||||
fame_functions.ipa2htk(ipa)
|
||||
|
||||
|
||||
|
||||
# Filename, Word, Self Xsampa
|
||||
df = pd.read_excel(xls, 'original')
|
||||
|
||||
ipas = []
|
||||
famehtks = []
|
||||
for xsampa in df['Self Xsampa']:
|
||||
if not isinstance(xsampa, float): # 'NaN'
|
||||
# typo?
|
||||
xsampa = xsampa.replace('r2:z@rA:\\t', 'r2:z@rA:t')
|
||||
xsampa = xsampa.replace(';', ':')
|
||||
|
||||
ipa = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
|
||||
ipa = ipa.replace('ː', ':')
|
||||
ipa = ipa.replace(' ', '')
|
||||
ipas.append(ipa)
|
||||
famehtk = convert_phone_set.ipa2famehtk(ipa)
|
||||
famehtks.append(famehtk)
|
||||
else:
|
||||
ipas.append('')
|
||||
famehtks.append('')
|
||||
|
||||
# extract interesting cols.
|
||||
df = pd.DataFrame({'filename': df['Filename'],
|
||||
'word': df['Word'],
|
||||
'xsampa': df['Self Xsampa'],
|
||||
'ipa': pd.Series(ipas),
|
||||
'famehtk': pd.Series(famehtks)})
|
||||
# cleansing.
|
||||
df = df[~df['famehtk'].isin(['/', ''])]
|
||||
|
||||
word_list = np.unique(df['word'])
|
||||
|
||||
|
||||
## ======================= make dict files used for HTK. ======================
|
||||
if make_htk_dict_files:
|
||||
output_type = 3
|
||||
|
||||
for word in word_list:
|
||||
htk_dict_file = htk_dict_dir + '\\' + word + '.dic'
|
||||
|
||||
# pronunciation variant of the target word.
|
||||
pronvar_ = df['famehtk'][df['word'].str.match(word)]
|
||||
## ======================= make a HTK dic file ======================
|
||||
#if make_htk_dic_file:
|
||||
# output_type = 3
|
||||
dictionary_txt = os.path.join(default.htk_dir, 'lexicon', 'stimmen.dic')
|
||||
#for word in word_list:
|
||||
word = word_list[2]
|
||||
# pronunciation variant of the target word.
|
||||
pronunciations = df_test['asr'][df_test['word'].str.match(word)]
|
||||
|
||||
# make dic file.
|
||||
am_func.make_htk_dict(word, pronvar_, htk_dict_file, output_type)
|
||||
#am_func.make_htk_dict(word, pronvar_, htk_dict_file, output_type)
|
||||
|
||||
|
||||
## ======================= forced alignment using HTK =======================
|
||||
|
@ -1,13 +1,15 @@
|
||||
import os
|
||||
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
||||
import glob
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import convert_xsampa2ipa
|
||||
import defaultfiles as default
|
||||
import fame_functions
|
||||
|
||||
|
||||
def load_transcriptions():
|
||||
def _load_transcriptions():
|
||||
stimmen_transcription = pd.ExcelFile(default.stimmen_transcription_xlsx)
|
||||
df = pd.read_excel(stimmen_transcription, 'original')
|
||||
|
||||
@ -34,5 +36,48 @@ def load_transcriptions():
|
||||
'word': df['Word'],
|
||||
'xsampa': df['Self Xsampa'],
|
||||
'ipa': pd.Series(ipas)})
|
||||
|
||||
# not valid inputs, but seperator.
|
||||
df_ = df_[~df_['ipa'].str.contains('/')]
|
||||
return df_.dropna()
|
||||
|
||||
|
||||
def load_transcriptions():
|
||||
""" in default.stimmen_transcription_xlsx
|
||||
rows of which wav files can be easily found"""
|
||||
df = _load_transcriptions()
|
||||
df_ = pd.DataFrame(index=[], columns=list(df.keys()))
|
||||
for index, row in df.iterrows():
|
||||
filename = row['filename']
|
||||
if isinstance(filename, str):
|
||||
wav_file = os.path.join(default.stimmen_wav_dir, filename)
|
||||
if os.path.exists(wav_file):
|
||||
df_ = df_.append(row, ignore_index=True)
|
||||
return df_
|
||||
|
||||
|
||||
def load_transcriptions_clean(clean_wav_dir):
|
||||
df = _load_transcriptions()
|
||||
wav_file_list = glob.glob(os.path.join(clean_wav_dir, '*.wav'))
|
||||
df_clean = pd.DataFrame(index=[], columns=list(df.keys()))
|
||||
for wav_file in wav_file_list:
|
||||
filename = os.path.basename(wav_file)
|
||||
df_ = df[df['filename'].str.match(filename)]
|
||||
df_clean = pd.concat([df_clean, df_])
|
||||
return df_clean
|
||||
|
||||
|
||||
def add_row_htk(df):
|
||||
""" df['htk'] is made from df['ipa'] and added. """
|
||||
htk = []
|
||||
for index, row in df.iterrows():
|
||||
htk.append(fame_functions.ipa2htk(row['ipa']))
|
||||
return df.assign(htk=htk)
|
||||
|
||||
|
||||
def add_row_asr(df):
|
||||
""" df['asr'] is made from df['ipa'] and added. """
|
||||
asr = []
|
||||
for index, row in df.iterrows():
|
||||
asr.append(fame_functions.ipa2asr(row['ipa']))
|
||||
return df.assign(asr=asr)
|
||||
|
@ -1,9 +1,7 @@
|
||||
import os
|
||||
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
||||
import sys
|
||||
|
||||
import shutil
|
||||
import glob
|
||||
|
||||
#import numpy as np
|
||||
import pandas as pd
|
||||
@ -24,23 +22,27 @@ from htk import pyhtk
|
||||
|
||||
|
||||
## ======================= make test data ======================
|
||||
# copy wav files which is in the stimmen data.
|
||||
stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test'
|
||||
fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav')
|
||||
|
||||
## copy wav files which is in the stimmen data.
|
||||
df = stimmen_functions.load_transcriptions()
|
||||
#word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
|
||||
#word_list = sorted(word_list)
|
||||
|
||||
#for index, row in df.iterrows():
|
||||
# filename = row['filename']
|
||||
# if isinstance(filename, str):
|
||||
# wav_file = os.path.join(default.stimmen_wav_dir, filename)
|
||||
# if os.path.exists(wav_file):
|
||||
# shutil.copy(wav_file, os.path.join(stimmen_test_dir, filename))
|
||||
# pyhtk.create_label_file(
|
||||
# row['word'],
|
||||
# os.path.join(stimmen_test_dir, filename.replace('.wav', '.lab')))
|
||||
# wav_file = os.path.join(default.stimmen_wav_dir, filename)
|
||||
# shutil.copy(wav_file, os.path.join(stimmen_test_dir, filename))
|
||||
|
||||
# after manually removed files which has too much noise and multiple words...
|
||||
# update the info.
|
||||
df_clean = stimmen_functions.load_transcriptions_clean(stimmen_test_dir)
|
||||
|
||||
# count how many files are removed due to the quality.
|
||||
word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
|
||||
word_list = sorted(word_list)
|
||||
for word in word_list:
|
||||
df_ = df[df['word']==word]
|
||||
df_clean_ = df_clean[df_clean['word']==word]
|
||||
print('word {0} has {1} clean files among {2} files ({3:.2f} [%]).'.format(
|
||||
word, len(df_clean_), len(df_), len(df_clean_)/len(df_)*100))
|
||||
|
||||
|
||||
## check phones included in stimmen but not in FAME!
|
||||
@ -59,3 +61,4 @@ for ipa in df['ipa']:
|
||||
ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)
|
||||
if ':' in ipa_splitted:
|
||||
print(ipa_splitted)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user