dataset is made.

This commit is contained in:
yemaozi88 2019-02-08 14:10:32 +01:00
parent f6e563ecd3
commit 8f89f60538
5 changed files with 97 additions and 78 deletions

Binary file not shown.

View File

@ -343,6 +343,16 @@ def word2htk(word):
return ''.join([fame_asr.translation_key_word2htk.get(i, i) for i in word])
def ipa2asr(ipa):
curr_dir = os.path.dirname(os.path.abspath(__file__))
translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0)
ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)
ipa_splitted = fame_ipa.phone_reduction(ipa_splitted)
asr_splitted = convert_phoneset.convert_phoneset(ipa_splitted, translation_key_ipa2asr)
return ''.join(asr_splitted)
def ipa2htk(ipa):
curr_dir = os.path.dirname(os.path.abspath(__file__))
translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0)

View File

@ -9,7 +9,7 @@ import sys
import shutil
import glob
#import numpy as np
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
#from sklearn.metrics import confusion_matrix
@ -75,24 +75,22 @@ lattice_file = os.path.join(config_dir, 'stimmen.ltc')
hvite_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test.scp')
## ======================= make test data ======================
# copy wav files which is in the stimmen data.
## ======================= load test data ======================
stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test'
fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav')
df = stimmen_functions.load_transcriptions()
df = stimmen_functions.load_transcriptions_clean(stimmen_test_dir)
df = stimmen_functions.add_row_asr(df)
df = stimmen_functions.add_row_htk(df)
word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
word_list = sorted(word_list)
# pronunciation variants
for word in word_list:
df_ = df[df['word']==word]
print('{0} has {1} variants'.format(word, len(np.unique(df_['htk'])))
# after manually removed files which does not contain clear sound,
# update df as df_test.
wav_file_list = glob.glob(os.path.join(stimmen_test_dir, '*.wav'))
df_test = pd.DataFrame(index=[], columns=list(df.keys()))
for wav_file in wav_file_list:
filename = os.path.basename(wav_file)
df_ = df[df['filename'].str.match(filename)]
df_test = pd.concat([df_test, df_])
#fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav')
#output = pyhtk.recognition(
# os.path.join(default.htk_dir, 'config', 'config.rec',
@ -102,58 +100,21 @@ for wav_file in wav_file_list:
# os.path.join(config_dir, 'phonelist.txt'),
# hvite_scp)
htk = [fame_functions.ipa2htk(ipa) for ipa in df['ipa']]
#pyhtk.create_label_file(
# row['word'],
# os.path.join(stimmen_test_dir, filename.replace('.wav', '.lab')))
ipa = 'e:χ'
fame_functions.ipa2htk(ipa)
# Filename, Word, Self Xsampa
df = pd.read_excel(xls, 'original')
ipas = []
famehtks = []
for xsampa in df['Self Xsampa']:
if not isinstance(xsampa, float): # 'NaN'
# typo?
xsampa = xsampa.replace('r2:z@rA:\\t', 'r2:z@rA:t')
xsampa = xsampa.replace(';', ':')
ipa = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
ipa = ipa.replace('ː', ':')
ipa = ipa.replace(' ', '')
ipas.append(ipa)
famehtk = convert_phone_set.ipa2famehtk(ipa)
famehtks.append(famehtk)
else:
ipas.append('')
famehtks.append('')
# extract interesting cols.
df = pd.DataFrame({'filename': df['Filename'],
'word': df['Word'],
'xsampa': df['Self Xsampa'],
'ipa': pd.Series(ipas),
'famehtk': pd.Series(famehtks)})
# cleansing.
df = df[~df['famehtk'].isin(['/', ''])]
word_list = np.unique(df['word'])
## ======================= make dict files used for HTK. ======================
if make_htk_dict_files:
output_type = 3
for word in word_list:
htk_dict_file = htk_dict_dir + '\\' + word + '.dic'
# pronunciation variant of the target word.
pronvar_ = df['famehtk'][df['word'].str.match(word)]
## ======================= make a HTK dic file ======================
#if make_htk_dic_file:
# output_type = 3
dictionary_txt = os.path.join(default.htk_dir, 'lexicon', 'stimmen.dic')
#for word in word_list:
word = word_list[2]
# pronunciation variant of the target word.
pronunciations = df_test['asr'][df_test['word'].str.match(word)]
# make dic file.
am_func.make_htk_dict(word, pronvar_, htk_dict_file, output_type)
#am_func.make_htk_dict(word, pronvar_, htk_dict_file, output_type)
## ======================= forced alignment using HTK =======================

View File

@ -1,13 +1,15 @@
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import glob
import pandas as pd
import convert_xsampa2ipa
import defaultfiles as default
import fame_functions
def load_transcriptions():
def _load_transcriptions():
stimmen_transcription = pd.ExcelFile(default.stimmen_transcription_xlsx)
df = pd.read_excel(stimmen_transcription, 'original')
@ -34,5 +36,48 @@ def load_transcriptions():
'word': df['Word'],
'xsampa': df['Self Xsampa'],
'ipa': pd.Series(ipas)})
# not valid inputs, but seperator.
df_ = df_[~df_['ipa'].str.contains('/')]
return df_.dropna()
def load_transcriptions():
""" in default.stimmen_transcription_xlsx
rows of which wav files can be easily found"""
df = _load_transcriptions()
df_ = pd.DataFrame(index=[], columns=list(df.keys()))
for index, row in df.iterrows():
filename = row['filename']
if isinstance(filename, str):
wav_file = os.path.join(default.stimmen_wav_dir, filename)
if os.path.exists(wav_file):
df_ = df_.append(row, ignore_index=True)
return df_
def load_transcriptions_clean(clean_wav_dir):
df = _load_transcriptions()
wav_file_list = glob.glob(os.path.join(clean_wav_dir, '*.wav'))
df_clean = pd.DataFrame(index=[], columns=list(df.keys()))
for wav_file in wav_file_list:
filename = os.path.basename(wav_file)
df_ = df[df['filename'].str.match(filename)]
df_clean = pd.concat([df_clean, df_])
return df_clean
def add_row_htk(df):
""" df['htk'] is made from df['ipa'] and added. """
htk = []
for index, row in df.iterrows():
htk.append(fame_functions.ipa2htk(row['ipa']))
return df.assign(htk=htk)
def add_row_asr(df):
""" df['asr'] is made from df['ipa'] and added. """
asr = []
for index, row in df.iterrows():
asr.append(fame_functions.ipa2asr(row['ipa']))
return df.assign(asr=asr)

View File

@ -1,9 +1,7 @@
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import sys
import shutil
import glob
#import numpy as np
import pandas as pd
@ -24,23 +22,27 @@ from htk import pyhtk
## ======================= make test data ======================
# copy wav files which is in the stimmen data.
stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test'
fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav')
## copy wav files which is in the stimmen data.
df = stimmen_functions.load_transcriptions()
#word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
#word_list = sorted(word_list)
#for index, row in df.iterrows():
# filename = row['filename']
# if isinstance(filename, str):
# wav_file = os.path.join(default.stimmen_wav_dir, filename)
# if os.path.exists(wav_file):
# shutil.copy(wav_file, os.path.join(stimmen_test_dir, filename))
# pyhtk.create_label_file(
# row['word'],
# os.path.join(stimmen_test_dir, filename.replace('.wav', '.lab')))
# wav_file = os.path.join(default.stimmen_wav_dir, filename)
# shutil.copy(wav_file, os.path.join(stimmen_test_dir, filename))
# after manually removed files which has too much noise and multiple words...
# update the info.
df_clean = stimmen_functions.load_transcriptions_clean(stimmen_test_dir)
# count how many files are removed due to the quality.
word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
word_list = sorted(word_list)
for word in word_list:
df_ = df[df['word']==word]
df_clean_ = df_clean[df_clean['word']==word]
print('word {0} has {1} clean files among {2} files ({3:.2f} [%]).'.format(
word, len(df_clean_), len(df_), len(df_clean_)/len(df_)*100))
## check phones included in stimmen but not in FAME!
@ -59,3 +61,4 @@ for ipa in df['ipa']:
ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)
if ':' in ipa_splitted:
print(ipa_splitted)