dataset is made.
This commit is contained in:
parent
f6e563ecd3
commit
8f89f60538
Binary file not shown.
@ -343,6 +343,16 @@ def word2htk(word):
|
|||||||
return ''.join([fame_asr.translation_key_word2htk.get(i, i) for i in word])
|
return ''.join([fame_asr.translation_key_word2htk.get(i, i) for i in word])
|
||||||
|
|
||||||
|
|
||||||
|
def ipa2asr(ipa):
|
||||||
|
curr_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0)
|
||||||
|
|
||||||
|
ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)
|
||||||
|
ipa_splitted = fame_ipa.phone_reduction(ipa_splitted)
|
||||||
|
asr_splitted = convert_phoneset.convert_phoneset(ipa_splitted, translation_key_ipa2asr)
|
||||||
|
return ''.join(asr_splitted)
|
||||||
|
|
||||||
|
|
||||||
def ipa2htk(ipa):
|
def ipa2htk(ipa):
|
||||||
curr_dir = os.path.dirname(os.path.abspath(__file__))
|
curr_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0)
|
translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0)
|
||||||
|
@ -9,7 +9,7 @@ import sys
|
|||||||
import shutil
|
import shutil
|
||||||
import glob
|
import glob
|
||||||
|
|
||||||
#import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
#import matplotlib.pyplot as plt
|
#import matplotlib.pyplot as plt
|
||||||
#from sklearn.metrics import confusion_matrix
|
#from sklearn.metrics import confusion_matrix
|
||||||
@ -75,24 +75,22 @@ lattice_file = os.path.join(config_dir, 'stimmen.ltc')
|
|||||||
hvite_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test.scp')
|
hvite_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test.scp')
|
||||||
|
|
||||||
|
|
||||||
## ======================= make test data ======================
|
## ======================= load test data ======================
|
||||||
# copy wav files which is in the stimmen data.
|
|
||||||
stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test'
|
stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test'
|
||||||
fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav')
|
|
||||||
|
|
||||||
df = stimmen_functions.load_transcriptions()
|
df = stimmen_functions.load_transcriptions_clean(stimmen_test_dir)
|
||||||
|
df = stimmen_functions.add_row_asr(df)
|
||||||
|
df = stimmen_functions.add_row_htk(df)
|
||||||
|
|
||||||
word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
|
word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
|
||||||
word_list = sorted(word_list)
|
word_list = sorted(word_list)
|
||||||
|
|
||||||
|
# pronunciation variants
|
||||||
|
for word in word_list:
|
||||||
|
df_ = df[df['word']==word]
|
||||||
|
print('{0} has {1} variants'.format(word, len(np.unique(df_['htk'])))
|
||||||
|
|
||||||
# after manually removed files which does not contain clear sound,
|
#fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav')
|
||||||
# update df as df_test.
|
|
||||||
wav_file_list = glob.glob(os.path.join(stimmen_test_dir, '*.wav'))
|
|
||||||
df_test = pd.DataFrame(index=[], columns=list(df.keys()))
|
|
||||||
for wav_file in wav_file_list:
|
|
||||||
filename = os.path.basename(wav_file)
|
|
||||||
df_ = df[df['filename'].str.match(filename)]
|
|
||||||
df_test = pd.concat([df_test, df_])
|
|
||||||
|
|
||||||
#output = pyhtk.recognition(
|
#output = pyhtk.recognition(
|
||||||
# os.path.join(default.htk_dir, 'config', 'config.rec',
|
# os.path.join(default.htk_dir, 'config', 'config.rec',
|
||||||
@ -102,58 +100,21 @@ for wav_file in wav_file_list:
|
|||||||
# os.path.join(config_dir, 'phonelist.txt'),
|
# os.path.join(config_dir, 'phonelist.txt'),
|
||||||
# hvite_scp)
|
# hvite_scp)
|
||||||
|
|
||||||
htk = [fame_functions.ipa2htk(ipa) for ipa in df['ipa']]
|
#pyhtk.create_label_file(
|
||||||
|
# row['word'],
|
||||||
|
# os.path.join(stimmen_test_dir, filename.replace('.wav', '.lab')))
|
||||||
|
|
||||||
ipa = 'e:χ'
|
## ======================= make a HTK dic file ======================
|
||||||
fame_functions.ipa2htk(ipa)
|
#if make_htk_dic_file:
|
||||||
|
# output_type = 3
|
||||||
|
dictionary_txt = os.path.join(default.htk_dir, 'lexicon', 'stimmen.dic')
|
||||||
|
#for word in word_list:
|
||||||
# Filename, Word, Self Xsampa
|
word = word_list[2]
|
||||||
df = pd.read_excel(xls, 'original')
|
# pronunciation variant of the target word.
|
||||||
|
pronunciations = df_test['asr'][df_test['word'].str.match(word)]
|
||||||
ipas = []
|
|
||||||
famehtks = []
|
|
||||||
for xsampa in df['Self Xsampa']:
|
|
||||||
if not isinstance(xsampa, float): # 'NaN'
|
|
||||||
# typo?
|
|
||||||
xsampa = xsampa.replace('r2:z@rA:\\t', 'r2:z@rA:t')
|
|
||||||
xsampa = xsampa.replace(';', ':')
|
|
||||||
|
|
||||||
ipa = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
|
|
||||||
ipa = ipa.replace('ː', ':')
|
|
||||||
ipa = ipa.replace(' ', '')
|
|
||||||
ipas.append(ipa)
|
|
||||||
famehtk = convert_phone_set.ipa2famehtk(ipa)
|
|
||||||
famehtks.append(famehtk)
|
|
||||||
else:
|
|
||||||
ipas.append('')
|
|
||||||
famehtks.append('')
|
|
||||||
|
|
||||||
# extract interesting cols.
|
|
||||||
df = pd.DataFrame({'filename': df['Filename'],
|
|
||||||
'word': df['Word'],
|
|
||||||
'xsampa': df['Self Xsampa'],
|
|
||||||
'ipa': pd.Series(ipas),
|
|
||||||
'famehtk': pd.Series(famehtks)})
|
|
||||||
# cleansing.
|
|
||||||
df = df[~df['famehtk'].isin(['/', ''])]
|
|
||||||
|
|
||||||
word_list = np.unique(df['word'])
|
|
||||||
|
|
||||||
|
|
||||||
## ======================= make dict files used for HTK. ======================
|
|
||||||
if make_htk_dict_files:
|
|
||||||
output_type = 3
|
|
||||||
|
|
||||||
for word in word_list:
|
|
||||||
htk_dict_file = htk_dict_dir + '\\' + word + '.dic'
|
|
||||||
|
|
||||||
# pronunciation variant of the target word.
|
|
||||||
pronvar_ = df['famehtk'][df['word'].str.match(word)]
|
|
||||||
|
|
||||||
# make dic file.
|
# make dic file.
|
||||||
am_func.make_htk_dict(word, pronvar_, htk_dict_file, output_type)
|
#am_func.make_htk_dict(word, pronvar_, htk_dict_file, output_type)
|
||||||
|
|
||||||
|
|
||||||
## ======================= forced alignment using HTK =======================
|
## ======================= forced alignment using HTK =======================
|
||||||
|
@ -1,13 +1,15 @@
|
|||||||
import os
|
import os
|
||||||
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
||||||
|
import glob
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
import convert_xsampa2ipa
|
import convert_xsampa2ipa
|
||||||
import defaultfiles as default
|
import defaultfiles as default
|
||||||
|
import fame_functions
|
||||||
|
|
||||||
|
|
||||||
def load_transcriptions():
|
def _load_transcriptions():
|
||||||
stimmen_transcription = pd.ExcelFile(default.stimmen_transcription_xlsx)
|
stimmen_transcription = pd.ExcelFile(default.stimmen_transcription_xlsx)
|
||||||
df = pd.read_excel(stimmen_transcription, 'original')
|
df = pd.read_excel(stimmen_transcription, 'original')
|
||||||
|
|
||||||
@ -34,5 +36,48 @@ def load_transcriptions():
|
|||||||
'word': df['Word'],
|
'word': df['Word'],
|
||||||
'xsampa': df['Self Xsampa'],
|
'xsampa': df['Self Xsampa'],
|
||||||
'ipa': pd.Series(ipas)})
|
'ipa': pd.Series(ipas)})
|
||||||
df_ = df_[~df_['ipa'].str.contains('/')]
|
|
||||||
|
# not valid inputs, but seperator.
|
||||||
|
df_ = df_[~df_['ipa'].str.contains('/')]
|
||||||
return df_.dropna()
|
return df_.dropna()
|
||||||
|
|
||||||
|
|
||||||
|
def load_transcriptions():
|
||||||
|
""" in default.stimmen_transcription_xlsx
|
||||||
|
rows of which wav files can be easily found"""
|
||||||
|
df = _load_transcriptions()
|
||||||
|
df_ = pd.DataFrame(index=[], columns=list(df.keys()))
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
filename = row['filename']
|
||||||
|
if isinstance(filename, str):
|
||||||
|
wav_file = os.path.join(default.stimmen_wav_dir, filename)
|
||||||
|
if os.path.exists(wav_file):
|
||||||
|
df_ = df_.append(row, ignore_index=True)
|
||||||
|
return df_
|
||||||
|
|
||||||
|
|
||||||
|
def load_transcriptions_clean(clean_wav_dir):
|
||||||
|
df = _load_transcriptions()
|
||||||
|
wav_file_list = glob.glob(os.path.join(clean_wav_dir, '*.wav'))
|
||||||
|
df_clean = pd.DataFrame(index=[], columns=list(df.keys()))
|
||||||
|
for wav_file in wav_file_list:
|
||||||
|
filename = os.path.basename(wav_file)
|
||||||
|
df_ = df[df['filename'].str.match(filename)]
|
||||||
|
df_clean = pd.concat([df_clean, df_])
|
||||||
|
return df_clean
|
||||||
|
|
||||||
|
|
||||||
|
def add_row_htk(df):
|
||||||
|
""" df['htk'] is made from df['ipa'] and added. """
|
||||||
|
htk = []
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
htk.append(fame_functions.ipa2htk(row['ipa']))
|
||||||
|
return df.assign(htk=htk)
|
||||||
|
|
||||||
|
|
||||||
|
def add_row_asr(df):
|
||||||
|
""" df['asr'] is made from df['ipa'] and added. """
|
||||||
|
asr = []
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
asr.append(fame_functions.ipa2asr(row['ipa']))
|
||||||
|
return df.assign(asr=asr)
|
||||||
|
@ -1,9 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
import shutil
|
import shutil
|
||||||
import glob
|
|
||||||
|
|
||||||
#import numpy as np
|
#import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@ -24,23 +22,27 @@ from htk import pyhtk
|
|||||||
|
|
||||||
|
|
||||||
## ======================= make test data ======================
|
## ======================= make test data ======================
|
||||||
# copy wav files which is in the stimmen data.
|
|
||||||
stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test'
|
stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test'
|
||||||
fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav')
|
|
||||||
|
|
||||||
|
## copy wav files which is in the stimmen data.
|
||||||
df = stimmen_functions.load_transcriptions()
|
df = stimmen_functions.load_transcriptions()
|
||||||
#word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
|
|
||||||
#word_list = sorted(word_list)
|
|
||||||
|
|
||||||
#for index, row in df.iterrows():
|
#for index, row in df.iterrows():
|
||||||
# filename = row['filename']
|
# filename = row['filename']
|
||||||
# if isinstance(filename, str):
|
# wav_file = os.path.join(default.stimmen_wav_dir, filename)
|
||||||
# wav_file = os.path.join(default.stimmen_wav_dir, filename)
|
# shutil.copy(wav_file, os.path.join(stimmen_test_dir, filename))
|
||||||
# if os.path.exists(wav_file):
|
|
||||||
# shutil.copy(wav_file, os.path.join(stimmen_test_dir, filename))
|
# after manually removed files which has too much noise and multiple words...
|
||||||
# pyhtk.create_label_file(
|
# update the info.
|
||||||
# row['word'],
|
df_clean = stimmen_functions.load_transcriptions_clean(stimmen_test_dir)
|
||||||
# os.path.join(stimmen_test_dir, filename.replace('.wav', '.lab')))
|
|
||||||
|
# count how many files are removed due to the quality.
|
||||||
|
word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
|
||||||
|
word_list = sorted(word_list)
|
||||||
|
for word in word_list:
|
||||||
|
df_ = df[df['word']==word]
|
||||||
|
df_clean_ = df_clean[df_clean['word']==word]
|
||||||
|
print('word {0} has {1} clean files among {2} files ({3:.2f} [%]).'.format(
|
||||||
|
word, len(df_clean_), len(df_), len(df_clean_)/len(df_)*100))
|
||||||
|
|
||||||
|
|
||||||
## check phones included in stimmen but not in FAME!
|
## check phones included in stimmen but not in FAME!
|
||||||
@ -59,3 +61,4 @@ for ipa in df['ipa']:
|
|||||||
ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)
|
ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)
|
||||||
if ':' in ipa_splitted:
|
if ':' in ipa_splitted:
|
||||||
print(ipa_splitted)
|
print(ipa_splitted)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user