moved testing parts in htk_vs_kaldi into stimmen_test.py

This commit is contained in:
yemaozi88 2019-02-06 09:35:23 +01:00
parent da0242b0e1
commit f6e563ecd3
4 changed files with 69 additions and 40 deletions

Binary file not shown.

View File

@ -52,6 +52,7 @@
<Compile Include="phoneset\fame_asr.py" /> <Compile Include="phoneset\fame_asr.py" />
<Compile Include="phoneset\fame_ipa.py" /> <Compile Include="phoneset\fame_ipa.py" />
<Compile Include="stimmen_functions.py" /> <Compile Include="stimmen_functions.py" />
<Compile Include="stimmen_test.py" />
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<Content Include="config.ini" /> <Content Include="config.ini" />

View File

@ -59,10 +59,6 @@ from htk import pyhtk
#load_forced_alignment_kaldi = 1 #load_forced_alignment_kaldi = 1
#eval_forced_alignment_kaldi = 1 #eval_forced_alignment_kaldi = 1
### ======================= add paths =======================
#sys.path.append(os.path.join(default.repo_dir, 'forced_alignment')) #sys.path.append(os.path.join(default.repo_dir, 'forced_alignment'))
#from forced_alignment import convert_phone_set #from forced_alignment import convert_phone_set
#from forced_alignment import pyhtk #from forced_alignment import pyhtk
@ -78,6 +74,7 @@ lattice_file = os.path.join(config_dir, 'stimmen.ltc')
# lattice_file) # lattice_file)
hvite_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test.scp') hvite_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test.scp')
## ======================= make test data ====================== ## ======================= make test data ======================
# copy wav files which is in the stimmen data. # copy wav files which is in the stimmen data.
stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test' stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test'
@ -87,25 +84,15 @@ df = stimmen_functions.load_transcriptions()
word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)] word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
word_list = sorted(word_list) word_list = sorted(word_list)
#for index, row in df.iterrows():
# filename = row['filename']
# if isinstance(filename, str):
# wav_file = os.path.join(default.stimmen_wav_dir, filename)
# if os.path.exists(wav_file):
# shutil.copy(wav_file, os.path.join(stimmen_test_dir, filename))
# pyhtk.create_label_file(
# row['word'],
# os.path.join(stimmen_test_dir, filename.replace('.wav', '.lab')))
# after manually removed files which does not contain clear sound, # after manually removed files which does not contain clear sound,
# update df as df_test. # update df as df_test.
#wav_file_list = glob.glob(os.path.join(stimmen_test_dir, '*.wav')) wav_file_list = glob.glob(os.path.join(stimmen_test_dir, '*.wav'))
#df_test = pd.DataFrame(index=[], columns=list(df.keys())) df_test = pd.DataFrame(index=[], columns=list(df.keys()))
#for wav_file in wav_file_list: for wav_file in wav_file_list:
# filename = os.path.basename(wav_file) filename = os.path.basename(wav_file)
# df_ = df[df['filename'].str.match(filename)] df_ = df[df['filename'].str.match(filename)]
# df_test = pd.concat([df_test, df_]) df_test = pd.concat([df_test, df_])
#output = pyhtk.recognition( #output = pyhtk.recognition(
# os.path.join(default.htk_dir, 'config', 'config.rec', # os.path.join(default.htk_dir, 'config', 'config.rec',
@ -115,26 +102,6 @@ word_list = sorted(word_list)
# os.path.join(config_dir, 'phonelist.txt'), # os.path.join(config_dir, 'phonelist.txt'),
# hvite_scp) # hvite_scp)
## check phones included in stimmen but not in FAME!
splitted_ipas = [' '.join(
convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones))
for ipa in df['ipa']]
stimmen_phones = set(' '.join(splitted_ipas))
stimmen_phones = list(stimmen_phones)
#stimmen_phones = list(set(fame_asr.phone_reduction(list(stimmen_phones))))
#fame_phones = fame_asr.phoneset_short
fame_phones = fame_ipa.phoneset
stimmen_phones.sort()
fame_phones.sort()
print('phones which are used in stimmen transcription but not in FAME corpus are:\n{}'.format(
set(stimmen_phones) - set(fame_phones)
))
for ipa in df['ipa']:
ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)
if ':' in ipa_splitted:
print(ipa_splitted)
htk = [fame_functions.ipa2htk(ipa) for ipa in df['ipa']] htk = [fame_functions.ipa2htk(ipa) for ipa in df['ipa']]
ipa = 'e:χ' ipa = 'e:χ'

View File

@ -0,0 +1,61 @@
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import sys
import shutil
import glob
#import numpy as np
import pandas as pd
import defaultfiles as default
import convert_xsampa2ipa
import stimmen_functions
import fame_functions
import convert_phoneset
from phoneset import fame_ipa, fame_asr
sys.path.append(default.toolbox_dir)
import file_handling as fh
from htk import pyhtk
## ======================= user define =======================
## ======================= make test data ======================
# copy wav files which is in the stimmen data.
stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test'
fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav')
df = stimmen_functions.load_transcriptions()
#word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
#word_list = sorted(word_list)
#for index, row in df.iterrows():
# filename = row['filename']
# if isinstance(filename, str):
# wav_file = os.path.join(default.stimmen_wav_dir, filename)
# if os.path.exists(wav_file):
# shutil.copy(wav_file, os.path.join(stimmen_test_dir, filename))
# pyhtk.create_label_file(
# row['word'],
# os.path.join(stimmen_test_dir, filename.replace('.wav', '.lab')))
## check phones included in stimmen but not in FAME!
splitted_ipas = [' '.join(
convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones))
for ipa in df['ipa']]
stimmen_phones = set(' '.join(splitted_ipas))
stimmen_phones = list(stimmen_phones)
fame_phones = fame_ipa.phoneset
stimmen_phones.sort()
fame_phones.sort()
print('phones which are used in stimmen transcription but not in FAME corpus are:\n{}'.format(
set(stimmen_phones) - set(fame_phones)
))
for ipa in df['ipa']:
ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)
if ':' in ipa_splitted:
print(ipa_splitted)