diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo
index ef753d5..d1feede 100644
Binary files a/.vs/acoustic_model/v15/.suo and b/.vs/acoustic_model/v15/.suo differ
diff --git a/acoustic_model/acoustic_model.pyproj b/acoustic_model/acoustic_model.pyproj
index 11ca83b..715d6c0 100644
--- a/acoustic_model/acoustic_model.pyproj
+++ b/acoustic_model/acoustic_model.pyproj
@@ -52,6 +52,7 @@
+
diff --git a/acoustic_model/htk_vs_kaldi.py b/acoustic_model/htk_vs_kaldi.py
index c1e5c97..9095b3c 100644
--- a/acoustic_model/htk_vs_kaldi.py
+++ b/acoustic_model/htk_vs_kaldi.py
@@ -59,10 +59,6 @@ from htk import pyhtk
#load_forced_alignment_kaldi = 1
#eval_forced_alignment_kaldi = 1
-
-
-
-### ======================= add paths =======================
#sys.path.append(os.path.join(default.repo_dir, 'forced_alignment'))
#from forced_alignment import convert_phone_set
#from forced_alignment import pyhtk
@@ -78,6 +74,7 @@ lattice_file = os.path.join(config_dir, 'stimmen.ltc')
# lattice_file)
hvite_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test.scp')
+
## ======================= make test data ======================
# copy wav files which is in the stimmen data.
stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test'
@@ -87,25 +84,15 @@ df = stimmen_functions.load_transcriptions()
word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
word_list = sorted(word_list)
-#for index, row in df.iterrows():
-# filename = row['filename']
-# if isinstance(filename, str):
-# wav_file = os.path.join(default.stimmen_wav_dir, filename)
-# if os.path.exists(wav_file):
-# shutil.copy(wav_file, os.path.join(stimmen_test_dir, filename))
-# pyhtk.create_label_file(
-# row['word'],
-# os.path.join(stimmen_test_dir, filename.replace('.wav', '.lab')))
-
# after manually removed files which does not contain clear sound,
# update df as df_test.
-#wav_file_list = glob.glob(os.path.join(stimmen_test_dir, '*.wav'))
-#df_test = pd.DataFrame(index=[], columns=list(df.keys()))
-#for wav_file in wav_file_list:
-# filename = os.path.basename(wav_file)
-# df_ = df[df['filename'].str.match(filename)]
-# df_test = pd.concat([df_test, df_])
+wav_file_list = glob.glob(os.path.join(stimmen_test_dir, '*.wav'))
+df_test = pd.DataFrame(index=[], columns=list(df.keys()))
+for wav_file in wav_file_list:
+ filename = os.path.basename(wav_file)
+ df_ = df[df['filename'].str.match(filename)]
+ df_test = pd.concat([df_test, df_])
#output = pyhtk.recognition(
# os.path.join(default.htk_dir, 'config', 'config.rec',
@@ -115,26 +102,6 @@ word_list = sorted(word_list)
# os.path.join(config_dir, 'phonelist.txt'),
# hvite_scp)
-
-## check phones included in stimmen but not in FAME!
-splitted_ipas = [' '.join(
- convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones))
- for ipa in df['ipa']]
-stimmen_phones = set(' '.join(splitted_ipas))
-stimmen_phones = list(stimmen_phones)
-#stimmen_phones = list(set(fame_asr.phone_reduction(list(stimmen_phones))))
-#fame_phones = fame_asr.phoneset_short
-fame_phones = fame_ipa.phoneset
-stimmen_phones.sort()
-fame_phones.sort()
-print('phones which are used in stimmen transcription but not in FAME corpus are:\n{}'.format(
- set(stimmen_phones) - set(fame_phones)
- ))
-for ipa in df['ipa']:
- ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)
- if ':' in ipa_splitted:
- print(ipa_splitted)
-
htk = [fame_functions.ipa2htk(ipa) for ipa in df['ipa']]
ipa = 'e:χ'
diff --git a/acoustic_model/stimmen_test.py b/acoustic_model/stimmen_test.py
new file mode 100644
index 0000000..8cbdace
--- /dev/null
+++ b/acoustic_model/stimmen_test.py
@@ -0,0 +1,61 @@
+import os
+os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
+import sys
+
+import shutil
+import glob
+
+#import numpy as np
+import pandas as pd
+
+import defaultfiles as default
+import convert_xsampa2ipa
+import stimmen_functions
+import fame_functions
+import convert_phoneset
+from phoneset import fame_ipa, fame_asr
+sys.path.append(default.toolbox_dir)
+import file_handling as fh
+from htk import pyhtk
+
+
+## ======================= user define =======================
+
+
+
+## ======================= make test data ======================
+# copy wav files which is in the stimmen data.
+stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test'
+fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav')
+
+df = stimmen_functions.load_transcriptions()
+#word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
+#word_list = sorted(word_list)
+
+#for index, row in df.iterrows():
+# filename = row['filename']
+# if isinstance(filename, str):
+# wav_file = os.path.join(default.stimmen_wav_dir, filename)
+# if os.path.exists(wav_file):
+# shutil.copy(wav_file, os.path.join(stimmen_test_dir, filename))
+# pyhtk.create_label_file(
+# row['word'],
+# os.path.join(stimmen_test_dir, filename.replace('.wav', '.lab')))
+
+
+## check phones included in stimmen but not in FAME!
+splitted_ipas = [' '.join(
+ convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones))
+ for ipa in df['ipa']]
+stimmen_phones = set(' '.join(splitted_ipas))
+stimmen_phones = list(stimmen_phones)
+fame_phones = fame_ipa.phoneset
+stimmen_phones.sort()
+fame_phones.sort()
+print('phones which are used in stimmen transcription but not in FAME corpus are:\n{}'.format(
+ set(stimmen_phones) - set(fame_phones)
+ ))
+for ipa in df['ipa']:
+ ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)
+ if ':' in ipa_splitted:
+ print(ipa_splitted)