dataset is made.

2019-02-08 14:10:32 +01:00
parent f6e563ecd3
commit 8f89f60538
5 changed files with 97 additions and 78 deletions
--- a/.vs/acoustic_model/v15/.suo
+++ b/.vs/acoustic_model/v15/.suo
--- a/acoustic_model/fame_functions.py
+++ b/acoustic_model/fame_functions.py
@@ -343,6 +343,16 @@ def word2htk(word):
 	return ''.join([fame_asr.translation_key_word2htk.get(i, i) for i in word])
 def ipa2asr(ipa):
 	curr_dir = os.path.dirname(os.path.abspath(__file__))
 	translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0)
 	ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)
 	ipa_splitted = fame_ipa.phone_reduction(ipa_splitted)
 	asr_splitted = convert_phoneset.convert_phoneset(ipa_splitted, translation_key_ipa2asr)
 	return ''.join(asr_splitted)
 def ipa2htk(ipa):
 	curr_dir = os.path.dirname(os.path.abspath(__file__))
 	translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0)
--- a/acoustic_model/htk_vs_kaldi.py
+++ b/acoustic_model/htk_vs_kaldi.py
@@ -9,7 +9,7 @@ import sys
 import shutil
 import glob
-#import numpy as np
+import numpy as np
 import pandas as pd
 #import matplotlib.pyplot as plt
 #from sklearn.metrics import confusion_matrix
@@ -75,24 +75,22 @@ lattice_file = os.path.join(config_dir, 'stimmen.ltc')
 hvite_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test.scp')
-## ======================= make test data ======================
+## ======================= load test data ======================
 # copy wav files which is in the stimmen data.
 stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test'
 fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav')
-df = stimmen_functions.load_transcriptions()
+df = stimmen_functions.load_transcriptions_clean(stimmen_test_dir)
 df = stimmen_functions.add_row_asr(df)
 df = stimmen_functions.add_row_htk(df)
 word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
 word_list = sorted(word_list)
 # pronunciation variants
 for word in word_list:
 	df_ = df[df['word']==word]
 	print('{0} has {1} variants'.format(word, len(np.unique(df_['htk'])))
-# after manually removed files which does not contain clear sound,
+#fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav')
 # update df as df_test.
 wav_file_list = glob.glob(os.path.join(stimmen_test_dir, '*.wav'))
 df_test = pd.DataFrame(index=[], columns=list(df.keys()))
 for wav_file in wav_file_list:
 	filename = os.path.basename(wav_file)
 	df_ = df[df['filename'].str.match(filename)]
 	df_test = pd.concat([df_test, df_])
 #output = pyhtk.recognition(
 #	os.path.join(default.htk_dir, 'config', 'config.rec', 
@@ -102,58 +100,21 @@ for wav_file in wav_file_list:
 #	os.path.join(config_dir, 'phonelist.txt'), 
 #	hvite_scp)
-htk = [fame_functions.ipa2htk(ipa) for ipa in df['ipa']]
+			#pyhtk.create_label_file(
 			#	row['word'], 
 			#	os.path.join(stimmen_test_dir, filename.replace('.wav', '.lab')))
-ipa = 'e:χ'
+## ======================= make a HTK dic file ======================
-fame_functions.ipa2htk(ipa)
+#if make_htk_dic_file:
-
+#    output_type = 3
-
+dictionary_txt = os.path.join(default.htk_dir, 'lexicon', 'stimmen.dic') 
-
+#for word in word_list:
-# Filename, Word, Self Xsampa
+word = word_list[2]
-df = pd.read_excel(xls, 'original')
+# pronunciation variant of the target word.
-    
+pronunciations = df_test['asr'][df_test['word'].str.match(word)]
 ipas     = []
 famehtks = []
 for xsampa in df['Self Xsampa']:
    if not isinstance(xsampa, float): # 'NaN'
        # typo?
        xsampa = xsampa.replace('r2:z@rA:\\t', 'r2:z@rA:t')
        xsampa = xsampa.replace(';', ':')
        ipa = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
        ipa = ipa.replace('ː', ':')
        ipa = ipa.replace(' ', '')
        ipas.append(ipa)
        famehtk = convert_phone_set.ipa2famehtk(ipa)
        famehtks.append(famehtk)
    else:
        ipas.append('')
        famehtks.append('')
 # extract interesting cols.
 df = pd.DataFrame({'filename': df['Filename'], 
                    'word': df['Word'], 
                    'xsampa': df['Self Xsampa'],
                    'ipa': pd.Series(ipas),
                    'famehtk': pd.Series(famehtks)})
 # cleansing. 
 df = df[~df['famehtk'].isin(['/', ''])]
 word_list = np.unique(df['word'])
 ## ======================= make dict files used for HTK. ======================
 if make_htk_dict_files:
    output_type = 3
    for word in word_list:
        htk_dict_file = htk_dict_dir + '\\' + word + '.dic'
        # pronunciation variant of the target word.
        pronvar_ = df['famehtk'][df['word'].str.match(word)]
        # make dic file.
-        am_func.make_htk_dict(word, pronvar_, htk_dict_file, output_type)
+        #am_func.make_htk_dict(word, pronvar_, htk_dict_file, output_type)
 ## ======================= forced alignment using HTK =======================
--- a/acoustic_model/stimmen_functions.py
+++ b/acoustic_model/stimmen_functions.py
@@ -1,13 +1,15 @@
 import os
 os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
 import glob
 import pandas as pd
 import convert_xsampa2ipa
 import defaultfiles as default
 import fame_functions
-def load_transcriptions():
+def _load_transcriptions():
 	stimmen_transcription = pd.ExcelFile(default.stimmen_transcription_xlsx)
 	df = pd.read_excel(stimmen_transcription, 'original')
@@ -34,5 +36,48 @@ def load_transcriptions():
 						'word': df['Word'], 
 						'xsampa': df['Self Xsampa'],
 						'ipa': pd.Series(ipas)})
-	df_ = df_[~df_['ipa'].str.contains('/')]
+	
 	# not valid inputs, but seperator.
 	df_ = df_[~df_['ipa'].str.contains('/')] 
 	return df_.dropna()
 def load_transcriptions():
 	""" in default.stimmen_transcription_xlsx 
 	rows of which wav files can be easily found"""
 	df = _load_transcriptions()
 	df_ = pd.DataFrame(index=[], columns=list(df.keys()))
 	for index, row in df.iterrows():
 		filename = row['filename']
 		if isinstance(filename, str):
 			wav_file = os.path.join(default.stimmen_wav_dir, filename)
 			if os.path.exists(wav_file):
 				df_ = df_.append(row, ignore_index=True)
 	return df_
 def load_transcriptions_clean(clean_wav_dir):
 	df = _load_transcriptions()
 	wav_file_list = glob.glob(os.path.join(clean_wav_dir, '*.wav'))
 	df_clean = pd.DataFrame(index=[], columns=list(df.keys()))
 	for wav_file in wav_file_list:
 		filename = os.path.basename(wav_file)
 		df_ = df[df['filename'].str.match(filename)]
 		df_clean = pd.concat([df_clean, df_])
 	return df_clean
 def add_row_htk(df):
 	""" df['htk'] is made from df['ipa'] and added. """
 	htk = []
 	for index, row in df.iterrows():
 		htk.append(fame_functions.ipa2htk(row['ipa']))
 	return df.assign(htk=htk)
 def add_row_asr(df):
 	""" df['asr'] is made from df['ipa'] and added. """
 	asr = []
 	for index, row in df.iterrows():
 		asr.append(fame_functions.ipa2asr(row['ipa']))
 	return df.assign(asr=asr)
--- a/acoustic_model/stimmen_test.py
+++ b/acoustic_model/stimmen_test.py
@@ -1,9 +1,7 @@
 import os
 os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
 import sys
 import shutil
 import glob
 #import numpy as np
 import pandas as pd
@@ -24,23 +22,27 @@ from htk import pyhtk
 ## ======================= make test data ======================
 # copy wav files which is in the stimmen data.
 stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test'
 fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav')
 ## copy wav files which is in the stimmen data.
 df = stimmen_functions.load_transcriptions()
 #word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
 #word_list = sorted(word_list)
 #for index, row in df.iterrows():
 #	filename = row['filename']
-#	if isinstance(filename, str):
+#	wav_file = os.path.join(default.stimmen_wav_dir, filename)
-#		wav_file = os.path.join(default.stimmen_wav_dir, filename)
+#	shutil.copy(wav_file, os.path.join(stimmen_test_dir, filename))
-#		if os.path.exists(wav_file):
+
-#			shutil.copy(wav_file, os.path.join(stimmen_test_dir, filename))
+# after manually removed files which has too much noise and multiple words...
-#			pyhtk.create_label_file(
+# update the info.
-#				row['word'], 
+df_clean = stimmen_functions.load_transcriptions_clean(stimmen_test_dir)
-#				os.path.join(stimmen_test_dir, filename.replace('.wav', '.lab')))
+
 # count how many files are removed due to the quality.
 word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
 word_list = sorted(word_list)
 for word in word_list:
 	df_ = df[df['word']==word]
 	df_clean_ = df_clean[df_clean['word']==word]
 	print('word {0} has {1} clean files among {2} files ({3:.2f} [%]).'.format(
 		word, len(df_clean_), len(df_), len(df_clean_)/len(df_)*100))
 ## check phones included in stimmen but not in FAME!
@@ -59,3 +61,4 @@ for ipa in df['ipa']:
 	ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)
 	if ':' in ipa_splitted:
 		print(ipa_splitted)