94 lines
3.0 KiB
Python
94 lines
3.0 KiB
Python
import os
|
|
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
|
import sys
|
|
import shutil
|
|
from collections import Counter
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
import defaultfiles as default
|
|
import convert_xsampa2ipa
|
|
import stimmen_functions
|
|
import fame_functions
|
|
import convert_phoneset
|
|
from phoneset import fame_ipa, fame_asr
|
|
sys.path.append(default.toolbox_dir)
|
|
import file_handling as fh
|
|
from htk import pyhtk
|
|
|
|
|
|
## ======================= user define =======================
|
|
|
|
|
|
|
|
## ======================= make test data ======================
|
|
stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test'
|
|
|
|
## copy wav files which is in the stimmen data.
|
|
df = stimmen_functions.load_transcriptions()
|
|
#for index, row in df.iterrows():
|
|
# filename = row['filename']
|
|
# wav_file = os.path.join(default.stimmen_wav_dir, filename)
|
|
# shutil.copy(wav_file, os.path.join(stimmen_test_dir, filename))
|
|
|
|
# after manually removed files which has too much noise and multiple words...
|
|
# update the info.
|
|
df_clean = stimmen_functions.load_transcriptions_clean(stimmen_test_dir)
|
|
|
|
# count how many files are removed due to the quality.
|
|
word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
|
|
word_list = sorted(word_list)
|
|
for word in word_list:
|
|
df_ = df[df['word']==word]
|
|
df_clean_ = df_clean[df_clean['word']==word]
|
|
print('word {0} has {1} clean files among {2} files ({3:.2f} [%]).'.format(
|
|
word, len(df_clean_), len(df_), len(df_clean_)/len(df_)*100))
|
|
|
|
|
|
## check phones included in stimmen but not in FAME!
|
|
splitted_ipas = [' '.join(
|
|
convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones))
|
|
for ipa in df['ipa']]
|
|
stimmen_phones = set(' '.join(splitted_ipas))
|
|
stimmen_phones = list(stimmen_phones)
|
|
fame_phones = fame_ipa.phoneset
|
|
stimmen_phones.sort()
|
|
fame_phones.sort()
|
|
print('phones which are used in stimmen transcription but not in FAME corpus are:\n{}'.format(
|
|
set(stimmen_phones) - set(fame_phones)
|
|
))
|
|
for ipa in df['ipa']:
|
|
ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)
|
|
if ':' in ipa_splitted:
|
|
print(ipa_splitted)
|
|
|
|
|
|
## check pronunciation variants
|
|
df_clean = stimmen_functions.load_transcriptions_clean(stimmen_test_dir)
|
|
df_clean = stimmen_functions.add_row_asr(df_clean)
|
|
df_clean = stimmen_functions.add_row_htk(df_clean)
|
|
|
|
for word in word_list:
|
|
#word = word_list[1]
|
|
df_ = df_clean[df_clean['word']==word]
|
|
c = Counter(df_['htk'])
|
|
pronunciations = dict()
|
|
for key, value in zip(c.keys(), c.values()):
|
|
if value > 3:
|
|
pronunciations[key] = value
|
|
print(pronunciations)
|
|
|
|
|
|
monophone_mlf = os.path.join(default.htk_dir, 'label', 'train_phone_aligned.mlf')
|
|
triphone_mlf = os.path.join(default.htk_dir, 'label', 'train_triphone.mlf')
|
|
def filenames_in_mlf(file_mlf):
|
|
with open(file_mlf) as f:
|
|
lines_ = f.read().split('\n')
|
|
lines = [line for line in lines_ if len(line.split(' ')) == 1 and line != '.']
|
|
filenames = [line.replace('"', '').replace('*/', '') for line in lines[1:-1]]
|
|
return filenames
|
|
filenames_mono = filenames_in_mlf(monophone_mlf)
|
|
filenames_tri = filenames_in_mlf(triphone_mlf)
|
|
|