forced alignment by novoapi is performed.
This commit is contained in:
parent
6edde06a4f
commit
05e8a671c1
Binary file not shown.
@ -139,6 +139,71 @@ df_per_word = pd.DataFrame(index=[], columns=df_samples.keys())
|
|||||||
|
|
||||||
for word in word_list:
|
for word in word_list:
|
||||||
df_samples_ = df_samples[df_samples['word']==word]
|
df_samples_ = df_samples[df_samples['word']==word]
|
||||||
df_samples_ = df_samples_[df_samples_['frequency']>1]
|
df_samples_ = df_samples_[df_samples_['frequency']>2]
|
||||||
df_per_word = df_per_word.append(df_samples_, ignore_index=True)
|
df_per_word = df_per_word.append(df_samples_, ignore_index=True)
|
||||||
df_per_word.to_excel(os.path.join(default.stimmen_dir, 'pronunciation_variants_novo70.xlsx'), encoding="utf-8")
|
#df_per_word.to_excel(os.path.join(default.stimmen_dir, 'pronunciation_variants_novo70.xlsx'), encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
## ===== forced alignment =====
|
||||||
|
if forced_alignment:
|
||||||
|
Results = pd.DataFrame(index=[],
|
||||||
|
columns=['filename', 'ipa', 'word', 'result_ipa', 'result_novo70', 'llh'])
|
||||||
|
for word in word_list:
|
||||||
|
#for word in ['Oor']:
|
||||||
|
# pronunciation variants top 3
|
||||||
|
df_per_word_ = df_per_word[df_per_word['word']==word]
|
||||||
|
df_per_word_ = df_per_word_.sort_values('frequency', ascending=False)
|
||||||
|
if len(df_per_word_) < 3: # pauw, rozen
|
||||||
|
pronunciation_ipa = list(df_per_word_['ipa'])
|
||||||
|
elif word=='Reuzenrad':
|
||||||
|
pronunciation_ipa = [
|
||||||
|
df_per_word_.iloc[0]['ipa'],
|
||||||
|
df_per_word_.iloc[1]['ipa'],
|
||||||
|
df_per_word_.iloc[2]['ipa'],
|
||||||
|
df_per_word_.iloc[3]['ipa']]
|
||||||
|
else:
|
||||||
|
# oog, oor, reus, roeiboot
|
||||||
|
pronunciation_ipa = [
|
||||||
|
df_per_word_.iloc[0]['ipa'],
|
||||||
|
df_per_word_.iloc[1]['ipa'],
|
||||||
|
df_per_word_.iloc[2]['ipa']]
|
||||||
|
#print("{0}: {1}".format(word, pronunciation_ipa))
|
||||||
|
|
||||||
|
# samples for the word
|
||||||
|
df_ = df[df['word']==word]
|
||||||
|
|
||||||
|
# samples in which all pronunciations are written in novo70.
|
||||||
|
samples = df_.query("ipa in @pronunciation_ipa")
|
||||||
|
|
||||||
|
results = pd.DataFrame(index=[],
|
||||||
|
columns=['filename', 'ipa', 'word', 'result_ipa', 'result_novo70', 'llh'])
|
||||||
|
|
||||||
|
#j = 0
|
||||||
|
for i in range(0, len(samples)):
|
||||||
|
sample = samples.iloc[i]
|
||||||
|
wav_file = os.path.join(default.stimmen_wav_dir, sample['filename'])
|
||||||
|
if os.path.exists(wav_file):
|
||||||
|
#j += 1
|
||||||
|
#print('{0} - {1}'.format(word, i))
|
||||||
|
pronunciation_ipa_ = [ipa.replace(':', 'ː') for ipa in pronunciation_ipa]
|
||||||
|
result = novoapi_functions.forced_alignment(wav_file, word, pronunciation_ipa_)
|
||||||
|
result_ipa, result_novo70, llh = novoapi_functions.result2pronunciation(result, word)
|
||||||
|
result_ = pd.Series([
|
||||||
|
sample['filename'],
|
||||||
|
sample['ipa'],
|
||||||
|
sample['word'],
|
||||||
|
' '.join(result_ipa),
|
||||||
|
' '.join(result_novo70),
|
||||||
|
llh
|
||||||
|
], index=results.columns)
|
||||||
|
results = results.append(result_, ignore_index = True)
|
||||||
|
print('{0}/{1}: answer {2} - prediction {3}'.format(
|
||||||
|
i+1, len(samples), result_['ipa'], result_['result_ipa']))
|
||||||
|
|
||||||
|
if len(results) > 0:
|
||||||
|
Results = Results.append(results, ignore_index = True)
|
||||||
|
Results.to_excel(os.path.join(default.stimmen_dir, 'Results.xlsx'), encoding="utf-8")
|
||||||
|
else:
|
||||||
|
Results_xlsx = pd.ExcelFile(os.path.join(default.stimmen_dir, 'Results.xlsx'), encoding="utf-8")
|
||||||
|
R = pd.read_excel(Results_xlsx, 'Sheet1')
|
||||||
|
|
||||||
|
@ -36,10 +36,29 @@ def load_phonset():
|
|||||||
phoneset_novo70.append(novo70)
|
phoneset_novo70.append(novo70)
|
||||||
translation_key_ipa2novo70[ipa] = novo70
|
translation_key_ipa2novo70[ipa] = novo70
|
||||||
translation_key_novo702ipa[novo70] = ipa
|
translation_key_novo702ipa[novo70] = ipa
|
||||||
|
|
||||||
|
# As per Nederlandse phoneset_aki.xlsx recieved from David
|
||||||
|
# [ɔː] oh / ohr # from ipa->novo70, only oh is used.
|
||||||
|
# [ɪː] ih / ihr # from ipa->novo70, only ih is used.
|
||||||
|
# [iː] iy
|
||||||
|
# [œː] uh
|
||||||
|
# [ɛː] eh
|
||||||
|
# [w] wv in IPA written as ʋ.
|
||||||
|
extra_ipa = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː', 'ʋ']
|
||||||
|
extra_novo70 = ['oh', 'ih', 'iy', 'uh', 'eh', 'wv']
|
||||||
|
for ipa, novo70 in zip(extra_ipa, extra_novo70):
|
||||||
|
phoneset_ipa.append(ipa)
|
||||||
|
phoneset_novo70.append(novo70)
|
||||||
|
translation_key_ipa2novo70[ipa] = novo70
|
||||||
|
translation_key_novo702ipa[novo70] = ipa
|
||||||
|
|
||||||
|
translation_key_novo702ipa['ohr'] = 'ɔː'
|
||||||
|
translation_key_novo702ipa['ihr'] = 'ɪː'
|
||||||
|
|
||||||
phoneset_ipa = np.unique(phoneset_ipa)
|
phoneset_ipa = np.unique(phoneset_ipa)
|
||||||
phoneset_novo70 = np.unique(phoneset_novo70)
|
phoneset_novo70 = np.unique(phoneset_novo70)
|
||||||
|
|
||||||
return
|
return phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa
|
||||||
|
|
||||||
|
|
||||||
def multi_character_tokenize(line, multi_character_tokens):
|
def multi_character_tokenize(line, multi_character_tokens):
|
||||||
|
Loading…
Reference in New Issue
Block a user