|
|
|
@ -139,6 +139,71 @@ df_per_word = pd.DataFrame(index=[], columns=df_samples.keys())
@@ -139,6 +139,71 @@ df_per_word = pd.DataFrame(index=[], columns=df_samples.keys())
|
|
|
|
|
|
|
|
|
|
for word in word_list: |
|
|
|
|
df_samples_ = df_samples[df_samples['word']==word] |
|
|
|
|
df_samples_ = df_samples_[df_samples_['frequency']>1] |
|
|
|
|
df_samples_ = df_samples_[df_samples_['frequency']>2] |
|
|
|
|
df_per_word = df_per_word.append(df_samples_, ignore_index=True) |
|
|
|
|
df_per_word.to_excel(os.path.join(default.stimmen_dir, 'pronunciation_variants_novo70.xlsx'), encoding="utf-8") |
|
|
|
|
#df_per_word.to_excel(os.path.join(default.stimmen_dir, 'pronunciation_variants_novo70.xlsx'), encoding="utf-8") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## ===== forced alignment ===== |
|
|
|
|
if forced_alignment: |
|
|
|
|
Results = pd.DataFrame(index=[], |
|
|
|
|
columns=['filename', 'ipa', 'word', 'result_ipa', 'result_novo70', 'llh']) |
|
|
|
|
for word in word_list: |
|
|
|
|
#for word in ['Oor']: |
|
|
|
|
# pronunciation variants top 3 |
|
|
|
|
df_per_word_ = df_per_word[df_per_word['word']==word] |
|
|
|
|
df_per_word_ = df_per_word_.sort_values('frequency', ascending=False) |
|
|
|
|
if len(df_per_word_) < 3: # pauw, rozen |
|
|
|
|
pronunciation_ipa = list(df_per_word_['ipa']) |
|
|
|
|
elif word=='Reuzenrad': |
|
|
|
|
pronunciation_ipa = [ |
|
|
|
|
df_per_word_.iloc[0]['ipa'], |
|
|
|
|
df_per_word_.iloc[1]['ipa'], |
|
|
|
|
df_per_word_.iloc[2]['ipa'], |
|
|
|
|
df_per_word_.iloc[3]['ipa']] |
|
|
|
|
else: |
|
|
|
|
# oog, oor, reus, roeiboot |
|
|
|
|
pronunciation_ipa = [ |
|
|
|
|
df_per_word_.iloc[0]['ipa'], |
|
|
|
|
df_per_word_.iloc[1]['ipa'], |
|
|
|
|
df_per_word_.iloc[2]['ipa']] |
|
|
|
|
#print("{0}: {1}".format(word, pronunciation_ipa)) |
|
|
|
|
|
|
|
|
|
# samples for the word |
|
|
|
|
df_ = df[df['word']==word] |
|
|
|
|
|
|
|
|
|
# samples in which all pronunciations are written in novo70. |
|
|
|
|
samples = df_.query("ipa in @pronunciation_ipa") |
|
|
|
|
|
|
|
|
|
results = pd.DataFrame(index=[], |
|
|
|
|
columns=['filename', 'ipa', 'word', 'result_ipa', 'result_novo70', 'llh']) |
|
|
|
|
|
|
|
|
|
#j = 0 |
|
|
|
|
for i in range(0, len(samples)): |
|
|
|
|
sample = samples.iloc[i] |
|
|
|
|
wav_file = os.path.join(default.stimmen_wav_dir, sample['filename']) |
|
|
|
|
if os.path.exists(wav_file): |
|
|
|
|
#j += 1 |
|
|
|
|
#print('{0} - {1}'.format(word, i)) |
|
|
|
|
pronunciation_ipa_ = [ipa.replace(':', 'ː') for ipa in pronunciation_ipa] |
|
|
|
|
result = novoapi_functions.forced_alignment(wav_file, word, pronunciation_ipa_) |
|
|
|
|
result_ipa, result_novo70, llh = novoapi_functions.result2pronunciation(result, word) |
|
|
|
|
result_ = pd.Series([ |
|
|
|
|
sample['filename'], |
|
|
|
|
sample['ipa'], |
|
|
|
|
sample['word'], |
|
|
|
|
' '.join(result_ipa), |
|
|
|
|
' '.join(result_novo70), |
|
|
|
|
llh |
|
|
|
|
], index=results.columns) |
|
|
|
|
results = results.append(result_, ignore_index = True) |
|
|
|
|
print('{0}/{1}: answer {2} - prediction {3}'.format( |
|
|
|
|
i+1, len(samples), result_['ipa'], result_['result_ipa'])) |
|
|
|
|
|
|
|
|
|
if len(results) > 0: |
|
|
|
|
Results = Results.append(results, ignore_index = True) |
|
|
|
|
Results.to_excel(os.path.join(default.stimmen_dir, 'Results.xlsx'), encoding="utf-8") |
|
|
|
|
else: |
|
|
|
|
Results_xlsx = pd.ExcelFile(os.path.join(default.stimmen_dir, 'Results.xlsx'), encoding="utf-8") |
|
|
|
|
R = pd.read_excel(Results_xlsx, 'Sheet1') |
|
|
|
|
|
|
|
|
|