acoustic_model/acoustic_model/check_novoapi.py

206 lines
7.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import sys
import csv
from collections import Counter
import random
import shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import novoapi
import defaultfiles as default
sys.path.append(default.forced_alignment_module_dir)
from forced_alignment import convert_phone_set
#import acoustic_model_functions as am_func
import convert_xsampa2ipa
import novoapi_functions
import stimmen_functions
sys.path.append(default.accent_classification_dir)
import output_confusion_matrix
## procedure
forced_alignment_novo70 = True
## ===== load novo phoneset =====
phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa = novoapi_functions.load_phonset()
## ===== extract pronunciations written in novo70 only (not_in_novo70) =====
# As per Nederlandse phoneset_aki.xlsx recieved from David
# [ɔː] oh / ohr
# [ɪː] ih / ihr
# [iː] iy
# [œː] uh
# [ɛː] eh
# [w] wv in IPA written as ʋ.
david_suggestion = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː', 'w']
## read pronunciation variants.
stimmen_transcription_ = pd.ExcelFile(default.stimmen_transcription_xlsx)
df = pd.read_excel(stimmen_transcription_, 'frequency')
transcription_ipa = list(df['IPA'])
# transcription mistake?
transcription_ipa = [ipa.replace(';', 'ː') for ipa in transcription_ipa if not ipa=='pypɪl' and not pd.isnull(ipa)]
transcription_ipa = [ipa.replace('ˑ', '') for ipa in transcription_ipa] # only one case.
not_in_novo70 = []
all_in_novo70 = []
for ipa in transcription_ipa:
ipa = ipa.replace(':', 'ː')
ipa = convert_phone_set.split_ipa(ipa)
# list of phones not in novo70 phoneset.
not_in_novo70_ = [phone for phone in ipa
if not phone in phoneset_ipa and not phone in david_suggestion]
not_in_novo70_ = [phone.replace('sp', '') for phone in not_in_novo70_]
not_in_novo70_ = [phone.replace(':', '') for phone in not_in_novo70_]
not_in_novo70_ = [phone.replace('ː', '') for phone in not_in_novo70_]
if len(not_in_novo70_) == 0:
all_in_novo70.append(''.join(ipa))
#translation_key.get(phone, phone)
not_in_novo70.extend(not_in_novo70_)
not_in_novo70_list = list(set(not_in_novo70))
## check which phones used in stimmen but not in novo70
# 'ʀ', 'ʁ',
# 'ɒ', 'ɐ',
# 'o', 'a' (o:, a:?)
# [e] 'nyːver mɑntsjə' (1)
# [ɾ] 'ɪːɾ'(1)
# [ɹ] 'iːjəɹ' (1), 'ɪ:ɹ' (1)
# [ø] 'gʀøtəpi:r'(1), 'grøtəpi:r'(1)
# [æ] 'røːzəʀæt'(2), 'røːzəræt'(1)
# [ʊ] 'ʊ'(1) --> can be ʏ (uh)??
# [χ] --> can be x??
def search_phone_ipa(x, phone_list):
x_in_item = []
for ipa in phone_list:
ipa_original = ipa
ipa = ipa.replace(':', 'ː')
ipa = convert_phone_set.split_ipa(ipa)
if x in ipa and not x+':' in ipa:
x_in_item.append(ipa_original)
return x_in_item
#search_phone_ipa('ø', transcription_ipa)
## ===== load all transcriptions (df) =====
df = stimmen_functions.load_transcriptions()
word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
word_list = sorted(word_list)
## check frequency of each pronunciation variants
cols = ['word', 'ipa', 'frequency']
df_samples = pd.DataFrame(index=[], columns=cols)
for ipa in all_in_novo70:
ipa = ipa.replace('ː', ':')
samples = df[df['ipa'] == ipa]
word = list(set(samples['word']))[0]
samples_Series = pd.Series([word, ipa, len(samples)], index=df_samples.columns)
df_samples = df_samples.append(samples_Series, ignore_index=True)
# each word
df_per_word = pd.DataFrame(index=[], columns=df_samples.keys())
for word in word_list:
df_samples_ = df_samples[df_samples['word']==word]
df_samples_ = df_samples_[df_samples_['frequency']>2]
df_per_word = df_per_word.append(df_samples_, ignore_index=True)
#df_per_word.to_excel(os.path.join(default.stimmen_dir, 'pronunciation_variants_novo70.xlsx'), encoding="utf-8")
## ===== forced alignment =====
rozen_dir = r'c:\Users\Aki\source\repos\acoustic_model\rozen-test'
if forced_alignment_novo70:
Results = pd.DataFrame(index=[],
columns=['filename', 'word', 'xsampa', 'ipa', 'result_ipa', 'result_novo70', 'llh'])
#for word in word_list:
for word in ['Rozen']:
# pronunciation variants top 3
df_per_word_ = df_per_word[df_per_word['word']==word]
df_per_word_ = df_per_word_.sort_values('frequency', ascending=False)
if len(df_per_word_) < 3: # pauw, rozen
pronunciation_ipa = list(df_per_word_['ipa'])
elif word=='Reuzenrad':
pronunciation_ipa = [
df_per_word_.iloc[0]['ipa'],
df_per_word_.iloc[1]['ipa'],
df_per_word_.iloc[2]['ipa'],
df_per_word_.iloc[3]['ipa']]
else:
# oog, oor, reus, roeiboot
pronunciation_ipa = [
df_per_word_.iloc[0]['ipa'],
df_per_word_.iloc[1]['ipa'],
df_per_word_.iloc[2]['ipa']]
#print("{0}: {1}".format(word, pronunciation_ipa))
# samples for the word
df_ = df[df['word']==word]
# samples in which all pronunciations are written in novo70.
samples = df_.query("ipa in @pronunciation_ipa")
results = pd.DataFrame(index=[],
columns=['filename', 'word', 'xsampa', 'ipa', 'result_ipa', 'result_novo70', 'llh'])
for i in range(0, len(samples)):
sample = samples.iloc[i]
filename = sample['filename']
wav_file = os.path.join(default.stimmen_wav_dir, filename)
if os.path.exists(wav_file):
# for Martijn
shutil.copy(wav_file, os.path.join(rozen_dir, filename))
# pronunciation_ipa_ = [ipa.replace(':', 'ː') for ipa in pronunciation_ipa]
# result = novoapi_functions.forced_alignment(wav_file, word, pronunciation_ipa_)
# result_ipa, result_novo70, llh = novoapi_functions.result2pronunciation(result, word)
# result_ = pd.Series([
# sample['filename'],
# sample['word'],
# sample['xsampa'],
# sample['ipa'],
# ' '.join(result_ipa),
# ' '.join(result_novo70),
# llh
# ], index=results.columns)
# results = results.append(result_, ignore_index = True)
# print('{0}/{1}: answer {2} - prediction {3}'.format(
# i+1, len(samples), result_['ipa'], result_['result_ipa']))
# #results.to_excel(os.path.join(default.stimmen_dir, 'results.xlsx'), encoding="utf-8")
#if len(results) > 0:
# Results = Results.append(results, ignore_index = True)
#Results.to_excel(os.path.join(default.stimmen_result_novoapi_dir, 'Results.xlsx'), encoding="utf-8")
else:
Results_xlsx = pd.ExcelFile(os.path.join(default.stimmen_result_novoapi_dir, 'Results.xlsx'), encoding="utf-8")
Results = pd.read_excel(Results_xlsx, 'Sheet1')
## ===== analysis =====
#for word in word_list:
# if not word == 'Oog':
# Results_ = Results[Results['word'] == word]
# y_true = list(Results_['ipa'])
# y_pred_ = [ipa.replace(' ', '') for ipa in list(Results_['result_ipa'])]
# y_pred = [ipa.replace('ː', ':') for ipa in y_pred_]
# pronunciation_variants = list(set(y_true))
# cm = confusion_matrix(y_true, y_pred, labels=pronunciation_variants)
# plt.figure()
# output_confusion_matrix.plot_confusion_matrix(cm, pronunciation_variants, normalize=False)
# #plt.show()
# plt.savefig(os.path.join(default.stimmen_result_novoapi_dir, word + '.png'))