diff --git a/acoustic_model/check_novoapi.py b/acoustic_model/check_novoapi.py index 3fd4601..178706b 100644 --- a/acoustic_model/check_novoapi.py +++ b/acoustic_model/check_novoapi.py @@ -21,8 +21,11 @@ from forced_alignment import pyhtk, convert_phone_set import novoapi import novoapi_functions -## ======================= novo phoneset ====================== -phoneset_ipa, phoneset_novo70, translation_key = novoapi_functions.load_phonset() +## ===== load novo phoneset ===== +phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa = novoapi_functions.load_phonset() + + +## ===== extract pronunciations written in novo70 only (not_in_novo70) ===== # As per Nederlandse phoneset_aki.xlsx recieved from David # [ɔː] oh / ohr @@ -33,10 +36,7 @@ phoneset_ipa, phoneset_novo70, translation_key = novoapi_functions.load_phonset( # [w] wv in IPA written as ʋ. david_suggestion = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː', 'w'] - -## ======================= extract words which is written only with novo70 ====================== -mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_converter_dir) - +## read pronunciation variants. stimmen_transcription_ = pd.ExcelFile(default.stimmen_transcription_xlsx) df = pd.read_excel(stimmen_transcription_, 'frequency') #for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']): @@ -68,7 +68,8 @@ for ipa in transcription_ipa: not_in_novo70.extend(not_in_novo70_) not_in_novo70_list = list(set(not_in_novo70)) -## check which phone is used in stimmen but not in novo70 + +## check which phones used in stimmen but not in novo70 # 'ʀ', 'ʁ', # 'ɒ', 'ɐ', # 'o', 'a' (o:, a:?) @@ -92,10 +93,12 @@ def search_phone_ipa(x, phone_list): #search_phone_ipa('ø', transcription_ipa) - - +## ===== load all transcriptions (df) ===== df = pd.read_excel(stimmen_transcription_, 'original') +# mapping from ipa to xsampa +mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_converter_dir) + ipas = [] famehtks = [] for xsampa in df['Self Xsampa']: @@ -117,15 +120,11 @@ df = pd.DataFrame({'filename': df['Filename'], 'xsampa': df['Self Xsampa'], 'ipa': pd.Series(ipas)}) -# find options which all phones are in novo70. -#word_list = list(set(df['word'])) -#word_list = [word for word in word_list if not pd.isnull(word)] -#word = word_list[1] +word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)] +word_list = sorted(word_list) -## pronunciation variants of 'word' -#df_ = df[df['word'] == word]['xsampa'] -##pronunciation_variant = list(set(df_)) +## check frequency of each pronunciation variants cols = ['word', 'ipa', 'frequency'] df_samples = pd.DataFrame(index=[], columns=cols) for ipa in all_in_novo70: @@ -133,4 +132,13 @@ for ipa in all_in_novo70: samples = df[df['ipa'] == ipa] word = list(set(samples['word']))[0] samples_Series = pd.Series([word, ipa, len(samples)], index=df_samples.columns) - df_samples = df_samples.append(samples_Series, ignore_index=True) \ No newline at end of file + df_samples = df_samples.append(samples_Series, ignore_index=True) + +# each word +df_per_word = pd.DataFrame(index=[], columns=df_samples.keys()) + +for word in word_list: + df_samples_ = df_samples[df_samples['word']==word] + df_samples_ = df_samples_[df_samples_['frequency']>1] + df_per_word = df_per_word.append(df_samples_, ignore_index=True) +df_per_word.to_excel(os.path.join(default.stimmen_dir, 'pronunciation_variants_novo70.xlsx'), encoding="utf-8") \ No newline at end of file diff --git a/acoustic_model/novoapi_functions.py b/acoustic_model/novoapi_functions.py index 0ab6aa8..bbf44ff 100644 --- a/acoustic_model/novoapi_functions.py +++ b/acoustic_model/novoapi_functions.py @@ -39,7 +39,7 @@ def load_phonset(): phoneset_ipa = np.unique(phoneset_ipa) phoneset_novo70 = np.unique(phoneset_novo70) - return phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa + return def multi_character_tokenize(line, multi_character_tokens):