check frequency of the pronunciation variants of each word.
This commit is contained in:
parent
1622655542
commit
6edde06a4f
@ -21,8 +21,11 @@ from forced_alignment import pyhtk, convert_phone_set
|
|||||||
import novoapi
|
import novoapi
|
||||||
import novoapi_functions
|
import novoapi_functions
|
||||||
|
|
||||||
## ======================= novo phoneset ======================
|
## ===== load novo phoneset =====
|
||||||
phoneset_ipa, phoneset_novo70, translation_key = novoapi_functions.load_phonset()
|
phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa = novoapi_functions.load_phonset()
|
||||||
|
|
||||||
|
|
||||||
|
## ===== extract pronunciations written in novo70 only (not_in_novo70) =====
|
||||||
|
|
||||||
# As per Nederlandse phoneset_aki.xlsx recieved from David
|
# As per Nederlandse phoneset_aki.xlsx recieved from David
|
||||||
# [ɔː] oh / ohr
|
# [ɔː] oh / ohr
|
||||||
@ -33,10 +36,7 @@ phoneset_ipa, phoneset_novo70, translation_key = novoapi_functions.load_phonset(
|
|||||||
# [w] wv in IPA written as ʋ.
|
# [w] wv in IPA written as ʋ.
|
||||||
david_suggestion = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː', 'w']
|
david_suggestion = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː', 'w']
|
||||||
|
|
||||||
|
## read pronunciation variants.
|
||||||
## ======================= extract words which is written only with novo70 ======================
|
|
||||||
mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_converter_dir)
|
|
||||||
|
|
||||||
stimmen_transcription_ = pd.ExcelFile(default.stimmen_transcription_xlsx)
|
stimmen_transcription_ = pd.ExcelFile(default.stimmen_transcription_xlsx)
|
||||||
df = pd.read_excel(stimmen_transcription_, 'frequency')
|
df = pd.read_excel(stimmen_transcription_, 'frequency')
|
||||||
#for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']):
|
#for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']):
|
||||||
@ -68,7 +68,8 @@ for ipa in transcription_ipa:
|
|||||||
not_in_novo70.extend(not_in_novo70_)
|
not_in_novo70.extend(not_in_novo70_)
|
||||||
not_in_novo70_list = list(set(not_in_novo70))
|
not_in_novo70_list = list(set(not_in_novo70))
|
||||||
|
|
||||||
## check which phone is used in stimmen but not in novo70
|
|
||||||
|
## check which phones used in stimmen but not in novo70
|
||||||
# 'ʀ', 'ʁ',
|
# 'ʀ', 'ʁ',
|
||||||
# 'ɒ', 'ɐ',
|
# 'ɒ', 'ɐ',
|
||||||
# 'o', 'a' (o:, a:?)
|
# 'o', 'a' (o:, a:?)
|
||||||
@ -92,10 +93,12 @@ def search_phone_ipa(x, phone_list):
|
|||||||
#search_phone_ipa('ø', transcription_ipa)
|
#search_phone_ipa('ø', transcription_ipa)
|
||||||
|
|
||||||
|
|
||||||
|
## ===== load all transcriptions (df) =====
|
||||||
|
|
||||||
df = pd.read_excel(stimmen_transcription_, 'original')
|
df = pd.read_excel(stimmen_transcription_, 'original')
|
||||||
|
|
||||||
|
# mapping from ipa to xsampa
|
||||||
|
mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_converter_dir)
|
||||||
|
|
||||||
ipas = []
|
ipas = []
|
||||||
famehtks = []
|
famehtks = []
|
||||||
for xsampa in df['Self Xsampa']:
|
for xsampa in df['Self Xsampa']:
|
||||||
@ -117,15 +120,11 @@ df = pd.DataFrame({'filename': df['Filename'],
|
|||||||
'xsampa': df['Self Xsampa'],
|
'xsampa': df['Self Xsampa'],
|
||||||
'ipa': pd.Series(ipas)})
|
'ipa': pd.Series(ipas)})
|
||||||
|
|
||||||
# find options which all phones are in novo70.
|
word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
|
||||||
#word_list = list(set(df['word']))
|
word_list = sorted(word_list)
|
||||||
#word_list = [word for word in word_list if not pd.isnull(word)]
|
|
||||||
#word = word_list[1]
|
|
||||||
|
|
||||||
## pronunciation variants of 'word'
|
|
||||||
#df_ = df[df['word'] == word]['xsampa']
|
|
||||||
##pronunciation_variant = list(set(df_))
|
|
||||||
|
|
||||||
|
## check frequency of each pronunciation variants
|
||||||
cols = ['word', 'ipa', 'frequency']
|
cols = ['word', 'ipa', 'frequency']
|
||||||
df_samples = pd.DataFrame(index=[], columns=cols)
|
df_samples = pd.DataFrame(index=[], columns=cols)
|
||||||
for ipa in all_in_novo70:
|
for ipa in all_in_novo70:
|
||||||
@ -134,3 +133,12 @@ for ipa in all_in_novo70:
|
|||||||
word = list(set(samples['word']))[0]
|
word = list(set(samples['word']))[0]
|
||||||
samples_Series = pd.Series([word, ipa, len(samples)], index=df_samples.columns)
|
samples_Series = pd.Series([word, ipa, len(samples)], index=df_samples.columns)
|
||||||
df_samples = df_samples.append(samples_Series, ignore_index=True)
|
df_samples = df_samples.append(samples_Series, ignore_index=True)
|
||||||
|
|
||||||
|
# each word
|
||||||
|
df_per_word = pd.DataFrame(index=[], columns=df_samples.keys())
|
||||||
|
|
||||||
|
for word in word_list:
|
||||||
|
df_samples_ = df_samples[df_samples['word']==word]
|
||||||
|
df_samples_ = df_samples_[df_samples_['frequency']>1]
|
||||||
|
df_per_word = df_per_word.append(df_samples_, ignore_index=True)
|
||||||
|
df_per_word.to_excel(os.path.join(default.stimmen_dir, 'pronunciation_variants_novo70.xlsx'), encoding="utf-8")
|
@ -39,7 +39,7 @@ def load_phonset():
|
|||||||
phoneset_ipa = np.unique(phoneset_ipa)
|
phoneset_ipa = np.unique(phoneset_ipa)
|
||||||
phoneset_novo70 = np.unique(phoneset_novo70)
|
phoneset_novo70 = np.unique(phoneset_novo70)
|
||||||
|
|
||||||
return phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa
|
return
|
||||||
|
|
||||||
|
|
||||||
def multi_character_tokenize(line, multi_character_tokens):
|
def multi_character_tokenize(line, multi_character_tokens):
|
||||||
|
Loading…
Reference in New Issue
Block a user