@ -29,48 +29,47 @@ forced_alignment_novo70 = True
## ===== load novo phoneset =====
phoneset_ipa , phoneset_novo70 , translation_key_ipa2novo70 , translation_key_novo702ipa = novoapi_functions . load_phonset ( )
phoneset_ipa , phoneset_novo70 , translation_key_ipa2novo70 , translation_key_novo702ipa = novoapi_functions . load_novo70_ phone set ( )
## ===== extract pronunciations written in novo70 only (not_in_novo70) =====
# As per Nederlandse phoneset_aki.xlsx recieved from David
# [ɔː] oh / ohr
# [ɪ ː ] ih / ihr
# [iː ] iy
# [œː] uh
# [ɛː] eh
# [w] wv in IPA written as ʋ .
david_suggestion = [ ' ɔː ' , ' ɪ ː ' , ' iː ' , ' œː ' , ' ɛː ' , ' w ' ]
## read pronunciation variants.
stimmen_transcription_ = pd . ExcelFile( default. stimmen_transcription_xlsx )
df = pd . read_excel( stimmen_transcription_, ' frequency' )
transcription_ipa = list ( df [ ' IPA ' ] )
# stimmen_transcription_ = pd. ExcelFile( default. stimmen_transcription_xlsx)
#df = pd. read_excel( stimmen_transcription_, ' frequency' )
# transcription_ipa = list(df['IPA'] )
# transcription mistake?
transcription_ipa = [ ipa . replace ( ' ; ' , ' ː ' ) for ipa in transcription_ipa if not ipa == ' pypɪ l ' and not pd . isnull ( ipa ) ]
transcription_ipa = [ ipa . replace ( ' ˑ ' , ' ' ) for ipa in transcription_ipa ] # only one case.
not_in_novo70 = [ ]
all_in_novo70 = [ ]
for ipa in transcription_ipa :
ipa = ipa . replace ( ' : ' , ' ː ' )
ipa = convert_phone_set . split_ipa ( ipa )
stimmen_test_dir = r ' c: \ OneDrive \ Research \ rug \ _data \ stimmen_test '
df = stimmen_functions . load_transcriptions_novo70 ( stimmen_test_dir )
# list of phones not in novo70 phoneset.
not_in_novo70_ = [ phone for phone in ipa
if not phone in phoneset_ipa and not phone in david_suggestion ]
not_in_novo70_ = [ phone . replace ( ' sp ' , ' ' ) for phone in not_in_novo70_ ]
not_in_novo70_ = [ phone . replace ( ' : ' , ' ' ) for phone in not_in_novo70_ ]
not_in_novo70_ = [ phone . replace ( ' ː ' , ' ' ) for phone in not_in_novo70_ ]
if len ( not_in_novo70_ ) == 0 :
all_in_novo70 . append ( ' ' . join ( ipa ) )
## transcription mistake?
#transcription_ipa = [ipa.replace(';', 'ː ') for ipa in transcription_ipa if not ipa=='pypɪ l' and not pd.isnull(ipa)]
#transcription_ipa = [ipa.replace('ˑ', '') for ipa in transcription_ipa] # only one case.
#translation_key.get(phone, phone)
not_in_novo70 . extend ( not _in_novo70_ )
not_in_novo70_list = list ( set ( not_in_novo70 ) )
#not_in_novo70 = []
#all _in_novo70 = []
#for ipa in transcription_ipa:
# ipa = ipa.replace(':', 'ː ')
# ipa = convert_phone_set.split_ipa(ipa)
# # list of phones not in novo70 phoneset.
# not_in_novo70_ = [phone for phone in ipa
# if not phone in phoneset_ipa and not phone in david_suggestion]
# not_in_novo70_ = [phone.replace('sp', '') for phone in not_in_novo70_]
# not_in_novo70_ = [phone.replace(':', '') for phone in not_in_novo70_]
# not_in_novo70_ = [phone.replace('ː ', '') for phone in not_in_novo70_]
# if len(not_in_novo70_) == 0:
# all_in_novo70.append(''.join(ipa))
# #translation_key.get(phone, phone)
# not_in_novo70.extend(not_in_novo70_)
#not_in_novo70_list = list(set(not_in_novo70))
## check which phones used in stimmen but not in novo70
@ -85,41 +84,43 @@ not_in_novo70_list = list(set(not_in_novo70))
# [ʊ] 'ʊ'(1) --> can be ʏ (uh)??
# [χ] --> can be x??
def search_phone_ipa( x , phone_list) :
x_in_item = [ ]
for ipa in phone_list :
ipa_original = ipa
ipa = ipa . replace ( ' : ' , ' ː ' )
ipa = convert_phone_set. split_ipa( ipa )
if x in ipa and not x + ' : ' in ipa :
x_in_item. append( ipa_original )
return x_in_item
# def search_phone_ipa(x, phone_list) :
# x_in_item = [ ]
# for ipa in phone_list:
# ipa_original = ipa
# ipa = ipa.replace(':', 'ː ' )
# ipa = convert_phone_set. split_ipa( ipa)
# if x in ipa and not x+':' in ipa:
# x_in_item. append( ipa_original)
# return x_in_item
#search_phone_ipa('ø', transcription_ipa)
## ===== load all transcriptions (df) =====
df = stimmen_functions. load_transcriptions( )
#df = stimmen_functions. load_transcriptions( )
word_list = [ i for i in list ( set ( df [ ' word ' ] ) ) if not pd . isnull ( i ) ]
word_list = sorted ( word_list )
## check frequency of each pronunciation variants
cols = [ ' word ' , ' ipa ' , ' frequency' ]
df_samples = pd . DataFrame ( index = [ ] , columns= cols )
for ipa in all_in_novo70 :
ipa = ipa . replace ( ' ː ' , ' : ' )
samples = df [ df [ ' ipa ' ] == ipa ]
word = list ( set ( samples [ ' word ' ] ) ) [ 0 ]
samples_Series = pd . Series ( [ word , ipa , len ( samples ) ] , index = df_samples. columns )
df_samples = df_samples. append( samples_Series, ignore_index= True )
# cols = ['word', 'ipa', ' frequency' ]
# df_samples = pd.DataFrame(index=[], columns= cols)
# for ipa in all_in_novo70:
# ipa = ipa.replace('ː ', ':' )
# samples = df[df['ipa'] == ipa]
# word = list(set(samples['word']))[0 ]
# samples_Series = pd.Series([word, ipa, len(samples)], index= df_samples. columns)
# df_samples = df_samples. append( samples_Series, ignore_index= True)
# each word
df_per_word = pd . DataFrame ( index = [ ] , columns= df_samples. keys ( ) )
# df_per_word = pd.DataFrame(index=[], columns= df_samples.keys() )
for word in word_list :
df_samples_ = df_samples [ df_samples [ ' word ' ] == word ]
df_samples_ = df_samples_ [ df_samples_ [ ' frequency ' ] > 2 ]
df_per_word = df_per_word . append ( df_samples_ , ignore_index = True )
# for word in word_list:
word = word_list [ 2 ]
df_ = df[ df [ ' word ' ] == word ]
np . unique ( list ( df_ [ ' ipa ' ] ) )
#df_samples_ = df_samples_[df_samples_['frequency']>2]
#df_per_word = df_per_word.append(df_samples_, ignore_index=True)
#df_per_word.to_excel(os.path.join(default.stimmen_dir, 'pronunciation_variants_novo70.xlsx'), encoding="utf-8")