check frequency of the pronunciation variants of each word.
This commit is contained in:
		| @@ -21,8 +21,11 @@ from forced_alignment import pyhtk, convert_phone_set | |||||||
| import novoapi  | import novoapi  | ||||||
| import novoapi_functions | import novoapi_functions | ||||||
|  |  | ||||||
| ## ======================= novo phoneset ====================== | ## ===== load novo phoneset ===== | ||||||
| phoneset_ipa, phoneset_novo70, translation_key = novoapi_functions.load_phonset() | phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa = novoapi_functions.load_phonset() | ||||||
|  |  | ||||||
|  |  | ||||||
|  | ## ===== extract pronunciations written in novo70 only (not_in_novo70) ===== | ||||||
|  |  | ||||||
| # As per Nederlandse phoneset_aki.xlsx recieved from David | # As per Nederlandse phoneset_aki.xlsx recieved from David | ||||||
| # [ɔː] oh / ohr | # [ɔː] oh / ohr | ||||||
| @@ -33,10 +36,7 @@ phoneset_ipa, phoneset_novo70, translation_key = novoapi_functions.load_phonset( | |||||||
| # [w] wv in IPA written as ʋ.  | # [w] wv in IPA written as ʋ.  | ||||||
| david_suggestion = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː', 'w'] | david_suggestion = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː', 'w'] | ||||||
|  |  | ||||||
|  | ## read pronunciation variants. | ||||||
| ## ======================= extract words which is written only with novo70 ====================== |  | ||||||
| mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_converter_dir) |  | ||||||
|  |  | ||||||
| stimmen_transcription_ = pd.ExcelFile(default.stimmen_transcription_xlsx) | stimmen_transcription_ = pd.ExcelFile(default.stimmen_transcription_xlsx) | ||||||
| df = pd.read_excel(stimmen_transcription_, 'frequency') | df = pd.read_excel(stimmen_transcription_, 'frequency') | ||||||
| #for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']): | #for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']): | ||||||
| @@ -68,7 +68,8 @@ for ipa in transcription_ipa: | |||||||
| 	not_in_novo70.extend(not_in_novo70_) | 	not_in_novo70.extend(not_in_novo70_) | ||||||
| not_in_novo70_list = list(set(not_in_novo70)) | not_in_novo70_list = list(set(not_in_novo70)) | ||||||
|  |  | ||||||
| ## check which phone is used in stimmen but not in novo70 |  | ||||||
|  | ## check which phones used in stimmen but not in novo70 | ||||||
| # 'ʀ', 'ʁ', | # 'ʀ', 'ʁ', | ||||||
| # 'ɒ', 'ɐ',  | # 'ɒ', 'ɐ',  | ||||||
| # 'o', 'a' (o:, a:?) | # 'o', 'a' (o:, a:?) | ||||||
| @@ -92,10 +93,12 @@ def search_phone_ipa(x, phone_list): | |||||||
| #search_phone_ipa('ø', transcription_ipa) | #search_phone_ipa('ø', transcription_ipa) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | ## ===== load all transcriptions (df) ===== | ||||||
|  |  | ||||||
| df = pd.read_excel(stimmen_transcription_, 'original') | df = pd.read_excel(stimmen_transcription_, 'original') | ||||||
|  |  | ||||||
|  | # mapping from ipa to xsampa | ||||||
|  | mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_converter_dir) | ||||||
|  |  | ||||||
| ipas     = [] | ipas     = [] | ||||||
| famehtks = [] | famehtks = [] | ||||||
| for xsampa in df['Self Xsampa']: | for xsampa in df['Self Xsampa']: | ||||||
| @@ -117,15 +120,11 @@ df = pd.DataFrame({'filename': df['Filename'], | |||||||
|                     'xsampa': df['Self Xsampa'], |                     'xsampa': df['Self Xsampa'], | ||||||
|                     'ipa': pd.Series(ipas)}) |                     'ipa': pd.Series(ipas)}) | ||||||
|  |  | ||||||
| # find options which all phones are in novo70. | word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)] | ||||||
| #word_list = list(set(df['word'])) | word_list = sorted(word_list) | ||||||
| #word_list = [word for word in word_list if not pd.isnull(word)] |  | ||||||
| #word = word_list[1] |  | ||||||
|  |  | ||||||
| ## pronunciation variants of 'word'  |  | ||||||
| #df_ = df[df['word'] == word]['xsampa'] |  | ||||||
| ##pronunciation_variant = list(set(df_)) |  | ||||||
|  |  | ||||||
|  | ## check frequency of each pronunciation variants | ||||||
| cols = ['word', 'ipa', 'frequency'] | cols = ['word', 'ipa', 'frequency'] | ||||||
| df_samples = pd.DataFrame(index=[], columns=cols) | df_samples = pd.DataFrame(index=[], columns=cols) | ||||||
| for ipa in all_in_novo70: | for ipa in all_in_novo70: | ||||||
| @@ -133,4 +132,13 @@ for ipa in all_in_novo70: | |||||||
| 	samples = df[df['ipa'] == ipa] | 	samples = df[df['ipa'] == ipa] | ||||||
| 	word = list(set(samples['word']))[0] | 	word = list(set(samples['word']))[0] | ||||||
| 	samples_Series = pd.Series([word, ipa, len(samples)], index=df_samples.columns) | 	samples_Series = pd.Series([word, ipa, len(samples)], index=df_samples.columns) | ||||||
| 	df_samples = df_samples.append(samples_Series, ignore_index=True) | 	df_samples = df_samples.append(samples_Series, ignore_index=True) | ||||||
|  |  | ||||||
|  | # each word | ||||||
|  | df_per_word = pd.DataFrame(index=[], columns=df_samples.keys()) | ||||||
|  |  | ||||||
|  | for word in word_list: | ||||||
|  | 	df_samples_ = df_samples[df_samples['word']==word] | ||||||
|  | 	df_samples_ = df_samples_[df_samples_['frequency']>1] | ||||||
|  | 	df_per_word = df_per_word.append(df_samples_, ignore_index=True) | ||||||
|  | df_per_word.to_excel(os.path.join(default.stimmen_dir, 'pronunciation_variants_novo70.xlsx'), encoding="utf-8") | ||||||
| @@ -39,7 +39,7 @@ def load_phonset(): | |||||||
| 	phoneset_ipa    = np.unique(phoneset_ipa) | 	phoneset_ipa    = np.unique(phoneset_ipa) | ||||||
| 	phoneset_novo70 = np.unique(phoneset_novo70) | 	phoneset_novo70 = np.unique(phoneset_novo70) | ||||||
|  |  | ||||||
| 	return phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa | 	return  | ||||||
|  |  | ||||||
|  |  | ||||||
| def multi_character_tokenize(line, multi_character_tokens): | def multi_character_tokenize(line, multi_character_tokens): | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user