started to check which words in stimmen transcription consists of only phones in novo70 phoneset.

This commit is contained in:
yemaozi88 2018-12-31 13:04:33 +01:00
parent e5cf182a18
commit dd9e3d820b
4 changed files with 44 additions and 5 deletions

Binary file not shown.

View File

@ -16,7 +16,7 @@ import acoustic_model_functions as am_func
import convert_xsampa2ipa
import defaultfiles as default
from forced_alignment import pyhtk
from forced_alignment import pyhtk, convert_phone_set
import novoapi
@ -35,7 +35,7 @@ translation_key = dict()
phoneset_ipa = []
phoneset_novo70 = []
with open(default.cmu69_phoneset, "rt", encoding="utf-8") as fin:
with open(default.novo70_phoneset, "rt", encoding="utf-8") as fin:
lines = fin.read()
lines = lines.split('\n')
for line in lines:
@ -49,6 +49,14 @@ with open(default.cmu69_phoneset, "rt", encoding="utf-8") as fin:
phoneset_ipa = np.unique(phoneset_ipa)
phoneset_novo70 = np.unique(phoneset_novo70)
# As per Nederlandse phoneset_aki.xlsx recieved from David
# [ɔː] oh / ohr
# [ɪː] ih / ihr
# [iː] iy
# [œː] uh
# [ɛː] eh
david_suggestion = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː']
## ======================= convert phones ======================
mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_converter_dir)
@ -56,7 +64,38 @@ mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_
stimmen_transcription_ = pd.ExcelFile(default.stimmen_transcription_xlsx)
df = pd.read_excel(stimmen_transcription_, 'check')
#for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']):
# #ipa_converted = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, xsampa_)
# ipa_converted = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
# if not ipa_converted == ipa:
# print('{0}: {1} - {2}'.format(xsampa, ipa_converted, ipa))
transcription_ipa = list(df['IPA'])
# transcription mistake?
transcription_ipa = [ipa.replace(';', ':') for ipa in transcription_ipa if not ipa=='pypɪl' and not pd.isnull(ipa)]
transcription_ipa = [ipa.replace('ˑ', '') for ipa in transcription_ipa] # only one case.
not_in_novo70 = []
for ipa in transcription_ipa:
ipa = convert_phone_set.split_ipa(ipa)
not_in_novo70_ = [phone for phone in ipa
if not phone in phoneset_ipa and not phone in david_suggestion]
not_in_novo70_ = [phone.replace('sp', '') for phone in not_in_novo70_]
not_in_novo70_ = [phone.replace(':', '') for phone in not_in_novo70_]
not_in_novo70_ = [phone.replace('ː', '') for phone in not_in_novo70_]
#translation_key.get(phone, phone)
not_in_novo70.extend(not_in_novo70_)
not_in_novo70_list = list(set(not_in_novo70))
def search_phone_ipa(x, phone_list):
return [phone for phone in phone_list if x in convert_phone_set.split_ipa(phone)]
# 'ɐ', 'ɒ', 'w', 'æ', 'ʀ', 'ʁ',
# 'œː', 'ɾ',
# 'o', 'a'
# [e] 'nyːver mɑntsjə' (1)
# [ɹ] 'iːjəɹ' (2)
search_phone_ipa('ˑ', transcription_ipa)

View File

@ -42,4 +42,4 @@ phonelist_friesian_txt = os.path.join(experiments_dir, 'friesian', 'acoustic
novo_api_dir = os.path.join(WSL_dir, 'python-novo-api', 'novoapi')
#novo_api_dir = r'c:\Python36-32\Lib\site-packages\novoapi'
cmu69_phoneset = os.path.join(novo_api_dir, 'asr', 'phoneset', 'en', 'cmu69.phoneset')
novo70_phoneset = os.path.join(novo_api_dir, 'asr', 'phoneset', 'nl', 'novo70.phoneset')