started to check which words in stimmen transcription consists of only phones in novo70 phoneset.
This commit is contained in:
parent
e5cf182a18
commit
dd9e3d820b
Binary file not shown.
Binary file not shown.
@ -16,7 +16,7 @@ import acoustic_model_functions as am_func
|
||||
import convert_xsampa2ipa
|
||||
import defaultfiles as default
|
||||
|
||||
from forced_alignment import pyhtk
|
||||
from forced_alignment import pyhtk, convert_phone_set
|
||||
|
||||
import novoapi
|
||||
|
||||
@ -35,7 +35,7 @@ translation_key = dict()
|
||||
|
||||
phoneset_ipa = []
|
||||
phoneset_novo70 = []
|
||||
with open(default.cmu69_phoneset, "rt", encoding="utf-8") as fin:
|
||||
with open(default.novo70_phoneset, "rt", encoding="utf-8") as fin:
|
||||
lines = fin.read()
|
||||
lines = lines.split('\n')
|
||||
for line in lines:
|
||||
@ -49,6 +49,14 @@ with open(default.cmu69_phoneset, "rt", encoding="utf-8") as fin:
|
||||
phoneset_ipa = np.unique(phoneset_ipa)
|
||||
phoneset_novo70 = np.unique(phoneset_novo70)
|
||||
|
||||
# As per Nederlandse phoneset_aki.xlsx recieved from David
|
||||
# [ɔː] oh / ohr
|
||||
# [ɪː] ih / ihr
|
||||
# [iː] iy
|
||||
# [œː] uh
|
||||
# [ɛː] eh
|
||||
david_suggestion = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː']
|
||||
|
||||
|
||||
## ======================= convert phones ======================
|
||||
mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_converter_dir)
|
||||
@ -56,7 +64,38 @@ mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_
|
||||
stimmen_transcription_ = pd.ExcelFile(default.stimmen_transcription_xlsx)
|
||||
df = pd.read_excel(stimmen_transcription_, 'check')
|
||||
#for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']):
|
||||
# #ipa_converted = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, xsampa_)
|
||||
# ipa_converted = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
|
||||
# if not ipa_converted == ipa:
|
||||
# print('{0}: {1} - {2}'.format(xsampa, ipa_converted, ipa))
|
||||
# print('{0}: {1} - {2}'.format(xsampa, ipa_converted, ipa))
|
||||
transcription_ipa = list(df['IPA'])
|
||||
|
||||
# transcription mistake?
|
||||
transcription_ipa = [ipa.replace(';', ':') for ipa in transcription_ipa if not ipa=='pypɪl' and not pd.isnull(ipa)]
|
||||
transcription_ipa = [ipa.replace('ˑ', '') for ipa in transcription_ipa] # only one case.
|
||||
|
||||
not_in_novo70 = []
|
||||
for ipa in transcription_ipa:
|
||||
ipa = convert_phone_set.split_ipa(ipa)
|
||||
|
||||
not_in_novo70_ = [phone for phone in ipa
|
||||
if not phone in phoneset_ipa and not phone in david_suggestion]
|
||||
not_in_novo70_ = [phone.replace('sp', '') for phone in not_in_novo70_]
|
||||
not_in_novo70_ = [phone.replace(':', '') for phone in not_in_novo70_]
|
||||
not_in_novo70_ = [phone.replace('ː', '') for phone in not_in_novo70_]
|
||||
|
||||
#translation_key.get(phone, phone)
|
||||
not_in_novo70.extend(not_in_novo70_)
|
||||
not_in_novo70_list = list(set(not_in_novo70))
|
||||
|
||||
|
||||
def search_phone_ipa(x, phone_list):
|
||||
return [phone for phone in phone_list if x in convert_phone_set.split_ipa(phone)]
|
||||
|
||||
|
||||
# 'ɐ', 'ɒ', 'w', 'æ', 'ʀ', 'ʁ',
|
||||
# 'œː', 'ɾ',
|
||||
# 'o', 'a'
|
||||
# [e] 'nyːver mɑntsjə' (1)
|
||||
# [ɹ] 'iːjəɹ' (2)
|
||||
|
||||
search_phone_ipa('ˑ', transcription_ipa)
|
@ -42,4 +42,4 @@ phonelist_friesian_txt = os.path.join(experiments_dir, 'friesian', 'acoustic
|
||||
|
||||
novo_api_dir = os.path.join(WSL_dir, 'python-novo-api', 'novoapi')
|
||||
#novo_api_dir = r'c:\Python36-32\Lib\site-packages\novoapi'
|
||||
cmu69_phoneset = os.path.join(novo_api_dir, 'asr', 'phoneset', 'en', 'cmu69.phoneset')
|
||||
novo70_phoneset = os.path.join(novo_api_dir, 'asr', 'phoneset', 'nl', 'novo70.phoneset')
|
Loading…
Reference in New Issue
Block a user