novoapi_functions is added to novo70 specific functions.

find pronunciation variants which all phones are in novo70.
started to check which words in stimmen transcription consists of only phones in novo70 phoneset.
2019-01-07 23:27:02 +01:00 · 2019-01-07 11:50:24 +01:00 · 2018-12-31 13:04:33 +01:00
9 changed files with 369 additions and 33 deletions
--- a/.vs/acoustic_model/v15/.suo
+++ b/.vs/acoustic_model/v15/.suo
--- a/acoustic_model/pycache/acoustic_model_functions.cpython-36.pyc
+++ b/acoustic_model/pycache/acoustic_model_functions.cpython-36.pyc
--- a/acoustic_model/pycache/defaultfiles.cpython-36.pyc
+++ b/acoustic_model/pycache/defaultfiles.cpython-36.pyc
--- a/acoustic_model/acoustic_model.pyproj
+++ b/acoustic_model/acoustic_model.pyproj
@ -35,9 +35,15 @@
    <Compile Include="fa_test.py">
      <SubType>Code</SubType>
    </Compile>
+    <Compile Include="novoapi_forced_alignment.py">
+      <SubType>Code</SubType>
+    </Compile>
    <Compile Include="htk_vs_kaldi.py">
      <SubType>Code</SubType>
    </Compile>
+    <Compile Include="novoapi_functions.py">
+      <SubType>Code</SubType>
+    </Compile>
  </ItemGroup>
  <ItemGroup>
    <Content Include="config.ini" />
--- a/acoustic_model/acoustic_model_functions.py
+++ b/acoustic_model/acoustic_model_functions.py
@ -199,4 +199,4 @@ def make_fame2ipa_variants(fame):
    ipa.append(fame.replace('ɔ̈', 'ɒ'))
    ipa.append(fame.replace('ɔ̈:', 'ɒ'))

-    return ipa
+    return ipa
--- a/acoustic_model/check_novoapi.py
+++ b/acoustic_model/check_novoapi.py
@ -16,47 +16,121 @@ import acoustic_model_functions as am_func
 import convert_xsampa2ipa
 import defaultfiles as default

-from forced_alignment import pyhtk
+from forced_alignment import pyhtk, convert_phone_set

 import novoapi 
-
+import novoapi_functions

 ## ======================= novo phoneset ======================
-translation_key = dict()
+phoneset_ipa, phoneset_novo70, translation_key = novoapi_functions.load_phonset()

-#phonelist_novo70_      = pd.ExcelFile(default.phonelist_novo70_xlsx)
-#df = pd.read_excel(phonelist_novo70_, 'list')
-## *_simple includes columns which has only one phone in.
-#for ipa, novo70 in zip(df['IPA_simple'], df['novo70_simple']):
-#    if not pd.isnull(ipa):
-#        print('{0}:{1}'.format(ipa, novo70))
-#        translation_key[ipa] = novo70
-#phonelist_novo70 = np.unique(list(df['novo70_simple']))
-
-phoneset_ipa = []
-phoneset_novo70 = []
-with open(default.cmu69_phoneset, "rt", encoding="utf-8") as fin:
-	lines = fin.read()
-	lines = lines.split('\n')
-	for line in lines:
-		words = line.split('\t')
-		if len(words) > 1:
-			novo70 = words[0]
-			ipa	   = words[1]
-			phoneset_ipa.append(ipa)
-			phoneset_novo70.append(novo70)
-			translation_key[ipa] = novo70
-phoneset_ipa    = np.unique(phoneset_ipa)
-phoneset_novo70 = np.unique(phoneset_novo70)
+# As per Nederlandse phoneset_aki.xlsx recieved from David
+# [ɔː] oh / ohr
+# [ɪː] ih / ihr
+# [iː] iy
+# [œː] uh
+# [ɛː] eh
+# [w] wv in IPA written as ʋ. 
+david_suggestion = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː', 'w']


-## ======================= convert phones ======================
+## ======================= extract words which is written only with novo70 ======================
 mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_converter_dir)

 stimmen_transcription_ = pd.ExcelFile(default.stimmen_transcription_xlsx)
-df = pd.read_excel(stimmen_transcription_, 'check')
+df = pd.read_excel(stimmen_transcription_, 'frequency')
 #for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']):
-#    #ipa_converted = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, xsampa_)
 #    ipa_converted = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
 #    if not ipa_converted == ipa:
-#        print('{0}: {1} - {2}'.format(xsampa, ipa_converted, ipa))
+#        print('{0}: {1} - {2}'.format(xsampa, ipa_converted, ipa))
+transcription_ipa = list(df['IPA'])
+
+# transcription mistake?
+transcription_ipa = [ipa.replace(';', 'ː') for ipa in transcription_ipa if not ipa=='pypɪl' and not pd.isnull(ipa)]
+transcription_ipa = [ipa.replace('ˑ', '') for ipa in transcription_ipa] # only one case.
+
+not_in_novo70 = []
+all_in_novo70 = []
+for ipa in transcription_ipa:
+	ipa = ipa.replace(':', 'ː')
+	ipa = convert_phone_set.split_ipa(ipa)
+
+	not_in_novo70_ = [phone for phone in ipa 
+				   if not phone in phoneset_ipa and not phone in david_suggestion]
+	not_in_novo70_ = [phone.replace('sp', '') for phone in not_in_novo70_]
+	not_in_novo70_ = [phone.replace(':', '') for phone in not_in_novo70_]
+	not_in_novo70_ = [phone.replace('ː', '') for phone in not_in_novo70_]
+
+	if len(not_in_novo70_) == 0:
+		all_in_novo70.append(''.join(ipa))
+
+	#translation_key.get(phone, phone)
+	not_in_novo70.extend(not_in_novo70_)
+not_in_novo70_list = list(set(not_in_novo70))
+
+## check which phone is used in stimmen but not in novo70
+# 'ʀ', 'ʁ',
+# 'ɒ', 'ɐ', 
+# 'o', 'a' (o:, a:?)
+# [e] 'nyːver mɑntsjə' (1)
+# [ɾ] 'ɪːɾ'(1)
+# [ɹ] 'iːjəɹ' (1), 'ɪ:ɹ' (1)
+# [ø] 'gʀøtəpi:r'(1), 'grøtəpi:r'(1)
+# [æ] 'røːzəʀæt'(2), 'røːzəræt'(1)
+# [ʊ] 'ʊ'(1) --> can be ʏ (uh)??
+# [χ] --> can be x??
+
+def search_phone_ipa(x, phone_list):
+	x_in_item = []
+	for ipa in phone_list:
+		ipa_original = ipa
+		ipa = ipa.replace(':', 'ː')
+		ipa = convert_phone_set.split_ipa(ipa)
+		if x in ipa and not x+':' in ipa:
+			x_in_item.append(ipa_original)
+	return x_in_item
+#search_phone_ipa('ø', transcription_ipa)
+
+
+
+
+df = pd.read_excel(stimmen_transcription_, 'original')
+
+ipas     = []
+famehtks = []
+for xsampa in df['Self Xsampa']:
+    if not isinstance(xsampa, float): # 'NaN'
+        # typo?
+        xsampa = xsampa.replace('r2:z@rA:\\t', 'r2:z@rA:t')
+        xsampa = xsampa.replace(';', ':')
+
+        ipa = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
+        ipa = ipa.replace('ː', ':')
+        ipa = ipa.replace(' ', '')
+        ipas.append(ipa)
+    else:
+        ipas.append('')
+        
+# extract interesting cols.
+df = pd.DataFrame({'filename': df['Filename'], 
+                    'word': df['Word'], 
+                    'xsampa': df['Self Xsampa'],
+                    'ipa': pd.Series(ipas)})
+
+# find options which all phones are in novo70.
+#word_list = list(set(df['word']))
+#word_list = [word for word in word_list if not pd.isnull(word)]
+#word = word_list[1]
+
+## pronunciation variants of 'word' 
+#df_ = df[df['word'] == word]['xsampa']
+##pronunciation_variant = list(set(df_))
+
+cols = ['word', 'ipa', 'frequency']
+df_samples = pd.DataFrame(index=[], columns=cols)
+for ipa in all_in_novo70:
+	ipa = ipa.replace('ː', ':')
+	samples = df[df['ipa'] == ipa]
+	word = list(set(samples['word']))[0]
+	samples_Series = pd.Series([word, ipa, len(samples)], index=df_samples.columns)
+	df_samples = df_samples.append(samples_Series, ignore_index=True)
--- a/acoustic_model/defaultfiles.py
+++ b/acoustic_model/defaultfiles.py
@ -42,4 +42,4 @@ phonelist_friesian_txt     = os.path.join(experiments_dir, 'friesian', 'acoustic

 novo_api_dir = os.path.join(WSL_dir, 'python-novo-api', 'novoapi')
 #novo_api_dir = r'c:\Python36-32\Lib\site-packages\novoapi'
-cmu69_phoneset = os.path.join(novo_api_dir, 'asr', 'phoneset', 'en', 'cmu69.phoneset')
+novo70_phoneset = os.path.join(novo_api_dir, 'asr', 'phoneset', 'nl', 'novo70.phoneset')
--- a/acoustic_model/novoapi_forced_alignment.py
+++ b/acoustic_model/novoapi_forced_alignment.py
@ -0,0 +1,118 @@
+#
+# forced alignment using novo-api.
+#
+# *** IMPORTANT ***
+# This file should be treated as confidencial.
+# This file should not be copied or uploaded to public sites.
+#
+# NOTES:
+# The usage of novo api: https://bitbucket.org/novolanguage/python-novo-api
+# I couldn't make it work as I described in the mail to Martijn Bartelds on
+# 2018/12/03.
+# As per the advice from him, I modified testgrammer.py and made it a function.
+#
+# In order to run on Python 3.6, the following points are changed in novo-api.
+# (1) backend/__init__.py
+# - #import session
+#   from .  import session
+# (2) backend/session.py
+# - #except Exception, e:
+#   except Exception as e:
+# - #print self.last_message
+#   print(self.last_message)
+# (3) asr/segment/praat.py
+# - def print_tier(output, title, begin, end, segs, (format, formatter))
+#   def print_tier(output, title, begin, end, segs, format, formatter):
+# (4) asr/spraaklab/__init.py
+# - #import session
+#   from .  import session
+# (5) asr/spraaklab/schema.py
+# - #print data, "validated not OK", e.message
+#   print("{0} validated not OK {1}".format(data, e.message))
+# - #print data, "validated OK"
+#   print("{} validated OK".format(data))
+# - #if isinstance(object, basestring):
+#	if isinstance(object, str)
+#
+# Aki Kunikoshi
+# 428968@gmail.com
+#
+import argparse
+import json
+
+from novoapi.backend import session
+import novoapi_functions
+
+# username / password cannot be passed as artuments...
+p = argparse.ArgumentParser()
+#p.add_argument("--user", default=None)
+#p.add_argument("--password", default=None)
+p.add_argument("--user", default='martijn.wieling')
+p.add_argument("--password", default='fa0Thaic')
+args = p.parse_args()
+
+wav_file = 'c:\\OneDrive\\WSL\\test\\onetwothree.wav'
+
+rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True) # , modeldir=modeldir)
+grammar = {
+  "type": "confusion_network",
+  "version": "1.0",
+  "data": {
+	"kind": "sequence",
+	"elements": [{
+		"kind": "word",
+		"pronunciation": [{
+			"phones": ["wv",
+			  "a1",
+			  "n"],
+			"id": 0
+		  },
+		  {
+			"phones": ["wv",
+			  "uh1",
+			  "n"],
+			"id": 1
+		  }],
+		"label": "one"
+	  },
+	  {
+		"kind": "word",
+		"pronunciation": [{
+			"phones": ["t",
+			  "uw1"],
+			"id": 0
+		  }],
+		"label": "two"
+	  },
+	  {
+		"kind": "word",
+		"pronunciation": [{
+			"phones": ["t",
+			  "r",
+			  "iy1"],
+			"id": 0
+		  },
+		  {
+			"phones": ["s",
+			  "r",
+			  "iy1"],
+			"id": 1
+		  }],
+		"label": "three"
+	  }]
+  },
+  "return_objects": ["grammar"],
+  "phoneset": "novo70"
+}
+
+res = rec.setgrammar(grammar)
+#print "Set grammar result", res
+
+#res = rec.recognize_wav("test/onetwothree.wav")
+res = rec.recognize_wav(wav_file)
+#print "Recognition result:", json.dumps(res.export(), indent=4)
+
+# list of the pronunciation for each words
+word = 'pauw'
+pronunciation_ipa = ['pau', 'pɑu']
+grammar = novoapi_functions.make_grammar(word, pronunciation_ipa)
--- a/acoustic_model/novoapi_functions.py
+++ b/acoustic_model/novoapi_functions.py
@ -0,0 +1,138 @@
+import numpy as np
+
+import defaultfiles as default
+
+def load_phonset():
+	translation_key_ipa2novo70 = dict()
+	translation_key_novo702ipa = dict()
+
+	#phonelist_novo70_      = pd.ExcelFile(default.phonelist_novo70_xlsx)
+	#df = pd.read_excel(phonelist_novo70_, 'list')
+	## *_simple includes columns which has only one phone in.
+	#for ipa, novo70 in zip(df['IPA_simple'], df['novo70_simple']):
+	#    if not pd.isnull(ipa):
+	#        print('{0}:{1}'.format(ipa, novo70))
+	#        translation_key[ipa] = novo70
+	#phonelist_novo70 = np.unique(list(df['novo70_simple']))
+
+	phoneset_ipa = []
+	phoneset_novo70 = []
+	with open(default.novo70_phoneset, "rt", encoding="utf-8") as fin:
+		lines = fin.read()
+		lines = lines.split('\n')
+		for line in lines:
+			words = line.split('\t')
+			if len(words) > 1:
+				novo70 = words[0]
+				ipa	   = words[1]
+				phoneset_ipa.append(ipa)
+				phoneset_novo70.append(novo70)
+				translation_key_ipa2novo70[ipa] = novo70
+				translation_key_novo702ipa[novo70] = ipa
+	phoneset_ipa    = np.unique(phoneset_ipa)
+	phoneset_novo70 = np.unique(phoneset_novo70)
+
+	return phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa
+
+
+def multi_character_tokenize(line, multi_character_tokens):
+	"""
+	Tries to match one of the tokens in multi_character_tokens at each position of line, 
+	starting at position 0,
+	if so tokenizes and eats that token. Otherwise tokenizes a single character.
+
+	Copied from forced_alignment.convert_phone_set.py
+	"""
+	while line != '':
+		for token in multi_character_tokens:
+			if line.startswith(token) and len(token) > 0:
+				yield token
+				line = line[len(token):]
+				break
+		else:
+			yield line[:1]
+			line = line[1:]
+
+
+def split_ipa(line):
+	"""
+	Split a line by IPA phones.
+	If nasalized sound (such as ɛ̃ː) is included, it will give error.
+	:param string line: one line written in IPA.
+	:return string lineSeperated: the line splitted in IPA phone. 
+	"""
+
+	multi_character_phones = [
+		# IPAs in CGN.
+		u'ʌu', u'ɛi', u'œy', u'aː', u'eː', u'iː', u'oː', u'øː', u'ɛː', u'œː', u'ɔː', u'ɛ̃ː', u'ɑ̃ː', u'ɔ̃ː', u'œ̃', u'ɪː'
+		]
+
+	return [phone for phone in multi_character_tokenize(line.strip(), multi_character_phones)]
+
+
+def split_novo70(line):
+	"""
+	Split a line by novo70 phones.
+	:param string line: one line written in novo70.
+	:return string lineSeperated: the line splitted by novo70 phones. 
+	"""
+	_, phoneset_novo70, _, _ = load_phonset()
+	multi_character_phones = [p for p in phoneset_novo70 if len(p) > 1]
+	multi_character_phones = sorted(multi_character_phones, key=len, reverse=True)
+
+	return ['sp' if phone == ' ' else phone
+			for phone in multi_character_tokenize(line.strip(), multi_character_phones)]
+
+
+def novo702ipa(tokens):
+	pronunciation = []
+	_, _, _, translation_key = load_phonset()
+	for phone in split_novo70(tokens):
+		pronunciation.append(translation_key.get(phone, phone))
+	return ' '.join(pronunciation)
+
+
+# numbering of novo70 should be checked.
+def ipa2novo70(tokens):
+	pronunciation = []
+	_, _, translation_key, _ = load_phonset()
+	for phone in split_ipa(tokens):
+		pronunciation.append(translation_key.get(phone, phone))
+	return ' '.join(pronunciation)
+	
+
+def make_grammar(word, pronunciation_ipa):
+	"""
+	Args:
+		words
+		pronunciation_ipa: list of pronunciation variants.
+	"""
+	#word = 'pauw'
+	#pronunciation_ipa = ['pau', 'pɑu']
+
+	grammer_data_elements0_pronunciation = []
+	for id, ipa in enumerate(pronunciation_ipa):
+		novo70 = novoapi_functions.ipa2novo70(ipa)
+		grammer_data_elements0_pronunciation.append({
+			"phones": novo70.split(),
+			"id": id
+			})
+
+	grammar_data = {
+		"kind": 'sequence',
+		"elements": [{	
+			"kind": "word",
+			"pronunciation": grammer_data_elements0_pronunciation,
+			"label": word
+			}]
+		}
+
+	grammar = {
+		"type": "confusion_network",
+		"version": "1.0",
+		"data": grammar_data,
+		"return_objects": ["grammar"],
+		"phoneset": "novo70"
+		}
+
+	return grammar
Author	SHA1	Message	Date
yemaozi88	d6d5543d03	novoapi_functions is added to novo70 specific functions.	2019-01-07 23:27:02 +01:00
yemaozi88	d6e005b1cb	find pronunciation variants which all phones are in novo70.	2019-01-07 11:50:24 +01:00
yemaozi88	dd9e3d820b	started to check which words in stimmen transcription consists of only phones in novo70 phoneset.	2018-12-31 13:04:33 +01:00