diff --git a/.vs/acoustic_model/v15/.suo b/.vs/acoustic_model/v15/.suo
index 3ce8f85..45e0fe2 100644
Binary files a/.vs/acoustic_model/v15/.suo and b/.vs/acoustic_model/v15/.suo differ
diff --git a/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc b/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc
index 869323d..a74cd44 100644
Binary files a/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc and b/acoustic_model/__pycache__/defaultfiles.cpython-36.pyc differ
diff --git a/acoustic_model/acoustic_model.pyproj b/acoustic_model/acoustic_model.pyproj
index 8faedc8..5319301 100644
--- a/acoustic_model/acoustic_model.pyproj
+++ b/acoustic_model/acoustic_model.pyproj
@@ -4,8 +4,7 @@
2.0
4d8c8573-32f0-4a62-9e62-3ce5cc680390
.
-
-
+ fame_hmm.py
.
diff --git a/acoustic_model/defaultfiles.py b/acoustic_model/defaultfiles.py
index 7c4a8cf..b10d247 100644
--- a/acoustic_model/defaultfiles.py
+++ b/acoustic_model/defaultfiles.py
@@ -39,11 +39,11 @@ toolbox_dir = os.path.join(repo_dir, 'toolbox')
#config_hvite = os.path.join(htk_config_dir, 'config.HVite')
#acoustic_model = os.path.join(htk_config_dir, 'hmmdefs.compo')
#acoustic_model = r'c:\cygwin64\home\A.Kunikoshi\acoustic_model\model\barbara\hmm128-2\hmmdefs.compo'
-#phonelist_txt = os.path.join(htk_config_dir, 'phonelist.txt')
+phonelist_txt = os.path.join(htk_dir, 'config', 'phonelist.txt')
WSL_dir = r'C:\OneDrive\WSL'
#fame_dir = os.path.join(WSL_dir, 'kaldi-trunk', 'egs', 'fame')
-fame_dir = r'd:\_corpus\fame'
+fame_dir = r'c:\OneDrive\Research\rug\_data\FAME'
fame_s5_dir = os.path.join(fame_dir, 's5')
fame_corpus_dir = os.path.join(fame_dir, 'corpus')
diff --git a/acoustic_model/fame_functions.py b/acoustic_model/fame_functions.py
index 5fe60e5..cb87620 100644
--- a/acoustic_model/fame_functions.py
+++ b/acoustic_model/fame_functions.py
@@ -290,15 +290,17 @@ def lexicon_asr2htk(lexicon_file_asr, lexicon_file_htk):
"""
lex_asr = load_lexicon(lexicon_file_asr)
+ def word2htk_(row):
+ return word2htk(row['word'])
def asr2htk_space_delimited_(row):
return asr2htk_space_delimited(row['pronunciation'])
lex_htk = pd.DataFrame({
- 'word': lex_asr['word'],
+ 'word': lex_asr.apply(word2htk_, axis=1).str.upper(),
'pronunciation': lex_asr.apply(asr2htk_space_delimited_, axis=1)
})
lex_htk = lex_htk.ix[:, ['word', 'pronunciation']]
- lex_htk.to_csv(lexicon_file_htk, header=None, index=None, sep='\t')
+ lex_htk.to_csv(lexicon_file_htk, header=None, index=None, sep='\t', encoding='utf-8')
return
@@ -316,20 +318,26 @@ def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out):
lex2 = load_lexicon(lexicon_file2)
lex = pd.concat([lex1, lex2])
lex = lex.sort_values(by='word', ascending=True)
- lex.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
+ lex.to_csv(lexicon_out, index=False, header=False, sep='\t', encoding='utf-8')
def fix_single_quote(lexicon_file):
""" add '\' before all single quote at the beginning of words.
+ convert special characters to ascii compatible characters.
Args:
lexicon_file (path): lexicon file, which will be overwitten.
"""
lex = load_lexicon(lexicon_file)
+ lex = lex.dropna() # remove N/A.
for i in lex[lex['word'].str.startswith('\'')].index.values:
lex.iat[i, 0] = lex.iat[i, 0].replace('\'', '\\\'')
- # to_csv does not work with space seperator. therefore all tabs should manually be replaced.
- #lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
- lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep='\t')
+ # to_csv does not work with space seperator. therefore all tabs should manually be replaced.
+ #lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
+ lex.to_csv(lexicon_file, index=False, header=False, sep='\t', encoding='utf-8')
return
+
+
+def word2htk(word):
+ return ''.join([fame_asr.translation_key_word2htk.get(i, i) for i in word])
diff --git a/acoustic_model/fame_hmm.py b/acoustic_model/fame_hmm.py
index ba2732c..9ce920b 100644
--- a/acoustic_model/fame_hmm.py
+++ b/acoustic_model/fame_hmm.py
@@ -3,6 +3,7 @@ import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import tempfile
+import shutil
#import configparser
#import subprocess
import time
@@ -11,6 +12,7 @@ import numpy as np
import pandas as pd
import fame_functions
+from phoneset import fame_ipa, fame_asr
import defaultfiles as default
sys.path.append(default.toolbox_dir)
import file_handling as fh
@@ -28,7 +30,7 @@ dataset_list = ['devel', 'test', 'train']
# procedure
extract_features = 0
-make_lexicon = 0
+make_lexicon = 1
make_mlf = 0
combine_files = 0
flat_start = 0
@@ -44,6 +46,9 @@ lexicon_htk_asr = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_asr')
lexicon_htk_oov = os.path.join(default.htk_dir, 'lexicon', 'lex.htk_oov')
lexicon_htk = os.path.join(default.htk_dir, 'lexicon', 'lex.htk')
+global_ded = os.path.join(default.htk_dir, 'config', 'global.ded')
+
+
#hcompv_scp = output_dir + '\\scp\\combined.scp'
#combined_mlf = output_dir + '\\label\\combined.mlf'
@@ -60,14 +65,17 @@ if not os.path.exists(feature_dir):
tmp_dir = os.path.join(default.htk_dir, 'tmp')
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
+label_dir = os.path.join(default.htk_dir, 'label')
+if not os.path.exists(label_dir):
+ os.makedirs(label_dir)
+
## ======================= extract features =======================
if extract_features:
- print('==== extract features ====\n')
-
+
for dataset in dataset_list:
- print('==== dataset: {} ===='.format(dataset))
+ print('==== extract features on dataset {} ====\n'.format(dataset))
# a script file for HCopy
print(">>> making a script file for HCopy... \n")
@@ -89,6 +97,8 @@ if extract_features:
hcompv_scp = os.path.join(tmp_dir, dataset + '.scp')
fh.make_filelist(feature_dir_, hcompv_scp, '.mfc')
+ os.remove(hcopy_scp.name)
+
## ======================= make lexicon for HTK =======================
if make_lexicon:
@@ -114,94 +124,132 @@ if make_lexicon:
fame_functions.fix_single_quote(lexicon_htk)
+## ======================= make phonelist =======================
+#phonelist_txt = os.path.join(default.htk_dir, 'config', 'phonelist.txt')
+#pyhtk.create_phonelist_file(fame_asr.phoneset_htk, phonelist_txt)
+#sentence = 'ien fan de minsken fan it deiferbliuw sels brúntsje visser'
+#log_txt = os.path.join(default.htk_dir, 'config', 'log.txt')
+#dictionary_file = os.path.join(default.htk_dir, 'config', 'test.dic')
+#pyhtk.create_dictionary(
+# sentence, global_ded, log_txt, dictionary_file, lexicon_htk)
+#pyhtk.create_dictionary_without_log(
+# sentence, global_ded, dictionary_file, lexicon_htk)
+
+
## ======================= make label file =======================
if make_mlf:
- print("==== make mlf ====\n")
-
- print("generating word level transcription...\n")
for dataset in dataset_list:
- hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp'
- hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'
- script_list = FAME_dir + '\\data\\' + dataset + '\\text'
- mlf_word = output_dir + '\\label\\' + dataset + '_word.mlf'
- mlf_phone = output_dir + '\\label\\' + dataset + '_phone.mlf'
+ timer_start = time.time()
+ print("==== generating word level transcription on dataset {}\n".format(dataset))
- # lexicon
- lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])
-
- # list of features
- with open(hcompv_scp) as fin:
- features = fin.read()
- features = features.split('\n')
+ #hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp'
+ #hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'
+ script_list = os.path.join(default.fame_dir, 'data', dataset, 'text')
+ #mlf_word = output_dir + '\\label\\' + dataset + '_word.mlf'
+ #mlf_phone = output_dir + '\\label\\' + dataset + '_phone.mlf'
+ wav_dir = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
+ dictionary_file = os.path.join(wav_dir, 'temp.dic')
# list of scripts
with open(script_list, "rt", encoding="utf-8") as fin:
- scripts = fin.read()
- scripts = pd.Series(scripts.split('\n'))
+ scripts = fin.read().split('\n')
- i = 0
- missing_words = []
- fscp = open(hcompv_scp2, 'wt')
- fmlf = open(mlf_word, "wt", encoding="utf-8")
- fmlf.write("#!MLF!#\n")
- feature_nr = 1
- for feature in features:
- sys.stdout.write("\r%d/%d" % (feature_nr, len(features)))
- sys.stdout.flush()
- feature_nr += 1
- file_basename = os.path.basename(feature).replace('.mfc', '')
+ for line in scripts:
+ #for line in ['sp0035m_train_1975_fragmentenvraaggesprekkenruilverkaveling_15413 en dat kan men nog meer']:
+ # sample line:
+ # sp0457m_test_1968_plakkenfryslanterhorne_2168 en dan begjinne je natuerlik
+ filename_ = line.split(' ')[0]
+ filename = '_'.join(filename_.split('_')[1:])
+ sentence = ' '.join(line.split(' ')[1:])
- # get words from scripts.
- try:
- script = scripts[scripts.str.contains(file_basename)]
- except IndexError:
- script = []
+ wav_file = os.path.join(wav_dir, filename + '.wav')
+ if len(re.findall(r'[\w]+[âêûô\'ú]+[\w]+', sentence))==0:
+ try:
+ sentence_ascii = bytes(sentence, 'ascii')
+ except UnicodeEncodeError:
+ print(sentence)
+ #if os.path.exists(wav_file):
+ # #dictionary_file = os.path.join(wav_dir, filename + '.dic')
+ # if pyhtk.create_dictionary_without_log(
+ # sentence, global_ded, dictionary_file, lexicon_htk) == 0:
+ # # when the file name is too long, HDMan command does not work.
+ # # therefore first temporary dictionary_file is made, then renamed.
+ # shutil.move(dictionary_file, os.path.join(wav_dir, filename + '.dic'))
+ # label_file = os.path.join(wav_dir, filename + '.lab')
+ # pyhtk.create_label_file(sentence, label_file)
+ # else:
+ # os.remove(dictionary_file)
+ print("elapsed time: {}".format(time.time() - timer_start))
+ # lexicon
+ #lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])
- if len(script) != 0:
- script_id = script.index[0]
- script_txt = script.get(script_id)
- script_words = script_txt.split(' ')
- del script_words[0]
+ # list of features
+ #with open(hcompv_scp) as fin:
+ # features = fin.read()
+ # features = features.split('\n')
+ #i = 0
+ #missing_words = []
+ #fscp = open(hcompv_scp2, 'wt')
+ #fmlf = open(mlf_word, "wt", encoding="utf-8")
+ #fmlf.write("#!MLF!#\n")
+ #feature_nr = 1
+ #for feature in features:
+ # sys.stdout.write("\r%d/%d" % (feature_nr, len(features)))
+ # sys.stdout.flush()
+ # feature_nr += 1
+ # file_basename = os.path.basename(feature).replace('.mfc', '')
+
+ # # get words from scripts.
+ # try:
+ # script = scripts[scripts.str.contains(file_basename)]
+ # except IndexError:
+ # script = []
+
+ # if len(script) != 0:
+ # script_id = script.index[0]
+ # script_txt = script.get(script_id)
+ # script_words = script_txt.split(' ')
+ # del script_words[0]
# check if all words can be found in the lexicon.
- SCRIPT_WORDS = []
- script_prons = []
- is_in_lexicon = 1
- for word in script_words:
- WORD = word.upper()
- SCRIPT_WORDS.append(WORD)
- extracted = lexicon_htk[lexicon_htk['word']==WORD]
- if len(extracted) == 0:
- missing_words.append(word)
- script_prons.append(extracted)
- is_in_lexicon *= len(extracted)
+ # SCRIPT_WORDS = []
+ # script_prons = []
+ # is_in_lexicon = 1
+ # for word in script_words:
+ # WORD = word.upper()
+ # SCRIPT_WORDS.append(WORD)
+ # extracted = lexicon_htk[lexicon_htk['word']==WORD]
+ # if len(extracted) == 0:
+ # missing_words.append(word)
+ # script_prons.append(extracted)
+ # is_in_lexicon *= len(extracted)
# if all pronunciations are found in the lexicon, update scp and mlf files.
- if is_in_lexicon:
+ # if is_in_lexicon:
# add the feature filename into the .scp file.
- fscp.write("{}\n".format(feature))
- i += 1
+ # fscp.write("{}\n".format(feature))
+ # i += 1
# add the words to the mlf file.
- fmlf.write('\"*/{}.lab\"\n'.format(file_basename))
+ # fmlf.write('\"*/{}.lab\"\n'.format(file_basename))
#fmlf.write('{}'.format('\n'.join(SCRIPT_WORDS)))
- for word_ in SCRIPT_WORDS:
- if word_[0] == '\'':
- word_ = '\\' + word_
- fmlf.write('{}\n'.format(word_))
- fmlf.write('.\n')
- print("\n{0} has {1} samples.\n".format(dataset, i))
- np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words)
+ # for word_ in SCRIPT_WORDS:
+ # if word_[0] == '\'':
+ # word_ = '\\' + word_
+ # fmlf.write('{}\n'.format(word_))
+ # fmlf.write('.\n')
+ # print("\n{0} has {1} samples.\n".format(dataset, i))
+ # np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words)
- fscp.close()
- fmlf.close()
+ # fscp.close()
+ # fmlf.close()
## generate phone level transcription
- print("generating phone level transcription...\n")
- mkphones = output_dir + '\\label\\mkphones0.txt'
- subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word
- subprocess.call(subprocessStr, shell=True)
+ # print("generating phone level transcription...\n")
+ # mkphones = output_dir + '\\label\\mkphones0.txt'
+ # subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word
+ # subprocess.call(subprocessStr, shell=True)
## ======================= combined scps and mlfs =======================
diff --git a/acoustic_model/fame_test.py b/acoustic_model/fame_test.py
index d330e7f..c7b2e59 100644
--- a/acoustic_model/fame_test.py
+++ b/acoustic_model/fame_test.py
@@ -3,6 +3,7 @@ import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
from collections import Counter
import time
+import re
import numpy as np
import pandas as pd
@@ -82,22 +83,52 @@ np.save(os.path.join('phoneset', 'fame_ipa2asr.npy'), translation_key_ipa2asr)
## check if all the phones in lexicon.htk are in fame_asr.py.
-timer_start = time.time()
-phoneset_htk = fame_asr.phoneset_htk
-phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_htk)
-phoneset_lex.remove('')
-print("phones which is in lexicon.htk but not in the fame_asr.py are:\n{}".format(
- set(phoneset_htk) - set(phoneset_lex)))
-print("elapsed time: {}".format(time.time() - timer_start))
+#timer_start = time.time()
+#phoneset_htk = fame_asr.phoneset_htk
+#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_htk)
+#phoneset_lex.remove('')
+#print("phones which is in lexicon.htk but not in the fame_asr.py are:\n{}".format(
+# set(phoneset_htk) - set(phoneset_lex)))
+#print("elapsed time: {}".format(time.time() - timer_start))
-# statistics over the lexicon
-lex_htk = fame_functions.load_lexicon(lexicon_htk)
-phones_all = (' '.join(lex_htk['pronunciation'])).split(' ')
-c = Counter(phones_all)
+## statistics over the lexicon
+#lex_htk = fame_functions.load_lexicon(lexicon_htk)
+#phones_all = (' '.join(lex_htk['pronunciation'])).split(' ')
+#c = Counter(phones_all)
+
+#lexicon_out = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\lexicon\lex.htk2'
+#for i in lex_htk[lex_htk['word'].str.startswith('\'')].index.values:
+# lex_htk.iat[i, 0] = lex_htk.iat[i, 0].replace('\'', '\\\'')
+## to_csv does not work with space seperator. therefore all tabs should manually be replaced.
+##lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
+#lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
+
+
+## check which letters are not coded in ascii.
+print('asr phones which cannot be coded in ascii:\n')
+for i in fame_asr.phoneset_short:
+ try:
+ i_encoded = i.encode("ascii")
+ #print("{0} --> {1}".format(i, i.encode("ascii")))
+ except UnicodeEncodeError:
+ print(">>> {}".format(i))
+
+print("letters in the scripts which is not coded in ascii:\n")
+for dataset in ['train', 'devel', 'test']:
+ timer_start = time.time()
+
+ script_list = os.path.join(default.fame_dir, 'data', dataset, 'text')
+ with open(script_list, "rt", encoding="utf-8") as fin:
+ scripts = fin.read().split('\n')
+
+ for line in scripts:
+ sentence = ' '.join(line.split(' ')[1:])
+ sentence_htk = fame_functions.word2htk(sentence)
+
+ #if len(re.findall(r'[âêôûč\'àéèúćäëïöü]', sentence))==0:
+ try:
+ sentence_htk = bytes(sentence_htk, 'ascii')
+ except UnicodeEncodeError:
+ print(sentence)
+ print(sentence_htk)
-lexicon_out = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\lexicon\lex.htk2'
-for i in lex_htk[lex_htk['word'].str.startswith('\'')].index.values:
- lex_htk.iat[i, 0] = lex_htk.iat[i, 0].replace('\'', '\\\'')
-# to_csv does not work with space seperator. therefore all tabs should manually be replaced.
-#lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
-lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
diff --git a/acoustic_model/phoneset/fame_asr.py b/acoustic_model/phoneset/fame_asr.py
index 8408646..a9f47a7 100644
--- a/acoustic_model/phoneset/fame_asr.py
+++ b/acoustic_model/phoneset/fame_asr.py
@@ -103,12 +103,22 @@ translation_key_asr2htk = {
}
phoneset_htk = [translation_key_asr2htk.get(i, i) for i in phoneset_short]
-## check
-#for i in phoneset_short:
-# try:
-# print("{0} --> {1}".format(i, i.encode("ascii")))
-# except UnicodeEncodeError:
-# print(">>> {}".format(i))
+#not_in_ascii = [
+# '\'',
+# 'â', 'ê', 'ô', 'û', 'č',
+# 'à', 'í', 'é', 'è', 'ú', 'ć',
+# 'ä', 'ë', 'ï', 'ö', 'ü'
+#]
+translation_key_word2htk = {
+ '\'': '\\\'',
+ 'í':'i1', 'é':'e1', 'ú':'u1', 'ć':'c1',
+ 'à':'a2', 'è':'e2',
+ 'â':'a3', 'ê':'e3', 'ô':'o3', 'û':'u3',
+ 'č':'c4',
+ 'ä': 'ao', 'ë': 'ee', 'ï': 'ie', 'ö': 'oe', 'ü': 'ue',
+}
+#[translation_key_word2htk.get(i, i) for i in not_in_ascii]
+
## the list of multi character phones.
diff --git a/acoustic_model/test.txt b/acoustic_model/test.txt
new file mode 100644
index 0000000..e69de29