Compare commits

...

9 Commits

16 changed files with 1340 additions and 641 deletions

Binary file not shown.

View File

@@ -4,7 +4,7 @@
<SchemaVersion>2.0</SchemaVersion> <SchemaVersion>2.0</SchemaVersion>
<ProjectGuid>4d8c8573-32f0-4a62-9e62-3ce5cc680390</ProjectGuid> <ProjectGuid>4d8c8573-32f0-4a62-9e62-3ce5cc680390</ProjectGuid>
<ProjectHome>.</ProjectHome> <ProjectHome>.</ProjectHome>
<StartupFile>fame_hmm.py</StartupFile> <StartupFile>check_novoapi.py</StartupFile>
<SearchPath> <SearchPath>
</SearchPath> </SearchPath>
<WorkingDirectory>.</WorkingDirectory> <WorkingDirectory>.</WorkingDirectory>
@@ -51,6 +51,9 @@
<Compile Include="fame_hmm.py" /> <Compile Include="fame_hmm.py" />
<Compile Include="phoneset\fame_asr.py" /> <Compile Include="phoneset\fame_asr.py" />
<Compile Include="phoneset\fame_ipa.py" /> <Compile Include="phoneset\fame_ipa.py" />
<Compile Include="phoneset\fame_phonetics.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="stimmen_functions.py" /> <Compile Include="stimmen_functions.py" />
<Compile Include="stimmen_test.py" /> <Compile Include="stimmen_test.py" />
</ItemGroup> </ItemGroup>

View File

@@ -29,48 +29,47 @@ forced_alignment_novo70 = True
## ===== load novo phoneset ===== ## ===== load novo phoneset =====
phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa = novoapi_functions.load_phonset() phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa = novoapi_functions.load_novo70_phoneset()
## ===== extract pronunciations written in novo70 only (not_in_novo70) ===== ## ===== extract pronunciations written in novo70 only (not_in_novo70) =====
# As per Nederlandse phoneset_aki.xlsx recieved from David
# [ɔː] oh / ohr
# [ɪː] ih / ihr
# [iː] iy
# [œː] uh
# [ɛː] eh
# [w] wv in IPA written as ʋ.
david_suggestion = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː', 'w']
## read pronunciation variants. ## read pronunciation variants.
stimmen_transcription_ = pd.ExcelFile(default.stimmen_transcription_xlsx) #stimmen_transcription_ = pd.ExcelFile(default.stimmen_transcription_xlsx)
df = pd.read_excel(stimmen_transcription_, 'frequency') #df = pd.read_excel(stimmen_transcription_, 'frequency')
transcription_ipa = list(df['IPA']) #transcription_ipa = list(df['IPA'])
# transcription mistake?
transcription_ipa = [ipa.replace(';', 'ː') for ipa in transcription_ipa if not ipa=='pypɪl' and not pd.isnull(ipa)]
transcription_ipa = [ipa.replace('ˑ', '') for ipa in transcription_ipa] # only one case.
not_in_novo70 = [] stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test'
all_in_novo70 = [] df = stimmen_functions.load_transcriptions_novo70(stimmen_test_dir)
for ipa in transcription_ipa:
ipa = ipa.replace(':', 'ː')
ipa = convert_phone_set.split_ipa(ipa)
# list of phones not in novo70 phoneset.
not_in_novo70_ = [phone for phone in ipa
if not phone in phoneset_ipa and not phone in david_suggestion]
not_in_novo70_ = [phone.replace('sp', '') for phone in not_in_novo70_]
not_in_novo70_ = [phone.replace(':', '') for phone in not_in_novo70_]
not_in_novo70_ = [phone.replace('ː', '') for phone in not_in_novo70_]
if len(not_in_novo70_) == 0: ## transcription mistake?
all_in_novo70.append(''.join(ipa)) #transcription_ipa = [ipa.replace(';', 'ː') for ipa in transcription_ipa if not ipa=='pypɪl' and not pd.isnull(ipa)]
#transcription_ipa = [ipa.replace('ˑ', '') for ipa in transcription_ipa] # only one case.
#translation_key.get(phone, phone) #not_in_novo70 = []
not_in_novo70.extend(not_in_novo70_) #all_in_novo70 = []
not_in_novo70_list = list(set(not_in_novo70)) #for ipa in transcription_ipa:
# ipa = ipa.replace(':', 'ː')
# ipa = convert_phone_set.split_ipa(ipa)
# # list of phones not in novo70 phoneset.
# not_in_novo70_ = [phone for phone in ipa
# if not phone in phoneset_ipa and not phone in david_suggestion]
# not_in_novo70_ = [phone.replace('sp', '') for phone in not_in_novo70_]
# not_in_novo70_ = [phone.replace(':', '') for phone in not_in_novo70_]
# not_in_novo70_ = [phone.replace('ː', '') for phone in not_in_novo70_]
# if len(not_in_novo70_) == 0:
# all_in_novo70.append(''.join(ipa))
# #translation_key.get(phone, phone)
# not_in_novo70.extend(not_in_novo70_)
#not_in_novo70_list = list(set(not_in_novo70))
## check which phones used in stimmen but not in novo70 ## check which phones used in stimmen but not in novo70
@@ -85,41 +84,43 @@ not_in_novo70_list = list(set(not_in_novo70))
# [ʊ] 'ʊ'(1) --> can be ʏ (uh)?? # [ʊ] 'ʊ'(1) --> can be ʏ (uh)??
# [χ] --> can be x?? # [χ] --> can be x??
def search_phone_ipa(x, phone_list): #def search_phone_ipa(x, phone_list):
x_in_item = [] # x_in_item = []
for ipa in phone_list: # for ipa in phone_list:
ipa_original = ipa # ipa_original = ipa
ipa = ipa.replace(':', 'ː') # ipa = ipa.replace(':', 'ː')
ipa = convert_phone_set.split_ipa(ipa) # ipa = convert_phone_set.split_ipa(ipa)
if x in ipa and not x+':' in ipa: # if x in ipa and not x+':' in ipa:
x_in_item.append(ipa_original) # x_in_item.append(ipa_original)
return x_in_item # return x_in_item
#search_phone_ipa('ø', transcription_ipa) #search_phone_ipa('ø', transcription_ipa)
## ===== load all transcriptions (df) ===== ## ===== load all transcriptions (df) =====
df = stimmen_functions.load_transcriptions() #df = stimmen_functions.load_transcriptions()
word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)] word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
word_list = sorted(word_list) word_list = sorted(word_list)
## check frequency of each pronunciation variants ## check frequency of each pronunciation variants
cols = ['word', 'ipa', 'frequency'] #cols = ['word', 'ipa', 'frequency']
df_samples = pd.DataFrame(index=[], columns=cols) #df_samples = pd.DataFrame(index=[], columns=cols)
for ipa in all_in_novo70: #for ipa in all_in_novo70:
ipa = ipa.replace('ː', ':') # ipa = ipa.replace('ː', ':')
samples = df[df['ipa'] == ipa] # samples = df[df['ipa'] == ipa]
word = list(set(samples['word']))[0] # word = list(set(samples['word']))[0]
samples_Series = pd.Series([word, ipa, len(samples)], index=df_samples.columns) # samples_Series = pd.Series([word, ipa, len(samples)], index=df_samples.columns)
df_samples = df_samples.append(samples_Series, ignore_index=True) # df_samples = df_samples.append(samples_Series, ignore_index=True)
# each word # each word
df_per_word = pd.DataFrame(index=[], columns=df_samples.keys()) #df_per_word = pd.DataFrame(index=[], columns=df_samples.keys())
for word in word_list: #for word in word_list:
df_samples_ = df_samples[df_samples['word']==word] word = word_list[2]
df_samples_ = df_samples_[df_samples_['frequency']>2] df_ = df[df['word']==word]
df_per_word = df_per_word.append(df_samples_, ignore_index=True) np.unique(list(df_['ipa']))
#df_samples_ = df_samples_[df_samples_['frequency']>2]
#df_per_word = df_per_word.append(df_samples_, ignore_index=True)
#df_per_word.to_excel(os.path.join(default.stimmen_dir, 'pronunciation_variants_novo70.xlsx'), encoding="utf-8") #df_per_word.to_excel(os.path.join(default.stimmen_dir, 'pronunciation_variants_novo70.xlsx'), encoding="utf-8")

View File

@@ -14,18 +14,20 @@ def multi_character_tokenize(line, multi_character_tokens):
line = line[1:] line = line[1:]
def split_word(word, multi_character_phones): def split_word(word, phoneset):
""" """
split a line by given phoneset. split a line by given phoneset.
Args: Args:
word (str): a word written in given phoneset. word (str): a word written in given phoneset.
multi_character_phones (list): the list of multicharacter phones which is considered as one phone. this can be obtained with phoneset definition such as fame_ipa.py. #multi_character_phones (list): the list of multicharacter phones which is considered as one phone. this can be obtained with phoneset definition such as fame_ipa.py.
phoneset (list): the list of phones.
Returns: Returns:
(word_seperated) (list): the word splitted in given phoneset. (word_seperated) (list): the word splitted in given phoneset.
""" """
multi_character_phones = extract_multi_character_phones(phoneset)
return [phone return [phone
for phone in multi_character_tokenize(word.strip(), multi_character_phones) for phone in multi_character_tokenize(word.strip(), multi_character_phones)
] ]
@@ -38,3 +40,19 @@ def convert_phoneset(word_list, translation_key):
translation_key (dict): translation_key (dict):
""" """
return [translation_key.get(phone, phone) for phone in word_list] return [translation_key.get(phone, phone) for phone in word_list]
def phone_reduction(phones, reduction_key):
multi_character_tokenize(wo.strip(), multi_character_phones)
return [reduction_key.get(i, i) for i in phones
if not i in phones_to_be_removed]
def extract_multi_character_phones(phoneset):
"""
Args:
phoneset (list):
"""
multi_character_phones = [i for i in phoneset if len(i) > 1]
multi_character_phones.sort(key=len, reverse=True)
return multi_character_phones

View File

@@ -17,6 +17,7 @@ novo_api_dir = os.path.join(WSL_dir, 'python-novo-api', 'novoapi')
rug_dir = r'c:\OneDrive\Research\rug' rug_dir = r'c:\OneDrive\Research\rug'
experiments_dir = os.path.join(rug_dir, 'experiments') experiments_dir = os.path.join(rug_dir, 'experiments')
htk_dir = os.path.join(experiments_dir, 'acoustic_model', 'fame', 'htk') htk_dir = os.path.join(experiments_dir, 'acoustic_model', 'fame', 'htk')
kaldi_dir = os.path.join(WSL_dir, 'kaldi-trunk', 'egs', '_stimmen')
stimmen_dir = os.path.join(experiments_dir, 'stimmen') stimmen_dir = os.path.join(experiments_dir, 'stimmen')
# data # data

View File

@@ -12,6 +12,10 @@ import defaultfiles as default
import convert_phoneset import convert_phoneset
from phoneset import fame_ipa, fame_asr from phoneset import fame_ipa, fame_asr
sys.path.append(default.toolbox_dir)
from htk import pyhtk
#def read_fileFA(fileFA): #def read_fileFA(fileFA):
# """ # """
# read the result file of HTK forced alignment. # read the result file of HTK forced alignment.
@@ -321,9 +325,11 @@ def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out):
lex.to_csv(lexicon_out, index=False, header=False, sep='\t', encoding='utf-8') lex.to_csv(lexicon_out, index=False, header=False, sep='\t', encoding='utf-8')
def fix_single_quote(lexicon_file): def fix_lexicon(lexicon_file):
""" add '\' before all single quote at the beginning of words. """ fix lexicon
convert special characters to ascii compatible characters. - add '\' before all single quote at the beginning of words.
- convert special characters to ascii compatible characters.
- add silence.
Args: Args:
lexicon_file (path): lexicon file, which will be overwitten. lexicon_file (path): lexicon file, which will be overwitten.
@@ -331,8 +337,15 @@ def fix_single_quote(lexicon_file):
""" """
lex = load_lexicon(lexicon_file) lex = load_lexicon(lexicon_file)
lex = lex.dropna() # remove N/A. lex = lex.dropna() # remove N/A.
# add 'sil'
row = pd.Series(['SILENCE', 'sil'], index=lex.columns)
lex = lex.append(row, ignore_index=True)
lex = lex.sort_values(by='word', ascending=True)
for i in lex[lex['word'].str.startswith('\'')].index.values: for i in lex[lex['word'].str.startswith('\'')].index.values:
lex.iat[i, 0] = lex.iat[i, 0].replace('\'', '\\\'') lex.iat[i, 0] = lex.iat[i, 0].replace('\'', '\\\'')
# to_csv does not work with space seperator. therefore all tabs should manually be replaced. # to_csv does not work with space seperator. therefore all tabs should manually be replaced.
#lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\') #lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
lex.to_csv(lexicon_file, index=False, header=False, sep='\t', encoding='utf-8') lex.to_csv(lexicon_file, index=False, header=False, sep='\t', encoding='utf-8')
@@ -346,19 +359,48 @@ def word2htk(word):
def ipa2asr(ipa): def ipa2asr(ipa):
curr_dir = os.path.dirname(os.path.abspath(__file__)) curr_dir = os.path.dirname(os.path.abspath(__file__))
translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0) translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0)
#ipa_ = fame_asr.phone_reduction(ipa)
ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones) ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)
ipa_splitted = fame_ipa.phone_reduction(ipa_splitted) ipa_splitted = fame_ipa.phone_reduction(ipa_splitted)
asr_splitted = convert_phoneset.convert_phoneset(ipa_splitted, translation_key_ipa2asr) asr_splitted = convert_phoneset.convert_phoneset(ipa_splitted, translation_key_ipa2asr)
asr_splitted = fame_asr.phone_reduction(asr_splitted)
return ''.join(asr_splitted) return ''.join(asr_splitted)
def ipa2htk(ipa): def ipa2htk(ipa):
curr_dir = os.path.dirname(os.path.abspath(__file__)) curr_dir = os.path.dirname(os.path.abspath(__file__))
translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0) translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0)
#translation_key_ipa2asr = np.load(r'c:\Users\Aki\source\repos\acoustic_model\acoustic_model\phoneset\fame_ipa2asr.npy').item(0)
ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones) ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)
ipa_splitted = fame_ipa.phone_reduction(ipa_splitted) ipa_splitted = fame_ipa.phone_reduction(ipa_splitted)
asr_splitted = convert_phoneset.convert_phoneset(ipa_splitted, translation_key_ipa2asr) asr_splitted = convert_phoneset.convert_phoneset(ipa_splitted, translation_key_ipa2asr)
asr_splitted = fame_asr.phone_reduction(asr_splitted)
htk_splitted = convert_phoneset.convert_phoneset(asr_splitted, fame_asr.translation_key_asr2htk) htk_splitted = convert_phoneset.convert_phoneset(asr_splitted, fame_asr.translation_key_asr2htk)
return ''.join(htk_splitted) return ''.join(htk_splitted)
def performance_on_stimmen(config_dir, stimmen_dir, hmmdefs):
lattice_file = os.path.join(stimmen_dir, 'word_lattice.ltc')
hvite_scp = os.path.join(stimmen_dir, 'hvite.scp')
#fh.make_filelist(os.path.join(stimmen_dir, 'mfc'), hvite_scp, file_type='mfc')
hresult_scp = os.path.join(stimmen_dir, 'hresult.scp')
#fh.make_filelist(os.path.join(stimmen_dir, 'mfc'), hresult_scp, file_type='rec')
lexicon_file = os.path.join(stimmen_dir, 'lexicon_recognition.dic')
# get feature_size from hmmdefs.
with open(hmmdefs) as f:
line = f.readline()
line = f.readline().strip()
feature_size = int(line.split(' ')[2])
chtk = pyhtk.HTK(config_dir, fame_asr.phoneset_htk, lexicon_file, feature_size)
result = chtk.recognition(
lattice_file,
hmmdefs,
hvite_scp
)
per_sentence, per_word = chtk.calc_recognition_performance(hresult_scp)
return per_sentence['accuracy']

View File

@@ -11,73 +11,87 @@ import numpy as np
import pandas as pd import pandas as pd
import fame_functions import fame_functions
from phoneset import fame_ipa, fame_asr from phoneset import fame_ipa, fame_asr, fame_phonetics
import defaultfiles as default import defaultfiles as default
sys.path.append(default.toolbox_dir) sys.path.append(default.toolbox_dir)
import file_handling as fh import file_handling as fh
from htk import pyhtk from htk import pyhtk
#from scripts import run_command
## ======================= user define ======================= ## ======================= user define =======================
# procedure # procedure
combine_all = 1
make_lexicon = 0 make_lexicon = 0
make_label = 0 # it takes roughly 4800 sec on Surface pro 2. make_label = 0 # it takes roughly 4800 sec on Surface pro 2.
make_htk_files = 0 make_mlf = 0
extract_features = 0 extract_features = 0
flat_start = 0 flat_start = 1
train_model_without_sp = 0 train_monophone_without_sp = 1
add_sp = 0 add_sp = 1
train_model_with_sp = 1 train_monophone_with_re_aligned_mlf = 1
increase_mixture = 1
train_triphone = 0
train_triphone_tied = 0
# pre-defined values. # pre-defined values.
dataset_list = ['devel', 'test', 'train'] dataset_list = ['devel', 'test', 'train']
hmmdefs_name = 'hmmdefs' feature_size = 30
proto_name = 'proto39' improvement_threshold = 0.3
lexicon_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr') lexicon_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr')
lexicon_oov = os.path.join(default.fame_dir, 'lexicon', 'lex.oov') lexicon_oov = os.path.join(default.fame_dir, 'lexicon', 'lex.oov')
config_dir = os.path.join(default.htk_dir, 'config') config_dir = os.path.join(default.htk_dir, 'config')
config_hcopy = os.path.join(config_dir, 'config.HCopy') phonelist_full_txt = os.path.join(config_dir, 'phonelist_full.txt')
config_train = os.path.join(config_dir, 'config.train') tree_hed = os.path.join(config_dir, 'tree.hed')
global_ded = os.path.join(config_dir, 'global.ded') quests_hed = os.path.join(config_dir, 'quests.hed')
mkphones_led = os.path.join(config_dir, 'mkphones.led')
sil_hed = os.path.join(config_dir, 'sil.hed')
prototype = os.path.join(config_dir, proto_name)
model_dir = os.path.join(default.htk_dir, 'model')
model_dir = os.path.join(default.htk_dir, 'model')
model_mono0_dir = os.path.join(model_dir, 'mono0')
model_mono1_dir = os.path.join(model_dir, 'mono1')
model_mono1sp_dir = os.path.join(model_dir, 'mono1sp')
model_mono1sp2_dir = os.path.join(model_dir, 'mono1sp2')
model_tri1_dir = os.path.join(model_dir, 'tri1')
model_tri1tied_dir = os.path.join(model_dir, 'tri1tied')
# directories / files to be made. # directories / files to be made.
lexicon_dir = os.path.join(default.htk_dir, 'lexicon') lexicon_dir = os.path.join(default.htk_dir, 'lexicon')
lexicon_htk_asr = os.path.join(lexicon_dir, 'lex.htk_asr') lexicon_htk_asr = os.path.join(lexicon_dir, 'lex.htk_asr')
lexicon_htk_oov = os.path.join(lexicon_dir, 'lex.htk_oov') lexicon_htk_oov = os.path.join(lexicon_dir, 'lex.htk_oov')
lexicon_htk = os.path.join(lexicon_dir, 'lex.htk') lexicon_htk = os.path.join(lexicon_dir, 'lex.htk')
lexicon_htk_with_sp = os.path.join(lexicon_dir, 'lex_with_sp.htk')
phonelist_txt = os.path.join(config_dir, 'phonelist.txt') lexicon_htk_triphone = os.path.join(lexicon_dir, 'lex_triphone.htk')
model0_dir = os.path.join(model_dir, 'hmm0')
model1_dir = os.path.join(model_dir, 'hmm1')
feature_dir = os.path.join(default.htk_dir, 'mfc') feature_dir = os.path.join(default.htk_dir, 'mfc')
if not os.path.exists(feature_dir): fh.make_new_directory(feature_dir, existing_dir='leave')
os.makedirs(feature_dir)
tmp_dir = os.path.join(default.htk_dir, 'tmp') tmp_dir = os.path.join(default.htk_dir, 'tmp')
if not os.path.exists(tmp_dir): fh.make_new_directory(tmp_dir, existing_dir='leave')
os.makedirs(tmp_dir)
label_dir = os.path.join(default.htk_dir, 'label') label_dir = os.path.join(default.htk_dir, 'label')
if not os.path.exists(label_dir): fh.make_new_directory(label_dir, existing_dir='leave')
os.makedirs(label_dir)
## training ## training
hcompv_scp_train = os.path.join(tmp_dir, 'train.scp') if combine_all:
mlf_file_train = os.path.join(label_dir, 'train_phone.mlf') hcompv_scp_train = os.path.join(tmp_dir, 'all.scp')
mlf_file_train = os.path.join(label_dir, 'all_phone.mlf')
mlf_file_train_word = os.path.join(label_dir, 'all_word.mlf')
mlf_file_train_with_sp = os.path.join(label_dir, 'all_phone_with_sp.mlf')
mlf_file_train_aligned = os.path.join(label_dir, 'all_phone_aligned.mlf')
triphone_mlf = os.path.join(label_dir, 'all_triphone.mlf')
else:
hcompv_scp_train = os.path.join(tmp_dir, 'train.scp')
mlf_file_train = os.path.join(label_dir, 'train_phone.mlf')
mlf_file_train_word = os.path.join(label_dir, 'train_word.mlf')
mlf_file_train_with_sp = os.path.join(label_dir, 'train_phone_with_sp.mlf')
mlf_file_train_aligned = os.path.join(label_dir, 'train_phone_aligned.mlf')
triphone_mlf = os.path.join(label_dir, 'train_triphone.mlf')
hcompv_scp_train_updated = hcompv_scp_train.replace('.scp', '_updated.scp')
## train without sp ## testing
niter_max = 10 htk_stimmen_dir = os.path.join(default.htk_dir, 'stimmen')
## ======================= make lexicon for HTK ======================= ## ======================= make lexicon for HTK =======================
@@ -96,19 +110,29 @@ if make_lexicon:
# therefore there is no overlap between lex_asr and lex_oov. # therefore there is no overlap between lex_asr and lex_oov.
fame_functions.combine_lexicon(lexicon_htk_asr, lexicon_htk_oov, lexicon_htk) fame_functions.combine_lexicon(lexicon_htk_asr, lexicon_htk_oov, lexicon_htk)
## ======================= ## fixing the lexicon for HTK.
## manually make changes to the pronunciation dictionary and save it as lex.htk
## =======================
# (1) Replace all tabs with single space; # (1) Replace all tabs with single space;
# (2) Put a '\' before any dictionary entry beginning with single quote # (2) Put a '\' before any dictionary entry beginning with single quote
#http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html # http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html
fame_functions.fix_single_quote(lexicon_htk) print('>>> fixing the lexicon...')
fame_functions.fix_lexicon(lexicon_htk)
## adding sp to the lexicon for HTK.
print('>>> adding sp to the lexicon...')
with open(lexicon_htk) as f:
lines = f.read().split('\n')
with open(lexicon_htk_with_sp, 'wb') as f:
f.write(bytes(' sp\n'.join(lines), 'ascii'))
print("elapsed time: {}".format(time.time() - timer_start)) print("elapsed time: {}".format(time.time() - timer_start))
## intialize the instance for HTK.
chtk = pyhtk.HTK(config_dir, fame_asr.phoneset_htk, lexicon_htk_with_sp, feature_size)
## ======================= make label files ======================= ## ======================= make label files =======================
if make_label: if make_label:
# train_2002_gongfansaken_10347.lab is empty. should be removed.
for dataset in dataset_list: for dataset in dataset_list:
timer_start = time.time() timer_start = time.time()
print("==== making label files on dataset {}".format(dataset)) print("==== making label files on dataset {}".format(dataset))
@@ -117,7 +141,7 @@ if make_label:
wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset) wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
label_dir_ = os.path.join(label_dir, dataset) label_dir_ = os.path.join(label_dir, dataset)
dictionary_file = os.path.join(label_dir_, 'temp.dic') dictionary_file = os.path.join(label_dir_, 'temp.dic')
fh.make_new_directory(label_dir_) fh.make_new_directory(label_dir_, existing_dir='leave')
# list of scripts # list of scripts
with open(script_list, "rt", encoding="utf-8") as fin: with open(script_list, "rt", encoding="utf-8") as fin:
@@ -132,56 +156,50 @@ if make_label:
sentence_htk = fame_functions.word2htk(sentence) sentence_htk = fame_functions.word2htk(sentence)
wav_file = os.path.join(wav_dir_, filename + '.wav') wav_file = os.path.join(wav_dir_, filename + '.wav')
if os.path.exists(wav_file) and pyhtk.can_be_ascii(sentence_htk) == 0: if os.path.exists(wav_file) and chtk.can_be_ascii(sentence_htk) == 0:
if pyhtk.create_dictionary_without_log( if chtk.get_number_of_missing_words(
sentence_htk, global_ded, dictionary_file, lexicon_htk) == 0: sentence_htk, dictionary_file) == 0:
# when the file name is too long, HDMan command does not work. # when the file name is too long, HDMan command does not work.
# therefore first temporary dictionary_file is made, then renamed. # therefore first temporary dictionary_file is made, then renamed.
shutil.move(dictionary_file, os.path.join(label_dir_, filename + '.dic')) shutil.move(dictionary_file, os.path.join(label_dir_, filename + '.dic'))
label_file = os.path.join(label_dir_, filename + '.lab') label_file = os.path.join(label_dir_, filename + '.lab')
pyhtk.create_label_file(sentence_htk, label_file) chtk.make_label_file(sentence_htk, label_file)
else: else:
os.remove(dictionary_file) os.remove(dictionary_file)
print("elapsed time: {}".format(time.time() - timer_start)) print("elapsed time: {}".format(time.time() - timer_start))
## ======================= make other required files ======================= ## ======================= make master label files =======================
if make_htk_files: if make_mlf:
timer_start = time.time() timer_start = time.time()
print("==== making files required for HTK ====") print("==== making master label files ====")
print(">>> making a phonelist...") # train_2002_gongfansaken_10347.lab is empty. should be removed.
pyhtk.create_phonelist_file(fame_asr.phoneset_htk, phonelist_txt) empty_lab_file = os.path.join(label_dir, 'train', 'train_2002_gongfansaken_10347.lab')
empty_dic_file = empty_lab_file.replace('.lab', '.dic')
if os.path.exists(empty_lab_file):
os.remove(empty_lab_file)
if os.path.exists(empty_dic_file):
os.remove(empty_dic_file)
for dataset in dataset_list: for dataset in dataset_list:
wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
feature_dir_ = os.path.join(feature_dir, dataset) feature_dir_ = os.path.join(feature_dir, dataset)
label_dir_ = os.path.join(label_dir, dataset) label_dir_ = os.path.join(label_dir, dataset)
mlf_word = os.path.join(label_dir, dataset + '_word.mlf') mlf_word = os.path.join(label_dir, dataset + '_word.mlf')
mlf_phone = os.path.join(label_dir, dataset + '_phone.mlf') mlf_phone = os.path.join(label_dir, dataset + '_phone.mlf')
mlf_phone_with_sp = os.path.join(label_dir, dataset + '_phone_with_sp.mlf')
#print(">>> making a script file for {}...".format(dataset)) print(">>> generating a word level mlf file for {}...".format(dataset))
#listdir = glob.glob(os.path.join(wav_dir_, '*.dic')) chtk.label2mlf(label_dir_, mlf_word)
#mfc_list = [filename.replace(wav_dir_, feature_dir_).replace('.dic', '.mfc') for filename in listdir]
#hcompv_scp = os.path.join(tmp_dir, dataset + '.scp')
#with open(hcompv_scp, 'wb') as f:
# f.write(bytes('\n'.join(mfc_list) + '\n', 'ascii'))
print(">>> making a mlf file for {}...".format(dataset)) print(">>> generating a phone level mlf file for {}...".format(dataset))
lab_list = glob.glob(os.path.join(label_dir_, '*.lab')) chtk.mlf_word2phone(mlf_phone, mlf_word, with_sp=False)
with open(mlf_word, 'wb') as fmlf: chtk.mlf_word2phone(mlf_phone_with_sp, mlf_word, with_sp=True)
fmlf.write(bytes('#!MLF!#\n', 'ascii'))
for label_file in lab_list: print("elapsed time: {}".format(time.time() - timer_start))
filename = os.path.basename(label_file)
fmlf.write(bytes('\"*/{}\"\n'.format(filename), 'ascii'))
with open(label_file) as flab:
lines = flab.read()
fmlf.write(bytes(lines + '.\n', 'ascii'))
print(">>> generating phone level transcription for {}...".format(dataset))
pyhtk.mlf_word2phone(lexicon_htk, mlf_phone, mlf_word, mkphones_led)
print("elapsed time: {}".format(time.time() - timer_start))
## ======================= extract features ======================= ## ======================= extract features =======================
@@ -190,10 +208,10 @@ if extract_features:
timer_start = time.time() timer_start = time.time()
print('==== extract features on dataset {} ===='.format(dataset)) print('==== extract features on dataset {} ===='.format(dataset))
wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset) wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
label_dir_ = os.path.join(label_dir, dataset) label_dir_ = os.path.join(label_dir, dataset)
feature_dir_ = os.path.join(feature_dir, dataset) feature_dir_ = os.path.join(feature_dir, dataset)
fh.make_new_directory(feature_dir_) fh.make_new_directory(feature_dir_, existing_dir='delete')
# a script file for HCopy # a script file for HCopy
print(">>> making a script file for HCopy...") print(">>> making a script file for HCopy...")
@@ -209,12 +227,15 @@ if extract_features:
os.path.join(wav_dir_, os.path.basename(lab_file).replace('.lab', '.wav')) + '\t' os.path.join(wav_dir_, os.path.basename(lab_file).replace('.lab', '.wav')) + '\t'
+ os.path.join(feature_dir_, os.path.basename(lab_file).replace('.lab', '.mfc')) + os.path.join(feature_dir_, os.path.basename(lab_file).replace('.lab', '.mfc'))
for lab_file in lab_list] for lab_file in lab_list]
#if os.path.exists(empty_mfc_file):
# os.remove(empty_mfc_file)
with open(hcopy_scp.name, 'wb') as f: with open(hcopy_scp.name, 'wb') as f:
f.write(bytes('\n'.join(feature_list), 'ascii')) f.write(bytes('\n'.join(feature_list), 'ascii'))
# extract features. # extract features.
print(">>> extracting features on {}...".format(dataset)) print(">>> extracting features on {}...".format(dataset))
pyhtk.wav2mfc(config_hcopy, hcopy_scp.name) chtk.wav2mfc(hcopy_scp.name)
os.remove(hcopy_scp.name) os.remove(hcopy_scp.name)
# make hcompv.scp. # make hcompv.scp.
@@ -225,117 +246,321 @@ if extract_features:
with open(hcompv_scp, 'wb') as f: with open(hcompv_scp, 'wb') as f:
f.write(bytes('\n'.join(mfc_list) + '\n', 'ascii')) f.write(bytes('\n'.join(mfc_list) + '\n', 'ascii'))
print(">>> extracting features on stimmen...")
chtk.wav2mfc(os.path.join(htk_stimmen_dir, 'hcopy.scp'))
print("elapsed time: {}".format(time.time() - timer_start)) print("elapsed time: {}".format(time.time() - timer_start))
## ======================= flat start monophones =======================
if combine_all:
# script files.
fh.concatenate(
os.path.join(tmp_dir, 'devel.scp'),
os.path.join(tmp_dir, 'test.scp'),
hcompv_scp_train
)
fh.concatenate(
hcompv_scp_train,
os.path.join(tmp_dir, 'train.scp'),
hcompv_scp_train
)
# phone level mlfs.
fh.concatenate(
os.path.join(label_dir, 'devel_phone.mlf'),
os.path.join(label_dir, 'test_phone.mlf'),
mlf_file_train
)
fh.concatenate(
mlf_file_train,
os.path.join(label_dir, 'train_phone.mlf'),
mlf_file_train
)
# phone level mlfs with sp.
fh.concatenate(
os.path.join(label_dir, 'devel_phone_with_sp.mlf'),
os.path.join(label_dir, 'test_phone_with_sp.mlf'),
mlf_file_train_with_sp
)
fh.concatenate(
mlf_file_train_with_sp,
os.path.join(label_dir, 'train_phone_with_sp.mlf'),
mlf_file_train_with_sp
)
# word level mlfs.
fh.concatenate(
os.path.join(label_dir, 'devel_word.mlf'),
os.path.join(label_dir, 'test_word.mlf'),
mlf_file_train_word
)
fh.concatenate(
mlf_file_train_word,
os.path.join(label_dir, 'train_word.mlf'),
mlf_file_train_word
)
## ======================= flat start monophones ======================= ## ======================= flat start monophones =======================
if flat_start: if flat_start:
timer_start = time.time() timer_start = time.time()
print('==== flat start ====') print('==== flat start ====')
pyhtk.flat_start(config_train, hcompv_scp_train, model0_dir, prototype) fh.make_new_directory(model_mono0_dir, existing_dir='leave')
chtk.flat_start(hcompv_scp_train, model_mono0_dir)
# make macros.
vFloors = os.path.join(model_mono0_dir, 'vFloors')
if os.path.exists(vFloors):
chtk.make_macros(vFloors)
# allocate mean & variance to all phones in the phone list # allocate mean & variance to all phones in the phone list
print('>>> allocating mean & variance to all phones in the phone list...') print('>>> allocating mean & variance to all phones in the phone list...')
pyhtk.create_hmmdefs( chtk.make_hmmdefs(model_mono0_dir)
os.path.join(model0_dir, proto_name),
os.path.join(model0_dir, 'hmmdefs'),
phonelist_txt)
# make macros
print('>>> making macros...')
with open(os.path.join(model0_dir, 'vFloors')) as f:
lines = f.read()
with open(os.path.join(model0_dir, 'macros'), 'wb') as f:
f.write(bytes('~o <MFCC_0_D_A> <VecSize> 39\n' + lines, 'ascii'))
print("elapsed time: {}".format(time.time() - timer_start)) print("elapsed time: {}".format(time.time() - timer_start))
## ======================= train model without short pause ======================= ## ======================= train model without short pause =======================
if train_model_without_sp: if train_monophone_without_sp:
fh.make_new_directory(model1_dir) print('==== train monophone without sp ====')
timer_start = time.time()
niter = chtk.re_estimation_until_saturated(
model_mono1_dir,
model_mono0_dir, improvement_threshold, hcompv_scp_train,
os.path.join(htk_stimmen_dir, 'mfc'),
'mfc',
os.path.join(htk_stimmen_dir, 'word_lattice.ltc'),
mlf_file=mlf_file_train,
lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic')
)
print('==== train model without sp ====') print("elapsed time: {}".format(time.time() - timer_start))
if not os.path.exists(os.path.join(model1_dir, 'iter0')):
shutil.copytree(model0_dir, os.path.join(model1_dir, 'iter0'))
for niter in range(1, niter_max):
timer_start = time.time()
hmm_n = 'iter' + str(niter)
hmm_n_pre = 'iter' + str(niter-1)
modeln_dir = os.path.join(model1_dir, hmm_n)
modeln_dir_pre = os.path.join(model1_dir, hmm_n_pre)
# re-estimation
fh.make_new_directory(modeln_dir)
pyhtk.re_estimation(
config_train,
os.path.join(modeln_dir_pre, 'macros'),
os.path.join(modeln_dir_pre, hmmdefs_name),
modeln_dir,
hcompv_scp_train, phonelist_txt,
mlf_file=mlf_file_train)
print("elapsed time: {}".format(time.time() - timer_start))
## ======================= adding sp to the model ======================= ## ======================= adding sp to the model =======================
if add_sp: if add_sp:
print('==== adding sp to the model ====') print('==== adding sp to the model ====')
# reference:
# http://www.f.waseda.jp/yusukekondo/htk.html#flat_start_estimation
timer_start = time.time()
# make model with sp. # make model with sp.
print('>>> modifying the last model in the previous step...') print('>>> adding sp state to the last model in the previous step...')
modeln_dir_pre = os.path.join(model1_dir, 'iter'+str(niter_max-1)) fh.make_new_directory(model_mono1sp_dir, existing_dir='leave')
modeln_dir = modeln_dir_pre.replace('iter' + str(niter_max-1), 'iter' + str(niter_max)) niter = chtk.get_niter_max(model_mono1_dir)
fh.make_new_directory(modeln_dir) modeln_dir_pre = os.path.join(model_mono1_dir, 'iter'+str(niter))
shutil.copy( modeln_dir = os.path.join(model_mono1sp_dir, 'iter0')
os.path.join(modeln_dir_pre, 'macros'),
os.path.join(modeln_dir, 'macros'))
shutil.copy(
os.path.join(modeln_dir_pre, hmmdefs_name),
os.path.join(modeln_dir, hmmdefs_name))
## ======================= chtk.add_sp(modeln_dir_pre, modeln_dir)
## manually make changes to modeln_dir/hmmdefs
## =======================
# add states 'sil'.
# http://www.f.waseda.jp/yusukekondo/htk.html#flat_start_estimation
#shutil.copy(
# os.path.join(model_dir, 'hmmdefs.txt'),
# os.path.join(modeln_dir, hmmdefs_name))
#hmmdefs_file_pre = os.path.join(modeln_dir_pre, hmmdefs_name)
hmmdefs_file = os.path.join(modeln_dir, hmmdefs_name)
macros_file = os.path.join(modeln_dir, 'macros')
#with open(hmmdefs_file_pre) as f:
# lines = f.read()
#lines_ = lines.split('~h ')
#sil_model = [line for line in lines_ if line.split('\n')[0].replace('"', '') == 'sil'][0]
# update hmmdefs and macros. print('>>> re-estimation...')
print('>>> updating hmmdefs and macros...') niter = chtk.re_estimation_until_saturated(
modeln_dir_pre = modeln_dir model_mono1sp_dir, modeln_dir, improvement_threshold, hcompv_scp_train,
modeln_dir = modeln_dir.replace('iter' + str(niter_max), 'iter' + str(niter_max+1)) os.path.join(htk_stimmen_dir, 'mfc'),
fh.make_new_directory(modeln_dir) 'mfc',
pyhtk.include_sil_in_hmmdefs(macros_file, hmmdefs_file, modeln_dir, sil_hed, phonelist_txt) os.path.join(htk_stimmen_dir, 'word_lattice.ltc'),
mlf_file=mlf_file_train_with_sp,
lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'),
model_type='monophone_with_sp'
)
print("elapsed time: {}".format(time.time() - timer_start))
## ======================= train model with re-aligned mlf =======================
if train_monophone_with_re_aligned_mlf:
print('==== traina monophone with re-aligned mlf ====')
timer_start = time.time()
print('>>> re-aligning the training data... ')
niter = chtk.get_niter_max(model_mono1sp_dir)
modeln_dir = os.path.join(model_mono1sp_dir, 'iter'+str(niter))
chtk.make_aligned_label(
os.path.join(modeln_dir, 'macros'),
os.path.join(modeln_dir, 'hmmdefs'),
mlf_file_train_aligned,
mlf_file_train_word,
hcompv_scp_train)
chtk.fix_mlf(mlf_file_train_aligned)
print('>>> updating the script file... ')
chtk.update_script_file(
mlf_file_train_aligned,
mlf_file_train_with_sp,
hcompv_scp_train,
hcompv_scp_train_updated)
print('>>> re-estimation... ')
timer_start = time.time()
fh.make_new_directory(model_mono1sp2_dir, existing_dir='leave')
niter = chtk.get_niter_max(model_mono1sp_dir)
niter = chtk.re_estimation_until_saturated(
model_mono1sp2_dir,
os.path.join(model_mono1sp_dir, 'iter'+str(niter)),
improvement_threshold,
hcompv_scp_train_updated,
os.path.join(htk_stimmen_dir, 'mfc'),
'mfc',
os.path.join(htk_stimmen_dir, 'word_lattice.ltc'),
mlf_file=mlf_file_train_aligned,
lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'),
model_type='monophone_with_sp'
)
print("elapsed time: {}".format(time.time() - timer_start))
## ======================= train model with short pause ======================= ## ======================= increase mixture =======================
if train_model_with_sp: if increase_mixture:
print('==== train model with sp ====') print('==== increase mixture ====')
#for niter in range(niter_max+1, niter_max*2+1): timer_start = time.time()
for niter in range(20, 50): for nmix in [2, 4, 8, 16]:
timer_start = time.time() if nmix == 2:
modeln_dir_ = model_mono1sp2_dir
else:
modeln_dir_ = os.path.join(model_dir, 'mono'+str(nmix_))
modeln_dir = os.path.join(model_dir, 'mono'+str(nmix))
print('mixture: {}'.format(nmix))
fh.make_new_directory(modeln_dir, existing_dir='delete')
niter = chtk.get_niter_max(modeln_dir_)
chtk.increase_mixture(
os.path.join(modeln_dir_, 'iter'+str(niter), 'hmmdefs'),
nmix,
os.path.join(modeln_dir, 'iter0'),
model_type='monophone_with_sp')
shutil.copy2(os.path.join(modeln_dir_, 'iter'+str(niter), 'macros'),
os.path.join(modeln_dir, 'iter0', 'macros'))
#improvement_threshold = -10
niter = chtk.re_estimation_until_saturated(
modeln_dir,
os.path.join(modeln_dir_, 'iter0'),
improvement_threshold,
hcompv_scp_train_updated,
os.path.join(htk_stimmen_dir, 'mfc'),
'mfc',
os.path.join(htk_stimmen_dir, 'word_lattice.ltc'),
mlf_file=mlf_file_train_aligned,
lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'),
model_type='monophone_with_sp'
)
nmix_ = nmix
print("elapsed time: {}".format(time.time() - timer_start))
## ======================= train triphone =======================
print('>>> making triphone list... ')
chtk.make_triphonelist(
mlf_file_train_aligned,
triphone_mlf)
if train_triphone:
print('==== train triphone model ====')
timer_start = time.time()
print('>>> init triphone model... ')
niter = chtk.get_niter_max(model_mono1sp2_dir)
fh.make_new_directory(os.path.join(model_tri1_dir, 'iter0'), existing_dir='leave')
chtk.init_triphone(
os.path.join(model_mono1sp2_dir, 'iter'+str(niter)),
os.path.join(model_tri1_dir, 'iter0')
)
print('>>> re-estimation... ')
## I wanted to train until satulated:
#niter = chtk.re_estimation_until_saturated(
# model_tri1_dir,
# os.path.join(model_tri1_dir, 'iter0'),
# improvement_threshold,
# hcompv_scp_train_updated,
# os.path.join(htk_stimmen_dir, 'mfc'),
# 'mfc',
# os.path.join(htk_stimmen_dir, 'word_lattice.ltc'),
# mlf_file=triphone_mlf,
# lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'),
# model_type='triphone'
# )
#
# but because the data size is limited, some triphone cannot be trained and received the error:
# ERROR [+8231] GetHCIModel: Cannot find hmm [i:-]r[+???]
# therefore only two times re-estimation is performed.
output_dir = model_tri1_dir
for niter in range(1, 4):
hmm_n = 'iter' + str(niter) hmm_n = 'iter' + str(niter)
hmm_n_pre = 'iter' + str(niter-1) hmm_n_pre = 'iter' + str(niter-1)
modeln_dir = os.path.join(model1_dir, hmm_n) _modeln_dir = os.path.join(output_dir, hmm_n)
modeln_dir_pre = os.path.join(model1_dir, hmm_n_pre) _modeln_dir_pre = os.path.join(output_dir, hmm_n_pre)
# re-estimation fh.make_new_directory(_modeln_dir, 'leave')
fh.make_new_directory(modeln_dir) chtk.re_estimation(
pyhtk.re_estimation( os.path.join(_modeln_dir_pre, 'hmmdefs'),
config_train, _modeln_dir,
os.path.join(modeln_dir_pre, 'macros'), hcompv_scp_train_updated,
os.path.join(modeln_dir_pre, hmmdefs_name), mlf_file=triphone_mlf,
modeln_dir, macros=os.path.join(_modeln_dir_pre, 'macros'),
hcompv_scp_train, phonelist_txt, model_type='triphone')
mlf_file=mlf_file_train)
print("elapsed time: {}".format(time.time() - timer_start)) print("elapsed time: {}".format(time.time() - timer_start))
## ======================= train tied-state triphones =======================
if train_triphone_tied:
print('==== train tied-state triphones ====')
timer_start = time.time()
print('>>> making lexicon for triphone... ')
chtk.make_lexicon_triphone(phonelist_full_txt, lexicon_htk_triphone)
chtk.combine_phonelists(phonelist_full_txt)
print('>>> making a tree header... ')
fame_phonetics.make_quests_hed(quests_hed)
stats = os.path.join(r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\model\tri1\iter3', 'stats')
chtk.make_tree_header(tree_hed, quests_hed, stats, config_dir)
print('>>> init triphone model... ')
niter = chtk.get_niter_max(model_tri1_dir)
fh.make_new_directory(os.path.join(model_tri1tied_dir, 'iter0'), existing_dir='leave')
chtk.init_triphone(
os.path.join(model_tri1_dir, 'iter'+str(niter)),
os.path.join(model_tri1tied_dir, 'iter0'),
tied=True)
# I wanted to train until satulated:
#niter = chtk.re_estimation_until_saturated(
# model_tri1tied_dir,
# os.path.join(model_tri1tied_dir, 'iter0'),
# improvement_threshold,
# hcompv_scp_train_updated,
# os.path.join(htk_stimmen_dir, 'mfc'),
# 'mfc',
# os.path.join(htk_stimmen_dir, 'word_lattice.ltc'),
# mlf_file=triphone_mlf,
# lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'),
# model_type='triphone'
# )
#
# but because the data size is limited, some triphone cannot be trained and received the error:
# ERROR [+8231] GetHCIModel: Cannot find hmm [i:-]r[+???]
# therefore only 3 times re-estimation is performed.
output_dir = model_tri1tied_dir
for niter in range(1, 4):
hmm_n = 'iter' + str(niter)
hmm_n_pre = 'iter' + str(niter-1)
_modeln_dir = os.path.join(output_dir, hmm_n)
_modeln_dir_pre = os.path.join(output_dir, hmm_n_pre)
fh.make_new_directory(_modeln_dir, 'leave')
chtk.re_estimation(
os.path.join(_modeln_dir_pre, 'hmmdefs'),
_modeln_dir,
hcompv_scp_train_updated,
mlf_file=triphone_mlf,
macros=os.path.join(_modeln_dir_pre, 'macros'),
model_type='triphone')
print("elapsed time: {}".format(time.time() - timer_start))

View File

@@ -109,30 +109,30 @@ np.save(os.path.join('phoneset', 'fame_ipa2asr.npy'), translation_key_ipa2asr)
## check which letters are not coded in ascii. ## check which letters are not coded in ascii.
print('asr phones which cannot be coded in ascii:\n') #print('asr phones which cannot be coded in ascii:\n')
for i in fame_asr.phoneset_short: #for i in fame_asr.phoneset_short:
try: # try:
i_encoded = i.encode("ascii") # i_encoded = i.encode("ascii")
#print("{0} --> {1}".format(i, i.encode("ascii"))) # #print("{0} --> {1}".format(i, i.encode("ascii")))
except UnicodeEncodeError: # except UnicodeEncodeError:
print(">>> {}".format(i)) # print(">>> {}".format(i))
print("letters in the scripts which is not coded in ascii:\n") #print("letters in the scripts which is not coded in ascii:\n")
for dataset in ['train', 'devel', 'test']: #for dataset in ['train', 'devel', 'test']:
timer_start = time.time() # timer_start = time.time()
script_list = os.path.join(default.fame_dir, 'data', dataset, 'text') # script_list = os.path.join(default.fame_dir, 'data', dataset, 'text')
with open(script_list, "rt", encoding="utf-8") as fin: # with open(script_list, "rt", encoding="utf-8") as fin:
scripts = fin.read().split('\n') # scripts = fin.read().split('\n')
for line in scripts: # for line in scripts:
sentence = ' '.join(line.split(' ')[1:]) # sentence = ' '.join(line.split(' ')[1:])
sentence_htk = fame_functions.word2htk(sentence) # sentence_htk = fame_functions.word2htk(sentence)
#if len(re.findall(r'[âêôûč\'àéèúćäëïöü]', sentence))==0: # #if len(re.findall(r'[âêôûč\'àéèúćäëïöü]', sentence))==0:
try: # try:
sentence_htk = bytes(sentence_htk, 'ascii') # sentence_htk = bytes(sentence_htk, 'ascii')
except UnicodeEncodeError: # except UnicodeEncodeError:
print(sentence) # print(sentence)
print(sentence_htk) # print(sentence_htk)

View File

@@ -11,6 +11,7 @@ import glob
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from collections import Counter
#import matplotlib.pyplot as plt #import matplotlib.pyplot as plt
#from sklearn.metrics import confusion_matrix #from sklearn.metrics import confusion_matrix
@@ -50,11 +51,14 @@ from htk import pyhtk
#lex_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr') #lex_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr')
#lex_asr_htk = os.path.join(default.fame_dir, 'lexicon', 'lex.asr_htk') #lex_asr_htk = os.path.join(default.fame_dir, 'lexicon', 'lex.asr_htk')
## procedure # procedure
make_dic_file = 0
make_HTK_files = 0
extract_features = 0
#make_htk_dict_files = 0 #make_htk_dict_files = 0
#do_forced_alignment_htk = 0 #do_forced_alignment_htk = 0
#eval_forced_alignment_htk = 0 #eval_forced_alignment_htk = 0
#make_kaldi_data_files = 0 make_kaldi_files = 0
#make_kaldi_lexicon_txt = 0 #make_kaldi_lexicon_txt = 0
#load_forced_alignment_kaldi = 1 #load_forced_alignment_kaldi = 1
#eval_forced_alignment_kaldi = 1 #eval_forced_alignment_kaldi = 1
@@ -66,13 +70,34 @@ from htk import pyhtk
#sys.path.append(os.path.join(default.repo_dir, 'toolbox')) #sys.path.append(os.path.join(default.repo_dir, 'toolbox'))
#from evaluation import plot_confusion_matrix #from evaluation import plot_confusion_matrix
config_dir = os.path.join(default.htk_dir, 'config') ## HTK related files.
model_dir = os.path.join(default.htk_dir, 'model') config_dir = os.path.join(default.htk_dir, 'config')
lattice_file = os.path.join(config_dir, 'stimmen.ltc') model_dir = os.path.join(default.htk_dir, 'model')
#pyhtk.create_word_lattice_file( feature_dir = os.path.join(default.htk_dir, 'mfc', 'stimmen')
# os.path.join(config_dir, 'stimmen.net'),
# lattice_file) config_hcopy = os.path.join(config_dir, 'config.HCopy')
hvite_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test.scp')
# files to be made.
lattice_file = os.path.join(config_dir, 'stimmen.ltc')
phonelist_txt = os.path.join(config_dir, 'phonelist.txt')
stimmen_dic = os.path.join(default.htk_dir, 'lexicon', 'stimmen_recognition.dic')
hcopy_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test_hcopy.scp')
hvite_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test_hvite.scp')
hresult_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test_result.scp')
## Kaldi related files.
kaldi_data_dir = os.path.join(default.kaldi_dir, 'data')
# files to be made.
wav_scp = os.path.join(kaldi_data_dir, 'test', 'wav.scp')
text_file = os.path.join(kaldi_data_dir, 'test', 'text')
utt2spk = os.path.join(kaldi_data_dir, 'test', 'utt2spk')
corpus_txt = os.path.join(kaldi_data_dir, 'local', 'corpus.txt')
lexicon_txt = os.path.join(kaldi_data_dir, 'local', 'dict', 'lexicon.txt')
nonsilence_phones_txt = os.path.join(kaldi_data_dir, 'local', 'dict', 'nonsilence_phones.txt')
silence_phones_txt = os.path.join(kaldi_data_dir, 'local', 'dict', 'silence_phones.txt')
optional_silence_txt = os.path.join(kaldi_data_dir, 'local', 'dict', 'optional_silence.txt')
## ======================= load test data ====================== ## ======================= load test data ======================
@@ -85,392 +110,478 @@ df = stimmen_functions.add_row_htk(df)
word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)] word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
word_list = sorted(word_list) word_list = sorted(word_list)
# pronunciation variants
## ======================= make dic file to check pronunciation variants ======================
# dic file should be manually modified depends on the task - recognition / forced-alignemnt.
if make_dic_file:
# for HTK.
with open(stimmen_dic, mode='wb') as f:
for word in word_list:
df_ = df[df['word']==word]
pronunciations = list(np.unique(df_['htk']))
pronunciations_ = [word.upper() + ' sil ' + ' '.join(convert_phoneset.split_word(
htk, fame_asr.multi_character_phones_htk)) + ' sil'
for htk in pronunciations]
f.write(bytes('\n'.join(pronunciations_) + '\n', 'ascii'))
f.write(bytes('SILENCE sil\n', 'ascii'))
# for Kaldi.
fh.make_new_directory(os.path.join(kaldi_data_dir, 'local', 'dict'))
with open(lexicon_txt, mode='wb') as f:
f.write(bytes('!SIL sil\n', 'utf-8'))
f.write(bytes('<UNK> spn\n', 'utf-8'))
for word in word_list:
df_ = df[df['word']==word]
pronunciations = list(np.unique(df_['asr']))
pronunciations_ = [word.lower() + ' ' + ' '.join(convert_phoneset.split_word(
asr, fame_asr.multi_character_phones))
for asr in pronunciations]
f.write(bytes('\n'.join(pronunciations_) + '\n', 'utf-8'))
## ======================= test data for recognition ======================
# only target pronunciation variants.
df_rec = pd.DataFrame(index=[], columns=list(df.keys()))
for word in word_list: for word in word_list:
df_ = df[df['word']==word] variants = [htk.replace(' ', '')
print('{0} has {1} variants'.format(word, len(np.unique(df_['htk']))) for htk in stimmen_functions.load_pronunciations(word.upper(), stimmen_dic)]
df_ = df[df['word'] == word]
for index, row in df_.iterrows():
if row['htk'] in variants:
df_rec = df_rec.append(row, ignore_index=True)
#fh.make_filelist(stimmen_test_dir, hvite_scp, file_type='wav')
#output = pyhtk.recognition( ## ======================= make files required for HTK ======================
# os.path.join(default.htk_dir, 'config', 'config.rec', if make_HTK_files:
# lattice_file, # make a word lattice file.
# os.path.join(model_dir, 'hmm1', 'iter13'), pyhtk.create_word_lattice_file(
# dictionary_file, os.path.join(config_dir, 'stimmen.net'),
# os.path.join(config_dir, 'phonelist.txt'), lattice_file)
# hvite_scp)
#pyhtk.create_label_file( # extract features.
# row['word'], with open(hcopy_scp, 'wb') as f:
# os.path.join(stimmen_test_dir, filename.replace('.wav', '.lab'))) filelist = [os.path.join(stimmen_test_dir, filename) + '\t'
+ os.path.join(feature_dir, os.path.basename(filename).replace('.wav', '.mfc'))
for filename in df['filename']]
f.write(bytes('\n'.join(filelist), 'ascii'))
pyhtk.wav2mfc(config_hcopy, hcopy_scp)
## ======================= make a HTK dic file ====================== # make label files.
#if make_htk_dic_file: for index, row in df.iterrows():
# output_type = 3 filename = row['filename'].replace('.wav', '.lab')
dictionary_txt = os.path.join(default.htk_dir, 'lexicon', 'stimmen.dic') label_file = os.path.join(feature_dir, filename)
#for word in word_list: with open(label_file, 'wb') as f:
word = word_list[2] label_string = 'SILENCE\n' + row['word'].upper() + '\nSILENCE\n'
# pronunciation variant of the target word. f.write(bytes(label_string, 'ascii'))
pronunciations = df_test['asr'][df_test['word'].str.match(word)]
## ======================= make files required for Kaldi =======================
if make_kaldi_files:
fh.make_new_directory(os.path.join(kaldi_data_dir, 'test'))
fh.make_new_directory(os.path.join(kaldi_data_dir, 'test', 'local'))
fh.make_new_directory(os.path.join(kaldi_data_dir, 'conf'))
# remove previous files.
if os.path.exists(wav_scp):
os.remove(wav_scp)
if os.path.exists(text_file):
os.remove(text_file)
if os.path.exists(utt2spk):
os.remove(utt2spk)
f_wav_scp = open(wav_scp, 'a', encoding="utf-8", newline='\n')
f_text_file = open(text_file, 'a', encoding="utf-8", newline='\n')
f_utt2spk = open(utt2spk, 'a', encoding="utf-8", newline='\n')
# make wav.scp, text, and utt2spk files.
for i, row in df_rec.iterrows():
filename = row['filename']
print('=== {0}: {1} ==='.format(i, filename))
wav_file = os.path.join(stimmen_test_dir, filename)
#if os.path.exists(wav_file):
speaker_id = 'speaker_' + str(i).zfill(4)
utterance_id = filename.replace('.wav', '')
utterance_id = utterance_id.replace(' ', '_')
utterance_id = speaker_id + '-' + utterance_id
# output
f_wav_scp.write('{0} {1}\n'.format(
utterance_id,
wav_file.replace('c:/', '/mnt/c/').replace('\\', '/'))) # convert path to unix format.
f_text_file.write('{0}\t{1}\n'.format(utterance_id, df_rec['word'][i].lower()))
f_utt2spk.write('{0} {1}\n'.format(utterance_id, speaker_id))
f_wav_scp.close()
f_text_file.close()
f_utt2spk.close()
with open(corpus_txt, 'wb') as f:
f.write(bytes('\n'.join([word.lower() for word in word_list]) + '\n', 'utf-8'))
with open(nonsilence_phones_txt, 'wb') as f:
f.write(bytes('\n'.join(fame_asr.phoneset_short) + '\n', 'utf-8'))
with open(silence_phones_txt, 'wb') as f:
f.write(bytes('sil\nspn\n', 'utf-8'))
with open(optional_silence_txt, 'wb') as f:
f.write(bytes('sil\n', 'utf-8'))
with open(os.path.join(kaldi_data_dir, 'conf', 'decode.config'), 'wb') as f:
f.write(bytes('first_beam=10.0\n', 'utf-8'))
f.write(bytes('beam=13.0\n', 'utf-8'))
f.write(bytes('lattice_beam=6.0\n', 'utf-8'))
with open(os.path.join(kaldi_data_dir, 'conf', 'mfcc.conf'), 'wb') as f:
f.write(bytes('--use-energy=false', 'utf-8'))
## ======================= recognition ======================
listdir = glob.glob(os.path.join(feature_dir, '*.mfc'))
with open(hvite_scp, 'wb') as f:
f.write(bytes('\n'.join(listdir), 'ascii'))
with open(hresult_scp, 'wb') as f:
f.write(bytes('\n'.join(listdir).replace('.mfc', '.rec'), 'ascii'))
# calculate result
performance = np.zeros((1, 2))
for niter in range(50, 60):
output = pyhtk.recognition(
os.path.join(config_dir, 'config.rec'),
lattice_file,
os.path.join(default.htk_dir, 'model', 'hmm1', 'iter' + str(niter), 'hmmdefs'),
stimmen_dic, phonelist_txt, hvite_scp)
output = pyhtk.calc_recognition_performance(
stimmen_dic, hresult_scp)
per_sentence, per_word = pyhtk.load_recognition_output_all(output)
performance_ = np.array([niter, per_sentence['accuracy']]).reshape(1, 2)
performance = np.r_[performance, performance_]
print('{0}: {1}[%]'.format(niter, per_sentence['accuracy']))
#output = run_command_with_output([
# 'HVite', '-T', '1',
# '-C', config_rec,
# '-w', lattice_file,
# '-H', hmm,
# dictionary_file, phonelist_txt,
# '-S', HVite_scp
#])
# make dic file.
#am_func.make_htk_dict(word, pronvar_, htk_dict_file, output_type)
## ======================= forced alignment using HTK ======================= ## ======================= forced alignment using HTK =======================
if do_forced_alignment_htk: if do_forced_alignment_htk:
#for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]: #for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
for hmm_num in [256, 512, 1024]: for hmm_num in [256, 512, 1024]:
hmm_num_str = str(hmm_num) hmm_num_str = str(hmm_num)
acoustic_model = os.path.join(acoustic_model_dir, 'hmm' + hmm_num_str + r'-2\hmmdefs') acoustic_model = os.path.join(acoustic_model_dir, 'hmm' + hmm_num_str + r'-2\hmmdefs')
predictions = pd.DataFrame({'filename': [''], predictions = pd.DataFrame({'filename': [''],
'word': [''], 'word': [''],
'xsampa': [''], 'xsampa': [''],
'ipa': [''], 'ipa': [''],
'famehtk': [''], 'famehtk': [''],
'prediction': ['']}) 'prediction': ['']})
for i, filename in enumerate(df['filename']): for i, filename in enumerate(df['filename']):
print('=== {0}/{1} ==='.format(i, len(df))) print('=== {0}/{1} ==='.format(i, len(df)))
if (i in df['filename'].keys()) and (isinstance(df['filename'][i], str)): if (i in df['filename'].keys()) and (isinstance(df['filename'][i], str)):
wav_file = os.path.join(wav_dir, filename) wav_file = os.path.join(wav_dir, filename)
if os.path.exists(wav_file): if os.path.exists(wav_file):
word = df['word'][i] word = df['word'][i]
WORD = word.upper() WORD = word.upper()
fa_file = os.path.join(fa_dir, filename.replace('.wav', '.txt') + hmm_num_str) fa_file = os.path.join(fa_dir, filename.replace('.wav', '.txt') + hmm_num_str)
#if not os.path.exists(fa_file): #if not os.path.exists(fa_file):
# make label file. # make label file.
label_file = os.path.join(wav_dir, filename.replace('.wav', '.lab')) label_file = os.path.join(wav_dir, filename.replace('.wav', '.lab'))
with open(label_file, 'w') as f: with open(label_file, 'w') as f:
lines = f.write(WORD) lines = f.write(WORD)
htk_dict_file = os.path.join(htk_dict_dir, word + '.dic') htk_dict_file = os.path.join(htk_dict_dir, word + '.dic')
pyhtk.doHVite(wav_file, label_file, htk_dict_file, fa_file, default.config_hvite, pyhtk.doHVite(wav_file, label_file, htk_dict_file, fa_file, default.config_hvite,
default.phonelist, acoustic_model) default.phonelist, acoustic_model)
os.remove(label_file) os.remove(label_file)
prediction = am_func.read_fileFA(fa_file) prediction = am_func.read_fileFA(fa_file)
print('{0}: {1} -> {2}'.format(WORD, df['famehtk'][i], prediction)) print('{0}: {1} -> {2}'.format(WORD, df['famehtk'][i], prediction))
else: else:
prediction = '' prediction = ''
print('!!!!! file not found.') print('!!!!! file not found.')
line = pd.Series([df['filename'][i], df['word'][i], df['xsampa'][i], df['ipa'][i], df['famehtk'][i], prediction], index=['filename', 'word', 'xsampa', 'ipa', 'famehtk', 'prediction'], name=i) line = pd.Series([df['filename'][i], df['word'][i], df['xsampa'][i], df['ipa'][i], df['famehtk'][i], prediction], index=['filename', 'word', 'xsampa', 'ipa', 'famehtk', 'prediction'], name=i)
predictions = predictions.append(line) predictions = predictions.append(line)
else: else:
prediction = '' prediction = ''
print('!!!!! invalid entry.') print('!!!!! invalid entry.')
predictions.to_pickle(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.pkl')) predictions.to_pickle(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.pkl'))
## ======================= make files which is used for forced alignment by Kaldi =======================
if make_kaldi_data_files:
wav_scp = os.path.join(kaldi_data_dir, 'wav.scp')
text_file = os.path.join(kaldi_data_dir, 'text')
utt2spk = os.path.join(kaldi_data_dir, 'utt2spk')
# remove previous files.
if os.path.exists(wav_scp):
os.remove(wav_scp)
if os.path.exists(text_file):
os.remove(text_file)
if os.path.exists(utt2spk):
os.remove(utt2spk)
f_wav_scp = open(wav_scp, 'a', encoding="utf-8", newline='\n')
f_text_file = open(text_file, 'a', encoding="utf-8", newline='\n')
f_utt2spk = open(utt2spk, 'a', encoding="utf-8", newline='\n')
# make wav.scp, text, and utt2spk files.
for i in df.index:
filename = df['filename'][i]
print('=== {0}: {1} ==='.format(i, filename))
#if (i in df['filename'].keys()) and (isinstance(df['filename'][i], str)):
wav_file = os.path.join(wav_dir, filename)
if os.path.exists(wav_file):
speaker_id = 'speaker_' + str(i).zfill(4)
utterance_id = filename.replace('.wav', '')
utterance_id = utterance_id.replace(' ', '_')
utterance_id = speaker_id + '-' + utterance_id
# wav.scp file
wav_file_unix = wav_file.replace('\\', '/')
wav_file_unix = wav_file_unix.replace('c:/', '/mnt/c/')
f_wav_scp.write('{0} {1}\n'.format(utterance_id, wav_file_unix))
# text file
word = df['word'][i].lower()
f_text_file.write('{0}\t{1}\n'.format(utterance_id, word))
# utt2spk
f_utt2spk.write('{0} {1}\n'.format(utterance_id, speaker_id))
f_wav_scp.close()
f_text_file.close()
f_utt2spk.close()
## ======================= make lexicon txt which is used by Kaldi ======================= ## ======================= make lexicon txt which is used by Kaldi =======================
if make_kaldi_lexicon_txt: if make_kaldi_lexicon_txt:
option_num = 6 option_num = 6
# remove previous file. # remove previous file.
if os.path.exists(lexicon_txt): if os.path.exists(lexicon_txt):
os.remove(lexicon_txt) os.remove(lexicon_txt)
lexiconp_txt = lexicon_txt.replace('lexicon.txt', 'lexiconp.txt') lexiconp_txt = lexicon_txt.replace('lexicon.txt', 'lexiconp.txt')
if os.path.exists(lexiconp_txt): if os.path.exists(lexiconp_txt):
os.remove(lexiconp_txt) os.remove(lexiconp_txt)
# output lexicon.txt # output lexicon.txt
f_lexicon_txt = open(lexicon_txt, 'a', encoding="utf-8", newline='\n') f_lexicon_txt = open(lexicon_txt, 'a', encoding="utf-8", newline='\n')
pronvar_list_all = [] pronvar_list_all = []
for word in word_list: for word in word_list:
# pronunciation variant of the target word. # pronunciation variant of the target word.
pronunciation_variants = df['ipa'][df['word'].str.match(word)] pronunciation_variants = df['ipa'][df['word'].str.match(word)]
c = Counter(pronunciation_variants) c = Counter(pronunciation_variants)
total_num = sum(c.values()) total_num = sum(c.values())
#with open(result_dir + '\\' + word + '.csv', 'a', encoding="utf-8", newline='\n') as f: #with open(result_dir + '\\' + word + '.csv', 'a', encoding="utf-8", newline='\n') as f:
# for key in c.keys(): # for key in c.keys():
# f.write("{0},{1}\n".format(key,c[key])) # f.write("{0},{1}\n".format(key,c[key]))
for key, value in c.most_common(option_num): for key, value in c.most_common(option_num):
# make possible pronunciation variant list. # make possible pronunciation variant list.
pronvar_list = am_func.fame_pronunciation_variant(key) pronvar_list = am_func.fame_pronunciation_variant(key)
for pronvar_ in pronvar_list: for pronvar_ in pronvar_list:
split_ipa = convert_phone_set.split_fame_ipa(pronvar_) split_ipa = convert_phone_set.split_fame_ipa(pronvar_)
pronvar_out = ' '.join(split_ipa) pronvar_out = ' '.join(split_ipa)
pronvar_list_all.append([word, pronvar_out]) pronvar_list_all.append([word, pronvar_out])
pronvar_list_all = np.array(pronvar_list_all) pronvar_list_all = np.array(pronvar_list_all)
pronvar_list_all = np.unique(pronvar_list_all, axis=0) pronvar_list_all = np.unique(pronvar_list_all, axis=0)
# output # output
f_lexicon_txt.write('<UNK>\tSPN\n') f_lexicon_txt.write('<UNK>\tSPN\n')
for line in pronvar_list_all: for line in pronvar_list_all:
f_lexicon_txt.write('{0}\t{1}\n'.format(line[0].lower(), line[1])) f_lexicon_txt.write('{0}\t{1}\n'.format(line[0].lower(), line[1]))
f_lexicon_txt.close() f_lexicon_txt.close()
## ======================= load kaldi forced alignment result ======================= ## ======================= load kaldi forced alignment result =======================
if load_forced_alignment_kaldi: if load_forced_alignment_kaldi:
phones_txt = os.path.join(default.kaldi_dir, 'data', 'lang', 'phones.txt') phones_txt = os.path.join(default.kaldi_dir, 'data', 'lang', 'phones.txt')
merged_alignment_txt = os.path.join(default.kaldi_dir, 'exp', 'tri1_alignme', 'merged_alignment.txt') merged_alignment_txt = os.path.join(default.kaldi_dir, 'exp', 'tri1_alignme', 'merged_alignment.txt')
#filenames = np.load(data_dir + '\\filenames.npy') #filenames = np.load(data_dir + '\\filenames.npy')
#words = np.load(data_dir + '\\words.npy') #words = np.load(data_dir + '\\words.npy')
#pronunciations = np.load(data_dir + '\\pronunciations_ipa.npy') #pronunciations = np.load(data_dir + '\\pronunciations_ipa.npy')
#pronvar_list_all = np.load(data_dir + '\\pronvar_list_all.npy') #pronvar_list_all = np.load(data_dir + '\\pronvar_list_all.npy')
#word_list = np.unique(words) #word_list = np.unique(words)
# load the mapping between phones and ids. # load the mapping between phones and ids.
with open(phones_txt, 'r', encoding="utf-8") as f: with open(phones_txt, 'r', encoding="utf-8") as f:
mapping_phone2id = f.read().split('\n') mapping_phone2id = f.read().split('\n')
phones = [] phones = []
phone_ids = [] # ID of phones phone_ids = [] # ID of phones
for m in mapping_phone2id: for m in mapping_phone2id:
m = m.split(' ') m = m.split(' ')
if len(m) > 1: if len(m) > 1:
phones.append(m[0]) phones.append(m[0])
phone_ids.append(int(m[1])) phone_ids.append(int(m[1]))
# load the result of FA. # load the result of FA.
with open(merged_alignment_txt, 'r') as f: with open(merged_alignment_txt, 'r') as f:
lines = f.read() lines = f.read()
lines = lines.split('\n') lines = lines.split('\n')
predictions = pd.DataFrame({'filename': [''], predictions = pd.DataFrame({'filename': [''],
'word': [''], 'word': [''],
'xsampa': [''], 'xsampa': [''],
'ipa': [''], 'ipa': [''],
'famehtk': [''], 'famehtk': [''],
'prediction': ['']}) 'prediction': ['']})
#fa_filenames = [] #fa_filenames = []
#fa_pronunciations = [] #fa_pronunciations = []
utterance_id_ = '' utterance_id_ = ''
pronunciation = [] pronunciation = []
for line in lines: for line in lines:
line = line.split(' ') line = line.split(' ')
if len(line) == 5: if len(line) == 5:
utterance_id = line[0] utterance_id = line[0]
if utterance_id == utterance_id_: if utterance_id == utterance_id_:
phone_id = int(line[4]) phone_id = int(line[4])
#if not phone_id == 1: #if not phone_id == 1:
phone_ = phones[phone_ids.index(phone_id)] phone_ = phones[phone_ids.index(phone_id)]
phone = re.sub(r'_[A-Z]', '', phone_) phone = re.sub(r'_[A-Z]', '', phone_)
if not phone == 'SIL': if not phone == 'SIL':
pronunciation.append(phone) pronunciation.append(phone)
else: else:
filename = re.sub(r'speaker_[0-9]{4}-', '', utterance_id_) filename = re.sub(r'speaker_[0-9]{4}-', '', utterance_id_)
prediction = ''.join(pronunciation) prediction = ''.join(pronunciation)
df_ = df[df['filename'].str.match(filename)] df_ = df[df['filename'].str.match(filename)]
df_idx = df_.index[0] df_idx = df_.index[0]
prediction_ = pd.Series([#filename, prediction_ = pd.Series([#filename,
#df_['word'][df_idx], #df_['word'][df_idx],
#df_['xsampa'][df_idx], #df_['xsampa'][df_idx],
#df_['ipa'][df_idx], #df_['ipa'][df_idx],
#df_['famehtk'][df_idx], #df_['famehtk'][df_idx],
df_.iloc[0,1], df_.iloc[0,1],
df_.iloc[0,3], df_.iloc[0,3],
df_.iloc[0,4], df_.iloc[0,4],
df_.iloc[0,2], df_.iloc[0,2],
df_.iloc[0,0], df_.iloc[0,0],
prediction], prediction],
index=['filename', 'word', 'xsampa', 'ipa', 'famehtk', 'prediction'], index=['filename', 'word', 'xsampa', 'ipa', 'famehtk', 'prediction'],
name=df_idx) name=df_idx)
predictions = predictions.append(prediction_) predictions = predictions.append(prediction_)
#fa_filenames.append() #fa_filenames.append()
#fa_pronunciations.append(' '.join(pronunciation)) #fa_pronunciations.append(' '.join(pronunciation))
pronunciation = [] pronunciation = []
utterance_id_ = utterance_id utterance_id_ = utterance_id
predictions.to_pickle(os.path.join(result_dir, 'kaldi', 'predictions.pkl')) predictions.to_pickle(os.path.join(result_dir, 'kaldi', 'predictions.pkl'))
## ======================= evaluate the result of forced alignment ======================= ## ======================= evaluate the result of forced alignment =======================
if eval_forced_alignment_htk: if eval_forced_alignment_htk:
htk_dict_dir = os.path.join(default.experiments_dir, 'stimmen', 'dic_short') htk_dict_dir = os.path.join(default.experiments_dir, 'stimmen', 'dic_short')
compare_hmm_num = 1 compare_hmm_num = 1
if compare_hmm_num: if compare_hmm_num:
f_result = open(os.path.join(result_dir, 'result.csv'), 'w') f_result = open(os.path.join(result_dir, 'result.csv'), 'w')
f_result.write("nmix,Oog,Oog,Oor,Oor,Pauw,Pauw,Reus,Reus,Reuzenrad,Reuzenrad,Roeiboot,Roeiboot,Rozen,Rozen\n") f_result.write("nmix,Oog,Oog,Oor,Oor,Pauw,Pauw,Reus,Reus,Reuzenrad,Reuzenrad,Roeiboot,Roeiboot,Rozen,Rozen\n")
for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]: for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
#for hmm_num in [256]: #for hmm_num in [256]:
hmm_num_str = str(hmm_num) hmm_num_str = str(hmm_num)
if compare_hmm_num: if compare_hmm_num:
f_result.write("{},".format(hmm_num_str)) f_result.write("{},".format(hmm_num_str))
#match = np.load(data_dir + '\\match_hmm' + hmm_num_str + '.npy') #match = np.load(data_dir + '\\match_hmm' + hmm_num_str + '.npy')
#prediction = np.load(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.npy')) #prediction = np.load(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.npy'))
#prediction = pd.Series(prediction, index=df.index, name='prediction') #prediction = pd.Series(prediction, index=df.index, name='prediction')
#result = pd.concat([df, prediction], axis=1) #result = pd.concat([df, prediction], axis=1)
result = pd.read_pickle(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.pkl')) result = pd.read_pickle(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.pkl'))
# load pronunciation variants # load pronunciation variants
for word in word_list: for word in word_list:
htk_dict_file = os.path.join(htk_dict_dir, word + '.dic') htk_dict_file = os.path.join(htk_dict_dir, word + '.dic')
with open(htk_dict_file, 'r') as f: with open(htk_dict_file, 'r') as f:
lines = f.read().split('\n')[:-1] lines = f.read().split('\n')[:-1]
pronunciation_variants = [line.split('\t')[1] for line in lines] pronunciation_variants = [line.split('\t')[1] for line in lines]
# see only words which appears in top 3. # see only words which appears in top 3.
result_ = result[result['word'].str.match(word)] result_ = result[result['word'].str.match(word)]
result_ = result_[result_['famehtk'].isin(pronunciation_variants)] result_ = result_[result_['famehtk'].isin(pronunciation_variants)]
match_num = sum(result_['famehtk'] == result_['prediction']) match_num = sum(result_['famehtk'] == result_['prediction'])
total_num = len(result_) total_num = len(result_)
print("word '{0}': {1}/{2} ({3:.2f} %)".format(word, match_num, total_num, match_num/total_num*100)) print("word '{0}': {1}/{2} ({3:.2f} %)".format(word, match_num, total_num, match_num/total_num*100))
if compare_hmm_num: if compare_hmm_num:
f_result.write("{0},{1},".format(match_num, total_num)) f_result.write("{0},{1},".format(match_num, total_num))
else: else:
# output confusion matrix # output confusion matrix
cm = confusion_matrix(result_['famehtk'], result_['prediction']) cm = confusion_matrix(result_['famehtk'], result_['prediction'])
plt.figure() plt.figure()
plot_confusion_matrix(cm, classes=pronunciation_variants, normalize=False) plot_confusion_matrix(cm, classes=pronunciation_variants, normalize=False)
plt.savefig(result_dir + '\\cm_' + word + '.png') plt.savefig(result_dir + '\\cm_' + word + '.png')
if compare_hmm_num: if compare_hmm_num:
f_result.write('\n') f_result.write('\n')
if compare_hmm_num: if compare_hmm_num:
f_result.close() f_result.close()
## ======================= evaluate the result of forced alignment of kaldi ======================= ## ======================= evaluate the result of forced alignment of kaldi =======================
if eval_forced_alignment_kaldi: if eval_forced_alignment_kaldi:
result = pd.read_pickle(os.path.join(result_dir, 'kaldi', 'predictions.pkl')) result = pd.read_pickle(os.path.join(result_dir, 'kaldi', 'predictions.pkl'))
f_result = open(os.path.join(result_dir, 'result.csv'), 'w') f_result = open(os.path.join(result_dir, 'result.csv'), 'w')
f_result.write("word,total,valid,match,[%]\n") f_result.write("word,total,valid,match,[%]\n")
# load pronunciation variants # load pronunciation variants
with open(lexicon_txt, 'r', encoding="utf-8", newline='\n') as f: with open(lexicon_txt, 'r', encoding="utf-8", newline='\n') as f:
lines = f.read().split('\n')[:-1] lines = f.read().split('\n')[:-1]
pronunciation_variants_all = [line.split('\t') for line in lines] pronunciation_variants_all = [line.split('\t') for line in lines]
word_list = np.delete(word_list, [0], 0) # remove 'Oog' word_list = np.delete(word_list, [0], 0) # remove 'Oog'
for word in word_list: for word in word_list:
# load pronunciation variant of the word. # load pronunciation variant of the word.
pronunciation_variants = [] pronunciation_variants = []
for line in pronunciation_variants_all: for line in pronunciation_variants_all:
if line[0] == word.lower(): if line[0] == word.lower():
pronunciation_variants.append(line[1].replace(' ', '')) pronunciation_variants.append(line[1].replace(' ', ''))
# see only words which appears in top 3. # see only words which appears in top 3.
result_ = result[result['word'].str.match(word)] result_ = result[result['word'].str.match(word)]
result_tolerant = pd.DataFrame({ result_tolerant = pd.DataFrame({
'filename': [''], 'filename': [''],
'word': [''], 'word': [''],
'xsampa': [''], 'xsampa': [''],
'ipa': [''], 'ipa': [''],
'prediction': [''], 'prediction': [''],
'match': ['']}) 'match': ['']})
for i in range(0, len(result_)): for i in range(0, len(result_)):
line = result_.iloc[i] line = result_.iloc[i]
# make a list of all possible pronunciation variants of ipa description. # make a list of all possible pronunciation variants of ipa description.
# i.e. possible answers from forced alignment. # i.e. possible answers from forced alignment.
ipa = line['ipa'] ipa = line['ipa']
pronvar_list = [ipa] pronvar_list = [ipa]
pronvar_list_ = am_func.fame_pronunciation_variant(ipa) pronvar_list_ = am_func.fame_pronunciation_variant(ipa)
if not pronvar_list_ is None: if not pronvar_list_ is None:
pronvar_list += list(pronvar_list_) pronvar_list += list(pronvar_list_)
# only focus on pronunciations which can be estimated from ipa. # only focus on pronunciations which can be estimated from ipa.
if len(set(pronvar_list) & set(pronunciation_variants)) > 0: if len(set(pronvar_list) & set(pronunciation_variants)) > 0:
if line['prediction'] in pronvar_list: if line['prediction'] in pronvar_list:
ismatch = True ismatch = True
else: else:
ismatch = False ismatch = False
line_df = pd.DataFrame(result_.iloc[i]).T line_df = pd.DataFrame(result_.iloc[i]).T
df_idx = line_df.index[0] df_idx = line_df.index[0]
result_tolerant_ = pd.Series([line_df.loc[df_idx, 'filename'], result_tolerant_ = pd.Series([line_df.loc[df_idx, 'filename'],
line_df.loc[df_idx, 'word'], line_df.loc[df_idx, 'word'],
line_df.loc[df_idx, 'xsampa'], line_df.loc[df_idx, 'xsampa'],
line_df.loc[df_idx, 'ipa'], line_df.loc[df_idx, 'ipa'],
line_df.loc[df_idx, 'prediction'], line_df.loc[df_idx, 'prediction'],
ismatch], ismatch],
index=['filename', 'word', 'xsampa', 'ipa', 'prediction', 'match'], index=['filename', 'word', 'xsampa', 'ipa', 'prediction', 'match'],
name=df_idx) name=df_idx)
result_tolerant = result_tolerant.append(result_tolerant_) result_tolerant = result_tolerant.append(result_tolerant_)
# remove the first entry (dummy) # remove the first entry (dummy)
result_tolerant = result_tolerant.drop(0, axis=0) result_tolerant = result_tolerant.drop(0, axis=0)
total_num = len(result_) total_num = len(result_)
valid_num = len(result_tolerant) valid_num = len(result_tolerant)
match_num = np.sum(result_tolerant['match']) match_num = np.sum(result_tolerant['match'])
print("word '{0}': {1}/{2} ({3:.2f} %) originally {4}".format(word, match_num, valid_num, match_num/valid_num*100, total_num)) print("word '{0}': {1}/{2} ({3:.2f} %) originally {4}".format(word, match_num, valid_num, match_num/valid_num*100, total_num))
f_result.write("{0},{1},{2},{3},{4}\n".format(word, total_num, valid_num, match_num, match_num/valid_num*100)) f_result.write("{0},{1},{2},{3},{4}\n".format(word, total_num, valid_num, match_num, match_num/valid_num*100))
f_result.close() f_result.close()
## output confusion matrix ## output confusion matrix
#cm = confusion_matrix(result_['ipa'], result_['prediction']) #cm = confusion_matrix(result_['ipa'], result_['prediction'])
#plt.figure() #plt.figure()
#plot_confusion_matrix(cm, classes=pronunciation_variants, normalize=False) #plot_confusion_matrix(cm, classes=pronunciation_variants, normalize=False)
#plt.savefig(result_dir + '\\cm_' + word + '.png') #plt.savefig(result_dir + '\\cm_' + word + '.png')

View File

@@ -1,20 +1,19 @@
## this script should be used only by Aki Kunikoshi. ## this script should be used only by Aki Kunikoshi.
import os
import numpy as np import numpy as np
import pandas as pd
import argparse import argparse
import json import json
from novoapi.backend import session from novoapi.backend import session
import os
#os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import defaultfiles as default import defaultfiles as default
import convert_phoneset
def load_phonset(): def load_novo70_phoneset():
translation_key_ipa2novo70 = dict()
translation_key_novo702ipa = dict()
#phonelist_novo70_ = pd.ExcelFile(default.phonelist_novo70_xlsx) #phonelist_novo70_ = pd.ExcelFile(default.phonelist_novo70_xlsx)
#df = pd.read_excel(phonelist_novo70_, 'list') #df = pd.read_excel(phonelist_novo70_, 'list')
## *_simple includes columns which has only one phone in. ## *_simple includes columns which has only one phone in.
@@ -23,21 +22,23 @@ def load_phonset():
# print('{0}:{1}'.format(ipa, novo70)) # print('{0}:{1}'.format(ipa, novo70))
# translation_key[ipa] = novo70 # translation_key[ipa] = novo70
#phonelist_novo70 = np.unique(list(df['novo70_simple'])) #phonelist_novo70 = np.unique(list(df['novo70_simple']))
novo70_phoneset = pd.read_csv(default.novo70_phoneset, delimiter='\t', header=None, encoding="utf-8")
novo70_phoneset.rename(columns={0: 'novo70', 1: 'ipa', 2: 'description'}, inplace=True)
phoneset_ipa = [] #phoneset_ipa = []
phoneset_novo70 = [] #phoneset_novo70 = []
with open(default.novo70_phoneset, "rt", encoding="utf-8") as fin: #with open(default.novo70_phoneset, "rt", encoding="utf-8") as fin:
lines = fin.read() # lines = fin.read()
lines = lines.split('\n') # lines = lines.split('\n')
for line in lines: # for line in lines:
words = line.split('\t') # words = line.split('\t')
if len(words) > 1: # if len(words) > 1:
novo70 = words[0] # novo70 = words[0]
ipa = words[1] # ipa = words[1]
phoneset_ipa.append(ipa) # phoneset_ipa.append(ipa)
phoneset_novo70.append(novo70) # phoneset_novo70.append(novo70)
translation_key_ipa2novo70[ipa] = novo70 # translation_key_ipa2novo70[ipa] = novo70
translation_key_novo702ipa[novo70] = ipa # translation_key_novo702ipa[novo70] = ipa
# As per Nederlandse phoneset_aki.xlsx recieved from David # As per Nederlandse phoneset_aki.xlsx recieved from David
# [ɔː] oh / ohr # from ipa->novo70, only oh is used. # [ɔː] oh / ohr # from ipa->novo70, only oh is used.
@@ -47,15 +48,26 @@ def load_phonset():
# [ɛː] eh # [ɛː] eh
# [w] wv in IPA written as ʋ. # [w] wv in IPA written as ʋ.
extra_ipa = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː', 'ʋ'] extra_ipa = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː', 'ʋ']
extra_novo70 = ['oh', 'ih', 'iy', 'uh', 'eh', 'wv'] extra_novo70 = ['oh', 'ih', 'iy', 'uh', 'eh', 'wv']
for ipa, novo70 in zip(extra_ipa, extra_novo70):
phoneset_ipa.append(ipa) phoneset_ipa = list(novo70_phoneset['ipa'])
phoneset_novo70.append(novo70) phoneset_ipa.extend(extra_ipa)
phoneset_ipa = [i.replace('ː', ':') for i in phoneset_ipa]
phoneset_novo70 = list(novo70_phoneset['novo70'])
phoneset_novo70.extend(extra_novo70)
phoneset_novo70 = [i.replace('ː', ':') for i in phoneset_novo70]
translation_key_ipa2novo70 = dict()
translation_key_novo702ipa = dict()
for ipa, novo70 in zip(phoneset_ipa, phoneset_novo70):
#phoneset_ipa.append(ipa)
#phoneset_novo70.append(novo70)
translation_key_ipa2novo70[ipa] = novo70 translation_key_ipa2novo70[ipa] = novo70
translation_key_novo702ipa[novo70] = ipa translation_key_novo702ipa[novo70] = ipa
translation_key_novo702ipa['ohr'] = 'ɔː' translation_key_novo702ipa['ohr'] = 'ɔ:'
translation_key_novo702ipa['ihr'] = 'ɪː' translation_key_novo702ipa['ihr'] = 'ɪ:'
phoneset_ipa = np.unique(phoneset_ipa) phoneset_ipa = np.unique(phoneset_ipa)
phoneset_novo70 = np.unique(phoneset_novo70) phoneset_novo70 = np.unique(phoneset_novo70)
@@ -63,25 +75,6 @@ def load_phonset():
return phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa return phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa
def multi_character_tokenize(line, multi_character_tokens):
"""
Tries to match one of the tokens in multi_character_tokens at each position of line,
starting at position 0,
if so tokenizes and eats that token. Otherwise tokenizes a single character.
Copied from forced_alignment.convert_phone_set.py
"""
while line != '':
for token in multi_character_tokens:
if line.startswith(token) and len(token) > 0:
yield token
line = line[len(token):]
break
else:
yield line[:1]
line = line[1:]
def split_ipa(line): def split_ipa(line):
""" """
Split a line by IPA phones. Split a line by IPA phones.
@@ -89,13 +82,16 @@ def split_ipa(line):
:param string line: one line written in IPA. :param string line: one line written in IPA.
:return string lineSeperated: the line splitted in IPA phone. :return string lineSeperated: the line splitted in IPA phone.
""" """
phoneset_ipa, _, _, _ = load_novo70_phoneset()
#multi_character_phones = [i for i in phoneset_ipa if len(i) > 1]
#multi_character_phones.sort(key=len, reverse=True)
#multi_character_phones = [
# # IPAs in CGN.
# u'ʌu', u'ɛi', u'œy', u'aː', u'eː', u'iː', u'oː', u'øː', u'ɛː', u'œː', u'ɔː', u'ɛ̃ː', u'ɑ̃ː', u'ɔ̃ː', u'œ̃', u'ɪː'
# ]
#return [phone for phone in multi_character_tokenize(line.strip(), multi_character_phones)]
multi_character_phones = [ return convert_phoneset.split_word(line, phoneset_ipa)
# IPAs in CGN.
u'ʌu', u'ɛi', u'œy', u'aː', u'eː', u'iː', u'oː', u'øː', u'ɛː', u'œː', u'ɔː', u'ɛ̃ː', u'ɑ̃ː', u'ɔ̃ː', u'œ̃', u'ɪː'
]
return [phone for phone in multi_character_tokenize(line.strip(), multi_character_phones)]
def split_novo70(line): def split_novo70(line):
@@ -104,30 +100,33 @@ def split_novo70(line):
:param string line: one line written in novo70. :param string line: one line written in novo70.
:return string lineSeperated: the line splitted by novo70 phones. :return string lineSeperated: the line splitted by novo70 phones.
""" """
_, phoneset_novo70, _, _ = load_phonset() _, phoneset_novo70, _, _ = load_novo70_phoneset()
multi_character_phones = [p for p in phoneset_novo70 if len(p) > 1] #multi_character_phones = [p for p in phoneset_novo70 if len(p) > 1]
multi_character_phones = sorted(multi_character_phones, key=len, reverse=True) #multi_character_phones = sorted(multi_character_phones, key=len, reverse=True)
multi_character_phones = convert_phoneset.extract_multi_character_phones(phoneset_novo70)
return ['sp' if phone == ' ' else phone return ['sp' if phone == ' ' else phone
for phone in multi_character_tokenize(line.strip(), multi_character_phones)] for phone in multi_character_tokenize(line.strip(), multi_character_phones)]
def novo702ipa(tokens): def novo702ipa(line):
pronunciation = [] #pronunciation = []
_, _, _, translation_key = load_phonset() _, _, _, translation_key = load_novo70_phoneset()
for phone in split_novo70(tokens): #for phone in split_novo70(tokens):
pronunciation.append(translation_key.get(phone, phone)) # pronunciation.append(translation_key.get(phone, phone))
return ' '.join(pronunciation) #return ' '.join(pronunciation)
return ' '.join(convert_phoneset.convert_phoneset(split_novo70(line), translation_key))
# numbering of novo70 should be checked. # numbering of novo70 should be checked.
def ipa2novo70(tokens): def ipa2novo70(line):
pronunciation = [] #pronunciation = []
_, _, translation_key, _ = load_phonset() _, _, translation_key, _ = load_novo70_phoneset()
for phone in split_ipa(tokens): #for phone in split_ipa(tokens):
pronunciation.append(translation_key.get(phone, phone)) # pronunciation.append(translation_key.get(phone, phone))
return ' '.join(pronunciation) #return ' '.join(pronunciation)
return ' '.join(convert_phoneset.convert_phoneset(split_ipa(line), translation_key))
def make_grammar(word, pronunciation_ipa): def make_grammar(word, pronunciation_ipa):
""" """
@@ -174,6 +173,9 @@ def forced_alignment(wav_file, word, pronunciation_ipa):
p = argparse.ArgumentParser() p = argparse.ArgumentParser()
p.add_argument("--user", default='martijn.wieling') p.add_argument("--user", default='martijn.wieling')
p.add_argument("--password", default='xxxxxx') p.add_argument("--password", default='xxxxxx')
args = p.parse_args() args = p.parse_args()
rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True) # , modeldir=modeldir) rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True) # , modeldir=modeldir)
@@ -194,6 +196,25 @@ def result2pronunciation(result, word):
return pronunciation_ipa, pronunciation_novo70, llh return pronunciation_ipa, pronunciation_novo70, llh
def phones_not_in_novo70(ipa):
""" extract phones which is not in novo70 phoneset. """
phoneset_ipa, _, _, _ = load_novo70_phoneset()
# As per Nederlandse phoneset_aki.xlsx recieved from David
# [ɔː] oh / ohr
# [ɪː] ih / ihr
# [iː] iy
# [œː] uh
# [ɛː] eh
# [w] wv in IPA written as ʋ.
david_suggestion = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː', 'w']
return [phone for phone in split_ipa(ipa)
if not phone in phoneset_ipa and not phone in david_suggestion]
if __name__ == 'main': if __name__ == 'main':
pronunciation_ipa = ['rø:s', 'mɑn', 'mɑntsjə'] pronunciation_ipa = ['rø:s', 'mɑn', 'mɑntsjə']
grammar = make_grammar('reus', pronunciation_ipa) #grammar = make_grammar('reus', pronunciation_ipa)
phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa = load_novo70_phoneset()

View File

@@ -68,14 +68,24 @@ phoneset = [
# the phones which seldom occur are replaced with another more popular phones. # the phones which seldom occur are replaced with another more popular phones.
# replacements are based on the advice from Martijn Wieling. # replacements are based on the advice from Martijn Wieling.
reduction_key = { reduction_key = {
'y':'i:', 'e':'e:', 'ə:':'ɛ:', 'r:':'r', 'ɡ':'g' 'y':'i:', 'e':'e:', 'ə:':'ɛ:', 'r:':'r', 'ɡ':'g',
# aki added because this is used in stimmen_project.
'ɔ̈:':'ɔ:'
} }
# already removed beforehand in phoneset. Just to be sure. # already removed beforehand in phoneset. Just to be sure.
phones_to_be_removed = ['ú', 's:', 'ɔ̈:'] phones_to_be_removed = ['ú', 's:']
def phone_reduction(phones): def phone_reduction(phones):
"""
Args:
phones (list): list of phones.
"""
if sum([phone in phones for phone in phones_to_be_removed]) != 0:
print('input includes phone(s) which is not defined in fame_asr.')
print('those phone(s) are removed.')
return [reduction_key.get(i, i) for i in phones return [reduction_key.get(i, i) for i in phones
if not i in phones_to_be_removed] if i not in phones_to_be_removed]
phoneset_short = list(set(phone_reduction(phoneset))) phoneset_short = list(set(phone_reduction(phoneset)))
phoneset_short.sort() phoneset_short.sort()
@@ -89,14 +99,15 @@ translation_key_asr2htk = {
'': 'u_', '': 'u_',
# on the analogy of German umlaut, 'e' is used. # on the analogy of German umlaut, 'e' is used.
'ö': 'oe', 'ö:': 'oe:', 'ö': 'oe', 'ö:': 'oe:', ''
'ü': 'ue', 'ü:': 'ue:', 'ü': 'ue', 'ü:': 'ue:',
# on the analogy of Chinese... # on the analogy of Chinese...
'ŋ': 'ng', 'ŋ': 'ng',
# refer to Xsampa. # refer to Xsampa.
'ɔ': 'O', 'ɔ:': 'O:', 'ɔ̈': 'Oe', 'ɔ': 'O', 'ɔ:': 'O:', 'ɔ̈': 'Oe',
#'ɔ̈:': 'O:', # does not appear in FAME, but used in stimmen.
'ɛ': 'E', 'ɛ:': 'E:', 'ɛ': 'E', 'ɛ:': 'E:',
'ɪ': 'I', 'ɪ:': 'I:', 'ɪ': 'I', 'ɪ:': 'I:',
@@ -120,7 +131,11 @@ translation_key_word2htk = {
'ä': 'ao', 'ë': 'ee', 'ï': 'ie', 'ö': 'oe', 'ü': 'ue', 'ä': 'ao', 'ë': 'ee', 'ï': 'ie', 'ö': 'oe', 'ü': 'ue',
} }
#[translation_key_word2htk.get(i, i) for i in not_in_ascii] #[translation_key_word2htk.get(i, i) for i in not_in_ascii]
#Stop: p, b, t, d, k, g
#Nasal: m, n, ng(ŋ)
#Fricative: s, z, f, v, h, x
#Liquid: l, r
#Vowel: a, a:, e:, i, i:, i_(i̯), o, o:, u, u:, u_(ṷ), oe(ö), oe:(ö:), ue(ü), ue:(ü:), O(ɔ), O:(ɔ:), Oe(ɔ̈), A(ə), E(ɛ), E:(ɛ:), I(ɪ), I:(ɪ:)
## the list of multi character phones. ## the list of multi character phones.

View File

@@ -61,7 +61,7 @@ phoneset = [
'ɔⁿ', 'ɔⁿ',
'ɔ:', 'ɔ:',
'ɔ:ⁿ', 'ɔ:ⁿ',
#'ɔ̈', # not included in lex.ipa 'ɔ̈', # not included in lex.ipa
'ɔ̈.', 'ɔ̈.',
'ɔ̈:', 'ɔ̈:',

View File

@@ -0,0 +1,197 @@
import sys
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import fame_functions
from phoneset import fame_ipa, fame_asr
import convert_phoneset
## general
stop = 'p, b, t, d, k, g'
nasal = 'm, n, ŋ'
fricative = 's, z, f, v, h, x, j'
liquid = 'l, r'
vowel = 'a, a:, e:, i, i:, i̯, o, o:, u, u:, ṷ, ö, ö:, ü, ü:, ɔ, ɔ:, ɔ̈, ə, ɛ, ɛ:, ɪ, ɪ:'
## consonant
c_front = 'p, b, m, f, v'
c_central = 't, d, n, s, z, l, r'
c_back = 'k, g, ŋ, h, x, j'
fortis = 'p, t, k, f, s'
lenis = 'b, d, g, v, z, j'
neither_fortis_nor_lenis = 'm, n, ŋ, h, l, r, x'
coronal = 't, d, n, s, z, l, r, j'
non_coronal = 'p, b, m, k, g, ŋ, f, v, h, x'
anterior = 'p, b, m, t, d, n, f, v, s, z, l'
non_anterior = 'k, g, ŋ, h, x, j, r'
continuent = 'm, n, ŋ, f, v, s, z, h, l, r'
non_continuent = 'p, b, t, d, k, g, x, j'
strident = 's, z, j'
non_strident = 'f, v, h'
unstrident = 'p, b, t, d, m, n, ŋ, k, g, r, x'
glide = 'h, l, r'
syllabic = 'm, l, ŋ'
unvoiced = 'p, t, k, s, f, x, h'
voiced = 'b, d, g, z, v, m, n, ŋ, l, r, j'
#affricate: ???
non_affricate = 's, z, f, v'
voiced_stop = 'b, d, g'
unvoiced_stop = 'p, t, k'
front_stop = 'p, b'
central_stop = 't, d'
back_stop = 'k, g'
voiced_fricative = 'z, v'
unvoiced_fricative = 's, f'
front_fricative = 'f, v'
central_fricative = 's, z'
back_fricative = 'j'
## vowel
v_front = 'i, i:, i̯, ɪ, ɪ:, e:, ə, ɛ, ɛ:, a, a:'
v_central = 'ə, ɛ, ɛ:, a, a:'
v_back = 'u, u:, ü, ü:, ṷ, ɔ, ɔ:, ɔ̈, ö, ö:, o, o:'
long = 'a:, e:, i:, o:, u:, ö:, ü:, ɔ:, ɛ:, ɪ:'
short = 'a, i, i̯, o, u, ṷ, ö, ü, ɔ, ɔ̈, ə, ɛ, ɪ'
#Dipthong: ???
#Front-Start: ???
#Fronting: ???
high = 'i, i:, i̯, ɪ, ɪ: u, u:, ṷ, ə, e:, o, o:, ö, ö:, ü, ü:'
medium = 'e:, ə, ɛ, ɛ:, ɔ, ɔ:, ɔ̈, o, o:, ö, ö:'
low = 'a, a:, ɛ, ɛ:, ɔ, ɔ:, ɔ̈'
rounded = 'a, a:, o, o:, u, u:, ṷ, ö, ö:, ü, ü:, ɔ, ɔ:, ɔ̈'
unrounded = 'i, i:, i̯, e:, ə, ɛ, ɛ:, ɪ, ɪ:'
i_vowel = 'i, i:, i̯, ɪ, ɪ:'
e_vowel = 'e:,ə, ɛ, ɛ:'
a_vowel = 'a, a:'
o_vowel = 'o, o:, ö, ö:, ɔ, ɔ:, ɔ̈'
u_vowel = 'u, u:, ṷ, ü, ü:'
## htk phoneset
phoneset = fame_asr.phoneset_htk
## convert ipa group to htk format for quests.hed.
def _ipa2quest(R_or_L, ipa_text):
assert R_or_L in ['R', 'L'], print('the first argument should be either R or L.')
ipa_list = ipa_text.replace(' ', '').split(',')
if R_or_L == 'R':
quests_list = ['*+' + fame_functions.ipa2htk(ipa) for ipa in ipa_list]
else:
quests_list = [fame_functions.ipa2htk(ipa) + '-*' for ipa in ipa_list]
return ','.join(quests_list)
def make_quests_hed(quest_hed):
def _add_quests_item(R_or_L, item_name_, ipa_text):
assert R_or_L in ['R', 'L'], print('the first argument should be either R or L.')
item_name = R_or_L + '_' + item_name_
with open(quest_hed, 'ab') as f:
f.write(bytes('QS "' + item_name + '"\t{ ' + _ipa2quest(R_or_L, ipa_text) + ' }\n', 'ascii'))
if os.path.exists(quest_hed):
os.remove(quest_hed)
for R_or_L in ['R', 'L']:
_add_quests_item(R_or_L, 'NonBoundary', '*')
_add_quests_item(R_or_L, 'Silence', 'sil')
_add_quests_item(R_or_L, 'Stop', stop)
_add_quests_item(R_or_L, 'Nasal', nasal)
_add_quests_item(R_or_L, 'Fricative', fricative)
_add_quests_item(R_or_L, 'Liquid', liquid)
_add_quests_item(R_or_L, 'Vowel', vowel)
_add_quests_item(R_or_L, 'C-Front', c_front)
_add_quests_item(R_or_L, 'C-Central', c_central)
_add_quests_item(R_or_L, 'C-Back', c_back)
_add_quests_item(R_or_L, 'V-Front', v_front)
_add_quests_item(R_or_L, 'V-Central', v_central)
_add_quests_item(R_or_L, 'V-Back', v_back)
_add_quests_item(R_or_L, 'Front', c_front + v_front)
_add_quests_item(R_or_L, 'Central', c_central + v_central)
_add_quests_item(R_or_L, 'Back', c_front + v_back)
_add_quests_item(R_or_L, 'Fortis', fortis)
_add_quests_item(R_or_L, 'Lenis', lenis)
_add_quests_item(R_or_L, 'UnFortLenis', neither_fortis_nor_lenis)
_add_quests_item(R_or_L, 'Coronal', coronal)
_add_quests_item(R_or_L, 'NonCoronal', non_coronal)
_add_quests_item(R_or_L, 'Anterior', anterior)
_add_quests_item(R_or_L, 'NonAnterior', non_anterior)
_add_quests_item(R_or_L, 'Continuent', continuent)
_add_quests_item(R_or_L, 'NonContinuent', non_continuent)
_add_quests_item(R_or_L, 'Strident', strident)
_add_quests_item(R_or_L, 'NonStrident', non_strident)
_add_quests_item(R_or_L, 'UnStrident', unstrident)
_add_quests_item(R_or_L, 'Glide', glide)
_add_quests_item(R_or_L, 'Syllabic', syllabic)
_add_quests_item(R_or_L, 'Unvoiced-Cons', unvoiced)
_add_quests_item(R_or_L, 'Voiced-Cons', voiced)
_add_quests_item(R_or_L, 'Unvoiced-All', unvoiced + ', sil')
_add_quests_item(R_or_L, 'Long', long)
_add_quests_item(R_or_L, 'Short', short)
#_add_quests_item(R_or_L, 'Dipthong', xxx)
#_add_quests_item(R_or_L, 'Front-Start', xxx)
#_add_quests_item(R_or_L, 'Fronting', xxx)
_add_quests_item(R_or_L, 'High', high)
_add_quests_item(R_or_L, 'Medium', medium)
_add_quests_item(R_or_L, 'Low', low)
_add_quests_item(R_or_L, 'Rounded', rounded)
_add_quests_item(R_or_L, 'UnRounded', unrounded)
#_add_quests_item(R_or_L, 'Affricative', rounded)
_add_quests_item(R_or_L, 'NonAffricative', non_affricate)
_add_quests_item(R_or_L, 'IVowel', i_vowel)
_add_quests_item(R_or_L, 'EVowel', e_vowel)
_add_quests_item(R_or_L, 'AVowel', a_vowel)
_add_quests_item(R_or_L, 'OVowel', o_vowel)
_add_quests_item(R_or_L, 'UVowel', u_vowel)
_add_quests_item(R_or_L, 'Voiced-Stop', voiced_stop)
_add_quests_item(R_or_L, 'UnVoiced-Stop', unvoiced_stop)
_add_quests_item(R_or_L, 'Front-Stop', front_stop)
_add_quests_item(R_or_L, 'Central-Stop', central_stop)
_add_quests_item(R_or_L, 'Back-Stop', back_stop)
_add_quests_item(R_or_L, 'Voiced-Fric', voiced_fricative)
_add_quests_item(R_or_L, 'UnVoiced-Fric', unvoiced_fricative)
_add_quests_item(R_or_L, 'Front-Fric', front_fricative)
_add_quests_item(R_or_L, 'Central-Fric', central_fricative)
_add_quests_item(R_or_L, 'Back-Fric', back_fricative)
for p in phoneset:
_add_quests_item(R_or_L, p, p)
return

View File

@@ -7,6 +7,7 @@ import pandas as pd
import convert_xsampa2ipa import convert_xsampa2ipa
import defaultfiles as default import defaultfiles as default
import fame_functions import fame_functions
import novoapi_functions
def _load_transcriptions(): def _load_transcriptions():
@@ -67,6 +68,19 @@ def load_transcriptions_clean(clean_wav_dir):
return df_clean return df_clean
def load_transcriptions_novo70(clean_wav_dir):
""" extract rows of which ipa is written in novo70 phonset. """
df = load_transcriptions_clean(clean_wav_dir)
df_novo70 = pd.DataFrame(index=[], columns=list(df.keys()))
for index, row in df.iterrows():
not_in_novo70 = novoapi_functions.phones_not_in_novo70(row['ipa'])
if len(not_in_novo70) == 0:
df_novo70 = df_novo70.append(row, ignore_index=True)
return df_novo70
def add_row_htk(df): def add_row_htk(df):
""" df['htk'] is made from df['ipa'] and added. """ """ df['htk'] is made from df['ipa'] and added. """
htk = [] htk = []
@@ -81,3 +95,25 @@ def add_row_asr(df):
for index, row in df.iterrows(): for index, row in df.iterrows():
asr.append(fame_functions.ipa2asr(row['ipa'])) asr.append(fame_functions.ipa2asr(row['ipa']))
return df.assign(asr=asr) return df.assign(asr=asr)
def load_pronunciations(WORD, htk_dic):
""" load pronunciation variants from HTK dic file.
Args:
WORD (str): word in capital letters.
htk_dic (path): HTK dict file.
Returns:
(pronunciations) (list): pronunciation variants of WORD.
Notes:
Because this function loads all contents from htk_dic file,
it is not recommended to use for large lexicon.
"""
with open(htk_dic) as f:
lines = f.read().replace(' sil', '')
lines = lines.split('\n')
return [' '.join(line.split(' ')[1:])
for line in lines if line.split(' ')[0]==WORD]

View File

@@ -2,8 +2,9 @@ import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model') os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import sys import sys
import shutil import shutil
from collections import Counter
#import numpy as np import numpy as np
import pandas as pd import pandas as pd
import defaultfiles as default import defaultfiles as default
@@ -62,3 +63,31 @@ for ipa in df['ipa']:
if ':' in ipa_splitted: if ':' in ipa_splitted:
print(ipa_splitted) print(ipa_splitted)
## check pronunciation variants
df_clean = stimmen_functions.load_transcriptions_clean(stimmen_test_dir)
df_clean = stimmen_functions.add_row_asr(df_clean)
df_clean = stimmen_functions.add_row_htk(df_clean)
for word in word_list:
#word = word_list[1]
df_ = df_clean[df_clean['word']==word]
c = Counter(df_['htk'])
pronunciations = dict()
for key, value in zip(c.keys(), c.values()):
if value > 3:
pronunciations[key] = value
print(pronunciations)
monophone_mlf = os.path.join(default.htk_dir, 'label', 'train_phone_aligned.mlf')
triphone_mlf = os.path.join(default.htk_dir, 'label', 'train_triphone.mlf')
def filenames_in_mlf(file_mlf):
with open(file_mlf) as f:
lines_ = f.read().split('\n')
lines = [line for line in lines_ if len(line.split(' ')) == 1 and line != '.']
filenames = [line.replace('"', '').replace('*/', '') for line in lines[1:-1]]
return filenames
filenames_mono = filenames_in_mlf(monophone_mlf)
filenames_tri = filenames_in_mlf(triphone_mlf)