Compare commits

..

No commits in common. "3a98e184fea13fd784fdf56a689926d833ea3b70" and "d56ef7f0759e5f1c143d98dbd5329926503c2574" have entirely different histories.

14 changed files with 310 additions and 1659 deletions

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@ -9,12 +9,13 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
ProjectSection(SolutionItems) = preProject
..\forced_alignment\forced_alignment\__init__.py = ..\forced_alignment\forced_alignment\__init__.py
..\forced_alignment\forced_alignment\convert_phone_set.py = ..\forced_alignment\forced_alignment\convert_phone_set.py
..\toolbox\evaluation.py = ..\toolbox\evaluation.py
..\ipa-xsama-converter\converter.py = ..\ipa-xsama-converter\converter.py
..\forced_alignment\forced_alignment\defaultfiles.py = ..\forced_alignment\forced_alignment\defaultfiles.py
..\forced_alignment\forced_alignment\forced_alignment.pyproj = ..\forced_alignment\forced_alignment\forced_alignment.pyproj
..\forced_alignment\forced_alignment\htk_dict.py = ..\forced_alignment\forced_alignment\htk_dict.py
..\forced_alignment\forced_alignment\lexicon.py = ..\forced_alignment\forced_alignment\lexicon.py
..\forced_alignment\forced_alignment\mlf.py = ..\forced_alignment\forced_alignment\mlf.py
..\forced_alignment\forced_alignment\pronunciations.py = ..\forced_alignment\forced_alignment\pronunciations.py
..\toolbox\pyHTK.py = ..\toolbox\pyHTK.py
..\forced_alignment\forced_alignment\pyhtk.py = ..\forced_alignment\forced_alignment\pyhtk.py
..\forced_alignment\forced_alignment\scripts.py = ..\forced_alignment\forced_alignment\scripts.py
..\forced_alignment\forced_alignment\tempfilename.py = ..\forced_alignment\forced_alignment\tempfilename.py

View File

@ -22,11 +22,12 @@ dataset_list = ['devel', 'test', 'train']
extract_features = 0
make_feature_list = 0
conv_lexicon = 0
check_lexicon = 0
check_lexicon = 1
make_mlf = 0
combine_files = 0
flat_start = 0
train_model = 1
train_model = 0
forced_alignment = 0
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir))
@ -287,7 +288,7 @@ if flat_start:
## ======================= estimate monophones =======================
if train_model:
iter_num_max = 3
for mix_num in [128, 256, 512, 1024]:
for mix_num in [16, 32, 64, 128]:
for iter_num in range(1, iter_num_max+1):
print("===== mix{}, iter{} =====".format(mix_num, iter_num))
iter_num_pre = iter_num - 1
@ -314,6 +315,5 @@ if train_model:
fout.write("MU %d {*.state[2-4].mix}" % (mix_num_next))
subprocessStr = 'HHEd -T 1 -H ' + modelN_dir + '\\' + hmmdefs_name + ' -M ' + modelN_dir_next + ' ' + header_file + ' ' + phonelist
subprocess.call(subprocessStr, shell=True)

View File

@ -28,12 +28,6 @@
<Compile Include="convert_xsampa2ipa.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="defaultfiles.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="fa_test.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="performance_check.py">
<SubType>Code</SubType>
</Compile>

View File

@ -1,13 +1,17 @@
import os
import sys
from collections import Counter
import numpy as np
import pandas as pd
import defaultfiles as default
sys.path.append(default.forced_alignment_module_dir)
## ======================= user define =======================
repo_dir = 'C:\\Users\\Aki\\source\\repos\\acoustic_model'
curr_dir = repo_dir + '\\acoustic_model'
forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced_alignment'
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir))
sys.path.append(forced_alignment_module)
from forced_alignment import convert_phone_set
@ -38,41 +42,6 @@ def make_filelist(input_dir, output_txt):
fout.write(input_dir + '\\' + filename + '\n')
def make_dic(word, pronvar_, fileDic, output_type):
"""
make dict files which can be used for HTK.
param word: target word.
param pronvar_: pronunciation variant. nx2 (WORD /t pronunciation) ndarray.
param fileDic: output dic file.
param output_type: 0:full, 1:statistics, 2:frequency <2% entries are removed. 3:top 3.
"""
#assert(output_type < 4 and output_type >= 0, 'output_type should be an integer between 0 and 3.')
WORD = word.upper()
if output_type == 0: # full
pronvar = np.unique(pronvar_)
with open(fileDic, 'w') as f:
for pvar in pronvar:
f.write('{0}\t{1}\n'.format(WORD, pvar))
else:
c = Counter(pronvar_)
total_num = sum(c.values())
with open(fileDic, 'w') as f:
if output_type == 3:
for key, value in c.most_common(3):
f.write('{0}\t{1}\n'.format(WORD, key))
else:
for key, value in c.items():
percentage = value/total_num*100
if output_type == 1: # all
f.write('{0}\t{1:.2f}\t{2}\t{3}\n'.format(value, percentage, WORD, key))
elif output_type == 2: # less than 2 percent
if percentage < 2:
f.write('{0}\t{1}\n'.format(WORD, key))
def get_phonelist(lexicon_file):
""" Make a list of phones which appears in the lexicon. """
@ -130,22 +99,4 @@ def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out):
lex2 = pd.read_table(lexicon_file2, names=['word', 'pronunciation'])
lex = pd.concat([lex1, lex2])
lex = lex.sort_values(by='word', ascending=True)
lex.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
def read_fileFA(fileFA):
"""
read the result file of HTK forced alignment.
this function only works when input is one word.
"""
with open(fileFA, 'r') as f:
lines = f.read()
lines = lines.split('\n')
phones = []
for line in lines:
line_split = line.split()
if len(line_split) > 1:
phones.append(line_split[2])
return ' '.join(phones)
lex.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')

View File

@ -2,4 +2,4 @@
config_hcopy = c:\cygwin64\home\Aki\acoustic_model\config\config.HCopy
config_train = c:\cygwin64\home\Aki\acoustic_model\config\config.train
mkhmmdefs_pl = c:\cygwin64\home\Aki\acoustic_model\src\acoustic_model\mkhmmdefs.pl
FAME_dir = C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\corpus
FAME_dir = c:\OneDrive\Research\rug\experiments\friesian\corpus

View File

@ -7,155 +7,122 @@ import json
import sys
import os
import defaultfiles as default
sys.path.append(os.path.join(default.repo_dir, 'forced_alignment'))
from forced_alignment import convert_phone_set
#sys.path.append(ipa_xsampa_converter_dir)
#import converter
def load_converter(source, sink, ipa_xsampa_converter_dir):
"""load the converter.
source and sink are either of "ipa", "xsampa" or "sassc".
"""
choices = ["ipa", "xsampa", "sassc"]
"""load the converter.
source and sink are either of "ipa", "xsampa" or "sassc".
"""
choices = ["ipa", "xsampa", "sassc"]
# Validate params
try:
choice1 = choices.index(source)
choice2 = choices.index(sink)
if choice1 == choice2:
print("source and destination format are the same.")
except ValueError:
print("source and destination should be one of [ipa xsampa sassc].")
exit(1)
# Mappings from disk
# some may not be used if source or sink is already IPA
source_to_ipa = {}
ipa_to_sink = {}
# Validate params
try:
choice1 = choices.index(source)
choice2 = choices.index(sink)
if choice1 == choice2:
print("source and destination format are the same.")
except ValueError:
print("source and destination should be one of [ipa xsampa sassc].")
exit(1)
# Mappings from disk
# some may not be used if source or sink is already IPA
source_to_ipa = {}
ipa_to_sink = {}
ipa_xsampa = []
sassc_ipa = []
ipa_xsampa = []
sassc_ipa = []
# The IPAs that actually occur within SASSC
sassc_active_ipa = {}
# The IPAs that actually occur within SASSC
sassc_active_ipa = {}
script_dir = os.path.dirname(os.path.realpath(__file__))
script_dir = os.path.dirname(os.path.realpath(__file__))
with open(os.path.join(ipa_xsampa_converter_dir, "ipa_xsampa_map.json"), encoding="utf-8") as f:
ipa_xsampa = json.load(f)
with open(os.path.join(ipa_xsampa_converter_dir, "ipa_xsampa_map.json"), encoding="utf-8") as f:
ipa_xsampa = json.load(f)
sassc_active = source == "sassc" or sink == "sassc"
if sassc_active:
with open(os.path.join(script_dir, "./sassc_ipa.json")) as f:
sassc_ipa = json.load(f)
for pair in sassc_ipa:
for char in pair[1]:
sassc_active_ipa[char] = 1
sassc_active = source == "sassc" or sink == "sassc"
if sassc_active:
with open(os.path.join(script_dir, "./sassc_ipa.json")) as f:
sassc_ipa = json.load(f)
for pair in sassc_ipa:
for char in pair[1]:
sassc_active_ipa[char] = 1
if source == "xsampa":
for pair in ipa_xsampa:
source_to_ipa[pair[1]] = pair[0]
elif source == "sassc":
for pair in sassc_ipa:
source_to_ipa[pair[0]] = pair[1]
if source == "xsampa":
for pair in ipa_xsampa:
source_to_ipa[pair[1]] = pair[0]
elif source == "sassc":
for pair in sassc_ipa:
source_to_ipa[pair[0]] = pair[1]
if sink == "xsampa":
for pair in ipa_xsampa:
ipa_to_sink[pair[0]] = pair[1]
elif sink == "sassc":
for pair in sassc_ipa:
ipa_to_sink[pair[1]] = pair[0]
if sink == "xsampa":
for pair in ipa_xsampa:
ipa_to_sink[pair[0]] = pair[1]
elif sink == "sassc":
for pair in sassc_ipa:
ipa_to_sink[pair[1]] = pair[0]
# Combine them into a single mapping
mapping = {}
if source == "ipa":
mapping = ipa_to_sink
elif sink == "ipa":
mapping = source_to_ipa
else:
for k, ipas in source_to_ipa.iteritems():
map_out = ""
failed = False
for ipa in ipas:
val = ipa_to_sink.get(ipa)
if not val:
failed = True
break
map_out += val
mapping[k] = map_out if not failed else None
# Combine them into a single mapping
mapping = {}
if source == "ipa":
mapping = ipa_to_sink
elif sink == "ipa":
mapping = source_to_ipa
else:
for k, ipas in source_to_ipa.iteritems():
map_out = ""
failed = False
for ipa in ipas:
val = ipa_to_sink.get(ipa)
if not val:
failed = True
break
map_out += val
mapping[k] = map_out if not failed else None
return mapping
return mapping
def conversion(source, sink, mapping, line):
"""
conversion.
Args:
mapping: can be obtained with load_converter().
line: must be seperated, by default the seperator is whitespace.
"""
"""
conversion.
Args:
mapping: can be obtained with load_converter().
line: must be seperated, by default the seperator is whitespace.
"""
# Edit this to change the seperator
SEPERATOR = " "
# Edit this to change the seperator
SEPERATOR = " "
line = line.strip()
output = []
#if sassc_active:
# tokens = line.split(SEPERATOR)
#else:
tokens = line
for token in tokens:
if token.isspace():
output.append(token)
continue
# Remove extraneous chars that IPA does not accept
if sink == "sassc":
cleaned_token = u""
for char in token:
if sassc_active_ipa.get(char):
cleaned_token += char
token = cleaned_token
mapped = mapping.get(token)
if not mapped:
print("WARNING: could not map token ", token, file=sys.stderr)
else:
output.append(mapped)
#if sassc_active:
# output = SEPERATOR.join(output)
#else:
output = "".join(output)
return output
def xsampa2ipa(mapping, xsampa):
"""
conversion from xsampa to ipa.
Args:
mapping: can be obtained with load_converter().
xsampa: a line written in xsampa.
Notes:
function conversion does not work when:
- the input is a word.
- when the line includes '\'.
- 'ɡ' and 'g' are considered to be different.
"""
# make a multi_character_list to split 'xsampa'.
multi_character_list = []
for i in list(mapping):
if len(i) > 1:
multi_character_list.append(i)
# conversion
ipa = []
for phone in convert_phone_set.multi_character_tokenize(xsampa, multi_character_list):
ipa.append(mapping.get(phone, phone))
ipa = ''.join(ipa)
# strange conversion.
ipa = ipa.replace('ɡ', 'g')
return ipa
line = line.strip()
output = []
#if sassc_active:
# tokens = line.split(SEPERATOR)
#else:
tokens = line
for token in tokens:
if token.isspace():
output.append(token)
continue
# Remove extraneous chars that IPA does not accept
if sink == "sassc":
cleaned_token = u""
for char in token:
if sassc_active_ipa.get(char):
cleaned_token += char
token = cleaned_token
mapped = mapping.get(token)
if not mapped:
print("WARNING: could not map token ", token, file=sys.stderr)
else:
output.append(mapped)
#if sassc_active:
# output = SEPERATOR.join(output)
#else:
output = "".join(output)
return output

View File

@ -1,35 +0,0 @@
import os
#default_hvite_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', 'htk', 'config.HVite')
cygwin_dir = r'C:\cygwin64\home\Aki\acoustic_model'
#config_hcopy = os.path.join(cygwin_dir, 'config', 'config.HCopy')
#config_train = os.path.join(cygwin_dir, 'config', 'config.train')
config_hvite = os.path.join(cygwin_dir, 'config', 'config.HVite')
#mkhmmdefs_pl = os.path.join(cygwin_dir, 'src', 'acoustic_model', 'mkhmmdefs.pl')
#dbLexicon = C:\\Users\\Aki\\source\\repos\\rug_VS\\forced_alignment\\config\\lexicon.accdb
#scriptBarbara = C:\\Users\\Aki\\source\\repos\\rug_VS\\forced_alignment\\config\\pronvars_barbara.perl
#exeG2P = C:\\Users\\Aki\\source\\repos\\rug_VS\\forced_alignment\\config\\string2phon.exe
#[pyHTK]
#configHVite = C:\\Users\\Aki\\source\\repos\\rug_VS\\forced_alignment\\config\\config.HVite
#filePhoneList = C:\\Users\\Aki\\source\\repos\\rug_VS\\forced_alignment\\config\\phonelist_barbara.txt
#AcousticModel = C:\\Users\\Aki\\source\\repos\\rug_VS\\forced_alignment\\config\\hmmdefs_16-2_barbara.compo
#dbLexicon = config['cLexicon']['dbLexicon']
#scriptBarbara = config['cLexicon']['scriptBarbara']
#exeG2P = config['cLexicon']['exeG2P']
#configHVite = config['pyHTK']['configHVite']
#filePhoneList = config['pyHTK']['filePhoneList']
#AcousticModel = config['pyHTK']['AcousticModel']
repo_dir = r'C:\Users\Aki\source\repos'
ipa_xsampa_converter_dir = os.path.join(repo_dir, 'ipa-xsama-converter')
forced_alignment_module_dir = os.path.join(repo_dir, 'forced_alignment')
fame_dir = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\corpus'
experiments_dir = r'c:\OneDrive\Research\rug\experiments'
phonelist = os.path.join(experiments_dir, 'friesian', 'acoustic_model', 'config', 'phonelist_friesian.txt')

View File

@ -1,16 +0,0 @@
import os
import sys
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import defaultfiles as default
sys.path.append(os.path.join(default.repo_dir, 'forced_alignment'))
from forced_alignment import forced_alignment
wav_file = r'C:\Users\Aki\source\repos\forced_alignment\notebooks\sample\10147-1464186409-1917281.wav'
forced_alignment(
wav_file,
#'Australië'
'BUFFETCOUPON COULISSEN DOUANE'
)

View File

@ -1,437 +1,257 @@
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import sys
import csv
import subprocess
import configparser
from collections import Counter
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#from sklearn.metrics import confusion_matrix
import acoustic_model_functions as am_func
import convert_xsampa2ipa
import defaultfiles as default
## ======================= user define =======================
#curr_dir = r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model'
#config_ini = 'config.ini'
#repo_dir = r'C:\Users\Aki\source\repos'
#forced_alignment_module = repo_dir + '\\forced_alignment'
#forced_alignment_module_old = repo_dir + '\\aki_tools'
#ipa_xsampa_converter_dir = repo_dir + '\\ipa-xsama-converter'
#accent_classification_dir = repo_dir + '\\accent_classification\accent_classification'
excel_file = os.path.join(default.experiments_dir, 'stimmen', 'data', 'Frisian Variants Picture Task Stimmen.xlsx')
## ======================= functions =======================
def read_fileFA(fileFA):
"""
read the result file of HTK forced alignment.
this function only works when input is one word.
"""
with open(fileFA, 'r') as f:
lines = f.read()
lines = lines.split('\n')
phones = []
for line in lines:
line_split = line.split()
if len(line_split) > 1:
phones.append(line_split[2])
return ' '.join(phones)
#experiments_dir = r'C:\OneDrive\Research\rug\experiments'
data_dir = os.path.join(default.experiments_dir, 'stimmen', 'data')
#csvfile = data_dir + '\\Frisian Variants Picture Task Stimmen.csv'
wav_dir = os.path.join(default.experiments_dir, 'stimmen', 'wav')
acoustic_model_dir = os.path.join(default.experiments_dir, 'friesian', 'acoustic_model', 'model')
htk_dict_dir = os.path.join(default.experiments_dir, 'stimmen', 'dic_short')
fa_dir = os.path.join(default.experiments_dir, 'stimmen', 'FA')
#cygwin_dir = r'C:\cygwin64\home\Aki\acoustic_model'
#lex_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr')
#lex_asr_htk = os.path.join(default.fame_dir, 'lexicon', 'lex.asr_htk')
#####################
## USER DEFINE ##
#####################
curr_dir = r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model'
config_ini = curr_dir + '\\config.ini'
forced_alignment_module = r'C:\Users\Aki\source\repos\forced_alignment'
forced_alignment_module_old = r'C:\OneDrive\Research\rug\code\forced_alignment\forced_alignment'
ipa_xsampa_converter_dir = r'C:\Users\Aki\source\repos\ipa-xsama-converter'
csvfile = r"C:\OneDrive\Research\rug\stimmen\Frisian Variants Picture Task Stimmen.csv"
experiments_dir = r'C:\OneDrive\Research\rug\experiments'
data_dir = experiments_dir + '\\stimmen\\data'
cygwin_dir = r'C:\cygwin64\home\Aki\acoustic_model'
# procedure
convert_phones = 0
make_dic_files = 0
do_forced_alignment_htk = 1
make_kaldi_data_files = 0
make_kaldi_lexicon_txt = 0
load_forced_alignment_kaldi = 0
eval_forced_alignment = 0
make_dic_files_short = 0
do_forced_alignment = 0
eval_forced_alignment = 1
## ======================= add paths =======================
sys.path.append(os.path.join(default.repo_dir, 'forced_alignment'))
sys.path.append(forced_alignment_module)
from forced_alignment import convert_phone_set
from forced_alignment import pyhtk
sys.path.append(os.path.join(default.repo_dir, 'toolbox'))
#import pyHTK
from evaluation import plot_confusion_matrix
# for interactive window
sys.path.append(curr_dir)
import convert_xsampa2ipa
import acoustic_model_functions as am_func
# for forced-alignment
sys.path.append(forced_alignment_module_old)
import pyHTK
## ======================= load variables =======================
config = configparser.ConfigParser()
config.sections()
config.read(config_ini)
FAME_dir = config['Settings']['FAME_dir']
lex_asr = FAME_dir + '\\lexicon\\lex.asr'
lex_asr_htk = FAME_dir + '\\lexicon\\lex.asr_htk'
## ======================= convert phones ======================
if convert_phones:
mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', ipa_xsampa_converter_dir)
mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_converter_dir)
## check phones included in FAME!
# the phones used in the lexicon.
#phonelist = am_func.get_phonelist(lex_htk)
xls = pd.ExcelFile(excel_file)
# the lines which include a specific phone.
#lines = am_func.find_phone(lex_asr, 'x')
## check conversion
#df = pd.read_excel(xls, 'frequency')
#for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']):
# #ipa_converted = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, xsampa_)
# ipa_converted = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
# if not ipa_converted == ipa:
# print('{0}: {1} - {2}'.format(xsampa, ipa_converted, ipa))
with open(csvfile, encoding="utf-8") as fin:
lines = csv.reader(fin, delimiter=';', lineterminator="\n", skipinitialspace=True)
next(lines, None) # skip the headers
filenames = []
words = []
pronunciations = []
for line in lines:
if line[1] is not '' and len(line) > 5:
filenames.append(line[0])
words.append(line[1])
pron_xsampa = line[3]
pron_ipa = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, pron_xsampa)
pron_ipa = pron_ipa.replace('ː', ':')
pron_famehtk = convert_phone_set.ipa2famehtk(pron_ipa)
# adjust to phones used in the acoustic model.
pron_famehtk = pron_famehtk.replace('sp', 'sil')
pron_famehtk = pron_famehtk.replace('ce :', 'ce') # because ceh is ignored.
pron_famehtk = pron_famehtk.replace('w :', 'wh')
pron_famehtk = pron_famehtk.replace('e :', 'eh')
pron_famehtk = pron_famehtk.replace('eh :', 'eh')
pron_famehtk = pron_famehtk.replace('ih :', 'ih')
## check phones included in FAME!
# the phones used in the lexicon.
#phonelist = am_func.get_phonelist(lex_asr)
#translation_key = {'sp': 'sil', 'ce :': 'ceh', 'w :': 'wh'}
#pron = []
#for phoneme in pron_famehtk.split(' '):
# pron.append(translation_key.get(phoneme, phoneme))
#pronunciations.append(' '.join(pron_famehtk))
pronunciations.append(pron_famehtk)
# the lines which include a specific phone.
#lines = am_func.find_phone(lex_asr, 'x')
# check if all phones are in the phonelist of the acoustic model.
#phonelist = ' '.join(pronunciations)
#np.unique(phonelist.split(' '))
#phonelist.find(':')
filenames = np.array(filenames)
words = np.array(words)
pronunciations = np.array(pronunciations)
# Filename, Word, Self Xsampa
df = pd.read_excel(xls, 'original')
ipas = []
famehtks = []
for xsampa in df['Self Xsampa']:
if not isinstance(xsampa, float): # 'NaN'
# typo?
xsampa = xsampa.replace('r2:z@rA:\\t', 'r2:z@rA:t')
xsampa = xsampa.replace(';', ':')
del line, lines
del pron_xsampa, pron_ipa, pron_famehtk
ipa = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
ipa = ipa.replace('ː', ':')
ipa = ipa.replace(' ', '')
ipas.append(ipa)
famehtk = convert_phone_set.ipa2famehtk(ipa)
famehtks.append(famehtk)
else:
ipas.append('')
famehtks.append('')
# extract interesting cols.
df = pd.DataFrame({'filename': df['Filename'],
'word': df['Word'],
'xsampa': df['Self Xsampa'],
'ipa': pd.Series(ipas),
'famehtk': pd.Series(famehtks)})
# cleansing.
df = df[~df['famehtk'].isin(['/', ''])]
np.save(data_dir + '\\filenames.npy', filenames)
np.save(data_dir + '\\words.npy', words)
np.save(data_dir + '\\pronunciations.npy', pronunciations)
else:
filenames = np.load(data_dir + '\\filenames.npy')
words = np.load(data_dir + '\\words.npy')
pronunciations = np.load(data_dir + '\\pronunciations.npy')
word_list = np.unique(words)
## ======================= make dict files used for HTK. ======================
if make_dic_files:
word_list = np.unique(df['word'])
output_dir = experiments_dir + r'\stimmen\dic'
output_type = 3
for word in word_list:
htk_dict_file = htk_dict_dir + '\\' + word + '.dic'
for word in word_list:
WORD = word.upper()
fileDic = output_dir + '\\' + word + '.dic'
# pronunciation variant of the target word.
pronvar_ = df['famehtk'][df['word'].str.match(word)]
# make dic file.
pronvar_ = pronunciations[words == word]
pronvar = np.unique(pronvar_)
# make dic file.
am_func.make_dic(word, pronvar_, htk_dict_file, output_type)
## ======================= forced alignment using HTK =======================
if do_forced_alignment_htk:
#hmm_num = 2
for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
hmm_num_str = str(hmm_num)
acoustic_model = os.path.join(acoustic_model_dir, 'hmm' + hmm_num_str + r'-2\hmmdefs')
predictions = []
for i, filename in enumerate(df['filename']):
print('=== {0}/{1} ==='.format(i, len(df)))
wav_file = os.path.join(wav_dir, filename)
if os.path.exists(wav_file) and i in df['filename'].keys():
word = df['word'][i]
WORD = word.upper()
# make label file.
label_file = os.path.join(wav_dir, filename.replace('.wav', '.lab'))
with open(label_file, 'w') as f:
lines = f.write(WORD)
htk_dict_file = os.path.join(htk_dict_dir, word + '.dic')
fa_file = os.path.join(fa_dir, filename.replace('.wav', '.txt') + hmm_num_str)
pyhtk.doHVite(wav_file, label_file, htk_dict_file, fa_file, default.config_hvite, default.phonelist, acoustic_model)
prediction = am_func.read_fileFA(fa_file)
predictions.append(prediction)
os.remove(label_file)
print('{0}: {1} -> {2}'.format(WORD, df['famehtk'][i], prediction))
else:
predictions.append('')
print('!!!!! file not found.')
predictions = np.array(predictions)
#match = np.c_[words[predictions != ''], pronunciations[predictions != ''], predictions[predictions != '']]
np.save(os.path.join(data_dir, 'predictions_hmm' + hmm_num_str + '.npy'), predictions)
with open(fileDic, 'w') as f:
for pvar in pronvar:
f.write('{0}\t{1}\n'.format(WORD, pvar))
## ======================= make files which is used for forced alignment by Kaldi =======================
if make_kaldi_data_files:
wav_dir = r'c:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\corpus\stimmen'
kaldi_work_dir = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5'
kaldi_data_dir = os.path.join(kaldi_work_dir, 'data', 'alignme')
kaldi_dict_dir = os.path.join(kaldi_work_dir, 'data', 'local', 'dict')
htk_dict_dir = os.path.join(experiments_dir, 'stimmen', 'dic_top3')
## ======================= make dict files for most popular words. ======================
if make_dic_files_short:
output_dir = experiments_dir + r'\stimmen\dic'
wav_scp = os.path.join(kaldi_data_dir, 'wav.scp')
text_file = os.path.join(kaldi_data_dir, 'text')
utt2spk = os.path.join(kaldi_data_dir, 'utt2spk')
#word = word_list[3]
for word in word_list:
WORD = word.upper()
fileStat = output_dir + '\\' + word + '_stat.csv'
pronvar = pronunciations[words == word]
c = Counter(pronvar)
total_num = sum(c.values())
lexicon_txt = os.path.join(kaldi_dict_dir, 'lexicon.txt')
predictions = []
file_num_max = len(filenames)
# remove previous files.
if os.path.exists(wav_scp):
os.remove(wav_scp)
if os.path.exists(text_file):
os.remove(text_file)
if os.path.exists(utt2spk):
os.remove(utt2spk)
f_wav_scp = open(wav_scp, 'a', encoding="utf-8", newline='\n')
f_text_file = open(text_file, 'a', encoding="utf-8", newline='\n')
f_utt2spk = open(utt2spk, 'a', encoding="utf-8", newline='\n')
# make wav.scp, text, and utt2spk files.
for i in range(0, file_num_max):
#for i in range(400, 410):
print('=== {0}/{1} ==='.format(i+1, file_num_max))
filename = filenames[i]
wav_file = wav_dir + '\\' + filename
if os.path.exists(wav_file):
speaker_id = 'speaker_' + str(i).zfill(4)
utterance_id = filename.replace('.wav', '')
utterance_id = utterance_id.replace(' ', '_')
utterance_id = speaker_id + '-' + utterance_id
# wav.scp file
wav_file_unix = wav_file.replace('\\', '/')
wav_file_unix = wav_file_unix.replace('c:/', '/mnt/c/')
f_wav_scp.write('{0} {1}\n'.format(utterance_id, wav_file_unix))
# text file
word = words[i].lower()
f_text_file.write('{0}\t{1}\n'.format(utterance_id, word))
# utt2spk
f_utt2spk.write('{0} {1}\n'.format(utterance_id, speaker_id))
f_wav_scp.close()
f_text_file.close()
f_utt2spk.close()
with open(fileStat, 'w') as f:
for key, value in c.items():
f.write('{0}\t{1:.2f}\t{2}\t{3}\n'.format(value, value/total_num*100, WORD, key))
## ======================= make lexicon txt which is used by Kaldi =======================
if make_kaldi_lexicon_txt:
kaldi_work_dir = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5'
kaldi_dict_dir = os.path.join(kaldi_work_dir, 'data', 'local', 'dict')
lexicon_txt = os.path.join(kaldi_dict_dir, 'lexicon.txt')
option_num = 5
## ======================= forced alignment =======================
if do_forced_alignment:
configHVite = cygwin_dir + r'\config\config.HVite'
filePhoneList = experiments_dir + r'\friesian\acoustic_model\config\phonelist_friesian.txt'
wav_dir = experiments_dir + r'\stimmen\wav'
# remove previous file.
if os.path.exists(lexicon_txt):
os.remove(lexicon_txt)
#for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128]:
for hmm_num in [64]:
hmm_num_str = str(hmm_num)
AcousticModel = experiments_dir + r'\friesian\acoustic_model\model\hmm' + hmm_num_str + r'-3\hmmdefs'
mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', ipa_xsampa_converter_dir)
with open(csvfile, encoding="utf-8") as fin:
lines = csv.reader(fin, delimiter=';', lineterminator="\n", skipinitialspace=True)
next(lines, None) # skip the headers
predictions = []
file_num_max = len(filenames)
for i in range(0, file_num_max):
print('=== {0}/{1} ==='.format(i, file_num_max))
filename = filenames[i]
fileWav = wav_dir + '\\' + filename
if os.path.exists(fileWav):
word = words[i]
WORD = word.upper()
filenames = []
words = []
pronunciations = []
p = []
for line in lines:
if line[1] is not '' and len(line) > 5:
filenames.append(line[0])
words.append(line[1])
pron_xsampa = line[3]
pron_ipa = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, pron_xsampa)
pron_ipa = pron_ipa.replace('ː', ':')
# adjust to phones used in the acoustic model.
pronunciations.append(pron_ipa)
# make label file.
fileLab = wav_dir + '\\' + filename.replace('.wav', '.lab')
with open(fileLab, 'w') as f:
lines = f.write(WORD)
# check if all phones are in the phonelist of the acoustic model.
#'y', 'b', 'ɾ', 'u', 'ɔ:', 'ø', 't', 'œ', 'n', 'ɒ', 'ɐ', 'f', 'o', 'k', 'x', 'ɡ', 'v', 's', 'ɛ:', 'ɪ:', 'ɑ', 'ɛ', 'a', 'd', 'z', 'ɪ', 'ɔ', 'l', 'i:', 'm', 'p', 'a:', 'i', 'e', 'j', 'o:', 'ʁ', 'h', ':', 'e:', 'ə', 'æ', 'χ', 'w', 'r', 'ə:', 'sp', 'ʊ', 'u:', 'ŋ'
fileDic = experiments_dir + r'\stimmen\dic_short' + '\\' + word + '.dic'
fileFA = experiments_dir + r'\stimmen\FA_short' + '\\' + filename.replace('.wav', '.txt') + hmm_num_str
filenames = np.array(filenames)
words = np.array(words)
wordlist = np.unique(words)
pronunciations = np.array(pronunciations)
# output lexicon.txt
#f_lexicon_txt = open(lexicon_txt, 'a', encoding="utf-8", newline='\n')
pronvar_list_all = []
for word in word_list:
pyHTK.doHVite(fileWav, fileLab, fileDic, fileFA, configHVite, filePhoneList, AcousticModel)
prediction = read_fileFA(fileFA)
predictions.append(prediction)
# pronunciation variant of the target word.
pronvar_ = pronunciations[words == word]
# remove ''
pronvar_ = np.delete(pronvar_, np.where(pronvar_==''))
os.remove(fileLab)
print('{0}: {1} -> {2}'.format(WORD, pronunciations[i], prediction))
else:
predictions.append('')
print('!!!!! file not found.')
c = Counter(pronvar_)
total_num = sum(c.values())
for key, value in c.most_common(option_num):
#print('{0}\t{1}\t{2}\t{3}'.format(word, key, value, total_num))
key = key.replace('æ', 'ɛ')
key = key.replace('ɐ', 'a')
key = key.replace('ɑ', 'a')
key = key.replace('ɾ', 'r')
key = key.replace('ʁ', 'r')
key = key.replace('ʊ', 'u')
key = key.replace('χ', 'x')
#print('-->{0}\t{1}\t{2}\t{3}\n'.format(word, key, value, total_num))
# make possible pronounciation variant list.
pronvar_list = [key]
while 'ø:' in ' '.join(pronvar_list) or 'œ' in ' '.join(pronvar_list) or 'ɒ' in ' '.join(pronvar_list):
pronvar_list_ = []
for p in pronvar_list:
if 'ø:' in p:
pronvar_list_.append(p.replace('ø:', 'ö'))
pronvar_list_.append(p.replace('ø:', 'ö:'))
if 'œ' in p:
pronvar_list_.append(p.replace('œ', 'ɔ̈'))
pronvar_list_.append(p.replace('œ', 'ɔ̈:'))
if 'ɒ' in p:
pronvar_list_.append(p.replace('ɒ', 'ɔ̈'))
pronvar_list_.append(p.replace('ɒ', 'ɔ̈:'))
pronvar_list = np.unique(pronvar_list_)
for pronvar_ in pronvar_list:
split_ipa = convert_phone_set.split_fame_ipa(pronvar_)
pronvar_out = ' '.join(split_ipa)
pronvar_list_all.append([word, pronvar_out])
# output
pronvar_list_all = np.array(pronvar_list_all)
pronvar_list_all = np.unique(pronvar_list_all, axis=0)
#f_lexicon_txt.write('<UNK>\tSPN\n')
#for line in pronvar_list_all:
# f_lexicon_txt.write('{0}\t{1}\n'.format(line[0].lower(), line[1]))
#f_lexicon_txt.close()
## ======================= load kaldi forced alignment result =======================
if load_forced_alignment_kaldi:
kaldi_work_dir = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5'
phones_txt = kaldi_work_dir + '\\data\\lang\\phones.txt'
merged_alignment_txt = kaldi_work_dir + '\\exp\\tri1_alignme\\merged_alignment.txt'
filenames = np.load(data_dir + '\\filenames.npy')
words = np.load(data_dir + '\\words.npy')
pronunciations = np.load(data_dir + '\\pronunciations_ipa.npy')
pronvar_list_all = np.load(data_dir + '\\pronvar_list_all.npy')
word_list = np.unique(words)
# load the mapping between phones and ids.
with open(phones_txt, 'r', encoding="utf-8") as f:
mappings = f.read().split('\n')
phones = []
phone_ids = []
for m in mappings:
m = m.split(' ')
if len(m) > 1:
phones.append(m[0])
phone_ids.append(int(m[1]))
with open(merged_alignment_txt, 'r') as f:
lines = f.read()
lines = lines.split('\n')
fa_filenames = []
fa_pronunciations = []
filename_ = ''
pron = []
for line in lines:
line = line.split(' ')
if len(line) == 5:
filename = line[0]
if filename == filename_:
phone_id = int(line[4])
#if not phone_id == 1:
phone = phones[phone_ids.index(phone_id)]
pron_ = re.sub(r'_[A-Z]', '', phone)
if not pron_ == 'SIL':
pron.append(pron_)
else:
fa_filenames.append(re.sub(r'speaker_[0-9]{4}-', '', filename))
fa_pronunciations.append(' '.join(pron))
pron = []
filename_ = filename
# correct or not.
#for filename, fa_pronunciation in zip(fa_filenames, fa_pronunciations):
predictions = np.array(predictions)
match = np.c_[words[predictions != ''], pronunciations[predictions != ''], predictions[predictions != '']]
np.save(data_dir + '\\match_hmm' + hmm_num_str + '.npy', match)
## ======================= evaluate the result of forced alignment =======================
if eval_forced_alignment:
match_num = []
for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256]:
#hmm_num = 256
hmm_num_str = str(hmm_num)
match = np.load(data_dir + '\\match_hmm' + hmm_num_str + '.npy')
# use dic_short?
if 1:
pronunciation_variants = np.array(['WORD', 'pronunciation']).reshape(1, 2)
for word in word_list:
fileDic = experiments_dir + r'\stimmen\dic_top3' + '\\' + word + '.dic'
pronunciation_variants = np.r_[pronunciation_variants, pyHTK.loadHTKdic(fileDic)]
# see only words which appears in top 3.
match_short = []
for line in match:
word = line[0]
WORD = word.upper()
pronvar = pronunciation_variants[pronunciation_variants[:, 0] == word.upper(), 1]
if line[1] in pronvar:
match_short.append(line)
#for hmm_num in [1, 2, 4, 8, 16, 32, 64]:
hmm_num = 64
hmm_num_str = str(hmm_num)
match = np.load(data_dir + '\\match_hmm' + hmm_num_str + '.npy')
# use dic_short?
if 1:
pronunciation_variants = np.array(['WORD', 'pronunciation']).reshape(1, 2)
for word in word_list:
fileDic = experiments_dir + r'\stimmen\dic_short' + '\\' + word + '.dic'
pronunciation_variants = np.r_[pronunciation_variants, pyHTK.loadHTKdic(fileDic)]
match_short = np.array(match_short)
match = np.copy(match_short)
match_short = []
for line in match:
word = line[0]
WORD = word.upper()
pronvar = pronunciation_variants[pronunciation_variants[:, 0] == word.upper(), 1]
if line[1] in pronvar:
match_short.append(line)
# number of match
total_match = sum(match[:, 1] == match[:, 2])
print("{}: {}/{}".format(hmm_num_str, total_match, match.shape[0]))
match_num.append([hmm_num, total_match, match.shape[0]])
match_short = np.array(match_short)
match = np.copy(match_short)
# number of match
total_match = sum(match[:, 1] == match[:, 2])
print("{}: {}/{}".format(hmm_num_str, total_match, match.shape[0]))
# number of mixtures vs accuracy
match_num = np.array(match_num)
plt.xscale("log")
plt.plot(match_num[:, 0], match_num[:, 1]/match_num[0, 2], 'o-')
plt.xlabel('number of mixtures', fontsize=14, fontweight='bold')
plt.ylabel('accuracy', fontsize=14, fontweight='bold')
plt.show()
# confusion matrix
#dir_out = r'C:\OneDrive\Research\rug\experiments\stimmen\result'
#word_list = np.unique(match[:, 0])
#for word in word_list:
# match_ = match[match[:, 0] == word, :]
# cm = confusion_matrix(match_[:, 1], match_[:, 2])
# pronvar = pronunciation_variants[pronunciation_variants[:, 0] == word.upper(), 1]
# plt.figure()
# plot_confusion_matrix(cm, classes=pronvar, normalize=True)
# plt.savefig(dir_out + '\\cm_' + word + '.png')