Compare commits

...

44 Commits

Author SHA1 Message Date
97486e5599 dataset for experiments in check_novoapi is updated. 2019-04-22 02:03:50 +02:00
2004399179 novoapi_functions.py is adjusted to use convert_phoneset.py. 2019-04-22 00:59:53 +02:00
b444b70af9 fame_phonetics.py and functions to make quests.hed to tie triphone are added. 2019-03-25 00:06:53 +01:00
bf586fcde5 triphone training is added. 2019-03-23 21:52:48 +01:00
fdd165ce6a re-aligned mlf file include less files than original mlf file. Therefore the scp file should also be updated accordingly, when re-estimation is performed. this bug is fixed. 2019-03-08 23:13:08 +01:00
fa81b70b27 monophone training is completed. 2019-03-07 22:16:50 +01:00
41d4fa5ff9 sp is added to the model. 2019-03-05 00:11:38 +01:00
b1b1942fa0 test on stimmen data is added. 2019-03-03 02:05:37 +01:00
c185072d5b label alignment using HVite is added. 2019-02-14 00:21:28 +01:00
8f89f60538 dataset is made. 2019-02-08 14:10:32 +01:00
f6e563ecd3 moved testing parts in htk_vs_kaldi into stimmen_test.py 2019-02-06 09:35:23 +01:00
da0242b0e1 make sure all the phones in stimmen transcription can be treated correctly. 2019-02-06 00:00:14 +01:00
ab3887c6ca sp is added to the model. 2019-02-04 20:32:12 +01:00
f6e7c8eefa bug related encoding on label file is fixed. 2019-02-04 13:46:27 +01:00
322a8a0079 label files are extracted. hcompv_scp is made. 2019-02-03 13:54:37 +01:00
22cccfb61d fix the bug there are characters in the lexicon which cannot be described in ascii. 2019-02-03 00:34:35 +01:00
dc6b7b84b6 lexicon is made. 2019-01-29 21:52:11 +01:00
8cda93de75 fame_asr phoneset is added including reduced version and htk compatible version. 2019-01-28 12:34:20 +01:00
87abbbb95a correspondence between lex.asr and lex.ipa is automatically obtained. header is added to the functions in fame_functions.py. 2019-01-27 23:52:33 +01:00
813f013d7a phonset is given as fame_phoneset.py. translation key is obtained based on the information. 2019-01-27 01:34:04 +01:00
7844a56281 HTK related functions are moved to pyhtk project. fame acoustic models are made using fame_hmm.py. feature extraction is completed. A function is being made to get translation key from ipa to asr. 2019-01-24 09:38:28 +01:00
04a862b2fd Merge branch 'master' of https://git.webhosting.rug.nl/p280427/acoustic_model 2019-01-21 21:57:46 +01:00
24ac56ac0e to transfer working environment to McRoberts laptop. 2019-01-21 21:56:55 +01:00
82a8e2302f rozen-test is added. 2019-01-21 10:35:50 +01:00
de5c9cecb9 The bug regarding novoapi for Python 3.6 is solved. The detail can be found in novoapi_for_python3x/readme.txt 2019-01-20 13:47:29 +01:00
8efb091715 confusion matrix is output. 2019-01-15 11:30:49 +01:00
05e8a671c1 forced alignment by novoapi is performed. 2019-01-13 23:36:02 +01:00
6edde06a4f check frequency of the pronunciation variants of each word. 2019-01-12 23:29:56 +01:00
1622655542 functions are added to perform forced_alignment using novoapi. results can be written in novo70 or IPA. 2019-01-10 23:39:02 +01:00
d6d5543d03 novoapi_functions is added to novo70 specific functions. 2019-01-07 23:27:02 +01:00
d6e005b1cb find pronunciation variants which all phones are in novo70. 2019-01-07 11:50:24 +01:00
dd9e3d820b started to check which words in stimmen transcription consists of only phones in novo70 phoneset. 2018-12-31 13:04:33 +01:00
e5cf182a18 novo_api for python 3.x is added. 2018-12-30 23:47:55 +01:00
a77ed9d4dd resolve the conflicts. 2018-12-30 23:34:20 +01:00
df046ffc26 forced_alignment_novo.py is removed (.gitignore). commit history is cleaned up. 2018-12-30 23:30:33 +01:00
beff33fdf9 revert "load novo phoneset" to continue working. 2018-12-30 23:19:18 +01:00
af785e51cf commit to clean up 2018-12-30 23:14:26 +01:00
9ec7c3c50b commit to clean up. 2018-12-30 21:36:12 +01:00
f04bb2a080 Revert "the script 'forced_alignment_novo.py' which is to run novo_api on Python 3.6 environment is added."
This reverts commit b87a81eb9d.
2018-12-30 21:34:37 +01:00
b771cbb7d6 just commit small change to revert. 2018-12-30 21:34:04 +01:00
3500a8cdf0 load novo phoneset. 2018-12-30 21:27:20 +01:00
b87a81eb9d the script 'forced_alignment_novo.py' which is to run novo_api on Python 3.6 environment is added. 2018-12-30 21:26:42 +01:00
0777735979 Code is cleaned up. 2018-09-16 23:33:31 +02:00
ea30b5c503 The Stimmen excel file is loaded as Data Frame. Default values are given by defaultfiles.py. 2018-09-15 17:28:57 +02:00
64 changed files with 4423 additions and 15693 deletions

265
.gitignore vendored Normal file
View File

@ -0,0 +1,265 @@
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
## important ##
.acoustic_model/forced_alignment_novo.py
.acoustic_model/novoapi_functions.py
# User-specific files
*.suo
*.user
*.userosscache
*.sln.docstates
# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs
# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
# Visual Studio 2015 cache/options directory
.vs/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
# NUNIT
*.VisualState.xml
TestResult.xml
# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c
# DNX
project.lock.json
project.fragment.lock.json
artifacts/
*_i.c
*_p.c
*_i.h
*.ilk
*.meta
*.obj
*.pch
*.pdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*.log
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc
# Chutzpah Test files
_Chutzpah*
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opendb
*.opensdf
*.sdf
*.cachefile
*.VC.db
*.VC.VC.opendb
# Visual Studio profiler
*.psess
*.vsp
*.vspx
*.sap
# TFS 2012 Local Workspace
$tf/
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user
# JustCode is a .NET coding add-in
.JustCode
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# NCrunch
_NCrunch_*
.*crunch*.local.xml
nCrunchTemp_*
# MightyMoose
*.mm.*
AutoTest.Net/
# Web workbench (sass)
.sass-cache/
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# TODO: Comment the next line if you want to checkin your web deploy settings
# but database connection strings (with potential passwords) will be unencrypted
#*.pubxml
*.publishproj
# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
PublishScripts/
# NuGet Packages
*.nupkg
# The packages folder can be ignored because of Package Restore
**/packages/*
# except build/, which is used as an MSBuild target.
!**/packages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/packages/repositories.config
# NuGet v3's project.json files produces more ignoreable files
*.nuget.props
*.nuget.targets
# Microsoft Azure Build Output
csx/
*.build.csdef
# Microsoft Azure Emulator
ecf/
rcf/
# Windows Store app package directories and files
AppPackages/
BundleArtifacts/
Package.StoreAssociation.xml
_pkginfo.txt
# Visual Studio cache files
# files ending in .cache can be ignored
*.[Cc]ache
# but keep track of directories ending in .cache
!*.[Cc]ache/
# Others
ClientBin/
~$*
*~
*.dbmdl
*.dbproj.schemaview
*.jfm
*.pfx
*.publishsettings
node_modules/
orleans.codegen.cs
# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
# SQL Server files
*.mdf
*.ldf
# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
# Microsoft Fakes
FakesAssemblies/
# GhostDoc plugin setting file
*.GhostDoc.xml
# Node.js Tools for Visual Studio
.ntvs_analysis.dat
# Visual Studio 6 build log
*.plg
# Visual Studio 6 workspace options file
*.opt
# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
**/*.DesktopClient/ModelManifest.xml
**/*.Server/GeneratedArtifacts
**/*.Server/ModelManifest.xml
_Pvt_Extensions
# Paket dependency manager
.paket/paket.exe
paket-files/
# FAKE - F# Make
.fake/
# JetBrains Rider
.idea/
*.sln.iml
# CodeRush
.cr/
# Python Tools for Visual Studio (PTVS)
__pycache__/
*.pyc

Binary file not shown.

13152
HCompV.scp

File diff suppressed because it is too large Load Diff

View File

@ -10,17 +10,20 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
..\forced_alignment\forced_alignment\__init__.py = ..\forced_alignment\forced_alignment\__init__.py ..\forced_alignment\forced_alignment\__init__.py = ..\forced_alignment\forced_alignment\__init__.py
..\forced_alignment\forced_alignment\convert_phone_set.py = ..\forced_alignment\forced_alignment\convert_phone_set.py ..\forced_alignment\forced_alignment\convert_phone_set.py = ..\forced_alignment\forced_alignment\convert_phone_set.py
..\toolbox\evaluation.py = ..\toolbox\evaluation.py ..\toolbox\evaluation.py = ..\toolbox\evaluation.py
..\forced_alignment\forced_alignment\forced_alignment.pyproj = ..\forced_alignment\forced_alignment\forced_alignment.pyproj ..\forced_alignment\forced_alignment\htk_dict.py = ..\forced_alignment\forced_alignment\htk_dict.py
..\forced_alignment\forced_alignment\lexicon.py = ..\forced_alignment\forced_alignment\lexicon.py ..\forced_alignment\forced_alignment\lexicon.py = ..\forced_alignment\forced_alignment\lexicon.py
..\forced_alignment\forced_alignment\mlf.py = ..\forced_alignment\forced_alignment\mlf.py ..\forced_alignment\forced_alignment\mlf.py = ..\forced_alignment\forced_alignment\mlf.py
..\forced_alignment\forced_alignment\pronunciations.py = ..\forced_alignment\forced_alignment\pronunciations.py ..\forced_alignment\forced_alignment\pronunciations.py = ..\forced_alignment\forced_alignment\pronunciations.py
..\toolbox\pyHTK.py = ..\toolbox\pyHTK.py ..\toolbox\pyHTK.py = ..\toolbox\pyHTK.py
..\forced_alignment\forced_alignment\pyhtk.py = ..\forced_alignment\forced_alignment\pyhtk.py ..\forced_alignment\forced_alignment\pyhtk.py = ..\forced_alignment\forced_alignment\pyhtk.py
..\forced_alignment\forced_alignment\scripts.py = ..\forced_alignment\forced_alignment\scripts.py ..\forced_alignment\forced_alignment\scripts.py = ..\forced_alignment\forced_alignment\scripts.py
..\..\..\..\..\Python36-32\Lib\site-packages\novoapi\backend\session.py = ..\..\..\..\..\Python36-32\Lib\site-packages\novoapi\backend\session.py
..\forced_alignment\forced_alignment\tempfilename.py = ..\forced_alignment\forced_alignment\tempfilename.py ..\forced_alignment\forced_alignment\tempfilename.py = ..\forced_alignment\forced_alignment\tempfilename.py
..\forced_alignment\forced_alignment\test_environment.py = ..\forced_alignment\forced_alignment\test_environment.py ..\forced_alignment\forced_alignment\test_environment.py = ..\forced_alignment\forced_alignment\test_environment.py
EndProjectSection EndProjectSection
EndProject EndProject
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "toolbox", "..\toolbox\toolbox.pyproj", "{F0D46C9C-51C6-4989-8A2F-35F2A0C048BE}"
EndProject
Global Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU Debug|Any CPU = Debug|Any CPU
@ -29,6 +32,8 @@ Global
GlobalSection(ProjectConfigurationPlatforms) = postSolution GlobalSection(ProjectConfigurationPlatforms) = postSolution
{4D8C8573-32F0-4A62-9E62-3CE5CC680390}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {4D8C8573-32F0-4A62-9E62-3CE5CC680390}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{4D8C8573-32F0-4A62-9E62-3CE5CC680390}.Release|Any CPU.ActiveCfg = Release|Any CPU {4D8C8573-32F0-4A62-9E62-3CE5CC680390}.Release|Any CPU.ActiveCfg = Release|Any CPU
{F0D46C9C-51C6-4989-8A2F-35F2A0C048BE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{F0D46C9C-51C6-4989-8A2F-35F2A0C048BE}.Release|Any CPU.ActiveCfg = Release|Any CPU
EndGlobalSection EndGlobalSection
GlobalSection(SolutionProperties) = preSolution GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE HideSolutionNode = FALSE

View File

@ -1,319 +0,0 @@
import os
import sys
import tempfile
import configparser
import subprocess
from collections import Counter
import numpy as np
import pandas as pd
## ======================= user define =======================
repo_dir = 'C:\\Users\\Aki\\source\\repos\\acoustic_model'
curr_dir = repo_dir + '\\acoustic_model'
config_ini = curr_dir + '\\config.ini'
output_dir = 'C:\\OneDrive\\Research\\rug\\experiments\\friesian\\acoustic_model'
forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced_alignment'
dataset_list = ['devel', 'test', 'train']
# procedure
extract_features = 0
make_feature_list = 0
conv_lexicon = 0
check_lexicon = 0
make_mlf = 0
combine_files = 0
flat_start = 0
train_model = 1
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir))
sys.path.append(forced_alignment_module)
from forced_alignment import convert_phone_set
import acoustic_model_functions as am_func
## ======================= load variables =======================
config = configparser.ConfigParser()
config.sections()
config.read(config_ini)
config_hcopy = config['Settings']['config_hcopy']
config_train = config['Settings']['config_train']
mkhmmdefs_pl = config['Settings']['mkhmmdefs_pl']
FAME_dir = config['Settings']['FAME_dir']
lex_asr = FAME_dir + '\\lexicon\\lex.asr'
lex_asr_htk = FAME_dir + '\\lexicon\\lex.asr_htk'
lex_oov = FAME_dir + '\\lexicon\\lex.oov'
lex_oov_htk = FAME_dir + '\\lexicon\\lex.oov_htk'
#lex_ipa = FAME_dir + '\\lexicon\\lex.ipa'
#lex_ipa_ = FAME_dir + '\\lexicon\\lex.ipa_'
#lex_ipa_htk = FAME_dir + '\\lexicon\\lex.ipa_htk'
lex_htk = FAME_dir + '\\lexicon\\lex_original.htk'
lex_htk_ = FAME_dir + '\\lexicon\\lex.htk'
hcompv_scp = output_dir + '\\scp\\combined.scp'
combined_mlf = output_dir + '\\label\\combined.mlf'
model_dir = output_dir + '\\model'
model0_dir = model_dir + '\\hmm0'
proto_init = model_dir + '\\proto38'
proto_name = 'proto'
phonelist = output_dir + '\\config\\phonelist_friesian.txt'
hmmdefs_name = 'hmmdefs'
## ======================= extract features =======================
if extract_features:
print("==== extract features ====\n")
for dataset in dataset_list:
print(dataset)
# a script file for HCopy
hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False)
hcopy_scp.close()
# get a list of features (hcopy.scp) from the filelist in FAME! corpus
feature_dir = output_dir + '\\mfc\\' + dataset
am_func.make_hcopy_scp_from_filelist_in_fame(FAME_dir, dataset, feature_dir, hcopy_scp.name)
# extract features
subprocessStr = 'HCopy -C ' + config_hcopy + ' -S ' + hcopy_scp.name
subprocess.call(subprocessStr, shell=True)
## ======================= make a list of features =======================
if make_feature_list:
print("==== make a list of features ====\n")
for dataset in dataset_list:
print(dataset)
feature_dir = output_dir + '\\mfc\\' + dataset
hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp'
am_func.make_filelist(feature_dir, hcompv_scp)
## ======================= convert lexicon from ipa to fame_htk =======================
if conv_lexicon:
print('==== convert lexicon from ipa 2 fame ====\n')
# lex.asr is Kaldi compatible version of lex.ipa.
# to check...
#lexicon_ipa = pd.read_table(lex_ipa, names=['word', 'pronunciation'])
#with open(lex_ipa_, "w", encoding="utf-8") as fout:
# for word, pronunciation in zip(lexicon_ipa['word'], lexicon_ipa['pronunciation']):
# # ignore nasalization and '.'
# pronunciation_ = pronunciation.replace(u'ⁿ', '')
# pronunciation_ = pronunciation_.replace('.', '')
# pronunciation_split = convert_phone_set.split_ipa_fame(pronunciation_)
# fout.write("{0}\t{1}\n".format(word, ' '.join(pronunciation_split)))
# convert each lexicon from ipa description to fame_htk phoneset.
am_func.ipa2famehtk_lexicon(lex_oov, lex_oov_htk)
am_func.ipa2famehtk_lexicon(lex_asr, lex_asr_htk)
# combine lexicon
# pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov.
# therefore there is no overlap between lex_asr and lex_oov.
am_func.combine_lexicon(lex_asr_htk, lex_oov_htk, lex_htk)
## ======================= check if all the phones are successfully converted =======================
if check_lexicon:
print("==== check if all the phones are successfully converted. ====\n")
# the phones used in the lexicon.
phonelist_asr = am_func.get_phonelist(lex_asr)
phonelist_oov = am_func.get_phonelist(lex_oov)
phonelist_htk = am_func.get_phonelist(lex_htk)
phonelist = phonelist_asr.union(phonelist_oov)
# the lines which include a specific phone.
lines = am_func.find_phone(lex_asr, 'g')
# statistics over the lexicon
lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])
pronunciation = lexicon_htk['pronunciation']
phones_all = []
for word in pronunciation:
phones_all = phones_all + word.split()
c = Counter(phones_all)
## =======================
## manually make changes to the pronunciation dictionary and save it as lex.htk
## =======================
# (1) Replace all tabs with single space;
# (2) Put a '\' before any dictionary entry beginning with single quote
#http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html
## ======================= make label file =======================
if make_mlf:
print("==== make mlf ====\n")
print("generating word level transcription...\n")
for dataset in dataset_list:
hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp'
hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'
script_list = FAME_dir + '\\data\\' + dataset + '\\text'
mlf_word = output_dir + '\\label\\' + dataset + '_word.mlf'
mlf_phone = output_dir + '\\label\\' + dataset + '_phone.mlf'
# lexicon
lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])
# list of features
with open(hcompv_scp) as fin:
features = fin.read()
features = features.split('\n')
# list of scripts
with open(script_list, "rt", encoding="utf-8") as fin:
scripts = fin.read()
scripts = pd.Series(scripts.split('\n'))
i = 0
missing_words = []
fscp = open(hcompv_scp2, 'wt')
fmlf = open(mlf_word, "wt", encoding="utf-8")
fmlf.write("#!MLF!#\n")
feature_nr = 1
for feature in features:
sys.stdout.write("\r%d/%d" % (feature_nr, len(features)))
sys.stdout.flush()
feature_nr += 1
file_basename = os.path.basename(feature).replace('.mfc', '')
# get words from scripts.
try:
script = scripts[scripts.str.contains(file_basename)]
except IndexError:
script = []
if len(script) != 0:
script_id = script.index[0]
script_txt = script.get(script_id)
script_words = script_txt.split(' ')
del script_words[0]
# check if all words can be found in the lexicon.
SCRIPT_WORDS = []
script_prons = []
is_in_lexicon = 1
for word in script_words:
WORD = word.upper()
SCRIPT_WORDS.append(WORD)
extracted = lexicon_htk[lexicon_htk['word']==WORD]
if len(extracted) == 0:
missing_words.append(word)
script_prons.append(extracted)
is_in_lexicon *= len(extracted)
# if all pronunciations are found in the lexicon, update scp and mlf files.
if is_in_lexicon:
# add the feature filename into the .scp file.
fscp.write("{}\n".format(feature))
i += 1
# add the words to the mlf file.
fmlf.write('\"*/{}.lab\"\n'.format(file_basename))
#fmlf.write('{}'.format('\n'.join(SCRIPT_WORDS)))
for word_ in SCRIPT_WORDS:
if word_[0] == '\'':
word_ = '\\' + word_
fmlf.write('{}\n'.format(word_))
fmlf.write('.\n')
print("\n{0} has {1} samples.\n".format(dataset, i))
np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words)
fscp.close()
fmlf.close()
## generate phone level transcription
print("generating phone level transcription...\n")
mkphones = output_dir + '\\label\\mkphones0.txt'
subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word
subprocess.call(subprocessStr, shell=True)
## ======================= combined scps and mlfs =======================
if combine_files:
print("==== combine scps and mlfs ====\n")
fscp = open(hcompv_scp, 'wt')
fmlf = open(combined_mlf, 'wt')
for dataset in dataset_list:
fmlf.write("#!MLF!#\n")
for dataset in dataset_list:
each_mlf = output_dir + '\\label\\' + dataset + '_phone.mlf'
each_scp = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'
with open(each_mlf, 'r') as fin:
lines = fin.read()
lines = lines.split('\n')
fmlf.write('\n'.join(lines[1:]))
with open(each_scp, 'r') as fin:
lines = fin.read()
fscp.write(lines)
fscp.close()
fmlf.close()
## ======================= flat start monophones =======================
if flat_start:
subprocessStr = 'HCompV -T 1 -C ' + config_train + ' -m -v 0.01 -S ' + hcompv_scp + ' -M ' + model0_dir + ' ' + proto_init
subprocess.call(subprocessStr, shell=True)
# allocate mean & variance to all phones in the phone list
subprocessStr = 'perl ' + mkhmmdefs_pl + ' ' + model0_dir + '\\proto38' + ' ' + phonelist + ' > ' + model0_dir + '\\' + hmmdefs_name
subprocess.call(subprocessStr, shell=True)
## ======================= estimate monophones =======================
if train_model:
iter_num_max = 3
for mix_num in [128, 256, 512, 1024]:
for iter_num in range(1, iter_num_max+1):
print("===== mix{}, iter{} =====".format(mix_num, iter_num))
iter_num_pre = iter_num - 1
modelN_dir = model_dir + '\\hmm' + str(mix_num) + '-' + str(iter_num)
if not os.path.exists(modelN_dir):
os.makedirs(modelN_dir)
if iter_num == 1 and mix_num == 1:
modelN_dir_pre = model0_dir
else:
modelN_dir_pre = model_dir + '\\hmm' + str(mix_num) + '-' + str(iter_num_pre)
## re-estimation
subprocessStr = 'HERest -T 1 -C ' + config_train + ' -v 0.01 -I ' + combined_mlf + ' -H ' + modelN_dir_pre + '\\' + hmmdefs_name + ' -M ' + modelN_dir + ' ' + phonelist + ' -S ' + hcompv_scp
subprocess.call(subprocessStr, shell=True)
mix_num_next = mix_num * 2
modelN_dir_next = model_dir + '\\hmm' + str(mix_num_next) + '-0'
if not os.path.exists(modelN_dir_next):
os.makedirs(modelN_dir_next)
header_file = modelN_dir + '\\mix' + str(mix_num_next) + '.hed'
with open(header_file, 'w') as fout:
fout.write("MU %d {*.state[2-4].mix}" % (mix_num_next))
subprocessStr = 'HHEd -T 1 -H ' + modelN_dir + '\\' + hmmdefs_name + ' -M ' + modelN_dir_next + ' ' + header_file + ' ' + phonelist
subprocess.call(subprocessStr, shell=True)

View File

@ -4,7 +4,7 @@
<SchemaVersion>2.0</SchemaVersion> <SchemaVersion>2.0</SchemaVersion>
<ProjectGuid>4d8c8573-32f0-4a62-9e62-3ce5cc680390</ProjectGuid> <ProjectGuid>4d8c8573-32f0-4a62-9e62-3ce5cc680390</ProjectGuid>
<ProjectHome>.</ProjectHome> <ProjectHome>.</ProjectHome>
<StartupFile>performance_check.py</StartupFile> <StartupFile>check_novoapi.py</StartupFile>
<SearchPath> <SearchPath>
</SearchPath> </SearchPath>
<WorkingDirectory>.</WorkingDirectory> <WorkingDirectory>.</WorkingDirectory>
@ -21,8 +21,8 @@
<EnableUnmanagedDebugging>false</EnableUnmanagedDebugging> <EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
</PropertyGroup> </PropertyGroup>
<ItemGroup> <ItemGroup>
<Compile Include="acoustic_model.py" /> <Compile Include="check_novoapi.py" />
<Compile Include="acoustic_model_functions.py"> <Compile Include="convert_phoneset.py">
<SubType>Code</SubType> <SubType>Code</SubType>
</Compile> </Compile>
<Compile Include="convert_xsampa2ipa.py"> <Compile Include="convert_xsampa2ipa.py">
@ -31,15 +31,43 @@
<Compile Include="defaultfiles.py"> <Compile Include="defaultfiles.py">
<SubType>Code</SubType> <SubType>Code</SubType>
</Compile> </Compile>
<Compile Include="fame_test.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="fa_test.py"> <Compile Include="fa_test.py">
<SubType>Code</SubType> <SubType>Code</SubType>
</Compile> </Compile>
<Compile Include="performance_check.py"> <Compile Include="fame_functions.py" />
<Compile Include="forced_aligner_comparison.py" />
<Compile Include="novoapi_forced_alignment.py">
<SubType>Code</SubType> <SubType>Code</SubType>
</Compile> </Compile>
<Compile Include="htk_vs_kaldi.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="novoapi_functions.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="fame_hmm.py" />
<Compile Include="phoneset\fame_asr.py" />
<Compile Include="phoneset\fame_ipa.py" />
<Compile Include="phoneset\fame_phonetics.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="stimmen_functions.py" />
<Compile Include="stimmen_test.py" />
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<Content Include="config.ini" /> <Content Include="config.ini" />
<Content Include="phoneset\fame_ipa2asr.npy" />
<Content Include="phoneset\output_get_translation_key_phone_unknown.npy" />
<Content Include="phoneset\output_get_translation_key_translation_key.npy" />
<Content Include="phoneset\__pycache__\fame_asr.cpython-36.pyc" />
<Content Include="phoneset\__pycache__\fame_ipa.cpython-36.pyc" />
</ItemGroup>
<ItemGroup>
<Folder Include="phoneset\" />
<Folder Include="phoneset\__pycache__\" />
</ItemGroup> </ItemGroup>
<Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" /> <Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
<!-- Uncomment the CoreCompile target to enable the Build command in <!-- Uncomment the CoreCompile target to enable the Build command in

View File

@ -1,151 +0,0 @@
import os
import sys
from collections import Counter
import numpy as np
import pandas as pd
import defaultfiles as default
sys.path.append(default.forced_alignment_module_dir)
from forced_alignment import convert_phone_set
def make_hcopy_scp_from_filelist_in_fame(FAME_dir, dataset, feature_dir, hcopy_scp):
""" Make a script file for HCopy using the filelist in FAME! corpus. """
filelist_txt = FAME_dir + '\\fame\\filelists\\' + dataset + 'list.txt'
with open(filelist_txt) as fin:
filelist = fin.read()
filelist = filelist.split('\n')
with open(hcopy_scp, 'w') as fout:
for filename_ in filelist:
filename = filename_.replace('.TextGrid', '')
if len(filename) > 3: # remove '.', '..' and ''
wav_file = FAME_dir + '\\fame\\wav\\' + dataset + '\\' + filename + '.wav'
mfc_file = feature_dir + '\\' + filename + '.mfc'
fout.write(wav_file + '\t' + mfc_file + '\n')
def make_filelist(input_dir, output_txt):
""" Make a list of files in the input_dir. """
filenames = os.listdir(input_dir)
with open(output_txt, 'w') as fout:
for filename in filenames:
fout.write(input_dir + '\\' + filename + '\n')
def make_dic(word, pronvar_, fileDic, output_type):
"""
make dict files which can be used for HTK.
param word: target word.
param pronvar_: pronunciation variant. nx2 (WORD /t pronunciation) ndarray.
param fileDic: output dic file.
param output_type: 0:full, 1:statistics, 2:frequency <2% entries are removed. 3:top 3.
"""
#assert(output_type < 4 and output_type >= 0, 'output_type should be an integer between 0 and 3.')
WORD = word.upper()
if output_type == 0: # full
pronvar = np.unique(pronvar_)
with open(fileDic, 'w') as f:
for pvar in pronvar:
f.write('{0}\t{1}\n'.format(WORD, pvar))
else:
c = Counter(pronvar_)
total_num = sum(c.values())
with open(fileDic, 'w') as f:
if output_type == 3:
for key, value in c.most_common(3):
f.write('{0}\t{1}\n'.format(WORD, key))
else:
for key, value in c.items():
percentage = value/total_num*100
if output_type == 1: # all
f.write('{0}\t{1:.2f}\t{2}\t{3}\n'.format(value, percentage, WORD, key))
elif output_type == 2: # less than 2 percent
if percentage < 2:
f.write('{0}\t{1}\n'.format(WORD, key))
def get_phonelist(lexicon_file):
""" Make a list of phones which appears in the lexicon. """
with open(lexicon_file, "rt", encoding="utf-8") as fin:
lines = fin.read()
lines = lines.split('\n')
phonelist = set([])
for line in lines:
line = line.split('\t')
if len(line) > 1:
pronunciation = set(line[1].split())
phonelist = phonelist | pronunciation
return phonelist
def find_phone(lexicon_file, phone):
""" Search where the phone is used in the lexicon. """
with open(lexicon_file, "rt", encoding="utf-8") as fin:
lines = fin.read()
lines = lines.split('\n')
extracted = []
for line in lines:
line = line.split('\t')
if len(line) > 1:
pron = line[1]
if phone in pron:
extracted.append(line)
return extracted
def ipa2famehtk_lexicon(lexicon_file_in, lexicon_file_out):
""" Convert a lexicon file from IPA to HTK format for FAME! corpus. """
lexicon_in = pd.read_table(lexicon_file_in, names=['word', 'pronunciation'])
with open(lexicon_file_out, "w", encoding="utf-8") as fout:
for word, pronunciation in zip(lexicon_in['word'], lexicon_in['pronunciation']):
pronunciation_no_space = pronunciation.replace(' ', '')
pronunciation_famehtk = convert_phone_set.ipa2famehtk(pronunciation_no_space)
if 'ceh' not in pronunciation_famehtk and 'sh' not in pronunciation_famehtk:
fout.write("{0}\t{1}\n".format(word.upper(), pronunciation_famehtk))
def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out):
""" Combine two lexicon files and sort by words. """
with open(lexicon_file1, "rt", encoding="utf-8") as fin:
lines1 = fin.read()
lines1 = lines1.split('\n')
with open(lexicon_file2, "rt", encoding="utf-8") as fin:
lines2 = fin.read()
lines2 = lines2.split('\n')
lex1 = pd.read_table(lexicon_file1, names=['word', 'pronunciation'])
lex2 = pd.read_table(lexicon_file2, names=['word', 'pronunciation'])
lex = pd.concat([lex1, lex2])
lex = lex.sort_values(by='word', ascending=True)
lex.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
def read_fileFA(fileFA):
"""
read the result file of HTK forced alignment.
this function only works when input is one word.
"""
with open(fileFA, 'r') as f:
lines = f.read()
lines = lines.split('\n')
phones = []
for line in lines:
line_split = line.split()
if len(line_split) > 1:
phones.append(line_split[2])
return ' '.join(phones)

View File

@ -0,0 +1,207 @@
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import sys
import csv
from collections import Counter
import random
import shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import novoapi
import defaultfiles as default
sys.path.append(default.forced_alignment_module_dir)
from forced_alignment import convert_phone_set
#import acoustic_model_functions as am_func
import convert_xsampa2ipa
import novoapi_functions
import stimmen_functions
sys.path.append(default.accent_classification_dir)
import output_confusion_matrix
## procedure
forced_alignment_novo70 = True
## ===== load novo phoneset =====
phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa = novoapi_functions.load_novo70_phoneset()
## ===== extract pronunciations written in novo70 only (not_in_novo70) =====
## read pronunciation variants.
#stimmen_transcription_ = pd.ExcelFile(default.stimmen_transcription_xlsx)
#df = pd.read_excel(stimmen_transcription_, 'frequency')
#transcription_ipa = list(df['IPA'])
stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test'
df = stimmen_functions.load_transcriptions_novo70(stimmen_test_dir)
## transcription mistake?
#transcription_ipa = [ipa.replace(';', 'ː') for ipa in transcription_ipa if not ipa=='pypɪl' and not pd.isnull(ipa)]
#transcription_ipa = [ipa.replace('ˑ', '') for ipa in transcription_ipa] # only one case.
#not_in_novo70 = []
#all_in_novo70 = []
#for ipa in transcription_ipa:
# ipa = ipa.replace(':', 'ː')
# ipa = convert_phone_set.split_ipa(ipa)
# # list of phones not in novo70 phoneset.
# not_in_novo70_ = [phone for phone in ipa
# if not phone in phoneset_ipa and not phone in david_suggestion]
# not_in_novo70_ = [phone.replace('sp', '') for phone in not_in_novo70_]
# not_in_novo70_ = [phone.replace(':', '') for phone in not_in_novo70_]
# not_in_novo70_ = [phone.replace('ː', '') for phone in not_in_novo70_]
# if len(not_in_novo70_) == 0:
# all_in_novo70.append(''.join(ipa))
# #translation_key.get(phone, phone)
# not_in_novo70.extend(not_in_novo70_)
#not_in_novo70_list = list(set(not_in_novo70))
## check which phones used in stimmen but not in novo70
# 'ʀ', 'ʁ',
# 'ɒ', 'ɐ',
# 'o', 'a' (o:, a:?)
# [e] 'nyːver mɑntsjə' (1)
# [ɾ] 'ɪːɾ'(1)
# [ɹ] 'iːjəɹ' (1), 'ɪ:ɹ' (1)
# [ø] 'gʀøtəpi:r'(1), 'grøtəpi:r'(1)
# [æ] 'røːzəʀæt'(2), 'røːzəræt'(1)
# [ʊ] 'ʊ'(1) --> can be ʏ (uh)??
# [χ] --> can be x??
#def search_phone_ipa(x, phone_list):
# x_in_item = []
# for ipa in phone_list:
# ipa_original = ipa
# ipa = ipa.replace(':', 'ː')
# ipa = convert_phone_set.split_ipa(ipa)
# if x in ipa and not x+':' in ipa:
# x_in_item.append(ipa_original)
# return x_in_item
#search_phone_ipa('ø', transcription_ipa)
## ===== load all transcriptions (df) =====
#df = stimmen_functions.load_transcriptions()
word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
word_list = sorted(word_list)
## check frequency of each pronunciation variants
#cols = ['word', 'ipa', 'frequency']
#df_samples = pd.DataFrame(index=[], columns=cols)
#for ipa in all_in_novo70:
# ipa = ipa.replace('ː', ':')
# samples = df[df['ipa'] == ipa]
# word = list(set(samples['word']))[0]
# samples_Series = pd.Series([word, ipa, len(samples)], index=df_samples.columns)
# df_samples = df_samples.append(samples_Series, ignore_index=True)
# each word
#df_per_word = pd.DataFrame(index=[], columns=df_samples.keys())
#for word in word_list:
word = word_list[2]
df_ = df[df['word']==word]
np.unique(list(df_['ipa']))
#df_samples_ = df_samples_[df_samples_['frequency']>2]
#df_per_word = df_per_word.append(df_samples_, ignore_index=True)
#df_per_word.to_excel(os.path.join(default.stimmen_dir, 'pronunciation_variants_novo70.xlsx'), encoding="utf-8")
## ===== forced alignment =====
rozen_dir = r'c:\Users\Aki\source\repos\acoustic_model\rozen-test'
if forced_alignment_novo70:
Results = pd.DataFrame(index=[],
columns=['filename', 'word', 'xsampa', 'ipa', 'result_ipa', 'result_novo70', 'llh'])
#for word in word_list:
for word in ['Rozen']:
# pronunciation variants top 3
df_per_word_ = df_per_word[df_per_word['word']==word]
df_per_word_ = df_per_word_.sort_values('frequency', ascending=False)
if len(df_per_word_) < 3: # pauw, rozen
pronunciation_ipa = list(df_per_word_['ipa'])
elif word=='Reuzenrad':
pronunciation_ipa = [
df_per_word_.iloc[0]['ipa'],
df_per_word_.iloc[1]['ipa'],
df_per_word_.iloc[2]['ipa'],
df_per_word_.iloc[3]['ipa']]
else:
# oog, oor, reus, roeiboot
pronunciation_ipa = [
df_per_word_.iloc[0]['ipa'],
df_per_word_.iloc[1]['ipa'],
df_per_word_.iloc[2]['ipa']]
#print("{0}: {1}".format(word, pronunciation_ipa))
# samples for the word
df_ = df[df['word']==word]
# samples in which all pronunciations are written in novo70.
samples = df_.query("ipa in @pronunciation_ipa")
results = pd.DataFrame(index=[],
columns=['filename', 'word', 'xsampa', 'ipa', 'result_ipa', 'result_novo70', 'llh'])
for i in range(0, len(samples)):
sample = samples.iloc[i]
filename = sample['filename']
wav_file = os.path.join(default.stimmen_wav_dir, filename)
if os.path.exists(wav_file):
# for Martijn
shutil.copy(wav_file, os.path.join(rozen_dir, filename))
# pronunciation_ipa_ = [ipa.replace(':', 'ː') for ipa in pronunciation_ipa]
# result = novoapi_functions.forced_alignment(wav_file, word, pronunciation_ipa_)
# result_ipa, result_novo70, llh = novoapi_functions.result2pronunciation(result, word)
# result_ = pd.Series([
# sample['filename'],
# sample['word'],
# sample['xsampa'],
# sample['ipa'],
# ' '.join(result_ipa),
# ' '.join(result_novo70),
# llh
# ], index=results.columns)
# results = results.append(result_, ignore_index = True)
# print('{0}/{1}: answer {2} - prediction {3}'.format(
# i+1, len(samples), result_['ipa'], result_['result_ipa']))
# #results.to_excel(os.path.join(default.stimmen_dir, 'results.xlsx'), encoding="utf-8")
#if len(results) > 0:
# Results = Results.append(results, ignore_index = True)
#Results.to_excel(os.path.join(default.stimmen_result_novoapi_dir, 'Results.xlsx'), encoding="utf-8")
else:
Results_xlsx = pd.ExcelFile(os.path.join(default.stimmen_result_novoapi_dir, 'Results.xlsx'), encoding="utf-8")
Results = pd.read_excel(Results_xlsx, 'Sheet1')
## ===== analysis =====
#for word in word_list:
# if not word == 'Oog':
# Results_ = Results[Results['word'] == word]
# y_true = list(Results_['ipa'])
# y_pred_ = [ipa.replace(' ', '') for ipa in list(Results_['result_ipa'])]
# y_pred = [ipa.replace('ː', ':') for ipa in y_pred_]
# pronunciation_variants = list(set(y_true))
# cm = confusion_matrix(y_true, y_pred, labels=pronunciation_variants)
# plt.figure()
# output_confusion_matrix.plot_confusion_matrix(cm, pronunciation_variants, normalize=False)
# #plt.show()
# plt.savefig(os.path.join(default.stimmen_result_novoapi_dir, word + '.png'))

View File

@ -0,0 +1,58 @@
"""Module to convert phonemes."""
def multi_character_tokenize(line, multi_character_tokens):
"""Tries to match one of the tokens in multi_character_tokens at each position of line, starting at position 0,
if so tokenizes and eats that token. Otherwise tokenizes a single character"""
while line != '':
for token in multi_character_tokens:
if line.startswith(token) and len(token) > 0:
yield token
line = line[len(token):]
break
else:
yield line[:1]
line = line[1:]
def split_word(word, phoneset):
"""
split a line by given phoneset.
Args:
word (str): a word written in given phoneset.
#multi_character_phones (list): the list of multicharacter phones which is considered as one phone. this can be obtained with phoneset definition such as fame_ipa.py.
phoneset (list): the list of phones.
Returns:
(word_seperated) (list): the word splitted in given phoneset.
"""
multi_character_phones = extract_multi_character_phones(phoneset)
return [phone
for phone in multi_character_tokenize(word.strip(), multi_character_phones)
]
def convert_phoneset(word_list, translation_key):
"""
Args:
word_list (str): a list of phones written in given phoneset.
translation_key (dict):
"""
return [translation_key.get(phone, phone) for phone in word_list]
def phone_reduction(phones, reduction_key):
multi_character_tokenize(wo.strip(), multi_character_phones)
return [reduction_key.get(i, i) for i in phones
if not i in phones_to_be_removed]
def extract_multi_character_phones(phoneset):
"""
Args:
phoneset (list):
"""
multi_character_phones = [i for i in phoneset if len(i) > 1]
multi_character_phones.sort(key=len, reverse=True)
return multi_character_phones

View File

@ -1,35 +1,42 @@
import os import os
# add path of the parent directory
#os.path.dirname(os.path.realpath(__file__))
#default_hvite_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', 'htk', 'config.HVite') # repos
cygwin_dir = r'C:\cygwin64\home\Aki\acoustic_model'
#config_hcopy = os.path.join(cygwin_dir, 'config', 'config.HCopy')
#config_train = os.path.join(cygwin_dir, 'config', 'config.train')
config_hvite = os.path.join(cygwin_dir, 'config', 'config.HVite')
#mkhmmdefs_pl = os.path.join(cygwin_dir, 'src', 'acoustic_model', 'mkhmmdefs.pl')
#dbLexicon = C:\\Users\\Aki\\source\\repos\\rug_VS\\forced_alignment\\config\\lexicon.accdb
#scriptBarbara = C:\\Users\\Aki\\source\\repos\\rug_VS\\forced_alignment\\config\\pronvars_barbara.perl
#exeG2P = C:\\Users\\Aki\\source\\repos\\rug_VS\\forced_alignment\\config\\string2phon.exe
#[pyHTK]
#configHVite = C:\\Users\\Aki\\source\\repos\\rug_VS\\forced_alignment\\config\\config.HVite
#filePhoneList = C:\\Users\\Aki\\source\\repos\\rug_VS\\forced_alignment\\config\\phonelist_barbara.txt
#AcousticModel = C:\\Users\\Aki\\source\\repos\\rug_VS\\forced_alignment\\config\\hmmdefs_16-2_barbara.compo
#dbLexicon = config['cLexicon']['dbLexicon']
#scriptBarbara = config['cLexicon']['scriptBarbara']
#exeG2P = config['cLexicon']['exeG2P']
#configHVite = config['pyHTK']['configHVite']
#filePhoneList = config['pyHTK']['filePhoneList']
#AcousticModel = config['pyHTK']['AcousticModel']
repo_dir = r'C:\Users\Aki\source\repos' repo_dir = r'C:\Users\Aki\source\repos'
ipa_xsampa_converter_dir = os.path.join(repo_dir, 'ipa-xsama-converter') ipa_xsampa_converter_dir = os.path.join(repo_dir, 'ipa-xsama-converter')
forced_alignment_module_dir = os.path.join(repo_dir, 'forced_alignment') forced_alignment_module_dir = os.path.join(repo_dir, 'forced_alignment')
accent_classification_dir = os.path.join(repo_dir, 'accent_classification', 'accent_classification')
toolbox_dir = os.path.join(repo_dir, 'toolbox')
WSL_dir = r'C:\OneDrive\WSL'
novo_api_dir = os.path.join(WSL_dir, 'python-novo-api', 'novoapi')
#novo_api_dir = r'c:\Python36-32\Lib\site-packages\novoapi'
# working directories
rug_dir = r'c:\OneDrive\Research\rug'
experiments_dir = os.path.join(rug_dir, 'experiments')
htk_dir = os.path.join(experiments_dir, 'acoustic_model', 'fame', 'htk')
kaldi_dir = os.path.join(WSL_dir, 'kaldi-trunk', 'egs', '_stimmen')
stimmen_dir = os.path.join(experiments_dir, 'stimmen')
# data
fame_dir = os.path.join(rug_dir, '_data', 'FAME')
#fame_dir = os.path.join(WSL_dir, 'kaldi-trunk', 'egs', 'fame')
# 44.1 kHz
#stimmen_wav_dir = os.path.join(stimmen_dir, 'wav')
# 16 kHz
stimmen_wav_dir = r'c:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\corpus\stimmen'
stimmen_transcription_xlsx = os.path.join(stimmen_dir, 'data', 'Frisian Variants Picture Task Stimmen.xlsx')
phonelist_friesian_txt = os.path.join(experiments_dir, 'friesian', 'acoustic_model', 'config', 'phonelist_friesian.txt')
novo70_phoneset = os.path.join(novo_api_dir, 'asr', 'phoneset', 'nl', 'novo70.phoneset')
#phonelist_txt = os.path.join(htk_dir, 'config', 'phonelist.txt')
#fame_s5_dir = os.path.join(fame_dir, 's5')
#fame_corpus_dir = os.path.join(fame_dir, 'corpus')
#stimmen_result_novoapi_dir = os.path.join(stimmen_dir, 'result', 'novoapi')
# novoapi_functions
fame_dir = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\corpus'
experiments_dir = r'c:\OneDrive\Research\rug\experiments'
phonelist = os.path.join(experiments_dir, 'friesian', 'acoustic_model', 'config', 'phonelist_friesian.txt')

View File

@ -0,0 +1,406 @@
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import sys
from collections import Counter
import pickle
import numpy as np
import pandas as pd
import defaultfiles as default
import convert_phoneset
from phoneset import fame_ipa, fame_asr
sys.path.append(default.toolbox_dir)
from htk import pyhtk
#def read_fileFA(fileFA):
# """
# read the result file of HTK forced alignment.
# this function only works when input is one word.
# """
# with open(fileFA, 'r') as f:
# lines = f.read()
# lines = lines.split('\n')
# phones = []
# for line in lines:
# line_split = line.split()
# if len(line_split) > 1:
# phones.append(line_split[2])
# return ' '.join(phones)
#def fame_pronunciation_variant(ipa):
# ipa = ipa.replace('æ', 'ɛ')
# ipa = ipa.replace('ɐ', 'a')
# ipa = ipa.replace('ɑ', 'a')
# ipa = ipa.replace('ɾ', 'r')
# ipa = ipa.replace('ɹ', 'r') # ???
# ipa = ipa.replace('ʁ', 'r')
# ipa = ipa.replace('ʀ', 'r') # ???
# ipa = ipa.replace('ʊ', 'u')
# ipa = ipa.replace('χ', 'x')
# pronvar_list = [ipa]
# while 'ø:' in ' '.join(pronvar_list) or 'œ' in ' '.join(pronvar_list) or 'ɒ' in ' '.join(pronvar_list):
# pronvar_list_ = []
# for p in pronvar_list:
# if 'ø:' in p:
# pronvar_list_.append(p.replace('ø:', 'ö'))
# pronvar_list_.append(p.replace('ø:', 'ö:'))
# if 'œ' in p:
# pronvar_list_.append(p.replace('œ', 'ɔ̈'))
# pronvar_list_.append(p.replace('œ', 'ɔ̈:'))
# if 'ɒ' in p:
# pronvar_list_.append(p.replace('ɒ', 'ɔ̈'))
# pronvar_list_.append(p.replace('ɒ', 'ɔ̈:'))
# pronvar_list = np.unique(pronvar_list_)
# return pronvar_list
#def make_fame2ipa_variants(fame):
# fame = 'rɛös'
# ipa = [fame]
# ipa.append(fame.replace('ɛ', 'æ'))
# ipa.append(fame.replace('a', 'ɐ'))
# ipa.append(fame.replace('a', 'ɑ'))
# ipa.append(fame.replace('r', 'ɾ'))
# ipa.append(fame.replace('r', 'ɹ'))
# ipa.append(fame.replace('r', 'ʁ'))
# ipa.append(fame.replace('r', 'ʀ'))
# ipa.append(fame.replace('u', 'ʊ'))
# ipa.append(fame.replace('x', 'χ'))
# ipa.append(fame.replace('ö', 'ø:'))
# ipa.append(fame.replace('ö:', 'ø:'))
# ipa.append(fame.replace('ɔ̈', 'œ'))
# ipa.append(fame.replace('ɔ̈:', 'œ'))
# ipa.append(fame.replace('ɔ̈', 'ɒ'))
# ipa.append(fame.replace('ɔ̈:', 'ɒ'))
# return ipa
#def make_htk_dict(word, pronvar_, fileDic, output_type):
# """
# make dict files which can be used for HTK.
# param word: target word.
# param pronvar_: pronunciation variant. nx2 (WORD /t pronunciation) ndarray.
# param fileDic: output dic file.
# param output_type: 0:full, 1:statistics, 2:frequency <2% entries are removed. 3:top 3.
# """
# #assert(output_type < 4 and output_type >= 0, 'output_type should be an integer between 0 and 3.')
# WORD = word.upper()
# if output_type == 0: # full
# pronvar = np.unique(pronvar_)
# with open(fileDic, 'w') as f:
# for pvar in pronvar:
# f.write('{0}\t{1}\n'.format(WORD, pvar))
# else:
# c = Counter(pronvar_)
# total_num = sum(c.values())
# with open(fileDic, 'w') as f:
# if output_type == 3:
# for key, value in c.most_common(3):
# f.write('{0}\t{1}\n'.format(WORD, key))
# else:
# for key, value in c.items():
# percentage = value/total_num*100
# if output_type == 1: # all
# f.write('{0}\t{1:.2f}\t{2}\t{3}\n'.format(value, percentage, WORD, key))
# elif output_type == 2: # less than 2 percent
# if percentage < 2:
# f.write('{0}\t{1}\n'.format(WORD, key))
def make_hcopy_scp_from_filelist_in_fame(fame_dir, dataset, feature_dir, hcopy_scp):
""" Make a script file for HCopy using the filelist in FAME! corpus.
Args:
fame_dir (path): the directory of FAME corpus.
dataset (str): 'devel', 'test' or 'train'.
feature_dir (path): the directory where feature will be stored.
hcopy_scp (path): a script file for HCopy to be made.
"""
filelist_txt = os.path.join(fame_dir, 'fame', 'filelists', dataset + 'list.txt')
with open(filelist_txt) as fin:
filelist = fin.read()
filelist = filelist.split('\n')
with open(hcopy_scp, 'w') as fout:
for filename_ in filelist:
filename = filename_.replace('.TextGrid', '')
if len(filename) > 3: # remove '.', '..' and ''
wav_file = os.path.join(fame_dir, 'fame', 'wav', dataset, filename + '.wav')
mfc_file = os.path.join(feature_dir, filename + '.mfc')
fout.write(wav_file + '\t' + mfc_file + '\n')
return
def load_lexicon(lexicon_file):
""" load lexicon file as data frame.
Args:
lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'.
Returns:
lex (df): lexicon as Data Frame, which has columns 'word' and 'pronunciation'.
"""
lex = pd.read_csv(lexicon_file, delimiter='\t', header=None, encoding="utf-8")
lex.rename(columns={0: 'word', 1: 'pronunciation'}, inplace=True)
return lex
def get_phoneset_from_lexicon(lexicon_file, phoneset_name='asr'):
""" Make a list of phones which appears in the lexicon.
Args:
lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'.
phoneset_name (str): the name of phoneset with which lexicon_file is written. 'asr'(default) or 'ipa'.
Returns:
(list_of_phones) (set): the set of phones included in the lexicon_file.
"""
assert phoneset_name in ['asr', 'ipa'], 'phoneset_name should be \'asr\' or \'ipa\''
lex = load_lexicon(lexicon_file)
if phoneset_name == 'asr':
return set(' '.join(lex['pronunciation']).split(' '))
elif phoneset_name == 'ipa':
join_pronunciations = ''.join(lex['pronunciation'])
return set(convert_phone_set.split_word(join_pronunciations, fame_ipa.multi_character_phones))
return
def extract_unknown_phones(ipa, known_phones):
"""extract unknown phones in the pronunciation written in IPA.
Args:
ipa (str): a pronunciation written in IPA.
known_phones (list): list of phones already know.
Returns:
(list_of_phones) (list): unknown phones not included in 'known_phones'.
"""
ipa_split = convert_phone_set.split_word(ipa, fame_ipa.multi_character_phones)
return [i for i in ipa_split if not i in known_phones]
def get_translation_key(lexicon_file_ipa, lexicon_file_asr):
""" get correspondence between lexicon_file_ipa and lexicon_file_asr.
Args:
lexicon_file_ipa (path): lexicon in the format of 'word' /t 'pronunciation (IPA)'.
lexicon_file_asr (path): lexicon in the format of 'word' /t 'pronunciation (asr)'.
the each character of 'pronunciation' should be delimited by ' '.
Returns:
translation_key (dict): translation key from ipa to asr.
(phone_unknown) (list): the list of IPA phones, which does not appear in lexicon_file_asr.
"""
lex_ipa = load_lexicon(lexicon_file_ipa)
lex_asr = load_lexicon(lexicon_file_asr)
phone_unknown = fame_ipa.phoneset[:]
translation_key = dict()
for word in lex_ipa['word']:
if np.sum(lex_ipa['word'] == word) == 1 and np.sum(lex_asr['word'] == word) == 1:
ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
asr = lex_asr[lex_asr['word'] == word].iat[0, 1]
ipa_list = convert_phone_set.split_word(ipa, fame_ipa.multi_character_phones)
asr_list = asr.split(' ')
# if there are phones which is not in phone_unknown
#if len([True for i in asr_list if i in phone_unknown]) > 0:
if(len(ipa_list) == len(asr_list)):
print("{0}: {1} --> {2}".format(word, ipa_list, asr_list))
for ipa_, asr_ in zip(ipa_list, asr_list):
if ipa_ in phone_unknown:
translation_key[ipa_] = asr_
phone_unknown.remove(ipa_)
return translation_key, list(phone_unknown)
def find_phone(lexicon_file, phone, phoneset_name='ipa'):
""" extract rows where the phone is used in the lexicon_file.
Args:
lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'.
phone (str): the phone to be searched.
phoneset_name (str): the name of phoneset_name with which lexicon_file is written. 'asr' or 'ipa'(default).
Returns:
extracted (df): rows where the phone is used.
ToDo:
* develop when the phonset == 'asr'.
"""
assert phoneset_name in ['asr', 'ipa'], 'phoneset_name should be \'asr\' or \'ipa\''
lex = load_lexicon(lexicon_file)
# to reduce the calculation time, only target rows which include 'phone' at least once.
lex_ = lex[lex['pronunciation'].str.count(phone)>0]
extracted = pd.DataFrame(index=[], columns=['word', 'pronunciation'])
for index, row in lex_.iterrows():
if phoneset_name == 'ipa':
pronunciation = convert_phone_set.split_word(row['pronunciation'], fame_ipa.multi_character_phones)
if phone in pronunciation:
extracted_ = pd.Series([row['word'], pronunciation], index=extracted.columns)
extracted = extracted.append(extracted_, ignore_index=True)
return extracted
def asr2htk_space_delimited(pronunciation):
"""convert phoneset from asr to htk.
Args:
pronunciation (str): space delimited asr phones.
Returns:
(pronunciation) (str): space delimited asr phones in htk format (ascii).
"""
pronunciation_short = [fame_asr.reduction_key.get(i, i) for i in pronunciation.split(' ')
if not i in fame_asr.phones_to_be_removed]
return ' '.join(convert_phoneset.convert_phoneset(
pronunciation_short, fame_asr.translation_key_asr2htk))
def lexicon_asr2htk(lexicon_file_asr, lexicon_file_htk):
""" Convert a lexicon file from asr to htk format (ascii).
Args:
lexicon_file_asr (path): a lexicon file written in asr format e.g. fame/lex.asr.
lexicon_file_htk (path): a lexicon file written in htk format (ascii).
"""
lex_asr = load_lexicon(lexicon_file_asr)
def word2htk_(row):
return word2htk(row['word'])
def asr2htk_space_delimited_(row):
return asr2htk_space_delimited(row['pronunciation'])
lex_htk = pd.DataFrame({
'word': lex_asr.apply(word2htk_, axis=1).str.upper(),
'pronunciation': lex_asr.apply(asr2htk_space_delimited_, axis=1)
})
lex_htk = lex_htk.ix[:, ['word', 'pronunciation']]
lex_htk.to_csv(lexicon_file_htk, header=None, index=None, sep='\t', encoding='utf-8')
return
def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out):
""" Combine two lexicon files and sort by words.
Args:
lexicon_file1, lexicon_file2 (path): input lexicon files.
Returns:
lexicon_file_out (path): lexicon_file which lexcion_file1 and 2 are combined and sorted.
"""
lex1 = load_lexicon(lexicon_file1)
lex2 = load_lexicon(lexicon_file2)
lex = pd.concat([lex1, lex2])
lex = lex.sort_values(by='word', ascending=True)
lex.to_csv(lexicon_out, index=False, header=False, sep='\t', encoding='utf-8')
def fix_lexicon(lexicon_file):
""" fix lexicon
- add '\' before all single quote at the beginning of words.
- convert special characters to ascii compatible characters.
- add silence.
Args:
lexicon_file (path): lexicon file, which will be overwitten.
"""
lex = load_lexicon(lexicon_file)
lex = lex.dropna() # remove N/A.
# add 'sil'
row = pd.Series(['SILENCE', 'sil'], index=lex.columns)
lex = lex.append(row, ignore_index=True)
lex = lex.sort_values(by='word', ascending=True)
for i in lex[lex['word'].str.startswith('\'')].index.values:
lex.iat[i, 0] = lex.iat[i, 0].replace('\'', '\\\'')
# to_csv does not work with space seperator. therefore all tabs should manually be replaced.
#lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
lex.to_csv(lexicon_file, index=False, header=False, sep='\t', encoding='utf-8')
return
def word2htk(word):
return ''.join([fame_asr.translation_key_word2htk.get(i, i) for i in word])
def ipa2asr(ipa):
curr_dir = os.path.dirname(os.path.abspath(__file__))
translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0)
#ipa_ = fame_asr.phone_reduction(ipa)
ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)
ipa_splitted = fame_ipa.phone_reduction(ipa_splitted)
asr_splitted = convert_phoneset.convert_phoneset(ipa_splitted, translation_key_ipa2asr)
asr_splitted = fame_asr.phone_reduction(asr_splitted)
return ''.join(asr_splitted)
def ipa2htk(ipa):
curr_dir = os.path.dirname(os.path.abspath(__file__))
translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0)
#translation_key_ipa2asr = np.load(r'c:\Users\Aki\source\repos\acoustic_model\acoustic_model\phoneset\fame_ipa2asr.npy').item(0)
ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)
ipa_splitted = fame_ipa.phone_reduction(ipa_splitted)
asr_splitted = convert_phoneset.convert_phoneset(ipa_splitted, translation_key_ipa2asr)
asr_splitted = fame_asr.phone_reduction(asr_splitted)
htk_splitted = convert_phoneset.convert_phoneset(asr_splitted, fame_asr.translation_key_asr2htk)
return ''.join(htk_splitted)
def performance_on_stimmen(config_dir, stimmen_dir, hmmdefs):
lattice_file = os.path.join(stimmen_dir, 'word_lattice.ltc')
hvite_scp = os.path.join(stimmen_dir, 'hvite.scp')
#fh.make_filelist(os.path.join(stimmen_dir, 'mfc'), hvite_scp, file_type='mfc')
hresult_scp = os.path.join(stimmen_dir, 'hresult.scp')
#fh.make_filelist(os.path.join(stimmen_dir, 'mfc'), hresult_scp, file_type='rec')
lexicon_file = os.path.join(stimmen_dir, 'lexicon_recognition.dic')
# get feature_size from hmmdefs.
with open(hmmdefs) as f:
line = f.readline()
line = f.readline().strip()
feature_size = int(line.split(' ')[2])
chtk = pyhtk.HTK(config_dir, fame_asr.phoneset_htk, lexicon_file, feature_size)
result = chtk.recognition(
lattice_file,
hmmdefs,
hvite_scp
)
per_sentence, per_word = chtk.calc_recognition_performance(hresult_scp)
return per_sentence['accuracy']

566
acoustic_model/fame_hmm.py Normal file
View File

@ -0,0 +1,566 @@
import sys
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import tempfile
import shutil
import glob
import time
import numpy as np
import pandas as pd
import fame_functions
from phoneset import fame_ipa, fame_asr, fame_phonetics
import defaultfiles as default
sys.path.append(default.toolbox_dir)
import file_handling as fh
from htk import pyhtk
#from scripts import run_command
## ======================= user define =======================
# procedure
combine_all = 1
make_lexicon = 0
make_label = 0 # it takes roughly 4800 sec on Surface pro 2.
make_mlf = 0
extract_features = 0
flat_start = 1
train_monophone_without_sp = 1
add_sp = 1
train_monophone_with_re_aligned_mlf = 1
increase_mixture = 1
train_triphone = 0
train_triphone_tied = 0
# pre-defined values.
dataset_list = ['devel', 'test', 'train']
feature_size = 30
improvement_threshold = 0.3
lexicon_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr')
lexicon_oov = os.path.join(default.fame_dir, 'lexicon', 'lex.oov')
config_dir = os.path.join(default.htk_dir, 'config')
phonelist_full_txt = os.path.join(config_dir, 'phonelist_full.txt')
tree_hed = os.path.join(config_dir, 'tree.hed')
quests_hed = os.path.join(config_dir, 'quests.hed')
model_dir = os.path.join(default.htk_dir, 'model')
model_mono0_dir = os.path.join(model_dir, 'mono0')
model_mono1_dir = os.path.join(model_dir, 'mono1')
model_mono1sp_dir = os.path.join(model_dir, 'mono1sp')
model_mono1sp2_dir = os.path.join(model_dir, 'mono1sp2')
model_tri1_dir = os.path.join(model_dir, 'tri1')
model_tri1tied_dir = os.path.join(model_dir, 'tri1tied')
# directories / files to be made.
lexicon_dir = os.path.join(default.htk_dir, 'lexicon')
lexicon_htk_asr = os.path.join(lexicon_dir, 'lex.htk_asr')
lexicon_htk_oov = os.path.join(lexicon_dir, 'lex.htk_oov')
lexicon_htk = os.path.join(lexicon_dir, 'lex.htk')
lexicon_htk_with_sp = os.path.join(lexicon_dir, 'lex_with_sp.htk')
lexicon_htk_triphone = os.path.join(lexicon_dir, 'lex_triphone.htk')
feature_dir = os.path.join(default.htk_dir, 'mfc')
fh.make_new_directory(feature_dir, existing_dir='leave')
tmp_dir = os.path.join(default.htk_dir, 'tmp')
fh.make_new_directory(tmp_dir, existing_dir='leave')
label_dir = os.path.join(default.htk_dir, 'label')
fh.make_new_directory(label_dir, existing_dir='leave')
## training
if combine_all:
hcompv_scp_train = os.path.join(tmp_dir, 'all.scp')
mlf_file_train = os.path.join(label_dir, 'all_phone.mlf')
mlf_file_train_word = os.path.join(label_dir, 'all_word.mlf')
mlf_file_train_with_sp = os.path.join(label_dir, 'all_phone_with_sp.mlf')
mlf_file_train_aligned = os.path.join(label_dir, 'all_phone_aligned.mlf')
triphone_mlf = os.path.join(label_dir, 'all_triphone.mlf')
else:
hcompv_scp_train = os.path.join(tmp_dir, 'train.scp')
mlf_file_train = os.path.join(label_dir, 'train_phone.mlf')
mlf_file_train_word = os.path.join(label_dir, 'train_word.mlf')
mlf_file_train_with_sp = os.path.join(label_dir, 'train_phone_with_sp.mlf')
mlf_file_train_aligned = os.path.join(label_dir, 'train_phone_aligned.mlf')
triphone_mlf = os.path.join(label_dir, 'train_triphone.mlf')
hcompv_scp_train_updated = hcompv_scp_train.replace('.scp', '_updated.scp')
## testing
htk_stimmen_dir = os.path.join(default.htk_dir, 'stimmen')
## ======================= make lexicon for HTK =======================
if make_lexicon:
timer_start = time.time()
print('==== making lexicon for HTK ====')
# convert each lexicon from fame_asr phoneset to fame_htk phoneset.
print('>>> converting each lexicon from fame_asr phoneset to fame_htk phoneset...')
fame_functions.lexicon_asr2htk(lexicon_asr, lexicon_htk_asr)
fame_functions.lexicon_asr2htk(lexicon_oov, lexicon_htk_oov)
# combine lexicon
print('>>> combining lexicon files into one lexicon...')
# pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov.
# therefore there is no overlap between lex_asr and lex_oov.
fame_functions.combine_lexicon(lexicon_htk_asr, lexicon_htk_oov, lexicon_htk)
## fixing the lexicon for HTK.
# (1) Replace all tabs with single space;
# (2) Put a '\' before any dictionary entry beginning with single quote
# http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html
print('>>> fixing the lexicon...')
fame_functions.fix_lexicon(lexicon_htk)
## adding sp to the lexicon for HTK.
print('>>> adding sp to the lexicon...')
with open(lexicon_htk) as f:
lines = f.read().split('\n')
with open(lexicon_htk_with_sp, 'wb') as f:
f.write(bytes(' sp\n'.join(lines), 'ascii'))
print("elapsed time: {}".format(time.time() - timer_start))
## intialize the instance for HTK.
chtk = pyhtk.HTK(config_dir, fame_asr.phoneset_htk, lexicon_htk_with_sp, feature_size)
## ======================= make label files =======================
if make_label:
for dataset in dataset_list:
timer_start = time.time()
print("==== making label files on dataset {}".format(dataset))
script_list = os.path.join(default.fame_dir, 'data', dataset, 'text')
wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
label_dir_ = os.path.join(label_dir, dataset)
dictionary_file = os.path.join(label_dir_, 'temp.dic')
fh.make_new_directory(label_dir_, existing_dir='leave')
# list of scripts
with open(script_list, "rt", encoding="utf-8") as fin:
scripts = fin.read().split('\n')
for line in scripts:
# sample line:
# sp0457m_test_1968_plakkenfryslanterhorne_2168 en dan begjinne je natuerlik
filename_ = line.split(' ')[0]
filename = '_'.join(filename_.split('_')[1:])
sentence = ' '.join(line.split(' ')[1:])
sentence_htk = fame_functions.word2htk(sentence)
wav_file = os.path.join(wav_dir_, filename + '.wav')
if os.path.exists(wav_file) and chtk.can_be_ascii(sentence_htk) == 0:
if chtk.get_number_of_missing_words(
sentence_htk, dictionary_file) == 0:
# when the file name is too long, HDMan command does not work.
# therefore first temporary dictionary_file is made, then renamed.
shutil.move(dictionary_file, os.path.join(label_dir_, filename + '.dic'))
label_file = os.path.join(label_dir_, filename + '.lab')
chtk.make_label_file(sentence_htk, label_file)
else:
os.remove(dictionary_file)
print("elapsed time: {}".format(time.time() - timer_start))
## ======================= make master label files =======================
if make_mlf:
timer_start = time.time()
print("==== making master label files ====")
# train_2002_gongfansaken_10347.lab is empty. should be removed.
empty_lab_file = os.path.join(label_dir, 'train', 'train_2002_gongfansaken_10347.lab')
empty_dic_file = empty_lab_file.replace('.lab', '.dic')
if os.path.exists(empty_lab_file):
os.remove(empty_lab_file)
if os.path.exists(empty_dic_file):
os.remove(empty_dic_file)
for dataset in dataset_list:
feature_dir_ = os.path.join(feature_dir, dataset)
label_dir_ = os.path.join(label_dir, dataset)
mlf_word = os.path.join(label_dir, dataset + '_word.mlf')
mlf_phone = os.path.join(label_dir, dataset + '_phone.mlf')
mlf_phone_with_sp = os.path.join(label_dir, dataset + '_phone_with_sp.mlf')
print(">>> generating a word level mlf file for {}...".format(dataset))
chtk.label2mlf(label_dir_, mlf_word)
print(">>> generating a phone level mlf file for {}...".format(dataset))
chtk.mlf_word2phone(mlf_phone, mlf_word, with_sp=False)
chtk.mlf_word2phone(mlf_phone_with_sp, mlf_word, with_sp=True)
print("elapsed time: {}".format(time.time() - timer_start))
## ======================= extract features =======================
if extract_features:
for dataset in dataset_list:
timer_start = time.time()
print('==== extract features on dataset {} ===='.format(dataset))
wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
label_dir_ = os.path.join(label_dir, dataset)
feature_dir_ = os.path.join(feature_dir, dataset)
fh.make_new_directory(feature_dir_, existing_dir='delete')
# a script file for HCopy
print(">>> making a script file for HCopy...")
hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False)
hcopy_scp.close()
# get a list of features (hcopy.scp)
# from the filelist in FAME! corpus.
#fame_functions.make_hcopy_scp_from_filelist_in_fame(default.fame_dir, dataset, feature_dir_, hcopy_scp.name)
# from the list of label files.
lab_list = glob.glob(os.path.join(label_dir_, '*.lab'))
feature_list = [
os.path.join(wav_dir_, os.path.basename(lab_file).replace('.lab', '.wav')) + '\t'
+ os.path.join(feature_dir_, os.path.basename(lab_file).replace('.lab', '.mfc'))
for lab_file in lab_list]
#if os.path.exists(empty_mfc_file):
# os.remove(empty_mfc_file)
with open(hcopy_scp.name, 'wb') as f:
f.write(bytes('\n'.join(feature_list), 'ascii'))
# extract features.
print(">>> extracting features on {}...".format(dataset))
chtk.wav2mfc(hcopy_scp.name)
os.remove(hcopy_scp.name)
# make hcompv.scp.
print(">>> making a script file for {}...".format(dataset))
listdir = glob.glob(os.path.join(label_dir_, '*.dic'))
mfc_list = [filename.replace(label_dir_, feature_dir_).replace('.dic', '.mfc') for filename in listdir]
hcompv_scp = os.path.join(tmp_dir, dataset + '.scp')
with open(hcompv_scp, 'wb') as f:
f.write(bytes('\n'.join(mfc_list) + '\n', 'ascii'))
print(">>> extracting features on stimmen...")
chtk.wav2mfc(os.path.join(htk_stimmen_dir, 'hcopy.scp'))
print("elapsed time: {}".format(time.time() - timer_start))
## ======================= flat start monophones =======================
if combine_all:
# script files.
fh.concatenate(
os.path.join(tmp_dir, 'devel.scp'),
os.path.join(tmp_dir, 'test.scp'),
hcompv_scp_train
)
fh.concatenate(
hcompv_scp_train,
os.path.join(tmp_dir, 'train.scp'),
hcompv_scp_train
)
# phone level mlfs.
fh.concatenate(
os.path.join(label_dir, 'devel_phone.mlf'),
os.path.join(label_dir, 'test_phone.mlf'),
mlf_file_train
)
fh.concatenate(
mlf_file_train,
os.path.join(label_dir, 'train_phone.mlf'),
mlf_file_train
)
# phone level mlfs with sp.
fh.concatenate(
os.path.join(label_dir, 'devel_phone_with_sp.mlf'),
os.path.join(label_dir, 'test_phone_with_sp.mlf'),
mlf_file_train_with_sp
)
fh.concatenate(
mlf_file_train_with_sp,
os.path.join(label_dir, 'train_phone_with_sp.mlf'),
mlf_file_train_with_sp
)
# word level mlfs.
fh.concatenate(
os.path.join(label_dir, 'devel_word.mlf'),
os.path.join(label_dir, 'test_word.mlf'),
mlf_file_train_word
)
fh.concatenate(
mlf_file_train_word,
os.path.join(label_dir, 'train_word.mlf'),
mlf_file_train_word
)
## ======================= flat start monophones =======================
if flat_start:
timer_start = time.time()
print('==== flat start ====')
fh.make_new_directory(model_mono0_dir, existing_dir='leave')
chtk.flat_start(hcompv_scp_train, model_mono0_dir)
# make macros.
vFloors = os.path.join(model_mono0_dir, 'vFloors')
if os.path.exists(vFloors):
chtk.make_macros(vFloors)
# allocate mean & variance to all phones in the phone list
print('>>> allocating mean & variance to all phones in the phone list...')
chtk.make_hmmdefs(model_mono0_dir)
print("elapsed time: {}".format(time.time() - timer_start))
## ======================= train model without short pause =======================
if train_monophone_without_sp:
print('==== train monophone without sp ====')
timer_start = time.time()
niter = chtk.re_estimation_until_saturated(
model_mono1_dir,
model_mono0_dir, improvement_threshold, hcompv_scp_train,
os.path.join(htk_stimmen_dir, 'mfc'),
'mfc',
os.path.join(htk_stimmen_dir, 'word_lattice.ltc'),
mlf_file=mlf_file_train,
lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic')
)
print("elapsed time: {}".format(time.time() - timer_start))
## ======================= adding sp to the model =======================
if add_sp:
print('==== adding sp to the model ====')
# reference:
# http://www.f.waseda.jp/yusukekondo/htk.html#flat_start_estimation
timer_start = time.time()
# make model with sp.
print('>>> adding sp state to the last model in the previous step...')
fh.make_new_directory(model_mono1sp_dir, existing_dir='leave')
niter = chtk.get_niter_max(model_mono1_dir)
modeln_dir_pre = os.path.join(model_mono1_dir, 'iter'+str(niter))
modeln_dir = os.path.join(model_mono1sp_dir, 'iter0')
chtk.add_sp(modeln_dir_pre, modeln_dir)
print('>>> re-estimation...')
niter = chtk.re_estimation_until_saturated(
model_mono1sp_dir, modeln_dir, improvement_threshold, hcompv_scp_train,
os.path.join(htk_stimmen_dir, 'mfc'),
'mfc',
os.path.join(htk_stimmen_dir, 'word_lattice.ltc'),
mlf_file=mlf_file_train_with_sp,
lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'),
model_type='monophone_with_sp'
)
print("elapsed time: {}".format(time.time() - timer_start))
## ======================= train model with re-aligned mlf =======================
if train_monophone_with_re_aligned_mlf:
print('==== traina monophone with re-aligned mlf ====')
timer_start = time.time()
print('>>> re-aligning the training data... ')
niter = chtk.get_niter_max(model_mono1sp_dir)
modeln_dir = os.path.join(model_mono1sp_dir, 'iter'+str(niter))
chtk.make_aligned_label(
os.path.join(modeln_dir, 'macros'),
os.path.join(modeln_dir, 'hmmdefs'),
mlf_file_train_aligned,
mlf_file_train_word,
hcompv_scp_train)
chtk.fix_mlf(mlf_file_train_aligned)
print('>>> updating the script file... ')
chtk.update_script_file(
mlf_file_train_aligned,
mlf_file_train_with_sp,
hcompv_scp_train,
hcompv_scp_train_updated)
print('>>> re-estimation... ')
timer_start = time.time()
fh.make_new_directory(model_mono1sp2_dir, existing_dir='leave')
niter = chtk.get_niter_max(model_mono1sp_dir)
niter = chtk.re_estimation_until_saturated(
model_mono1sp2_dir,
os.path.join(model_mono1sp_dir, 'iter'+str(niter)),
improvement_threshold,
hcompv_scp_train_updated,
os.path.join(htk_stimmen_dir, 'mfc'),
'mfc',
os.path.join(htk_stimmen_dir, 'word_lattice.ltc'),
mlf_file=mlf_file_train_aligned,
lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'),
model_type='monophone_with_sp'
)
print("elapsed time: {}".format(time.time() - timer_start))
## ======================= increase mixture =======================
if increase_mixture:
print('==== increase mixture ====')
timer_start = time.time()
for nmix in [2, 4, 8, 16]:
if nmix == 2:
modeln_dir_ = model_mono1sp2_dir
else:
modeln_dir_ = os.path.join(model_dir, 'mono'+str(nmix_))
modeln_dir = os.path.join(model_dir, 'mono'+str(nmix))
print('mixture: {}'.format(nmix))
fh.make_new_directory(modeln_dir, existing_dir='delete')
niter = chtk.get_niter_max(modeln_dir_)
chtk.increase_mixture(
os.path.join(modeln_dir_, 'iter'+str(niter), 'hmmdefs'),
nmix,
os.path.join(modeln_dir, 'iter0'),
model_type='monophone_with_sp')
shutil.copy2(os.path.join(modeln_dir_, 'iter'+str(niter), 'macros'),
os.path.join(modeln_dir, 'iter0', 'macros'))
#improvement_threshold = -10
niter = chtk.re_estimation_until_saturated(
modeln_dir,
os.path.join(modeln_dir_, 'iter0'),
improvement_threshold,
hcompv_scp_train_updated,
os.path.join(htk_stimmen_dir, 'mfc'),
'mfc',
os.path.join(htk_stimmen_dir, 'word_lattice.ltc'),
mlf_file=mlf_file_train_aligned,
lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'),
model_type='monophone_with_sp'
)
nmix_ = nmix
print("elapsed time: {}".format(time.time() - timer_start))
## ======================= train triphone =======================
print('>>> making triphone list... ')
chtk.make_triphonelist(
mlf_file_train_aligned,
triphone_mlf)
if train_triphone:
print('==== train triphone model ====')
timer_start = time.time()
print('>>> init triphone model... ')
niter = chtk.get_niter_max(model_mono1sp2_dir)
fh.make_new_directory(os.path.join(model_tri1_dir, 'iter0'), existing_dir='leave')
chtk.init_triphone(
os.path.join(model_mono1sp2_dir, 'iter'+str(niter)),
os.path.join(model_tri1_dir, 'iter0')
)
print('>>> re-estimation... ')
## I wanted to train until satulated:
#niter = chtk.re_estimation_until_saturated(
# model_tri1_dir,
# os.path.join(model_tri1_dir, 'iter0'),
# improvement_threshold,
# hcompv_scp_train_updated,
# os.path.join(htk_stimmen_dir, 'mfc'),
# 'mfc',
# os.path.join(htk_stimmen_dir, 'word_lattice.ltc'),
# mlf_file=triphone_mlf,
# lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'),
# model_type='triphone'
# )
#
# but because the data size is limited, some triphone cannot be trained and received the error:
# ERROR [+8231] GetHCIModel: Cannot find hmm [i:-]r[+???]
# therefore only two times re-estimation is performed.
output_dir = model_tri1_dir
for niter in range(1, 4):
hmm_n = 'iter' + str(niter)
hmm_n_pre = 'iter' + str(niter-1)
_modeln_dir = os.path.join(output_dir, hmm_n)
_modeln_dir_pre = os.path.join(output_dir, hmm_n_pre)
fh.make_new_directory(_modeln_dir, 'leave')
chtk.re_estimation(
os.path.join(_modeln_dir_pre, 'hmmdefs'),
_modeln_dir,
hcompv_scp_train_updated,
mlf_file=triphone_mlf,
macros=os.path.join(_modeln_dir_pre, 'macros'),
model_type='triphone')
print("elapsed time: {}".format(time.time() - timer_start))
## ======================= train tied-state triphones =======================
if train_triphone_tied:
print('==== train tied-state triphones ====')
timer_start = time.time()
print('>>> making lexicon for triphone... ')
chtk.make_lexicon_triphone(phonelist_full_txt, lexicon_htk_triphone)
chtk.combine_phonelists(phonelist_full_txt)
print('>>> making a tree header... ')
fame_phonetics.make_quests_hed(quests_hed)
stats = os.path.join(r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\model\tri1\iter3', 'stats')
chtk.make_tree_header(tree_hed, quests_hed, stats, config_dir)
print('>>> init triphone model... ')
niter = chtk.get_niter_max(model_tri1_dir)
fh.make_new_directory(os.path.join(model_tri1tied_dir, 'iter0'), existing_dir='leave')
chtk.init_triphone(
os.path.join(model_tri1_dir, 'iter'+str(niter)),
os.path.join(model_tri1tied_dir, 'iter0'),
tied=True)
# I wanted to train until satulated:
#niter = chtk.re_estimation_until_saturated(
# model_tri1tied_dir,
# os.path.join(model_tri1tied_dir, 'iter0'),
# improvement_threshold,
# hcompv_scp_train_updated,
# os.path.join(htk_stimmen_dir, 'mfc'),
# 'mfc',
# os.path.join(htk_stimmen_dir, 'word_lattice.ltc'),
# mlf_file=triphone_mlf,
# lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'),
# model_type='triphone'
# )
#
# but because the data size is limited, some triphone cannot be trained and received the error:
# ERROR [+8231] GetHCIModel: Cannot find hmm [i:-]r[+???]
# therefore only 3 times re-estimation is performed.
output_dir = model_tri1tied_dir
for niter in range(1, 4):
hmm_n = 'iter' + str(niter)
hmm_n_pre = 'iter' + str(niter-1)
_modeln_dir = os.path.join(output_dir, hmm_n)
_modeln_dir_pre = os.path.join(output_dir, hmm_n_pre)
fh.make_new_directory(_modeln_dir, 'leave')
chtk.re_estimation(
os.path.join(_modeln_dir_pre, 'hmmdefs'),
_modeln_dir,
hcompv_scp_train_updated,
mlf_file=triphone_mlf,
macros=os.path.join(_modeln_dir_pre, 'macros'),
model_type='triphone')
print("elapsed time: {}".format(time.time() - timer_start))

138
acoustic_model/fame_test.py Normal file
View File

@ -0,0 +1,138 @@
import sys
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
from collections import Counter
import time
import re
import numpy as np
import pandas as pd
import fame_functions
import defaultfiles as default
sys.path.append(default.toolbox_dir)
from phoneset import fame_ipa, fame_asr
import convert_phoneset
lexicon_dir = os.path.join(default.fame_dir, 'lexicon')
lexicon_ipa = os.path.join(lexicon_dir, 'lex.ipa')
lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
lexicon_htk = os.path.join(default.htk_dir, 'lexicon', 'lex.htk')
## check if all the phones in lexicon.ipa are in fame_ipa.py.
#timer_start = time.time()
#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_ipa, phoneset='ipa')
#phoneset_py = fame_ipa.phoneset
#print("phones which is in lexicon.ipa but not in fame_ipa.py:\n{}".format(
# set(phoneset_lex) - set(phoneset_py)))
#print("elapsed time: {}".format(time.time() - timer_start))
# check which word has the phone.
#timer_start = time.time()
#extracted = find_phone(lexicon_ipa, 'ⁿ')
#print("elapsed time: {}".format(time.time() - timer_start))
## get the correspondence between lex_ipa and lex_asr.
lex_asr = fame_functions.load_lexicon(lexicon_asr)
lex_ipa = fame_functions.load_lexicon(lexicon_ipa)
if 0:
timer_start = time.time()
translation_key_ipa2asr, phone_unknown = fame_functions.get_translation_key(lexicon_ipa, lexicon_asr)
print("elapsed time: {}".format(time.time() - timer_start))
np.save(os.path.join('phoneset', 'output_get_translation_key_translation_key.npy'), translation_key_ipa2asr)
np.save(os.path.join('phoneset', 'output_get_translation_key_phone_unknown.npy'), phone_unknown)
else:
translation_key_ipa2asr = np.load(os.path.join('phoneset', 'output_get_translation_key_translation_key.npy')).item()
phone_unknown = np.load(os.path.join('phoneset', 'output_get_translation_key_phone_unknown.npy'))
phone_unknown = list(phone_unknown)
# manually check the correspondence for the phone in phone_unknown.
#p = phone_unknown[0]
#lex_ipa_ = find_phone(lexicon_ipa, p, phoneset='ipa')
#for word in lex_ipa_['word']:
# ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
# if np.sum(lex_asr['word'] == word) > 0:
# asr = lex_asr[lex_asr['word'] == word].iat[0, 1]
# ipa_list = convert_phone_set.split_word(ipa, fame_ipa.multi_character_phones)
# asr_list = asr.split(' ')
# if p in ipa_list and (len(ipa_list) == len(asr_list)):
# print("{0}: {1} --> {2}".format(word, ipa_list, asr_list))
# for ipa_, asr_ in zip(ipa_list, asr_list):
# if ipa_ in phone_unknown:
# translation_key_ipa2asr[ipa_] = asr_
# phone_unknown.remove(ipa_)
translation_key_ipa2asr['ə:'] = 'ə'
translation_key_ipa2asr['r.'] = 'r'
translation_key_ipa2asr['r:'] = 'r'
# added for stimmen.
translation_key_ipa2asr['ɪ:'] = 'ɪ:'
translation_key_ipa2asr['y:'] = 'y'
np.save(os.path.join('phoneset', 'fame_ipa2asr.npy'), translation_key_ipa2asr)
## check if all the phones in lexicon.asr are in translation_key_ipa2asr.
#timer_start = time.time()
#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_asr, phoneset='asr')
#phoneset_lex.remove("")
#phoneset_asr = list(set(translation_key_ipa2asr.values()))
#print("phones which is in lexicon.asr but not in the translation_key_ipa2asr:\n{}".format(
# set(phoneset_lex) - set(phoneset_asr)))
#print("elapsed time: {}".format(time.time() - timer_start))
## check if all the phones in lexicon.htk are in fame_asr.py.
#timer_start = time.time()
#phoneset_htk = fame_asr.phoneset_htk
#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_htk)
#phoneset_lex.remove('')
#print("phones which is in lexicon.htk but not in the fame_asr.py are:\n{}".format(
# set(phoneset_htk) - set(phoneset_lex)))
#print("elapsed time: {}".format(time.time() - timer_start))
## statistics over the lexicon
#lex_htk = fame_functions.load_lexicon(lexicon_htk)
#phones_all = (' '.join(lex_htk['pronunciation'])).split(' ')
#c = Counter(phones_all)
#lexicon_out = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\lexicon\lex.htk2'
#for i in lex_htk[lex_htk['word'].str.startswith('\'')].index.values:
# lex_htk.iat[i, 0] = lex_htk.iat[i, 0].replace('\'', '\\\'')
## to_csv does not work with space seperator. therefore all tabs should manually be replaced.
##lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
#lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
## check which letters are not coded in ascii.
#print('asr phones which cannot be coded in ascii:\n')
#for i in fame_asr.phoneset_short:
# try:
# i_encoded = i.encode("ascii")
# #print("{0} --> {1}".format(i, i.encode("ascii")))
# except UnicodeEncodeError:
# print(">>> {}".format(i))
#print("letters in the scripts which is not coded in ascii:\n")
#for dataset in ['train', 'devel', 'test']:
# timer_start = time.time()
# script_list = os.path.join(default.fame_dir, 'data', dataset, 'text')
# with open(script_list, "rt", encoding="utf-8") as fin:
# scripts = fin.read().split('\n')
# for line in scripts:
# sentence = ' '.join(line.split(' ')[1:])
# sentence_htk = fame_functions.word2htk(sentence)
# #if len(re.findall(r'[âêôûč\'àéèúćäëïöü]', sentence))==0:
# try:
# sentence_htk = bytes(sentence_htk, 'ascii')
# except UnicodeEncodeError:
# print(sentence)
# print(sentence_htk)

View File

@ -0,0 +1,42 @@
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import sys
import defaultfiles as default
sys.path.append(default.forced_alignment_module_dir)
from forced_alignment import pyhtk, convert_phone_set, scripts
reus_dir = r'c:\Users\Aki\source\repos\acoustic_model\reus-test'
wav_dir = reus_dir
wav_files = ['reus1008-reus.wav',
'reus1167-man.wav',
'reus3768-mantsje.wav']
word = 'reus'
pronunciation_ipa = ['rø:s', 'mɑn', 'mɑntsjə']
for wav_file in wav_files:
file_lab = os.path.join(reus_dir, wav_file.replace('.wav', '.lab'))
file_dic = os.path.join(reus_dir, wav_file.replace('.wav', '.dic'))
file_txt = os.path.join(reus_dir, wav_file.replace('.wav', '.txt'))
# output htk dict file
with open(file_dic, 'w', encoding="utf-8") as f:
for ipa in pronunciation_ipa:
cgn = convert_phone_set.ipa2cgn([ipa.replace(':', 'ː')])
barbara = convert_phone_set.cgn2barbara(cgn)
f.write(word.upper() + '\t' + barbara + '\n')
# output htk label file.
pyhtk._create_label_file(word, file_lab)
scripts.run_command([
'HVite','-T', '1',
'-a',
'-C', default.config_hvite,
'-H', default.acoustic_model,
'-m',
'-i', file_txt,
#'-S', script_file,
file_dic, default.phonelist_txt, os.path.join(wav_dir, wav_file)
])

View File

@ -0,0 +1,587 @@
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import sys
#import csv
#import subprocess
#from collections import Counter
#import re
import shutil
import glob
import numpy as np
import pandas as pd
from collections import Counter
#import matplotlib.pyplot as plt
#from sklearn.metrics import confusion_matrix
#import acoustic_model_functions as am_func
#import convert_xsampa2ipa
import defaultfiles as default
#from forced_alignment import pyhtk
#sys.path.append(default.forced_alignment_module_dir)
#from forced_alignment import convert_phone_set
#import acoustic_model_functions as am_func
import convert_xsampa2ipa
import stimmen_functions
import fame_functions
import convert_phoneset
from phoneset import fame_ipa, fame_asr
sys.path.append(default.toolbox_dir)
import file_handling as fh
from htk import pyhtk
## ======================= user define =======================
#excel_file = os.path.join(default.experiments_dir, 'stimmen', 'data', 'Frisian Variants Picture Task Stimmen.xlsx')
#data_dir = os.path.join(default.experiments_dir, 'stimmen', 'data')
#wav_dir = r'c:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\corpus\stimmen' # 16k
#acoustic_model_dir = os.path.join(default.experiments_dir, 'friesian', 'acoustic_model', 'model')
#htk_dict_dir = os.path.join(default.experiments_dir, 'stimmen', 'dic_short')
#fa_dir = os.path.join(default.experiments_dir, 'stimmen', 'FA_44k')
#result_dir = os.path.join(default.experiments_dir, 'stimmen', 'result')
#kaldi_data_dir = os.path.join(default.kaldi_dir, 'data', 'alignme')
#kaldi_dict_dir = os.path.join(default.kaldi_dir, 'data', 'local', 'dict')
#lexicon_txt = os.path.join(kaldi_dict_dir, 'lexicon.txt')
#lex_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr')
#lex_asr_htk = os.path.join(default.fame_dir, 'lexicon', 'lex.asr_htk')
# procedure
make_dic_file = 0
make_HTK_files = 0
extract_features = 0
#make_htk_dict_files = 0
#do_forced_alignment_htk = 0
#eval_forced_alignment_htk = 0
make_kaldi_files = 0
#make_kaldi_lexicon_txt = 0
#load_forced_alignment_kaldi = 1
#eval_forced_alignment_kaldi = 1
#sys.path.append(os.path.join(default.repo_dir, 'forced_alignment'))
#from forced_alignment import convert_phone_set
#from forced_alignment import pyhtk
#sys.path.append(os.path.join(default.repo_dir, 'toolbox'))
#from evaluation import plot_confusion_matrix
## HTK related files.
config_dir = os.path.join(default.htk_dir, 'config')
model_dir = os.path.join(default.htk_dir, 'model')
feature_dir = os.path.join(default.htk_dir, 'mfc', 'stimmen')
config_hcopy = os.path.join(config_dir, 'config.HCopy')
# files to be made.
lattice_file = os.path.join(config_dir, 'stimmen.ltc')
phonelist_txt = os.path.join(config_dir, 'phonelist.txt')
stimmen_dic = os.path.join(default.htk_dir, 'lexicon', 'stimmen_recognition.dic')
hcopy_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test_hcopy.scp')
hvite_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test_hvite.scp')
hresult_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test_result.scp')
## Kaldi related files.
kaldi_data_dir = os.path.join(default.kaldi_dir, 'data')
# files to be made.
wav_scp = os.path.join(kaldi_data_dir, 'test', 'wav.scp')
text_file = os.path.join(kaldi_data_dir, 'test', 'text')
utt2spk = os.path.join(kaldi_data_dir, 'test', 'utt2spk')
corpus_txt = os.path.join(kaldi_data_dir, 'local', 'corpus.txt')
lexicon_txt = os.path.join(kaldi_data_dir, 'local', 'dict', 'lexicon.txt')
nonsilence_phones_txt = os.path.join(kaldi_data_dir, 'local', 'dict', 'nonsilence_phones.txt')
silence_phones_txt = os.path.join(kaldi_data_dir, 'local', 'dict', 'silence_phones.txt')
optional_silence_txt = os.path.join(kaldi_data_dir, 'local', 'dict', 'optional_silence.txt')
## ======================= load test data ======================
stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test'
df = stimmen_functions.load_transcriptions_clean(stimmen_test_dir)
df = stimmen_functions.add_row_asr(df)
df = stimmen_functions.add_row_htk(df)
word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
word_list = sorted(word_list)
## ======================= make dic file to check pronunciation variants ======================
# dic file should be manually modified depends on the task - recognition / forced-alignemnt.
if make_dic_file:
# for HTK.
with open(stimmen_dic, mode='wb') as f:
for word in word_list:
df_ = df[df['word']==word]
pronunciations = list(np.unique(df_['htk']))
pronunciations_ = [word.upper() + ' sil ' + ' '.join(convert_phoneset.split_word(
htk, fame_asr.multi_character_phones_htk)) + ' sil'
for htk in pronunciations]
f.write(bytes('\n'.join(pronunciations_) + '\n', 'ascii'))
f.write(bytes('SILENCE sil\n', 'ascii'))
# for Kaldi.
fh.make_new_directory(os.path.join(kaldi_data_dir, 'local', 'dict'))
with open(lexicon_txt, mode='wb') as f:
f.write(bytes('!SIL sil\n', 'utf-8'))
f.write(bytes('<UNK> spn\n', 'utf-8'))
for word in word_list:
df_ = df[df['word']==word]
pronunciations = list(np.unique(df_['asr']))
pronunciations_ = [word.lower() + ' ' + ' '.join(convert_phoneset.split_word(
asr, fame_asr.multi_character_phones))
for asr in pronunciations]
f.write(bytes('\n'.join(pronunciations_) + '\n', 'utf-8'))
## ======================= test data for recognition ======================
# only target pronunciation variants.
df_rec = pd.DataFrame(index=[], columns=list(df.keys()))
for word in word_list:
variants = [htk.replace(' ', '')
for htk in stimmen_functions.load_pronunciations(word.upper(), stimmen_dic)]
df_ = df[df['word'] == word]
for index, row in df_.iterrows():
if row['htk'] in variants:
df_rec = df_rec.append(row, ignore_index=True)
## ======================= make files required for HTK ======================
if make_HTK_files:
# make a word lattice file.
pyhtk.create_word_lattice_file(
os.path.join(config_dir, 'stimmen.net'),
lattice_file)
# extract features.
with open(hcopy_scp, 'wb') as f:
filelist = [os.path.join(stimmen_test_dir, filename) + '\t'
+ os.path.join(feature_dir, os.path.basename(filename).replace('.wav', '.mfc'))
for filename in df['filename']]
f.write(bytes('\n'.join(filelist), 'ascii'))
pyhtk.wav2mfc(config_hcopy, hcopy_scp)
# make label files.
for index, row in df.iterrows():
filename = row['filename'].replace('.wav', '.lab')
label_file = os.path.join(feature_dir, filename)
with open(label_file, 'wb') as f:
label_string = 'SILENCE\n' + row['word'].upper() + '\nSILENCE\n'
f.write(bytes(label_string, 'ascii'))
## ======================= make files required for Kaldi =======================
if make_kaldi_files:
fh.make_new_directory(os.path.join(kaldi_data_dir, 'test'))
fh.make_new_directory(os.path.join(kaldi_data_dir, 'test', 'local'))
fh.make_new_directory(os.path.join(kaldi_data_dir, 'conf'))
# remove previous files.
if os.path.exists(wav_scp):
os.remove(wav_scp)
if os.path.exists(text_file):
os.remove(text_file)
if os.path.exists(utt2spk):
os.remove(utt2spk)
f_wav_scp = open(wav_scp, 'a', encoding="utf-8", newline='\n')
f_text_file = open(text_file, 'a', encoding="utf-8", newline='\n')
f_utt2spk = open(utt2spk, 'a', encoding="utf-8", newline='\n')
# make wav.scp, text, and utt2spk files.
for i, row in df_rec.iterrows():
filename = row['filename']
print('=== {0}: {1} ==='.format(i, filename))
wav_file = os.path.join(stimmen_test_dir, filename)
#if os.path.exists(wav_file):
speaker_id = 'speaker_' + str(i).zfill(4)
utterance_id = filename.replace('.wav', '')
utterance_id = utterance_id.replace(' ', '_')
utterance_id = speaker_id + '-' + utterance_id
# output
f_wav_scp.write('{0} {1}\n'.format(
utterance_id,
wav_file.replace('c:/', '/mnt/c/').replace('\\', '/'))) # convert path to unix format.
f_text_file.write('{0}\t{1}\n'.format(utterance_id, df_rec['word'][i].lower()))
f_utt2spk.write('{0} {1}\n'.format(utterance_id, speaker_id))
f_wav_scp.close()
f_text_file.close()
f_utt2spk.close()
with open(corpus_txt, 'wb') as f:
f.write(bytes('\n'.join([word.lower() for word in word_list]) + '\n', 'utf-8'))
with open(nonsilence_phones_txt, 'wb') as f:
f.write(bytes('\n'.join(fame_asr.phoneset_short) + '\n', 'utf-8'))
with open(silence_phones_txt, 'wb') as f:
f.write(bytes('sil\nspn\n', 'utf-8'))
with open(optional_silence_txt, 'wb') as f:
f.write(bytes('sil\n', 'utf-8'))
with open(os.path.join(kaldi_data_dir, 'conf', 'decode.config'), 'wb') as f:
f.write(bytes('first_beam=10.0\n', 'utf-8'))
f.write(bytes('beam=13.0\n', 'utf-8'))
f.write(bytes('lattice_beam=6.0\n', 'utf-8'))
with open(os.path.join(kaldi_data_dir, 'conf', 'mfcc.conf'), 'wb') as f:
f.write(bytes('--use-energy=false', 'utf-8'))
## ======================= recognition ======================
listdir = glob.glob(os.path.join(feature_dir, '*.mfc'))
with open(hvite_scp, 'wb') as f:
f.write(bytes('\n'.join(listdir), 'ascii'))
with open(hresult_scp, 'wb') as f:
f.write(bytes('\n'.join(listdir).replace('.mfc', '.rec'), 'ascii'))
# calculate result
performance = np.zeros((1, 2))
for niter in range(50, 60):
output = pyhtk.recognition(
os.path.join(config_dir, 'config.rec'),
lattice_file,
os.path.join(default.htk_dir, 'model', 'hmm1', 'iter' + str(niter), 'hmmdefs'),
stimmen_dic, phonelist_txt, hvite_scp)
output = pyhtk.calc_recognition_performance(
stimmen_dic, hresult_scp)
per_sentence, per_word = pyhtk.load_recognition_output_all(output)
performance_ = np.array([niter, per_sentence['accuracy']]).reshape(1, 2)
performance = np.r_[performance, performance_]
print('{0}: {1}[%]'.format(niter, per_sentence['accuracy']))
#output = run_command_with_output([
# 'HVite', '-T', '1',
# '-C', config_rec,
# '-w', lattice_file,
# '-H', hmm,
# dictionary_file, phonelist_txt,
# '-S', HVite_scp
#])
## ======================= forced alignment using HTK =======================
if do_forced_alignment_htk:
#for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
for hmm_num in [256, 512, 1024]:
hmm_num_str = str(hmm_num)
acoustic_model = os.path.join(acoustic_model_dir, 'hmm' + hmm_num_str + r'-2\hmmdefs')
predictions = pd.DataFrame({'filename': [''],
'word': [''],
'xsampa': [''],
'ipa': [''],
'famehtk': [''],
'prediction': ['']})
for i, filename in enumerate(df['filename']):
print('=== {0}/{1} ==='.format(i, len(df)))
if (i in df['filename'].keys()) and (isinstance(df['filename'][i], str)):
wav_file = os.path.join(wav_dir, filename)
if os.path.exists(wav_file):
word = df['word'][i]
WORD = word.upper()
fa_file = os.path.join(fa_dir, filename.replace('.wav', '.txt') + hmm_num_str)
#if not os.path.exists(fa_file):
# make label file.
label_file = os.path.join(wav_dir, filename.replace('.wav', '.lab'))
with open(label_file, 'w') as f:
lines = f.write(WORD)
htk_dict_file = os.path.join(htk_dict_dir, word + '.dic')
pyhtk.doHVite(wav_file, label_file, htk_dict_file, fa_file, default.config_hvite,
default.phonelist, acoustic_model)
os.remove(label_file)
prediction = am_func.read_fileFA(fa_file)
print('{0}: {1} -> {2}'.format(WORD, df['famehtk'][i], prediction))
else:
prediction = ''
print('!!!!! file not found.')
line = pd.Series([df['filename'][i], df['word'][i], df['xsampa'][i], df['ipa'][i], df['famehtk'][i], prediction], index=['filename', 'word', 'xsampa', 'ipa', 'famehtk', 'prediction'], name=i)
predictions = predictions.append(line)
else:
prediction = ''
print('!!!!! invalid entry.')
predictions.to_pickle(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.pkl'))
## ======================= make lexicon txt which is used by Kaldi =======================
if make_kaldi_lexicon_txt:
option_num = 6
# remove previous file.
if os.path.exists(lexicon_txt):
os.remove(lexicon_txt)
lexiconp_txt = lexicon_txt.replace('lexicon.txt', 'lexiconp.txt')
if os.path.exists(lexiconp_txt):
os.remove(lexiconp_txt)
# output lexicon.txt
f_lexicon_txt = open(lexicon_txt, 'a', encoding="utf-8", newline='\n')
pronvar_list_all = []
for word in word_list:
# pronunciation variant of the target word.
pronunciation_variants = df['ipa'][df['word'].str.match(word)]
c = Counter(pronunciation_variants)
total_num = sum(c.values())
#with open(result_dir + '\\' + word + '.csv', 'a', encoding="utf-8", newline='\n') as f:
# for key in c.keys():
# f.write("{0},{1}\n".format(key,c[key]))
for key, value in c.most_common(option_num):
# make possible pronunciation variant list.
pronvar_list = am_func.fame_pronunciation_variant(key)
for pronvar_ in pronvar_list:
split_ipa = convert_phone_set.split_fame_ipa(pronvar_)
pronvar_out = ' '.join(split_ipa)
pronvar_list_all.append([word, pronvar_out])
pronvar_list_all = np.array(pronvar_list_all)
pronvar_list_all = np.unique(pronvar_list_all, axis=0)
# output
f_lexicon_txt.write('<UNK>\tSPN\n')
for line in pronvar_list_all:
f_lexicon_txt.write('{0}\t{1}\n'.format(line[0].lower(), line[1]))
f_lexicon_txt.close()
## ======================= load kaldi forced alignment result =======================
if load_forced_alignment_kaldi:
phones_txt = os.path.join(default.kaldi_dir, 'data', 'lang', 'phones.txt')
merged_alignment_txt = os.path.join(default.kaldi_dir, 'exp', 'tri1_alignme', 'merged_alignment.txt')
#filenames = np.load(data_dir + '\\filenames.npy')
#words = np.load(data_dir + '\\words.npy')
#pronunciations = np.load(data_dir + '\\pronunciations_ipa.npy')
#pronvar_list_all = np.load(data_dir + '\\pronvar_list_all.npy')
#word_list = np.unique(words)
# load the mapping between phones and ids.
with open(phones_txt, 'r', encoding="utf-8") as f:
mapping_phone2id = f.read().split('\n')
phones = []
phone_ids = [] # ID of phones
for m in mapping_phone2id:
m = m.split(' ')
if len(m) > 1:
phones.append(m[0])
phone_ids.append(int(m[1]))
# load the result of FA.
with open(merged_alignment_txt, 'r') as f:
lines = f.read()
lines = lines.split('\n')
predictions = pd.DataFrame({'filename': [''],
'word': [''],
'xsampa': [''],
'ipa': [''],
'famehtk': [''],
'prediction': ['']})
#fa_filenames = []
#fa_pronunciations = []
utterance_id_ = ''
pronunciation = []
for line in lines:
line = line.split(' ')
if len(line) == 5:
utterance_id = line[0]
if utterance_id == utterance_id_:
phone_id = int(line[4])
#if not phone_id == 1:
phone_ = phones[phone_ids.index(phone_id)]
phone = re.sub(r'_[A-Z]', '', phone_)
if not phone == 'SIL':
pronunciation.append(phone)
else:
filename = re.sub(r'speaker_[0-9]{4}-', '', utterance_id_)
prediction = ''.join(pronunciation)
df_ = df[df['filename'].str.match(filename)]
df_idx = df_.index[0]
prediction_ = pd.Series([#filename,
#df_['word'][df_idx],
#df_['xsampa'][df_idx],
#df_['ipa'][df_idx],
#df_['famehtk'][df_idx],
df_.iloc[0,1],
df_.iloc[0,3],
df_.iloc[0,4],
df_.iloc[0,2],
df_.iloc[0,0],
prediction],
index=['filename', 'word', 'xsampa', 'ipa', 'famehtk', 'prediction'],
name=df_idx)
predictions = predictions.append(prediction_)
#fa_filenames.append()
#fa_pronunciations.append(' '.join(pronunciation))
pronunciation = []
utterance_id_ = utterance_id
predictions.to_pickle(os.path.join(result_dir, 'kaldi', 'predictions.pkl'))
## ======================= evaluate the result of forced alignment =======================
if eval_forced_alignment_htk:
htk_dict_dir = os.path.join(default.experiments_dir, 'stimmen', 'dic_short')
compare_hmm_num = 1
if compare_hmm_num:
f_result = open(os.path.join(result_dir, 'result.csv'), 'w')
f_result.write("nmix,Oog,Oog,Oor,Oor,Pauw,Pauw,Reus,Reus,Reuzenrad,Reuzenrad,Roeiboot,Roeiboot,Rozen,Rozen\n")
for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
#for hmm_num in [256]:
hmm_num_str = str(hmm_num)
if compare_hmm_num:
f_result.write("{},".format(hmm_num_str))
#match = np.load(data_dir + '\\match_hmm' + hmm_num_str + '.npy')
#prediction = np.load(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.npy'))
#prediction = pd.Series(prediction, index=df.index, name='prediction')
#result = pd.concat([df, prediction], axis=1)
result = pd.read_pickle(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.pkl'))
# load pronunciation variants
for word in word_list:
htk_dict_file = os.path.join(htk_dict_dir, word + '.dic')
with open(htk_dict_file, 'r') as f:
lines = f.read().split('\n')[:-1]
pronunciation_variants = [line.split('\t')[1] for line in lines]
# see only words which appears in top 3.
result_ = result[result['word'].str.match(word)]
result_ = result_[result_['famehtk'].isin(pronunciation_variants)]
match_num = sum(result_['famehtk'] == result_['prediction'])
total_num = len(result_)
print("word '{0}': {1}/{2} ({3:.2f} %)".format(word, match_num, total_num, match_num/total_num*100))
if compare_hmm_num:
f_result.write("{0},{1},".format(match_num, total_num))
else:
# output confusion matrix
cm = confusion_matrix(result_['famehtk'], result_['prediction'])
plt.figure()
plot_confusion_matrix(cm, classes=pronunciation_variants, normalize=False)
plt.savefig(result_dir + '\\cm_' + word + '.png')
if compare_hmm_num:
f_result.write('\n')
if compare_hmm_num:
f_result.close()
## ======================= evaluate the result of forced alignment of kaldi =======================
if eval_forced_alignment_kaldi:
result = pd.read_pickle(os.path.join(result_dir, 'kaldi', 'predictions.pkl'))
f_result = open(os.path.join(result_dir, 'result.csv'), 'w')
f_result.write("word,total,valid,match,[%]\n")
# load pronunciation variants
with open(lexicon_txt, 'r', encoding="utf-8", newline='\n') as f:
lines = f.read().split('\n')[:-1]
pronunciation_variants_all = [line.split('\t') for line in lines]
word_list = np.delete(word_list, [0], 0) # remove 'Oog'
for word in word_list:
# load pronunciation variant of the word.
pronunciation_variants = []
for line in pronunciation_variants_all:
if line[0] == word.lower():
pronunciation_variants.append(line[1].replace(' ', ''))
# see only words which appears in top 3.
result_ = result[result['word'].str.match(word)]
result_tolerant = pd.DataFrame({
'filename': [''],
'word': [''],
'xsampa': [''],
'ipa': [''],
'prediction': [''],
'match': ['']})
for i in range(0, len(result_)):
line = result_.iloc[i]
# make a list of all possible pronunciation variants of ipa description.
# i.e. possible answers from forced alignment.
ipa = line['ipa']
pronvar_list = [ipa]
pronvar_list_ = am_func.fame_pronunciation_variant(ipa)
if not pronvar_list_ is None:
pronvar_list += list(pronvar_list_)
# only focus on pronunciations which can be estimated from ipa.
if len(set(pronvar_list) & set(pronunciation_variants)) > 0:
if line['prediction'] in pronvar_list:
ismatch = True
else:
ismatch = False
line_df = pd.DataFrame(result_.iloc[i]).T
df_idx = line_df.index[0]
result_tolerant_ = pd.Series([line_df.loc[df_idx, 'filename'],
line_df.loc[df_idx, 'word'],
line_df.loc[df_idx, 'xsampa'],
line_df.loc[df_idx, 'ipa'],
line_df.loc[df_idx, 'prediction'],
ismatch],
index=['filename', 'word', 'xsampa', 'ipa', 'prediction', 'match'],
name=df_idx)
result_tolerant = result_tolerant.append(result_tolerant_)
# remove the first entry (dummy)
result_tolerant = result_tolerant.drop(0, axis=0)
total_num = len(result_)
valid_num = len(result_tolerant)
match_num = np.sum(result_tolerant['match'])
print("word '{0}': {1}/{2} ({3:.2f} %) originally {4}".format(word, match_num, valid_num, match_num/valid_num*100, total_num))
f_result.write("{0},{1},{2},{3},{4}\n".format(word, total_num, valid_num, match_num, match_num/valid_num*100))
f_result.close()
## output confusion matrix
#cm = confusion_matrix(result_['ipa'], result_['prediction'])
#plt.figure()
#plot_confusion_matrix(cm, classes=pronunciation_variants, normalize=False)
#plt.savefig(result_dir + '\\cm_' + word + '.png')

View File

@ -0,0 +1,65 @@
#
# forced alignment using novo-api.
#
# *** IMPORTANT ***
# This file should be treated as confidencial.
# This file should not be copied or uploaded to public sites.
#
# NOTES:
# The usage of novo api: https://bitbucket.org/novolanguage/python-novo-api
# I couldn't make it work as I described in the mail to Martijn Bartelds on
# 2018/12/03.
# As per the advice from him, I modified testgrammer.py and made it a function.
#
# In order to run on Python 3.6, the following points are changed in novo-api.
# (1) backend/__init__.py
# - #import session
# from . import session
# (2) backend/session.py
# - #except Exception, e:
# except Exception as e:
# - #print self.last_message
# print(self.last_message)
# (3) asr/segment/praat.py
# - def print_tier(output, title, begin, end, segs, (format, formatter))
# def print_tier(output, title, begin, end, segs, format, formatter):
# (4) asr/spraaklab/__init.py
# - #import session
# from . import session
# (5) asr/spraaklab/schema.py
# - #print data, "validated not OK", e.message
# print("{0} validated not OK {1}".format(data, e.message))
# - #print data, "validated OK"
# print("{} validated OK".format(data))
# - #if isinstance(object, basestring):
# if isinstance(object, str)
#
# Aki Kunikoshi
# 428968@gmail.com
#
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import argparse
import json
from novoapi.backend import session
import novoapi_functions
import defaultfiles as default
# username / password cannot be passed as artuments...
p = argparse.ArgumentParser()
#p.add_argument("--user", default=None)
#p.add_argument("--password", default=None)
p.add_argument("--user", default='martijn.wieling')
p.add_argument("--password", default='xxxxxx')
args = p.parse_args()
#wav_file = 'c:\\OneDrive\\WSL\\test\\onetwothree.wav'
wav_file = os.path.join(default.stimmen_wav_dir, 'pg_pauw_2206_0fjd8.wav')
# list of the pronunciation for each words
word = 'pauw'
pronunciation_ipa = ['pau', 'pɑu']
result = novoapi_functions.forced_alignment(wav_file, word, pronunciation_ipa)
pronunciation_ipa, pronunciation_novo70, llh = novoapi_functions.result2pronunciation(result, word)

View File

@ -0,0 +1,220 @@
## this script should be used only by Aki Kunikoshi.
import os
import numpy as np
import pandas as pd
import argparse
import json
from novoapi.backend import session
import defaultfiles as default
import convert_phoneset
def load_novo70_phoneset():
#phonelist_novo70_ = pd.ExcelFile(default.phonelist_novo70_xlsx)
#df = pd.read_excel(phonelist_novo70_, 'list')
## *_simple includes columns which has only one phone in.
#for ipa, novo70 in zip(df['IPA_simple'], df['novo70_simple']):
# if not pd.isnull(ipa):
# print('{0}:{1}'.format(ipa, novo70))
# translation_key[ipa] = novo70
#phonelist_novo70 = np.unique(list(df['novo70_simple']))
novo70_phoneset = pd.read_csv(default.novo70_phoneset, delimiter='\t', header=None, encoding="utf-8")
novo70_phoneset.rename(columns={0: 'novo70', 1: 'ipa', 2: 'description'}, inplace=True)
#phoneset_ipa = []
#phoneset_novo70 = []
#with open(default.novo70_phoneset, "rt", encoding="utf-8") as fin:
# lines = fin.read()
# lines = lines.split('\n')
# for line in lines:
# words = line.split('\t')
# if len(words) > 1:
# novo70 = words[0]
# ipa = words[1]
# phoneset_ipa.append(ipa)
# phoneset_novo70.append(novo70)
# translation_key_ipa2novo70[ipa] = novo70
# translation_key_novo702ipa[novo70] = ipa
# As per Nederlandse phoneset_aki.xlsx recieved from David
# [ɔː] oh / ohr # from ipa->novo70, only oh is used.
# [ɪː] ih / ihr # from ipa->novo70, only ih is used.
# [iː] iy
# [œː] uh
# [ɛː] eh
# [w] wv in IPA written as ʋ.
extra_ipa = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː', 'ʋ']
extra_novo70 = ['oh', 'ih', 'iy', 'uh', 'eh', 'wv']
phoneset_ipa = list(novo70_phoneset['ipa'])
phoneset_ipa.extend(extra_ipa)
phoneset_ipa = [i.replace('ː', ':') for i in phoneset_ipa]
phoneset_novo70 = list(novo70_phoneset['novo70'])
phoneset_novo70.extend(extra_novo70)
phoneset_novo70 = [i.replace('ː', ':') for i in phoneset_novo70]
translation_key_ipa2novo70 = dict()
translation_key_novo702ipa = dict()
for ipa, novo70 in zip(phoneset_ipa, phoneset_novo70):
#phoneset_ipa.append(ipa)
#phoneset_novo70.append(novo70)
translation_key_ipa2novo70[ipa] = novo70
translation_key_novo702ipa[novo70] = ipa
translation_key_novo702ipa['ohr'] = 'ɔ:'
translation_key_novo702ipa['ihr'] = 'ɪ:'
phoneset_ipa = np.unique(phoneset_ipa)
phoneset_novo70 = np.unique(phoneset_novo70)
return phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa
def split_ipa(line):
"""
Split a line by IPA phones.
If nasalized sound (such as ɛ̃ː) is included, it will give error.
:param string line: one line written in IPA.
:return string lineSeperated: the line splitted in IPA phone.
"""
phoneset_ipa, _, _, _ = load_novo70_phoneset()
#multi_character_phones = [i for i in phoneset_ipa if len(i) > 1]
#multi_character_phones.sort(key=len, reverse=True)
#multi_character_phones = [
# # IPAs in CGN.
# u'ʌu', u'ɛi', u'œy', u'aː', u'eː', u'iː', u'oː', u'øː', u'ɛː', u'œː', u'ɔː', u'ɛ̃ː', u'ɑ̃ː', u'ɔ̃ː', u'œ̃', u'ɪː'
# ]
#return [phone for phone in multi_character_tokenize(line.strip(), multi_character_phones)]
return convert_phoneset.split_word(line, phoneset_ipa)
def split_novo70(line):
"""
Split a line by novo70 phones.
:param string line: one line written in novo70.
:return string lineSeperated: the line splitted by novo70 phones.
"""
_, phoneset_novo70, _, _ = load_novo70_phoneset()
#multi_character_phones = [p for p in phoneset_novo70 if len(p) > 1]
#multi_character_phones = sorted(multi_character_phones, key=len, reverse=True)
multi_character_phones = convert_phoneset.extract_multi_character_phones(phoneset_novo70)
return ['sp' if phone == ' ' else phone
for phone in multi_character_tokenize(line.strip(), multi_character_phones)]
def novo702ipa(line):
#pronunciation = []
_, _, _, translation_key = load_novo70_phoneset()
#for phone in split_novo70(tokens):
# pronunciation.append(translation_key.get(phone, phone))
#return ' '.join(pronunciation)
return ' '.join(convert_phoneset.convert_phoneset(split_novo70(line), translation_key))
# numbering of novo70 should be checked.
def ipa2novo70(line):
#pronunciation = []
_, _, translation_key, _ = load_novo70_phoneset()
#for phone in split_ipa(tokens):
# pronunciation.append(translation_key.get(phone, phone))
#return ' '.join(pronunciation)
return ' '.join(convert_phoneset.convert_phoneset(split_ipa(line), translation_key))
def make_grammar(word, pronunciation_ipa):
"""
Args:
words
pronunciation_ipa: list of pronunciation variants.
"""
#word = 'pauw'
#pronunciation_ipa = ['pau', 'pɑu']
grammer_data_elements0_pronunciation = []
for id, ipa in enumerate(pronunciation_ipa):
novo70 = ipa2novo70(ipa)
grammer_data_elements0_pronunciation.append({
"phones": novo70.split(),
"id": id
})
grammar_data = {
"kind": 'sequence',
"elements": [{
"kind": "word",
"pronunciation": grammer_data_elements0_pronunciation,
"label": word
}]
}
grammar = {
"type": "confusion_network",
"version": "1.0",
"data": grammar_data,
"return_objects": ["grammar"],
"phoneset": "novo70"
}
return grammar
def forced_alignment(wav_file, word, pronunciation_ipa):
### IMPORTANT ###
# because of this function, this script should not be uploaded / shared.
# username / password cannot be passed as artuments...
p = argparse.ArgumentParser()
p.add_argument("--user", default='martijn.wieling')
p.add_argument("--password", default='xxxxxx')
args = p.parse_args()
rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True) # , modeldir=modeldir)
grammar = make_grammar(word, pronunciation_ipa)
result = rec.setgrammar(grammar)
#print "Set grammar result", res
result = rec.recognize_wav(wav_file)
return result.export()
def result2pronunciation(result, word):
result_ = [result[i] for i in range(len(result)) if result[i]['label'] == word]
llh = result_[0]['llh']
phones = result_[0]['phones']
pronunciation_novo70 = [phone['label'] for phone in phones]
pronunciation_ipa = [novo702ipa(phone) for phone in pronunciation_novo70]
return pronunciation_ipa, pronunciation_novo70, llh
def phones_not_in_novo70(ipa):
""" extract phones which is not in novo70 phoneset. """
phoneset_ipa, _, _, _ = load_novo70_phoneset()
# As per Nederlandse phoneset_aki.xlsx recieved from David
# [ɔː] oh / ohr
# [ɪː] ih / ihr
# [iː] iy
# [œː] uh
# [ɛː] eh
# [w] wv in IPA written as ʋ.
david_suggestion = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː', 'w']
return [phone for phone in split_ipa(ipa)
if not phone in phoneset_ipa and not phone in david_suggestion]
if __name__ == 'main':
pronunciation_ipa = ['rø:s', 'mɑn', 'mɑntsjə']
#grammar = make_grammar('reus', pronunciation_ipa)
phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa = load_novo70_phoneset()

View File

@ -1,437 +0,0 @@
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import sys
import csv
import subprocess
from collections import Counter
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#from sklearn.metrics import confusion_matrix
import acoustic_model_functions as am_func
import convert_xsampa2ipa
import defaultfiles as default
## ======================= user define =======================
#curr_dir = r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model'
#config_ini = 'config.ini'
#repo_dir = r'C:\Users\Aki\source\repos'
#forced_alignment_module = repo_dir + '\\forced_alignment'
#forced_alignment_module_old = repo_dir + '\\aki_tools'
#ipa_xsampa_converter_dir = repo_dir + '\\ipa-xsama-converter'
#accent_classification_dir = repo_dir + '\\accent_classification\accent_classification'
excel_file = os.path.join(default.experiments_dir, 'stimmen', 'data', 'Frisian Variants Picture Task Stimmen.xlsx')
#experiments_dir = r'C:\OneDrive\Research\rug\experiments'
data_dir = os.path.join(default.experiments_dir, 'stimmen', 'data')
#csvfile = data_dir + '\\Frisian Variants Picture Task Stimmen.csv'
wav_dir = os.path.join(default.experiments_dir, 'stimmen', 'wav')
acoustic_model_dir = os.path.join(default.experiments_dir, 'friesian', 'acoustic_model', 'model')
htk_dict_dir = os.path.join(default.experiments_dir, 'stimmen', 'dic_short')
fa_dir = os.path.join(default.experiments_dir, 'stimmen', 'FA')
#cygwin_dir = r'C:\cygwin64\home\Aki\acoustic_model'
#lex_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr')
#lex_asr_htk = os.path.join(default.fame_dir, 'lexicon', 'lex.asr_htk')
# procedure
make_dic_files = 0
do_forced_alignment_htk = 1
make_kaldi_data_files = 0
make_kaldi_lexicon_txt = 0
load_forced_alignment_kaldi = 0
eval_forced_alignment = 0
## ======================= add paths =======================
sys.path.append(os.path.join(default.repo_dir, 'forced_alignment'))
from forced_alignment import convert_phone_set
from forced_alignment import pyhtk
sys.path.append(os.path.join(default.repo_dir, 'toolbox'))
#import pyHTK
from evaluation import plot_confusion_matrix
## ======================= convert phones ======================
mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_converter_dir)
xls = pd.ExcelFile(excel_file)
## check conversion
#df = pd.read_excel(xls, 'frequency')
#for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']):
# #ipa_converted = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, xsampa_)
# ipa_converted = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
# if not ipa_converted == ipa:
# print('{0}: {1} - {2}'.format(xsampa, ipa_converted, ipa))
## check phones included in FAME!
# the phones used in the lexicon.
#phonelist = am_func.get_phonelist(lex_asr)
# the lines which include a specific phone.
#lines = am_func.find_phone(lex_asr, 'x')
# Filename, Word, Self Xsampa
df = pd.read_excel(xls, 'original')
ipas = []
famehtks = []
for xsampa in df['Self Xsampa']:
if not isinstance(xsampa, float): # 'NaN'
# typo?
xsampa = xsampa.replace('r2:z@rA:\\t', 'r2:z@rA:t')
xsampa = xsampa.replace(';', ':')
ipa = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
ipa = ipa.replace('ː', ':')
ipa = ipa.replace(' ', '')
ipas.append(ipa)
famehtk = convert_phone_set.ipa2famehtk(ipa)
famehtks.append(famehtk)
else:
ipas.append('')
famehtks.append('')
# extract interesting cols.
df = pd.DataFrame({'filename': df['Filename'],
'word': df['Word'],
'xsampa': df['Self Xsampa'],
'ipa': pd.Series(ipas),
'famehtk': pd.Series(famehtks)})
# cleansing.
df = df[~df['famehtk'].isin(['/', ''])]
## ======================= make dict files used for HTK. ======================
if make_dic_files:
word_list = np.unique(df['word'])
output_type = 3
for word in word_list:
htk_dict_file = htk_dict_dir + '\\' + word + '.dic'
# pronunciation variant of the target word.
pronvar_ = df['famehtk'][df['word'].str.match(word)]
# make dic file.
am_func.make_dic(word, pronvar_, htk_dict_file, output_type)
## ======================= forced alignment using HTK =======================
if do_forced_alignment_htk:
#hmm_num = 2
for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
hmm_num_str = str(hmm_num)
acoustic_model = os.path.join(acoustic_model_dir, 'hmm' + hmm_num_str + r'-2\hmmdefs')
predictions = []
for i, filename in enumerate(df['filename']):
print('=== {0}/{1} ==='.format(i, len(df)))
wav_file = os.path.join(wav_dir, filename)
if os.path.exists(wav_file) and i in df['filename'].keys():
word = df['word'][i]
WORD = word.upper()
# make label file.
label_file = os.path.join(wav_dir, filename.replace('.wav', '.lab'))
with open(label_file, 'w') as f:
lines = f.write(WORD)
htk_dict_file = os.path.join(htk_dict_dir, word + '.dic')
fa_file = os.path.join(fa_dir, filename.replace('.wav', '.txt') + hmm_num_str)
pyhtk.doHVite(wav_file, label_file, htk_dict_file, fa_file, default.config_hvite, default.phonelist, acoustic_model)
prediction = am_func.read_fileFA(fa_file)
predictions.append(prediction)
os.remove(label_file)
print('{0}: {1} -> {2}'.format(WORD, df['famehtk'][i], prediction))
else:
predictions.append('')
print('!!!!! file not found.')
predictions = np.array(predictions)
#match = np.c_[words[predictions != ''], pronunciations[predictions != ''], predictions[predictions != '']]
np.save(os.path.join(data_dir, 'predictions_hmm' + hmm_num_str + '.npy'), predictions)
## ======================= make files which is used for forced alignment by Kaldi =======================
if make_kaldi_data_files:
wav_dir = r'c:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\corpus\stimmen'
kaldi_work_dir = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5'
kaldi_data_dir = os.path.join(kaldi_work_dir, 'data', 'alignme')
kaldi_dict_dir = os.path.join(kaldi_work_dir, 'data', 'local', 'dict')
htk_dict_dir = os.path.join(experiments_dir, 'stimmen', 'dic_top3')
wav_scp = os.path.join(kaldi_data_dir, 'wav.scp')
text_file = os.path.join(kaldi_data_dir, 'text')
utt2spk = os.path.join(kaldi_data_dir, 'utt2spk')
lexicon_txt = os.path.join(kaldi_dict_dir, 'lexicon.txt')
predictions = []
file_num_max = len(filenames)
# remove previous files.
if os.path.exists(wav_scp):
os.remove(wav_scp)
if os.path.exists(text_file):
os.remove(text_file)
if os.path.exists(utt2spk):
os.remove(utt2spk)
f_wav_scp = open(wav_scp, 'a', encoding="utf-8", newline='\n')
f_text_file = open(text_file, 'a', encoding="utf-8", newline='\n')
f_utt2spk = open(utt2spk, 'a', encoding="utf-8", newline='\n')
# make wav.scp, text, and utt2spk files.
for i in range(0, file_num_max):
#for i in range(400, 410):
print('=== {0}/{1} ==='.format(i+1, file_num_max))
filename = filenames[i]
wav_file = wav_dir + '\\' + filename
if os.path.exists(wav_file):
speaker_id = 'speaker_' + str(i).zfill(4)
utterance_id = filename.replace('.wav', '')
utterance_id = utterance_id.replace(' ', '_')
utterance_id = speaker_id + '-' + utterance_id
# wav.scp file
wav_file_unix = wav_file.replace('\\', '/')
wav_file_unix = wav_file_unix.replace('c:/', '/mnt/c/')
f_wav_scp.write('{0} {1}\n'.format(utterance_id, wav_file_unix))
# text file
word = words[i].lower()
f_text_file.write('{0}\t{1}\n'.format(utterance_id, word))
# utt2spk
f_utt2spk.write('{0} {1}\n'.format(utterance_id, speaker_id))
f_wav_scp.close()
f_text_file.close()
f_utt2spk.close()
## ======================= make lexicon txt which is used by Kaldi =======================
if make_kaldi_lexicon_txt:
kaldi_work_dir = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5'
kaldi_dict_dir = os.path.join(kaldi_work_dir, 'data', 'local', 'dict')
lexicon_txt = os.path.join(kaldi_dict_dir, 'lexicon.txt')
option_num = 5
# remove previous file.
if os.path.exists(lexicon_txt):
os.remove(lexicon_txt)
mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', ipa_xsampa_converter_dir)
with open(csvfile, encoding="utf-8") as fin:
lines = csv.reader(fin, delimiter=';', lineterminator="\n", skipinitialspace=True)
next(lines, None) # skip the headers
filenames = []
words = []
pronunciations = []
p = []
for line in lines:
if line[1] is not '' and len(line) > 5:
filenames.append(line[0])
words.append(line[1])
pron_xsampa = line[3]
pron_ipa = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, pron_xsampa)
pron_ipa = pron_ipa.replace('ː', ':')
# adjust to phones used in the acoustic model.
pronunciations.append(pron_ipa)
# check if all phones are in the phonelist of the acoustic model.
#'y', 'b', 'ɾ', 'u', 'ɔ:', 'ø', 't', 'œ', 'n', 'ɒ', 'ɐ', 'f', 'o', 'k', 'x', 'ɡ', 'v', 's', 'ɛ:', 'ɪ:', 'ɑ', 'ɛ', 'a', 'd', 'z', 'ɪ', 'ɔ', 'l', 'i:', 'm', 'p', 'a:', 'i', 'e', 'j', 'o:', 'ʁ', 'h', ':', 'e:', 'ə', 'æ', 'χ', 'w', 'r', 'ə:', 'sp', 'ʊ', 'u:', 'ŋ'
filenames = np.array(filenames)
words = np.array(words)
wordlist = np.unique(words)
pronunciations = np.array(pronunciations)
# output lexicon.txt
#f_lexicon_txt = open(lexicon_txt, 'a', encoding="utf-8", newline='\n')
pronvar_list_all = []
for word in word_list:
# pronunciation variant of the target word.
pronvar_ = pronunciations[words == word]
# remove ''
pronvar_ = np.delete(pronvar_, np.where(pronvar_==''))
c = Counter(pronvar_)
total_num = sum(c.values())
for key, value in c.most_common(option_num):
#print('{0}\t{1}\t{2}\t{3}'.format(word, key, value, total_num))
key = key.replace('æ', 'ɛ')
key = key.replace('ɐ', 'a')
key = key.replace('ɑ', 'a')
key = key.replace('ɾ', 'r')
key = key.replace('ʁ', 'r')
key = key.replace('ʊ', 'u')
key = key.replace('χ', 'x')
#print('-->{0}\t{1}\t{2}\t{3}\n'.format(word, key, value, total_num))
# make possible pronounciation variant list.
pronvar_list = [key]
while 'ø:' in ' '.join(pronvar_list) or 'œ' in ' '.join(pronvar_list) or 'ɒ' in ' '.join(pronvar_list):
pronvar_list_ = []
for p in pronvar_list:
if 'ø:' in p:
pronvar_list_.append(p.replace('ø:', 'ö'))
pronvar_list_.append(p.replace('ø:', 'ö:'))
if 'œ' in p:
pronvar_list_.append(p.replace('œ', 'ɔ̈'))
pronvar_list_.append(p.replace('œ', 'ɔ̈:'))
if 'ɒ' in p:
pronvar_list_.append(p.replace('ɒ', 'ɔ̈'))
pronvar_list_.append(p.replace('ɒ', 'ɔ̈:'))
pronvar_list = np.unique(pronvar_list_)
for pronvar_ in pronvar_list:
split_ipa = convert_phone_set.split_fame_ipa(pronvar_)
pronvar_out = ' '.join(split_ipa)
pronvar_list_all.append([word, pronvar_out])
# output
pronvar_list_all = np.array(pronvar_list_all)
pronvar_list_all = np.unique(pronvar_list_all, axis=0)
#f_lexicon_txt.write('<UNK>\tSPN\n')
#for line in pronvar_list_all:
# f_lexicon_txt.write('{0}\t{1}\n'.format(line[0].lower(), line[1]))
#f_lexicon_txt.close()
## ======================= load kaldi forced alignment result =======================
if load_forced_alignment_kaldi:
kaldi_work_dir = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5'
phones_txt = kaldi_work_dir + '\\data\\lang\\phones.txt'
merged_alignment_txt = kaldi_work_dir + '\\exp\\tri1_alignme\\merged_alignment.txt'
filenames = np.load(data_dir + '\\filenames.npy')
words = np.load(data_dir + '\\words.npy')
pronunciations = np.load(data_dir + '\\pronunciations_ipa.npy')
pronvar_list_all = np.load(data_dir + '\\pronvar_list_all.npy')
word_list = np.unique(words)
# load the mapping between phones and ids.
with open(phones_txt, 'r', encoding="utf-8") as f:
mappings = f.read().split('\n')
phones = []
phone_ids = []
for m in mappings:
m = m.split(' ')
if len(m) > 1:
phones.append(m[0])
phone_ids.append(int(m[1]))
with open(merged_alignment_txt, 'r') as f:
lines = f.read()
lines = lines.split('\n')
fa_filenames = []
fa_pronunciations = []
filename_ = ''
pron = []
for line in lines:
line = line.split(' ')
if len(line) == 5:
filename = line[0]
if filename == filename_:
phone_id = int(line[4])
#if not phone_id == 1:
phone = phones[phone_ids.index(phone_id)]
pron_ = re.sub(r'_[A-Z]', '', phone)
if not pron_ == 'SIL':
pron.append(pron_)
else:
fa_filenames.append(re.sub(r'speaker_[0-9]{4}-', '', filename))
fa_pronunciations.append(' '.join(pron))
pron = []
filename_ = filename
# correct or not.
#for filename, fa_pronunciation in zip(fa_filenames, fa_pronunciations):
## ======================= evaluate the result of forced alignment =======================
if eval_forced_alignment:
match_num = []
for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256]:
#hmm_num = 256
hmm_num_str = str(hmm_num)
match = np.load(data_dir + '\\match_hmm' + hmm_num_str + '.npy')
# use dic_short?
if 1:
pronunciation_variants = np.array(['WORD', 'pronunciation']).reshape(1, 2)
for word in word_list:
fileDic = experiments_dir + r'\stimmen\dic_top3' + '\\' + word + '.dic'
pronunciation_variants = np.r_[pronunciation_variants, pyHTK.loadHTKdic(fileDic)]
# see only words which appears in top 3.
match_short = []
for line in match:
word = line[0]
WORD = word.upper()
pronvar = pronunciation_variants[pronunciation_variants[:, 0] == word.upper(), 1]
if line[1] in pronvar:
match_short.append(line)
match_short = np.array(match_short)
match = np.copy(match_short)
# number of match
total_match = sum(match[:, 1] == match[:, 2])
print("{}: {}/{}".format(hmm_num_str, total_match, match.shape[0]))
match_num.append([hmm_num, total_match, match.shape[0]])
# number of mixtures vs accuracy
match_num = np.array(match_num)
plt.xscale("log")
plt.plot(match_num[:, 0], match_num[:, 1]/match_num[0, 2], 'o-')
plt.xlabel('number of mixtures', fontsize=14, fontweight='bold')
plt.ylabel('accuracy', fontsize=14, fontweight='bold')
plt.show()
# confusion matrix
#dir_out = r'C:\OneDrive\Research\rug\experiments\stimmen\result'
#word_list = np.unique(match[:, 0])
#for word in word_list:
# match_ = match[match[:, 0] == word, :]
# cm = confusion_matrix(match_[:, 1], match_[:, 2])
# pronvar = pronunciation_variants[pronunciation_variants[:, 0] == word.upper(), 1]
# plt.figure()
# plot_confusion_matrix(cm, classes=pronvar, normalize=True)
# plt.savefig(dir_out + '\\cm_' + word + '.png')

View File

@ -0,0 +1,154 @@
""" definition of the phones to be used. """
# phonese in {FAME}/lexicon/lex.asr
phoneset = [
# vowels
'a',
'a:',
'e',
'e:',
'i',
'i:',
'',
'o',
'o:',
'ö',
'ö:',
'u',
'u:',
'ü',
'ü:',
#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone.
'',
'y',
'ɔ',
'ɔ:',
'ɔ̈',
'ɔ̈:',
'ə',
'ɛ',
'ɛ:',
'ɪ',
'ɪ:',
# plosives
'p',
'b',
't',
'd',
'k',
'g',
'ɡ', # = 'g'
# nasals
'm',
'n',
'ŋ',
# fricatives
'f',
'v',
's',
's:',
'z',
'x',
'h',
# tap and flip
'r',
'r:',
# approximant
'j',
'l'
]
## reduce the number of phones.
# the phones which seldom occur are replaced with another more popular phones.
# replacements are based on the advice from Martijn Wieling.
reduction_key = {
'y':'i:', 'e':'e:', 'ə:':'ɛ:', 'r:':'r', 'ɡ':'g',
# aki added because this is used in stimmen_project.
'ɔ̈:':'ɔ:'
}
# already removed beforehand in phoneset. Just to be sure.
phones_to_be_removed = ['ú', 's:']
def phone_reduction(phones):
"""
Args:
phones (list): list of phones.
"""
if sum([phone in phones for phone in phones_to_be_removed]) != 0:
print('input includes phone(s) which is not defined in fame_asr.')
print('those phone(s) are removed.')
return [reduction_key.get(i, i) for i in phones
if i not in phones_to_be_removed]
phoneset_short = list(set(phone_reduction(phoneset)))
phoneset_short.sort()
## translation_key to htk format (ascii).
# phones which gives UnicodeEncodeError when phone.encode("ascii")
# are replaced with other characters.
translation_key_asr2htk = {
'': 'i_',
'': 'u_',
# on the analogy of German umlaut, 'e' is used.
'ö': 'oe', 'ö:': 'oe:', ''
'ü': 'ue', 'ü:': 'ue:',
# on the analogy of Chinese...
'ŋ': 'ng',
# refer to Xsampa.
'ɔ': 'O', 'ɔ:': 'O:', 'ɔ̈': 'Oe',
#'ɔ̈:': 'O:', # does not appear in FAME, but used in stimmen.
'ɛ': 'E', 'ɛ:': 'E:',
'ɪ': 'I', 'ɪ:': 'I:',
# it is @ in Xsampa, but that is not handy on HTK.
'ə': 'A'
}
phoneset_htk = [translation_key_asr2htk.get(i, i) for i in phoneset_short]
#not_in_ascii = [
# '\'',
# 'â', 'ê', 'ô', 'û', 'č',
# 'à', 'í', 'é', 'è', 'ú', 'ć',
# 'ä', 'ë', 'ï', 'ö', 'ü'
#]
translation_key_word2htk = {
#'\'': '\\\'',
'í':'i1', 'é':'e1', 'ú':'u1', 'ć':'c1',
'à':'a2', 'è':'e2',
'â':'a3', 'ê':'e3', 'ô':'o3', 'û':'u3',
'č':'c4',
'ä': 'ao', 'ë': 'ee', 'ï': 'ie', 'ö': 'oe', 'ü': 'ue',
}
#[translation_key_word2htk.get(i, i) for i in not_in_ascii]
#Stop: p, b, t, d, k, g
#Nasal: m, n, ng(ŋ)
#Fricative: s, z, f, v, h, x
#Liquid: l, r
#Vowel: a, a:, e:, i, i:, i_(i̯), o, o:, u, u:, u_(ṷ), oe(ö), oe:(ö:), ue(ü), ue:(ü:), O(ɔ), O:(ɔ:), Oe(ɔ̈), A(ə), E(ɛ), E:(ɛ:), I(ɪ), I:(ɪ:)
## the list of multi character phones.
# for example, the length of 'a:' is 3, but in the codes it is treated as one letter.
# original.
multi_character_phones = [i for i in phoneset if len(i) > 1]
multi_character_phones.sort(key=len, reverse=True)
# phonset reduced.
multi_character_phones_short = [i for i in phoneset_short if len(i) > 1]
multi_character_phones_short.sort(key=len, reverse=True)
# htk compatible.
multi_character_phones_htk = [i for i in phoneset_htk if len(i) > 1]
multi_character_phones_htk.sort(key=len, reverse=True)

View File

@ -0,0 +1,138 @@
""" definition of the phones to be used. """
phoneset = [
# vowels
'',
'i̯ⁿ',
'y',
'y:', # not included in lex.ipa, but in stimmen.
'i',
'i.',
'iⁿ',
'i:',
'i:ⁿ',
'ɪ',
'ɪⁿ',
'ɪ.',
'ɪ:', # not included in lex.ipa, but in stimmen.
'ɪ:ⁿ',
'e',
'e:',
'e:ⁿ',
'ə',
'əⁿ',
'ə:',
'ɛ',
'ɛ.',
'ɛⁿ',
'ɛ:',
'ɛ:ⁿ',
'a',
'aⁿ',
'a.',
'a:',
'a:ⁿ',
'',
'ṷ.',
'ṷⁿ',
#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone.
'u',
'uⁿ',
'u.',
'u:',
'u:ⁿ',
'ü',
'ü.',
'üⁿ',
'ü:',
'ü:ⁿ',
'o',
'oⁿ',
'o.',
'o:',
'o:ⁿ',
'ö',
'ö.',
'öⁿ',
'ö:',
'ö:ⁿ',
'ɔ',
'ɔ.',
'ɔⁿ',
'ɔ:',
'ɔ:ⁿ',
'ɔ̈', # not included in lex.ipa
'ɔ̈.',
'ɔ̈:',
# plosives
'p',
'b',
't',
'tⁿ',
'd',
'k',
'g',
'ɡ', # = 'g'
# nasals
'm',
'n',
'ŋ',
# fricatives
'f',
'v',
's',
's:',
'z',
'zⁿ',
'x',
'h',
# tap and flip
'r',
'r.', # only appears in word 'mearpartijestelsel'(does not exist in lex_asr) and 'tenoarpartij'.
'r:', # only appears in word 'mûsearflearmûs' and 'sjochdêr'.
# approximant
'j',
'j.',
'l'
]
## reduce the number of phones.
# the phones which are used in stimmen transcription but not in FAME corpus.
# replacements are based on the advice from Jelske Dijkstra on 2018/06/21.
stimmen_replacement = {
'æ': 'ɛ',
'ø': 'ö', # or 'ö:'
'ø:': 'ö:', # Aki added.
'œ': 'ɔ̈', # or 'ɔ̈:'
'œ:': 'ɔ̈:', # Aki added.
'ɐ': 'a', # or 'a:'
'ɐ:': 'a:', # Aki added.
'ɑ': 'a', # or 'a:'
'ɑ:': 'a:', # Aki added
'ɒ': 'ɔ', # or 'ɔ:'
'ɒ:': 'ɔ:', # Aki added.
'ɾ': 'r',
'ʁ': 'r',
'ʊ': 'u',
'χ': 'x',
# aki guessed.
'ʀ': 'r',
'ɹ': 'r',
'w': 'ö'
}
phoneset.extend(list(stimmen_replacement.keys()))
def phone_reduction(phones):
return [stimmen_replacement.get(i, i) for i in phones]
## the list of multi character phones.
# for example, the length of 'i̯ⁿ' is 3, but in the codes it is treated as one letter.
multi_character_phones = [i for i in phoneset if len(i) > 1]
multi_character_phones.sort(key=len, reverse=True)

Binary file not shown.

View File

@ -0,0 +1,197 @@
import sys
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import fame_functions
from phoneset import fame_ipa, fame_asr
import convert_phoneset
## general
stop = 'p, b, t, d, k, g'
nasal = 'm, n, ŋ'
fricative = 's, z, f, v, h, x, j'
liquid = 'l, r'
vowel = 'a, a:, e:, i, i:, i̯, o, o:, u, u:, ṷ, ö, ö:, ü, ü:, ɔ, ɔ:, ɔ̈, ə, ɛ, ɛ:, ɪ, ɪ:'
## consonant
c_front = 'p, b, m, f, v'
c_central = 't, d, n, s, z, l, r'
c_back = 'k, g, ŋ, h, x, j'
fortis = 'p, t, k, f, s'
lenis = 'b, d, g, v, z, j'
neither_fortis_nor_lenis = 'm, n, ŋ, h, l, r, x'
coronal = 't, d, n, s, z, l, r, j'
non_coronal = 'p, b, m, k, g, ŋ, f, v, h, x'
anterior = 'p, b, m, t, d, n, f, v, s, z, l'
non_anterior = 'k, g, ŋ, h, x, j, r'
continuent = 'm, n, ŋ, f, v, s, z, h, l, r'
non_continuent = 'p, b, t, d, k, g, x, j'
strident = 's, z, j'
non_strident = 'f, v, h'
unstrident = 'p, b, t, d, m, n, ŋ, k, g, r, x'
glide = 'h, l, r'
syllabic = 'm, l, ŋ'
unvoiced = 'p, t, k, s, f, x, h'
voiced = 'b, d, g, z, v, m, n, ŋ, l, r, j'
#affricate: ???
non_affricate = 's, z, f, v'
voiced_stop = 'b, d, g'
unvoiced_stop = 'p, t, k'
front_stop = 'p, b'
central_stop = 't, d'
back_stop = 'k, g'
voiced_fricative = 'z, v'
unvoiced_fricative = 's, f'
front_fricative = 'f, v'
central_fricative = 's, z'
back_fricative = 'j'
## vowel
v_front = 'i, i:, i̯, ɪ, ɪ:, e:, ə, ɛ, ɛ:, a, a:'
v_central = 'ə, ɛ, ɛ:, a, a:'
v_back = 'u, u:, ü, ü:, ṷ, ɔ, ɔ:, ɔ̈, ö, ö:, o, o:'
long = 'a:, e:, i:, o:, u:, ö:, ü:, ɔ:, ɛ:, ɪ:'
short = 'a, i, i̯, o, u, ṷ, ö, ü, ɔ, ɔ̈, ə, ɛ, ɪ'
#Dipthong: ???
#Front-Start: ???
#Fronting: ???
high = 'i, i:, i̯, ɪ, ɪ: u, u:, ṷ, ə, e:, o, o:, ö, ö:, ü, ü:'
medium = 'e:, ə, ɛ, ɛ:, ɔ, ɔ:, ɔ̈, o, o:, ö, ö:'
low = 'a, a:, ɛ, ɛ:, ɔ, ɔ:, ɔ̈'
rounded = 'a, a:, o, o:, u, u:, ṷ, ö, ö:, ü, ü:, ɔ, ɔ:, ɔ̈'
unrounded = 'i, i:, i̯, e:, ə, ɛ, ɛ:, ɪ, ɪ:'
i_vowel = 'i, i:, i̯, ɪ, ɪ:'
e_vowel = 'e:,ə, ɛ, ɛ:'
a_vowel = 'a, a:'
o_vowel = 'o, o:, ö, ö:, ɔ, ɔ:, ɔ̈'
u_vowel = 'u, u:, ṷ, ü, ü:'
## htk phoneset
phoneset = fame_asr.phoneset_htk
## convert ipa group to htk format for quests.hed.
def _ipa2quest(R_or_L, ipa_text):
assert R_or_L in ['R', 'L'], print('the first argument should be either R or L.')
ipa_list = ipa_text.replace(' ', '').split(',')
if R_or_L == 'R':
quests_list = ['*+' + fame_functions.ipa2htk(ipa) for ipa in ipa_list]
else:
quests_list = [fame_functions.ipa2htk(ipa) + '-*' for ipa in ipa_list]
return ','.join(quests_list)
def make_quests_hed(quest_hed):
def _add_quests_item(R_or_L, item_name_, ipa_text):
assert R_or_L in ['R', 'L'], print('the first argument should be either R or L.')
item_name = R_or_L + '_' + item_name_
with open(quest_hed, 'ab') as f:
f.write(bytes('QS "' + item_name + '"\t{ ' + _ipa2quest(R_or_L, ipa_text) + ' }\n', 'ascii'))
if os.path.exists(quest_hed):
os.remove(quest_hed)
for R_or_L in ['R', 'L']:
_add_quests_item(R_or_L, 'NonBoundary', '*')
_add_quests_item(R_or_L, 'Silence', 'sil')
_add_quests_item(R_or_L, 'Stop', stop)
_add_quests_item(R_or_L, 'Nasal', nasal)
_add_quests_item(R_or_L, 'Fricative', fricative)
_add_quests_item(R_or_L, 'Liquid', liquid)
_add_quests_item(R_or_L, 'Vowel', vowel)
_add_quests_item(R_or_L, 'C-Front', c_front)
_add_quests_item(R_or_L, 'C-Central', c_central)
_add_quests_item(R_or_L, 'C-Back', c_back)
_add_quests_item(R_or_L, 'V-Front', v_front)
_add_quests_item(R_or_L, 'V-Central', v_central)
_add_quests_item(R_or_L, 'V-Back', v_back)
_add_quests_item(R_or_L, 'Front', c_front + v_front)
_add_quests_item(R_or_L, 'Central', c_central + v_central)
_add_quests_item(R_or_L, 'Back', c_front + v_back)
_add_quests_item(R_or_L, 'Fortis', fortis)
_add_quests_item(R_or_L, 'Lenis', lenis)
_add_quests_item(R_or_L, 'UnFortLenis', neither_fortis_nor_lenis)
_add_quests_item(R_or_L, 'Coronal', coronal)
_add_quests_item(R_or_L, 'NonCoronal', non_coronal)
_add_quests_item(R_or_L, 'Anterior', anterior)
_add_quests_item(R_or_L, 'NonAnterior', non_anterior)
_add_quests_item(R_or_L, 'Continuent', continuent)
_add_quests_item(R_or_L, 'NonContinuent', non_continuent)
_add_quests_item(R_or_L, 'Strident', strident)
_add_quests_item(R_or_L, 'NonStrident', non_strident)
_add_quests_item(R_or_L, 'UnStrident', unstrident)
_add_quests_item(R_or_L, 'Glide', glide)
_add_quests_item(R_or_L, 'Syllabic', syllabic)
_add_quests_item(R_or_L, 'Unvoiced-Cons', unvoiced)
_add_quests_item(R_or_L, 'Voiced-Cons', voiced)
_add_quests_item(R_or_L, 'Unvoiced-All', unvoiced + ', sil')
_add_quests_item(R_or_L, 'Long', long)
_add_quests_item(R_or_L, 'Short', short)
#_add_quests_item(R_or_L, 'Dipthong', xxx)
#_add_quests_item(R_or_L, 'Front-Start', xxx)
#_add_quests_item(R_or_L, 'Fronting', xxx)
_add_quests_item(R_or_L, 'High', high)
_add_quests_item(R_or_L, 'Medium', medium)
_add_quests_item(R_or_L, 'Low', low)
_add_quests_item(R_or_L, 'Rounded', rounded)
_add_quests_item(R_or_L, 'UnRounded', unrounded)
#_add_quests_item(R_or_L, 'Affricative', rounded)
_add_quests_item(R_or_L, 'NonAffricative', non_affricate)
_add_quests_item(R_or_L, 'IVowel', i_vowel)
_add_quests_item(R_or_L, 'EVowel', e_vowel)
_add_quests_item(R_or_L, 'AVowel', a_vowel)
_add_quests_item(R_or_L, 'OVowel', o_vowel)
_add_quests_item(R_or_L, 'UVowel', u_vowel)
_add_quests_item(R_or_L, 'Voiced-Stop', voiced_stop)
_add_quests_item(R_or_L, 'UnVoiced-Stop', unvoiced_stop)
_add_quests_item(R_or_L, 'Front-Stop', front_stop)
_add_quests_item(R_or_L, 'Central-Stop', central_stop)
_add_quests_item(R_or_L, 'Back-Stop', back_stop)
_add_quests_item(R_or_L, 'Voiced-Fric', voiced_fricative)
_add_quests_item(R_or_L, 'UnVoiced-Fric', unvoiced_fricative)
_add_quests_item(R_or_L, 'Front-Fric', front_fricative)
_add_quests_item(R_or_L, 'Central-Fric', central_fricative)
_add_quests_item(R_or_L, 'Back-Fric', back_fricative)
for p in phoneset:
_add_quests_item(R_or_L, p, p)
return

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,119 @@
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import glob
import pandas as pd
import convert_xsampa2ipa
import defaultfiles as default
import fame_functions
import novoapi_functions
def _load_transcriptions():
stimmen_transcription = pd.ExcelFile(default.stimmen_transcription_xlsx)
df = pd.read_excel(stimmen_transcription, 'original')
# mapping from ipa to xsampa
mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_converter_dir)
#for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']):
# ipa_converted = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
# if not ipa_converted == ipa:
# print('{0}: {1} - {2}'.format(xsampa, ipa_converted, ipa))
ipas = []
for xsampa in df['Self Xsampa']:
if not isinstance(xsampa, float): # 'NaN'
# typo?
xsampa = xsampa.replace('r2:z@rA:\\t', 'r2:z@rA:t').replace(';', ':')
ipa = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
ipa = ipa.replace('ː', ':').replace(' ', '')
ipas.append(ipa)
else:
ipas.append('')
df_ = pd.DataFrame({'filename': df['Filename'],
'word': df['Word'],
'xsampa': df['Self Xsampa'],
'ipa': pd.Series(ipas)})
# not valid inputs, but seperator.
df_ = df_[~df_['ipa'].str.contains('/')]
return df_.dropna()
def load_transcriptions():
""" in default.stimmen_transcription_xlsx
rows of which wav files can be easily found"""
df = _load_transcriptions()
df_ = pd.DataFrame(index=[], columns=list(df.keys()))
for index, row in df.iterrows():
filename = row['filename']
if isinstance(filename, str):
wav_file = os.path.join(default.stimmen_wav_dir, filename)
if os.path.exists(wav_file):
df_ = df_.append(row, ignore_index=True)
return df_
def load_transcriptions_clean(clean_wav_dir):
df = _load_transcriptions()
wav_file_list = glob.glob(os.path.join(clean_wav_dir, '*.wav'))
df_clean = pd.DataFrame(index=[], columns=list(df.keys()))
for wav_file in wav_file_list:
filename = os.path.basename(wav_file)
df_ = df[df['filename'].str.match(filename)]
df_clean = pd.concat([df_clean, df_])
return df_clean
def load_transcriptions_novo70(clean_wav_dir):
""" extract rows of which ipa is written in novo70 phonset. """
df = load_transcriptions_clean(clean_wav_dir)
df_novo70 = pd.DataFrame(index=[], columns=list(df.keys()))
for index, row in df.iterrows():
not_in_novo70 = novoapi_functions.phones_not_in_novo70(row['ipa'])
if len(not_in_novo70) == 0:
df_novo70 = df_novo70.append(row, ignore_index=True)
return df_novo70
def add_row_htk(df):
""" df['htk'] is made from df['ipa'] and added. """
htk = []
for index, row in df.iterrows():
htk.append(fame_functions.ipa2htk(row['ipa']))
return df.assign(htk=htk)
def add_row_asr(df):
""" df['asr'] is made from df['ipa'] and added. """
asr = []
for index, row in df.iterrows():
asr.append(fame_functions.ipa2asr(row['ipa']))
return df.assign(asr=asr)
def load_pronunciations(WORD, htk_dic):
""" load pronunciation variants from HTK dic file.
Args:
WORD (str): word in capital letters.
htk_dic (path): HTK dict file.
Returns:
(pronunciations) (list): pronunciation variants of WORD.
Notes:
Because this function loads all contents from htk_dic file,
it is not recommended to use for large lexicon.
"""
with open(htk_dic) as f:
lines = f.read().replace(' sil', '')
lines = lines.split('\n')
return [' '.join(line.split(' ')[1:])
for line in lines if line.split(' ')[0]==WORD]

View File

@ -0,0 +1,93 @@
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import sys
import shutil
from collections import Counter
import numpy as np
import pandas as pd
import defaultfiles as default
import convert_xsampa2ipa
import stimmen_functions
import fame_functions
import convert_phoneset
from phoneset import fame_ipa, fame_asr
sys.path.append(default.toolbox_dir)
import file_handling as fh
from htk import pyhtk
## ======================= user define =======================
## ======================= make test data ======================
stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test'
## copy wav files which is in the stimmen data.
df = stimmen_functions.load_transcriptions()
#for index, row in df.iterrows():
# filename = row['filename']
# wav_file = os.path.join(default.stimmen_wav_dir, filename)
# shutil.copy(wav_file, os.path.join(stimmen_test_dir, filename))
# after manually removed files which has too much noise and multiple words...
# update the info.
df_clean = stimmen_functions.load_transcriptions_clean(stimmen_test_dir)
# count how many files are removed due to the quality.
word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
word_list = sorted(word_list)
for word in word_list:
df_ = df[df['word']==word]
df_clean_ = df_clean[df_clean['word']==word]
print('word {0} has {1} clean files among {2} files ({3:.2f} [%]).'.format(
word, len(df_clean_), len(df_), len(df_clean_)/len(df_)*100))
## check phones included in stimmen but not in FAME!
splitted_ipas = [' '.join(
convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones))
for ipa in df['ipa']]
stimmen_phones = set(' '.join(splitted_ipas))
stimmen_phones = list(stimmen_phones)
fame_phones = fame_ipa.phoneset
stimmen_phones.sort()
fame_phones.sort()
print('phones which are used in stimmen transcription but not in FAME corpus are:\n{}'.format(
set(stimmen_phones) - set(fame_phones)
))
for ipa in df['ipa']:
ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)
if ':' in ipa_splitted:
print(ipa_splitted)
## check pronunciation variants
df_clean = stimmen_functions.load_transcriptions_clean(stimmen_test_dir)
df_clean = stimmen_functions.add_row_asr(df_clean)
df_clean = stimmen_functions.add_row_htk(df_clean)
for word in word_list:
#word = word_list[1]
df_ = df_clean[df_clean['word']==word]
c = Counter(df_['htk'])
pronunciations = dict()
for key, value in zip(c.keys(), c.values()):
if value > 3:
pronunciations[key] = value
print(pronunciations)
monophone_mlf = os.path.join(default.htk_dir, 'label', 'train_phone_aligned.mlf')
triphone_mlf = os.path.join(default.htk_dir, 'label', 'train_triphone.mlf')
def filenames_in_mlf(file_mlf):
with open(file_mlf) as f:
lines_ = f.read().split('\n')
lines = [line for line in lines_ if len(line.split(' ')) == 1 and line != '.']
filenames = [line.replace('"', '').replace('*/', '') for line in lines[1:-1]]
return filenames
filenames_mono = filenames_in_mlf(monophone_mlf)
filenames_tri = filenames_in_mlf(triphone_mlf)

View File

@ -0,0 +1,5 @@
#!/usr/bin/env python
__version__ = "0.2"
import backend

View File

@ -0,0 +1,6 @@
#!/usr/bin/env python
#import segments
#import spraaklab
from . import segments
from . import spraaklab

View File

@ -0,0 +1,4 @@
#!/usr/bin/env python
from .segments import Segmentation
from .praat import seg2tg

View File

@ -0,0 +1,79 @@
#!/usr/bin/env python
# (c) 2015--2018 NovoLanguage, author: David A. van Leeuwen
import codecs
def print_header(output, begin, end, nr_tiers):
print >> output, 'File type = "ooTextFile"'
print >> output, 'Object class = "TextGrid"'
print >> output, ''
print >> output, 'xmin = %s' % begin
print >> output, 'xmax = %s' % end
print >> output, 'tiers? <exists>'
print >> output, 'size = %d' % nr_tiers
print >> output, 'item []:'
def print_info_tier(output, title, begin, end, label):
print >> output, '\titem [%d]:' % 0
print >> output, '\t\tclass = "IntervalTier"'
print >> output, '\t\tname = "%s"' % title
print >> output, '\t\txmin = %s' % begin
print >> output, '\t\txmax = %s' % end
print >> output, '\t\tintervals: size = %d' % 1
print >> output, '\t\tintervals [1]:'
print >> output, '\t\t\txmin = %s' % begin
print >> output, '\t\t\txmax = %s' % end
print >> output, '\t\t\ttext = "%s"' % label
def print_tier(output, title, begin, end, segs, format, formatter):
print >> output, '\titem [%d]:' % 0
print >> output, '\t\tclass = "IntervalTier"'
print >> output, '\t\tname = "%s"' % title
print >> output, '\t\txmin = %s' % begin
print >> output, '\t\txmax = %s' % end
print >> output, '\t\tintervals: size = %d' % len(segs)
count = 1
for seg in segs:
#print seg
print >> output, '\t\tintervals [%d]:' % count
print >> output, '\t\t\txmin = %s' % repr(int(seg['begin']) / 100.0)
print >> output, '\t\t\txmax = %s' % repr(int(seg['end']) / 100.0)
string = '\t\t\ttext = "' + format + '"'
print >> output, string % formatter(seg['label'])
count += 1
def seg2tg(fname, segments):
if not segments:
return
output = codecs.open(fname, "w", encoding="utf-8")
confidences = []
word_labels = []
phones = []
for s in segments:
conf = s.llh if hasattr(s, "llh") else s.score
confidences.append({'begin': s.begin, 'end': s.end, 'label': conf})
word_labels.append({'begin': s.begin, 'end': s.end, 'label': s.label})
for p in s.phones:
phones.append({'begin': p.begin, 'end': p.end, 'label': p.label})
begin = repr(int(segments[0].begin) / 100.0)
end = repr(int(segments[-1].end) / 100.0)
nr_tiers = 3
print_header(output, begin, end, nr_tiers)
#print_tier(output, "confidence", begin, end, confidences, ('%.3f', lambda x: x))
#print_tier(output, "words", begin, end, word_labels, ('%s', lambda x: x))
#print_tier(output, "phones", begin, end, phones, ('%s', lambda x: x))
print_tier(output, "confidence", begin, end, confidences, '%.3f', lambda x: x)
print_tier(output, "words", begin, end, word_labels, '%s', lambda x: x)
print_tier(output, "phones", begin, end, phones, '%s', lambda x: x)
output.close()

View File

@ -0,0 +1,99 @@
#!/usr/bin/env python
# (c) 2015--2018 NovoLanguage, author: David A. van Leeuwen
## These classes can be initialized with dictionaries, as they are returned by the python spraaklab recognition system.
class Segment(object):
def __init__(self, segment):
self.begin = segment["begin"]
self.end = segment["end"]
self.begintime = segment.get("beginTime", self.begin / 100.0)
self.endtime = segment.get("endTime", self.end / 100.0)
self.label = segment["label"]
self.score = segment["score"]
if "llh" in segment:
self.llh = segment["llh"]
if "phones" in segment:
self.type = "word"
self.phones = Segmentation(segment["phones"], ["sil"])
if hasattr(self.phones[0], "llh"):
self.minllh = min([s.llh for s in self.phones]) ## the current word llh for error detection
else:
self.type = "phone"
def __repr__(self):
res = "%8.3f -- %8.3f score %8.3f " % (self.begintime, self.endtime, self.score)
if hasattr(self, "llh"):
res += "llh %8.3f " % self.llh
res += self.label.encode("utf8")
return res
def export(self):
r = {"begin": self.begin, "end": self.end, "label": self.label, "score": self.score, "type": self.type}
if hasattr(self, "llh"):
r["llh"] = self.llh
if hasattr(self, "phones"):
r["phones"] = self.phones.export()
return r
class Segmentation(object):
def __init__(self, segments, sils=["<s>", "</s>", "!sil"]):
"""Create a segmentation from a spraaklab recognition structure.
segments: an array of words (or phones), represented by a dict with
"begin", "end", "label", "score", and "llh" keys. Words can also have
"phones" which is another array of segments."""
self.segments = [Segment(s) for s in segments]
if self.segments:
self.type = self.segments[0].type
else:
self.type = None
self.sils = sils
self.orig = segments ## in case we want to have access to the original recognition structure
def __getitem__(self, item):
return self.segments[item]
def __repr__(self):
ns = len(self.segments)
res = "Segmentation with %d %s%s" % (ns, self.type, "" if ns==1 else "s")
for seg in self.segments:
res += "\n " + repr(seg)
return res
def __len__(self):
return len(self.segments)
def score(self, skip=None):
if not skip:
skip = self.sils
s = 0.0
for seg in self.segments:
if seg.label not in skip:
s += seg.score
return s
def llhs(self, skip=None):
if not skip:
skip = self.sils
return [seg.llh for seg in self.segments if hasattr(seg, "llh") and seg.label not in skip]
def llh(self, skip=None):
return sum(self.llhs(skip))
def minllh(self, skip=None):
llhs = self.llhs(skip)
if llhs:
return min(llhs)
else:
return None
def labels(self, skip=None):
if not skip:
skip = self.sils
return [seg.label for seg in self.segments if seg.label not in skip]
def sentence(self, skip=None):
return " ".join(self.labels(skip))
def export(self):
return [seg.export() for seg in self.segments]

View File

@ -0,0 +1,4 @@
#!/usr/bin/env python
#import schema
from . import schema

View File

@ -0,0 +1,273 @@
#!/usr/bin/env python
## (c) 2017 NovoLanguage, author: David A. van Leeuwen
## The purpose of this to define the grammar structure in a json schema, so that it can be validated,
## (de)serialized, and perhaps even automatically converted to a Python class structure.
import json
import jsonschema
grammar_schema_v10 = {
"$schema": "http://json-schema.org/schema#",
"title": "NovoLanguage grammar",
"description": "A grammar specification for the NovoLanguage Automatic Speech Recognition",
"$ref": "#/definitions/group",
"definitions": {
"phones": {
"type": "array",
"items": {
"type": "string"
},
"minItems": 1
},
"pronunciation": {
"type": "object",
"properties": {
"phones": {
"$ref": "#/definitions/phones"
},
"syllables": {
"type": "array",
"items": {
"$ref": "#/definitions/syllable"
},
"minItems": 1
},
"id": {
"type": "integer",
"description": "ID to distinguish this pronunciation from other variants"
},
"meta": {
"type": "object"
}
},
"required": ["phones"]
},
"syllable": {
"type": "object",
"properties": {
"begin": {
"type": "integer",
"minimum": 0
},
"end": {
"type": "integer",
"minimum": 0
},
"stress": {
"type": "integer",
"minimum": 0
},
"tone": {
"type": "integer",
"minimum": 0
}
},
"required": ["begin", "end"]
},
"word": {
"type": "object",
"properties": {
"kind": {
"type": "string",
"enum": ["word"]
},
"label": {
"type": "string"
},
"pronunciation": {
"anyOf": [
{
"$ref": "#/definitions/pronunciation"
},
{
"type": "array",
"items": {
"anyOf": [
{
"$ref": "#/definitions/pronunciation"
},
{
"$ref": "#/definitions/phones"
}
]
},
"minItems": 1
},
{
"$ref": "#/definitions/phones"
}
]
},
"syllables": {
"type": "array",
"items": {
"$ref": "#/definitions/syllable"
}
},
"graphemes": {
"type": "array",
"items": {
"type": "string"
}
},
"id": {
"type": "integer",
"description": "ID to distinguish this word from other words (with possibly the same label)"
},
"meta": {
"type": "object"
}
},
"required": ["label"]
},
"element": {
"title": "element",
"oneOf": [
{
"$ref": "#/definitions/word"
},
{
"$ref": "#/definitions/group"
},
{
"type": ["string", "null"]
}
]
},
"group": {
"title": "element group",
"type": "object",
"properties": {
"kind": {
"type": "string",
"enum": ["sequence", "alternatives", "order"]
},
"elements": {
"type": "array",
"items": {
"$ref": "#/definitions/element"
},
"minItems": 1,
},
"meta": {
"type": "object"
}
},
"required": ["kind", "elements"]
}
}
}
grammar_schema_v01 = {
"$schema": "http://json-schema.org/schema#",
"title": "NovoLanguage grammar v0.1",
"description": "A grammar specification for the NovoLanguage Automatic Speech Recognition",
"type": "object",
"properties": {
"type": {
"type": "string",
"enum": ["multiple_choice", "word_order"]
},
"parts": {
"type": "array",
"minItems": 1,
"maxItems": 5,
"items": {
"type": ["string", "array"],
"items": {
"type": ["string"]
}
}
}
}
}
grammar_rpc_schema = {
"$schema": "http://json-schema.org/schema#",
"title": "NovoLanguage RPC grammar",
"type": "object",
"properties": {
"type": {
"type": "string",
"enum": ["confusion_network"]
},
"version": {
"type": "string",
"default": "v0.1"
},
"data": {
"type": "object"
},
"return_dict": {
"type": "boolean"
},
"return_objects": {
"type": "array",
"items": {
"type": "string",
"enum": ["dict", "grammar"]
}
},
"phoneset": {
"type": "string",
"enum": ["cmu69", "novo70", "mdbg115"]
},
"parallel_silence": {
"type": "boolean"
}
},
"required": ["type", "data"]
}
def validate(object, schema=grammar_schema_v10):
#if isinstance(object, basestring):
if isinstance(object, str):
object = json.loads(object)
if not isinstance(object, dict):
raise TypeError("Expected dict or json string")
try:
jsonschema.validate(object, schema)
except jsonschema.ValidationError:
return False
except Exception:
raise
else:
return True
def validate_rpc_grammar(message):
"""validate an rpc grammar message"""
if not validate(message, grammar_rpc_schema):
raise ValueError("Not a valid RPC grammar")
version = message.get("version", "0.1")
data = message["data"]
if version == "0.1":
if not validate(data, grammar_schema_v01):
raise ValueError("Not a valid grammar v0.1")
elif version == "1.0":
if not validate(data, grammar_schema_v10):
raise ValueError("Not a valid grammar v1.0")
else:
raise ValueError("Unsupported schema version")
## test
def test(data=None):
if not data:
data = {"kind": "sequence", "elements": [
{"kind": "alternatives", "elements": ["a plain string", "an alternative string"]},
{"kind": "word", "label": "a word", "pronunciation": {"phones": ["ah", "w", "er", "d"]}},
{"kind": "order", "elements": [{"kind": "word", "label": "another word", "visible": False}, "last word"]}]}
try:
jsonschema.validate(data, schema)
except jsonschema.ValidationError as e:
#print data, "validated not OK", e.message
print("{0} validated not OK {1}".format(data, e.message))
else:
#print data, "validated OK"
print("{0} validated OK".format(data))
if __name__ == "__main__":
test()

View File

@ -0,0 +1,4 @@
#!/usr/bin/env python
#import session
from . import session

View File

@ -0,0 +1,255 @@
#!/usr/bin/env python
# (c) 2015--2018 NovoLanguage, author: David A. van Leeuwen
## Recognition interface for actual backend. Adapted from player.asr.debug.
import json
import sys
import wave
import requests
import websocket
import logging
import collections
import time
from .. import asr
logger = logging.getLogger(__name__)
## turn off annoying warnings
requests.packages.urllib3.disable_warnings()
logging.getLogger("requests.packages.urllib3.connectionpool").setLevel(logging.WARN)
buffer_size = 4096
gm = "gm.novolanguage.com" ## dev
protocol = "https"
port = 443
apiversion = 0
sessions = collections.Counter()
def segmentation(result):
"""converts a raw backend recognition result to a segment of novo.asr.segments class Segmentation"""
for w in result:
w["score"] = w["confidence"]["prob"]
w["llh"] = w["confidence"]["llr"]
w["label"] = w["label"]["raw"]
w["begin"] /= 10
w["end"] /= 10
for p in w["phones"]:
p["score"] = p["confidence"]["prob"]
p["llh"] = p["confidence"]["llr"]
p["begin"] /= 10
p["end"] /= 10
return asr.segments.Segmentation(result)
class rpcid:
id = 0
@staticmethod
def next():
rpcid.id += 1
return rpcid.id
class Recognizer(object):
def __init__(self, lang="en", gm=gm, grammar_version="0.1", user=None, password=None, snodeid=None, keepopen=False):
self.lang = lang
self.keepopen = keepopen
self.api_url = "%s://%s:%d/v%d" % (protocol, gm, port, apiversion)
self.verify = False
self.headers = {"Content-Type": "application/json"}
self.login_user(user, password)
data = {"l2": lang, "local": False, "skipupload": True}
if snodeid:
data["snodeid"] = snodeid
self.conn = None
self.init_session(data)
self.grammar_version = grammar_version
self.last_message = None
def login_user(self, username, password):
# obtain authentication token of user
logger.info('obtain auth token at %s', self.api_url)
data = {
'username': username,
'password': password
}
try:
r = requests.post(self.api_url + '/publishers/1/login', headers=self.headers, data=json.dumps(data), verify=self.verify)
except Exception as e:
logger.error("Cannot post request to GM API for user login: %s", e.message)
sys.exit(-1)
assert r.ok, r.reason
result = r.json()
if "errors" in result["response"]:
logger.info("Error in logging in: %s", result["response"]["errors"])
sys.exit(-1)
user_auth_token = result['response']['user']['authentication_token']
logger.info("User auth token is: %s", user_auth_token)
# set auth token in header
self.headers['Authentication-Token'] = user_auth_token
def init_session(self, data, direct=False, use_ip=False):
logger.info('Request new session: %s', data)
r = requests.post(self.api_url + '/sessions', headers=self.headers, data=json.dumps(data), verify=self.verify)
if not r.ok:
logger.error("New session request failed: %s", r.text)
return
status_url = r.headers.get("location")
if status_url:
## we got a redirect
status = {}
while True:
logger.debug("Checking %s", status_url)
s = requests.get(status_url, verify=self.verify)
if not s.ok:
logger.error('Checking Failed: %s', s.text)
return
status = s.json()
if status['status'] == 'PENDING':
logger.debug("Status: %s", status['status'])
time.sleep(1)
else:
break
session = status['result'][0] ## [1] is another status code...
if "error" in session:
logger.error("Error in getting a snode: %s", session["error"])
raise Exception
else:
session = r.json()
try:
logger.info("Session: %r", session)
if direct:
snode_ip = session["snode"]["ip"]
proxy_url = snode_ip
snode_port = session["port"]
ws_url = "%s://%s:%d/" % ("ws", snode_ip, snode_port)
else:
field = "ip" if use_ip else "hostname"
proxy_url = session['snode']['datacentre']['proxy'][field]
ws_url = 'wss://' + proxy_url + '/' + session['uuid']
logger.info("Connecting to websocket: %s", ws_url)
conn = websocket.create_connection(ws_url, sslopt={"check_hostname": self.verify})
logger.info("Connected.")
#except Exception, e:
except Exception as e:
logger.error("Unable to connect to websocket: %s", e.message)
raise e
self.session_id = session['id']
self.proxy_url = proxy_url
self.conn = conn
self.session = session
sessions[session["uuid"]] += 1
def setgrammar(self, grammar): ## backend grammar object: {"data": {...}, "type": "confusion_network"}
data = {"jsonrpc": "2.0",
'type': 'jsonrpc',
'method': 'set_grammar',
'params': grammar,
"id": rpcid.next()}
asr.spraaklab.schema.validate_rpc_grammar(grammar)
self.conn.send(json.dumps(data))
result = json.loads(self.conn.recv())
if result.get("error"):
logger.error("Exercise validation error: %s", result)
return result
def set_alternatives_grammar(self, *args, **kwargs):
if not "version" in kwargs:
kwargs["version"] = self.grammar_version
return self.setgrammar(alternatives_grammar(*args, **kwargs))
def recognize_wav(self, wavf):
w = wave.open(wavf, 'r')
nchannels, sampwidth, framerate, nframes, comptype, compname = w.getparams()
if nchannels > 1:
logging.error("Please use .wav with only 1 channel, found %d channels in %s", nchannels, wavf)
return
if (sampwidth != 2):
logging.error("Please use .wav with 2-byte PCM data, found %d bytes in %s", sampwidth, wavf)
return
if (framerate != 16000.0):
logging.error("Please use .wav sampled at 16000 Hz, found %1.0f in %s", framerate, wavf)
return
if (comptype != 'NONE'):
logging.error("Please use .wav with uncompressed data, found %s in %s", compname, wavf)
return
buf = w.readframes(nframes)
w.close()
return self.recognize_data(buf)
def recognize_data(self, buf):
nbytes_sent = 0
start = time.time()
for j in range(0, len(buf), buffer_size):
#audio_packet = str(buf[j:j + buffer_size])
audio_packet = buf[j:j + buffer_size]
nbytes_sent += len(audio_packet)
self.conn.send_binary(audio_packet)
self.conn.send(json.dumps({"jsonrpc": "2.0", "method": "get_result", "id": rpcid.next()}))
logger.info("Waiting for recognition result...")
self.last_message = self.conn.recv() ## keep result for the interested applications
message = json.loads(self.last_message)
dur = time.time() - start
logger.info("Recognition took %5.3f seconds", dur)
if "error" in message:
raise RuntimeError("Error from recognition backend: %r" % message.get("error"))
return segmentation(message["result"]["words"])
def recognize_url(self, url):
start = time.time()
data = json.dumps({"jsonrpc": "2.0", "method": "send_audio", "id": rpcid.next(), "params": {"type": "url", "data": url, "details": ["word", "utterance"]}})
self.conn.send(data)
logger.info("Waiting for recognition result...")
self.last_message = self.conn.recv() ## keep result for the interested applications
#print self.last_message
print(self.last_message)
message = json.loads(self.last_message)
dur = time.time() - start
logger.info("Recognition took %5.3f seconds", dur)
if "error" in message:
raise RuntimeError("Error from recognition backend: %r" % message.get("error"))
return segmentation(message["result"]["words"])
def __del__(self):
sessions[self.session["uuid"]] -= 1
if self.conn and sessions[self.session["uuid"]] <= 0:
self.conn.close()
url = self.api_url + '/sessions/%d' % self.session_id
if self.keepopen:
logger.info("Keeping session open...")
else:
logger.info("Closing session: %s", url)
r = requests.delete(url, headers=self.headers, verify=self.verify)
assert r.ok, r.reason
def alternatives_grammar(parts, version="0.1", ret=None):
"""Make a grammar of alternatives, as array(sequence)-of-array(alternatives)-of-strings"""
r = {"type": "confusion_network", "version": version}
if version=="0.1":
r["data"] = {"type": "multiple_choice", "parts": parts}
if isinstance(ret, list) and "dict" in ret:
r["return_dict"] = True
elif version=="1.0":
seqels = []
for part in parts:
altels = []
for alt in part:
words = alt.split(" ")
if len(words) > 1:
alt = {"kind": "sequence", "elements": words}
altels.append(alt)
seqels.append({"kind": "alternatives", "elements": altels})
r["data"] = {"kind": "sequence", "elements": seqels}
if isinstance(ret, list):
r["return_objects"] = ret
else:
raise ValueError("Unsupported version: %s" % version)
asr.spraaklab.schema.validate_rpc_grammar(r)
return r

View File

@ -0,0 +1,64 @@
novoapi( https://bitbucket.org/novolanguage/python-novo-api ) is written in Python 2.7.
To install it on Python 3.x the following points should be modified.
- basestring --> str
- print xxx --> print({}.format(xxx)).
- import xxx --> from . import xxx
- except Exception, e --> except Exception as e
- remove tuples from input arguments of a function.
Concretely...
=== novoapi\backend\__init__.py
#import session
from . import session
=== novoapi\backend\session.py
#except Exception, e:
except Exception as e:
#print self.last_message
print(self.last_message)
=== novoapi\asr\__init__.py
#import segments
#import spraaklab
from . import segments
from . import spraaklab
=== novoapi\asr\segments\praat.py
#print_tier(output, "confidence", begin, end, confidences, ('%.3f', lambda x: x))
#print_tier(output, "words", begin, end, word_labels, ('%s', lambda x: x))
#print_tier(output, "phones", begin, end, phones, ('%s', lambda x: x))
print_tier(output, "confidence", begin, end, confidences, '%.3f', lambda x: x)
print_tier(output, "words", begin, end, word_labels, '%s', lambda x: x)
print_tier(output, "phones", begin, end, phones, '%s', lambda x: x)
=== novoapi\asr\spraaklab\__init__.py ===
#import schema
from . import schema
=== novoapi\asr\spraaklab\schema.py ===
#if isinstance(object, basestring):
if isinstance(object, str):
except jsonschema.ValidationError as e:
#print data, "validated not OK", e.message
print("{0} validated not OK {1}".format(data, e.message))
else:
#print data, "validated OK"
print("{0} validated OK".format(data))
Then to make it correctly work, few more modification is needed.
When the wav file is read using the wave module, the output (named buf) is a string of bytes on Python 2.7 while buf is a byte object on Python 3.6.
Therefore...
=== novoapi\backend\session.py
#audio_packet = str(buf[j:j + buffer_size])
audio_packet = buf[j:j + buffer_size]
Also, because of this difference, Segment.__repr__ (novoapi\asr\segments\segments.py) does not work.

View File

@ -0,0 +1,25 @@
#!/usr/bin/env python
## from https://stackoverflow.com/questions/1447287/format-floats-with-standard-json-module
class PrettyFloat(float):
def __repr__(self):
return '%.15g' % self
def pretty_floats(obj):
if isinstance(obj, float):
return PrettyFloat(obj)
elif isinstance(obj, dict):
return dict((k, pretty_floats(v)) for k, v in obj.items())
elif isinstance(obj, (list, tuple)):
return map(pretty_floats, obj)
return obj
def rounded_floats(obj, ndigits=15):
if isinstance(obj, float):
return PrettyFloat(round(obj, ndigits))
elif isinstance(obj, dict):
return dict((k, rounded_floats(v, ndigits)) for k, v in obj.items())
elif isinstance(obj, (list, tuple)):
return map(lambda o: rounded_floats(o, ndigits), obj)
return obj

BIN
reus-test/check_novoapi.zip Normal file

Binary file not shown.

119
reus-test/reus-test.py Normal file
View File

@ -0,0 +1,119 @@
#!/usr/bin/env python
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import argparse
import json
from novoapi.backend import session
p = argparse.ArgumentParser()
p.add_argument("--user", default='martijn.wieling')
p.add_argument("--password", default='xxxxx')
args = p.parse_args()
rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True)
grammar = {
"type": "confusion_network",
"version": "1.0",
"data": {
"kind": "sequence",
"elements": [
{
"kind": "word",
"pronunciation": [
{
"phones": [
"r",
"eu0",
"s"
],
"id": 0
}
,
{
"phones": [
"m",
"a0",
"n"
],
"id": 1
}
,
{
"phones": [
"m",
"a0",
"n",
"t",
"s",
"y",
"ax"
],
"id": 2
}
],
"label": "reus"
}
]
},
"return_objects": [
"grammar"
],
"phoneset": "novo70"
}
res = rec.setgrammar(grammar)
#print "Set grammar result", res
## === novoapi/backend/session.py ===
#import wave
#import time
#from novoapi.backend.session import rpcid, segmentation
#wavf = "reus1008-reus.wav"
#w = wave.open(wavf, 'r')
#nchannels, sampwidth, framerate, nframes, comptype, compname = w.getparams()
#buf = w.readframes(nframes)
#w.close()
#buffer_size = 4096
#nbytes_sent = 0
#start = time.time()
#for j in range(0, len(buf), buffer_size):
# audio_packet = buf[j:j + buffer_size]
# nbytes_sent += len(audio_packet)
# rec.conn.send_binary(audio_packet)
#rec.conn.send(json.dumps({"jsonrpc": "2.0", "method": "get_result", "id": rpcid.next()}))
#print(rpcid.next())
#rec.last_message = rec.conn.recv()
#message = json.loads(rec.last_message)
#result = session.segmentation(message["result"]["words"])
#result.export()
## ====================================
def result2pronunciation(result, word):
#result_ = res.export()[1]
result_ = [result[i] for i in range(len(result)) if result[i]['label'] == word]
llh = result_[0]['llh']
phones = result_[0]['phones']
pronunciation = [phone['label'] for phone in phones]
return pronunciation, llh
res = rec.recognize_wav("reus1008-reus.wav")
#print "\n\n\nThe pronounced word in reus1008-reus.wav is: REUS\n\n"
#print "Recognition result:", json.dumps(res.export(), indent=4)
result2pronunciation(res.export(), 'reus')
#print "\n\n\nThe pronounced word in reus1167-man.wav is: MAN\n\n"
res2 = rec.recognize_wav("reus1167-man.wav")
#print "Recognition result:", json.dumps(res2.export(), indent=4)
result2pronunciation(res2.export(), 'reus')
#print "\n\n\nThe pronounced word in reus3768-mantsje.wav is: MANTSJE\n\n"
res3 = rec.recognize_wav("reus3768-mantsje.wav")
#print "Recognition result:", json.dumps(res3.export(), indent=4)
result2pronunciation(res3.export(), 'reus')

View File

@ -0,0 +1,3 @@
REUS r eu s
REUS m ac n
REUS m ac n t s j @

View File

@ -0,0 +1 @@
REUS

View File

@ -0,0 +1,6 @@
#!MLF!#
"c:/Users/Aki/source/repos/acoustic_model/reus-test/reus1008-reus.rec"
0 9700000 r -12463.852539 REUS
9700000 12800000 eu -3622.108887
12800000 26250001 s -17303.216797
.

BIN
reus-test/reus1008-reus.wav Normal file

Binary file not shown.

View File

@ -0,0 +1,3 @@
REUS r eu s
REUS m ac n
REUS m ac n t s j @

View File

@ -0,0 +1 @@
REUS

View File

@ -0,0 +1,10 @@
#!MLF!#
"c:/Users/Aki/source/repos/acoustic_model/reus-test/reus1167-man.rec"
0 150000 m -230.057571 REUS
150000 300000 ac -250.994858
300000 450000 n -202.377716
450000 4600000 t -5128.984375
4600000 5050000 s -711.338501
5050000 5450000 j -564.730591
5450000 16049999 @ -13249.787109
.

BIN
reus-test/reus1167-man.wav Normal file

Binary file not shown.

View File

@ -0,0 +1,3 @@
REUS r eu s
REUS m ac n
REUS m ac n t s j @

View File

@ -0,0 +1 @@
REUS

View File

@ -0,0 +1,10 @@
#!MLF!#
"c:/Users/Aki/source/repos/acoustic_model/reus-test/reus3768-mantsje.rec"
0 150000 m -217.347229 REUS
150000 1150000 ac -1266.293579
1150000 1650000 n -583.382568
1650000 11100000 t -11259.270508
11100000 11250000 s -247.939255
11250000 11550000 j -445.511444
11550000 24150000 @ -16769.048828
.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

119
rozen-test/rozen-test.py Normal file
View File

@ -0,0 +1,119 @@
#!/usr/bin/env python
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
import argparse
import json
from novoapi.backend import session
p = argparse.ArgumentParser()
p.add_argument("--user", default='martijn.wieling')
p.add_argument("--password", default='xxxxx')
args = p.parse_args()
rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True)
grammar = {
"type": "confusion_network",
"version": "1.0",
"data": {
"kind": "sequence",
"elements": [
{
"kind": "word",
"pronunciation": [
{
"phones": [
"r",
"eu0",
"s"
],
"id": 0
}
,
{
"phones": [
"m",
"a0",
"n"
],
"id": 1
}
,
{
"phones": [
"m",
"a0",
"n",
"t",
"s",
"y",
"ax"
],
"id": 2
}
],
"label": "reus"
}
]
},
"return_objects": [
"grammar"
],
"phoneset": "novo70"
}
res = rec.setgrammar(grammar)
#print "Set grammar result", res
## === novoapi/backend/session.py ===
#import wave
#import time
#from novoapi.backend.session import rpcid, segmentation
#wavf = "reus1008-reus.wav"
#w = wave.open(wavf, 'r')
#nchannels, sampwidth, framerate, nframes, comptype, compname = w.getparams()
#buf = w.readframes(nframes)
#w.close()
#buffer_size = 4096
#nbytes_sent = 0
#start = time.time()
#for j in range(0, len(buf), buffer_size):
# audio_packet = buf[j:j + buffer_size]
# nbytes_sent += len(audio_packet)
# rec.conn.send_binary(audio_packet)
#rec.conn.send(json.dumps({"jsonrpc": "2.0", "method": "get_result", "id": rpcid.next()}))
#print(rpcid.next())
#rec.last_message = rec.conn.recv()
#message = json.loads(rec.last_message)
#result = session.segmentation(message["result"]["words"])
#result.export()
## ====================================
def result2pronunciation(result, word):
#result_ = res.export()[1]
result_ = [result[i] for i in range(len(result)) if result[i]['label'] == word]
llh = result_[0]['llh']
phones = result_[0]['phones']
pronunciation = [phone['label'] for phone in phones]
return pronunciation, llh
res = rec.recognize_wav("reus1008-reus.wav")
#print "\n\n\nThe pronounced word in reus1008-reus.wav is: REUS\n\n"
#print "Recognition result:", json.dumps(res.export(), indent=4)
result2pronunciation(res.export(), 'reus')
#print "\n\n\nThe pronounced word in reus1167-man.wav is: MAN\n\n"
res2 = rec.recognize_wav("reus1167-man.wav")
#print "Recognition result:", json.dumps(res2.export(), indent=4)
result2pronunciation(res2.export(), 'reus')
#print "\n\n\nThe pronounced word in reus3768-mantsje.wav is: MANTSJE\n\n"
res3 = rec.recognize_wav("reus3768-mantsje.wav")
#print "Recognition result:", json.dumps(res3.export(), indent=4)
result2pronunciation(res3.export(), 'reus')