Compare commits
44 Commits
3a98e184fe
...
master
Author | SHA1 | Date | |
---|---|---|---|
97486e5599 | |||
2004399179 | |||
b444b70af9 | |||
bf586fcde5 | |||
fdd165ce6a | |||
fa81b70b27 | |||
41d4fa5ff9 | |||
b1b1942fa0 | |||
c185072d5b | |||
8f89f60538 | |||
f6e563ecd3 | |||
da0242b0e1 | |||
ab3887c6ca | |||
f6e7c8eefa | |||
322a8a0079 | |||
22cccfb61d | |||
dc6b7b84b6 | |||
8cda93de75 | |||
87abbbb95a | |||
813f013d7a | |||
7844a56281 | |||
04a862b2fd | |||
24ac56ac0e | |||
82a8e2302f | |||
de5c9cecb9 | |||
8efb091715 | |||
05e8a671c1 | |||
6edde06a4f | |||
1622655542 | |||
d6d5543d03 | |||
d6e005b1cb | |||
dd9e3d820b | |||
e5cf182a18 | |||
a77ed9d4dd | |||
df046ffc26 | |||
beff33fdf9 | |||
af785e51cf | |||
9ec7c3c50b | |||
f04bb2a080 | |||
b771cbb7d6 | |||
3500a8cdf0 | |||
b87a81eb9d | |||
0777735979 | |||
ea30b5c503 |
265
.gitignore
vendored
Normal file
265
.gitignore
vendored
Normal file
@ -0,0 +1,265 @@
|
|||||||
|
## Ignore Visual Studio temporary files, build results, and
|
||||||
|
## files generated by popular Visual Studio add-ons.
|
||||||
|
|
||||||
|
## important ##
|
||||||
|
.acoustic_model/forced_alignment_novo.py
|
||||||
|
.acoustic_model/novoapi_functions.py
|
||||||
|
|
||||||
|
# User-specific files
|
||||||
|
*.suo
|
||||||
|
*.user
|
||||||
|
*.userosscache
|
||||||
|
*.sln.docstates
|
||||||
|
|
||||||
|
# User-specific files (MonoDevelop/Xamarin Studio)
|
||||||
|
*.userprefs
|
||||||
|
|
||||||
|
# Build results
|
||||||
|
[Dd]ebug/
|
||||||
|
[Dd]ebugPublic/
|
||||||
|
[Rr]elease/
|
||||||
|
[Rr]eleases/
|
||||||
|
x64/
|
||||||
|
x86/
|
||||||
|
bld/
|
||||||
|
[Bb]in/
|
||||||
|
[Oo]bj/
|
||||||
|
[Ll]og/
|
||||||
|
|
||||||
|
# Visual Studio 2015 cache/options directory
|
||||||
|
.vs/
|
||||||
|
# Uncomment if you have tasks that create the project's static files in wwwroot
|
||||||
|
#wwwroot/
|
||||||
|
|
||||||
|
# MSTest test Results
|
||||||
|
[Tt]est[Rr]esult*/
|
||||||
|
[Bb]uild[Ll]og.*
|
||||||
|
|
||||||
|
# NUNIT
|
||||||
|
*.VisualState.xml
|
||||||
|
TestResult.xml
|
||||||
|
|
||||||
|
# Build Results of an ATL Project
|
||||||
|
[Dd]ebugPS/
|
||||||
|
[Rr]eleasePS/
|
||||||
|
dlldata.c
|
||||||
|
|
||||||
|
# DNX
|
||||||
|
project.lock.json
|
||||||
|
project.fragment.lock.json
|
||||||
|
artifacts/
|
||||||
|
|
||||||
|
*_i.c
|
||||||
|
*_p.c
|
||||||
|
*_i.h
|
||||||
|
*.ilk
|
||||||
|
*.meta
|
||||||
|
*.obj
|
||||||
|
*.pch
|
||||||
|
*.pdb
|
||||||
|
*.pgc
|
||||||
|
*.pgd
|
||||||
|
*.rsp
|
||||||
|
*.sbr
|
||||||
|
*.tlb
|
||||||
|
*.tli
|
||||||
|
*.tlh
|
||||||
|
*.tmp
|
||||||
|
*.tmp_proj
|
||||||
|
*.log
|
||||||
|
*.vspscc
|
||||||
|
*.vssscc
|
||||||
|
.builds
|
||||||
|
*.pidb
|
||||||
|
*.svclog
|
||||||
|
*.scc
|
||||||
|
|
||||||
|
# Chutzpah Test files
|
||||||
|
_Chutzpah*
|
||||||
|
|
||||||
|
# Visual C++ cache files
|
||||||
|
ipch/
|
||||||
|
*.aps
|
||||||
|
*.ncb
|
||||||
|
*.opendb
|
||||||
|
*.opensdf
|
||||||
|
*.sdf
|
||||||
|
*.cachefile
|
||||||
|
*.VC.db
|
||||||
|
*.VC.VC.opendb
|
||||||
|
|
||||||
|
# Visual Studio profiler
|
||||||
|
*.psess
|
||||||
|
*.vsp
|
||||||
|
*.vspx
|
||||||
|
*.sap
|
||||||
|
|
||||||
|
# TFS 2012 Local Workspace
|
||||||
|
$tf/
|
||||||
|
|
||||||
|
# Guidance Automation Toolkit
|
||||||
|
*.gpState
|
||||||
|
|
||||||
|
# ReSharper is a .NET coding add-in
|
||||||
|
_ReSharper*/
|
||||||
|
*.[Rr]e[Ss]harper
|
||||||
|
*.DotSettings.user
|
||||||
|
|
||||||
|
# JustCode is a .NET coding add-in
|
||||||
|
.JustCode
|
||||||
|
|
||||||
|
# TeamCity is a build add-in
|
||||||
|
_TeamCity*
|
||||||
|
|
||||||
|
# DotCover is a Code Coverage Tool
|
||||||
|
*.dotCover
|
||||||
|
|
||||||
|
# NCrunch
|
||||||
|
_NCrunch_*
|
||||||
|
.*crunch*.local.xml
|
||||||
|
nCrunchTemp_*
|
||||||
|
|
||||||
|
# MightyMoose
|
||||||
|
*.mm.*
|
||||||
|
AutoTest.Net/
|
||||||
|
|
||||||
|
# Web workbench (sass)
|
||||||
|
.sass-cache/
|
||||||
|
|
||||||
|
# Installshield output folder
|
||||||
|
[Ee]xpress/
|
||||||
|
|
||||||
|
# DocProject is a documentation generator add-in
|
||||||
|
DocProject/buildhelp/
|
||||||
|
DocProject/Help/*.HxT
|
||||||
|
DocProject/Help/*.HxC
|
||||||
|
DocProject/Help/*.hhc
|
||||||
|
DocProject/Help/*.hhk
|
||||||
|
DocProject/Help/*.hhp
|
||||||
|
DocProject/Help/Html2
|
||||||
|
DocProject/Help/html
|
||||||
|
|
||||||
|
# Click-Once directory
|
||||||
|
publish/
|
||||||
|
|
||||||
|
# Publish Web Output
|
||||||
|
*.[Pp]ublish.xml
|
||||||
|
*.azurePubxml
|
||||||
|
# TODO: Comment the next line if you want to checkin your web deploy settings
|
||||||
|
# but database connection strings (with potential passwords) will be unencrypted
|
||||||
|
#*.pubxml
|
||||||
|
*.publishproj
|
||||||
|
|
||||||
|
# Microsoft Azure Web App publish settings. Comment the next line if you want to
|
||||||
|
# checkin your Azure Web App publish settings, but sensitive information contained
|
||||||
|
# in these scripts will be unencrypted
|
||||||
|
PublishScripts/
|
||||||
|
|
||||||
|
# NuGet Packages
|
||||||
|
*.nupkg
|
||||||
|
# The packages folder can be ignored because of Package Restore
|
||||||
|
**/packages/*
|
||||||
|
# except build/, which is used as an MSBuild target.
|
||||||
|
!**/packages/build/
|
||||||
|
# Uncomment if necessary however generally it will be regenerated when needed
|
||||||
|
#!**/packages/repositories.config
|
||||||
|
# NuGet v3's project.json files produces more ignoreable files
|
||||||
|
*.nuget.props
|
||||||
|
*.nuget.targets
|
||||||
|
|
||||||
|
# Microsoft Azure Build Output
|
||||||
|
csx/
|
||||||
|
*.build.csdef
|
||||||
|
|
||||||
|
# Microsoft Azure Emulator
|
||||||
|
ecf/
|
||||||
|
rcf/
|
||||||
|
|
||||||
|
# Windows Store app package directories and files
|
||||||
|
AppPackages/
|
||||||
|
BundleArtifacts/
|
||||||
|
Package.StoreAssociation.xml
|
||||||
|
_pkginfo.txt
|
||||||
|
|
||||||
|
# Visual Studio cache files
|
||||||
|
# files ending in .cache can be ignored
|
||||||
|
*.[Cc]ache
|
||||||
|
# but keep track of directories ending in .cache
|
||||||
|
!*.[Cc]ache/
|
||||||
|
|
||||||
|
# Others
|
||||||
|
ClientBin/
|
||||||
|
~$*
|
||||||
|
*~
|
||||||
|
*.dbmdl
|
||||||
|
*.dbproj.schemaview
|
||||||
|
*.jfm
|
||||||
|
*.pfx
|
||||||
|
*.publishsettings
|
||||||
|
node_modules/
|
||||||
|
orleans.codegen.cs
|
||||||
|
|
||||||
|
# Since there are multiple workflows, uncomment next line to ignore bower_components
|
||||||
|
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
|
||||||
|
#bower_components/
|
||||||
|
|
||||||
|
# RIA/Silverlight projects
|
||||||
|
Generated_Code/
|
||||||
|
|
||||||
|
# Backup & report files from converting an old project file
|
||||||
|
# to a newer Visual Studio version. Backup files are not needed,
|
||||||
|
# because we have git ;-)
|
||||||
|
_UpgradeReport_Files/
|
||||||
|
Backup*/
|
||||||
|
UpgradeLog*.XML
|
||||||
|
UpgradeLog*.htm
|
||||||
|
|
||||||
|
# SQL Server files
|
||||||
|
*.mdf
|
||||||
|
*.ldf
|
||||||
|
|
||||||
|
# Business Intelligence projects
|
||||||
|
*.rdl.data
|
||||||
|
*.bim.layout
|
||||||
|
*.bim_*.settings
|
||||||
|
|
||||||
|
# Microsoft Fakes
|
||||||
|
FakesAssemblies/
|
||||||
|
|
||||||
|
# GhostDoc plugin setting file
|
||||||
|
*.GhostDoc.xml
|
||||||
|
|
||||||
|
# Node.js Tools for Visual Studio
|
||||||
|
.ntvs_analysis.dat
|
||||||
|
|
||||||
|
# Visual Studio 6 build log
|
||||||
|
*.plg
|
||||||
|
|
||||||
|
# Visual Studio 6 workspace options file
|
||||||
|
*.opt
|
||||||
|
|
||||||
|
# Visual Studio LightSwitch build output
|
||||||
|
**/*.HTMLClient/GeneratedArtifacts
|
||||||
|
**/*.DesktopClient/GeneratedArtifacts
|
||||||
|
**/*.DesktopClient/ModelManifest.xml
|
||||||
|
**/*.Server/GeneratedArtifacts
|
||||||
|
**/*.Server/ModelManifest.xml
|
||||||
|
_Pvt_Extensions
|
||||||
|
|
||||||
|
# Paket dependency manager
|
||||||
|
.paket/paket.exe
|
||||||
|
paket-files/
|
||||||
|
|
||||||
|
# FAKE - F# Make
|
||||||
|
.fake/
|
||||||
|
|
||||||
|
# JetBrains Rider
|
||||||
|
.idea/
|
||||||
|
*.sln.iml
|
||||||
|
|
||||||
|
# CodeRush
|
||||||
|
.cr/
|
||||||
|
|
||||||
|
# Python Tools for Visual Studio (PTVS)
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
Binary file not shown.
13152
HCompV.scp
13152
HCompV.scp
File diff suppressed because it is too large
Load Diff
@ -10,17 +10,20 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
|
|||||||
..\forced_alignment\forced_alignment\__init__.py = ..\forced_alignment\forced_alignment\__init__.py
|
..\forced_alignment\forced_alignment\__init__.py = ..\forced_alignment\forced_alignment\__init__.py
|
||||||
..\forced_alignment\forced_alignment\convert_phone_set.py = ..\forced_alignment\forced_alignment\convert_phone_set.py
|
..\forced_alignment\forced_alignment\convert_phone_set.py = ..\forced_alignment\forced_alignment\convert_phone_set.py
|
||||||
..\toolbox\evaluation.py = ..\toolbox\evaluation.py
|
..\toolbox\evaluation.py = ..\toolbox\evaluation.py
|
||||||
..\forced_alignment\forced_alignment\forced_alignment.pyproj = ..\forced_alignment\forced_alignment\forced_alignment.pyproj
|
..\forced_alignment\forced_alignment\htk_dict.py = ..\forced_alignment\forced_alignment\htk_dict.py
|
||||||
..\forced_alignment\forced_alignment\lexicon.py = ..\forced_alignment\forced_alignment\lexicon.py
|
..\forced_alignment\forced_alignment\lexicon.py = ..\forced_alignment\forced_alignment\lexicon.py
|
||||||
..\forced_alignment\forced_alignment\mlf.py = ..\forced_alignment\forced_alignment\mlf.py
|
..\forced_alignment\forced_alignment\mlf.py = ..\forced_alignment\forced_alignment\mlf.py
|
||||||
..\forced_alignment\forced_alignment\pronunciations.py = ..\forced_alignment\forced_alignment\pronunciations.py
|
..\forced_alignment\forced_alignment\pronunciations.py = ..\forced_alignment\forced_alignment\pronunciations.py
|
||||||
..\toolbox\pyHTK.py = ..\toolbox\pyHTK.py
|
..\toolbox\pyHTK.py = ..\toolbox\pyHTK.py
|
||||||
..\forced_alignment\forced_alignment\pyhtk.py = ..\forced_alignment\forced_alignment\pyhtk.py
|
..\forced_alignment\forced_alignment\pyhtk.py = ..\forced_alignment\forced_alignment\pyhtk.py
|
||||||
..\forced_alignment\forced_alignment\scripts.py = ..\forced_alignment\forced_alignment\scripts.py
|
..\forced_alignment\forced_alignment\scripts.py = ..\forced_alignment\forced_alignment\scripts.py
|
||||||
|
..\..\..\..\..\Python36-32\Lib\site-packages\novoapi\backend\session.py = ..\..\..\..\..\Python36-32\Lib\site-packages\novoapi\backend\session.py
|
||||||
..\forced_alignment\forced_alignment\tempfilename.py = ..\forced_alignment\forced_alignment\tempfilename.py
|
..\forced_alignment\forced_alignment\tempfilename.py = ..\forced_alignment\forced_alignment\tempfilename.py
|
||||||
..\forced_alignment\forced_alignment\test_environment.py = ..\forced_alignment\forced_alignment\test_environment.py
|
..\forced_alignment\forced_alignment\test_environment.py = ..\forced_alignment\forced_alignment\test_environment.py
|
||||||
EndProjectSection
|
EndProjectSection
|
||||||
EndProject
|
EndProject
|
||||||
|
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "toolbox", "..\toolbox\toolbox.pyproj", "{F0D46C9C-51C6-4989-8A2F-35F2A0C048BE}"
|
||||||
|
EndProject
|
||||||
Global
|
Global
|
||||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
Debug|Any CPU = Debug|Any CPU
|
Debug|Any CPU = Debug|Any CPU
|
||||||
@ -29,6 +32,8 @@ Global
|
|||||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||||
{4D8C8573-32F0-4A62-9E62-3CE5CC680390}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
{4D8C8573-32F0-4A62-9E62-3CE5CC680390}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||||
{4D8C8573-32F0-4A62-9E62-3CE5CC680390}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
{4D8C8573-32F0-4A62-9E62-3CE5CC680390}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||||
|
{F0D46C9C-51C6-4989-8A2F-35F2A0C048BE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||||
|
{F0D46C9C-51C6-4989-8A2F-35F2A0C048BE}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||||
EndGlobalSection
|
EndGlobalSection
|
||||||
GlobalSection(SolutionProperties) = preSolution
|
GlobalSection(SolutionProperties) = preSolution
|
||||||
HideSolutionNode = FALSE
|
HideSolutionNode = FALSE
|
||||||
|
Binary file not shown.
Binary file not shown.
@ -1,319 +0,0 @@
|
|||||||
import os
|
|
||||||
import sys
|
|
||||||
import tempfile
|
|
||||||
import configparser
|
|
||||||
import subprocess
|
|
||||||
from collections import Counter
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
## ======================= user define =======================
|
|
||||||
repo_dir = 'C:\\Users\\Aki\\source\\repos\\acoustic_model'
|
|
||||||
curr_dir = repo_dir + '\\acoustic_model'
|
|
||||||
config_ini = curr_dir + '\\config.ini'
|
|
||||||
output_dir = 'C:\\OneDrive\\Research\\rug\\experiments\\friesian\\acoustic_model'
|
|
||||||
forced_alignment_module = 'C:\\Users\\Aki\\source\\repos\\forced_alignment'
|
|
||||||
|
|
||||||
dataset_list = ['devel', 'test', 'train']
|
|
||||||
|
|
||||||
# procedure
|
|
||||||
extract_features = 0
|
|
||||||
make_feature_list = 0
|
|
||||||
conv_lexicon = 0
|
|
||||||
check_lexicon = 0
|
|
||||||
make_mlf = 0
|
|
||||||
combine_files = 0
|
|
||||||
flat_start = 0
|
|
||||||
train_model = 1
|
|
||||||
|
|
||||||
|
|
||||||
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), curr_dir))
|
|
||||||
sys.path.append(forced_alignment_module)
|
|
||||||
from forced_alignment import convert_phone_set
|
|
||||||
|
|
||||||
import acoustic_model_functions as am_func
|
|
||||||
|
|
||||||
|
|
||||||
## ======================= load variables =======================
|
|
||||||
|
|
||||||
config = configparser.ConfigParser()
|
|
||||||
config.sections()
|
|
||||||
config.read(config_ini)
|
|
||||||
|
|
||||||
config_hcopy = config['Settings']['config_hcopy']
|
|
||||||
config_train = config['Settings']['config_train']
|
|
||||||
mkhmmdefs_pl = config['Settings']['mkhmmdefs_pl']
|
|
||||||
FAME_dir = config['Settings']['FAME_dir']
|
|
||||||
|
|
||||||
lex_asr = FAME_dir + '\\lexicon\\lex.asr'
|
|
||||||
lex_asr_htk = FAME_dir + '\\lexicon\\lex.asr_htk'
|
|
||||||
lex_oov = FAME_dir + '\\lexicon\\lex.oov'
|
|
||||||
lex_oov_htk = FAME_dir + '\\lexicon\\lex.oov_htk'
|
|
||||||
#lex_ipa = FAME_dir + '\\lexicon\\lex.ipa'
|
|
||||||
#lex_ipa_ = FAME_dir + '\\lexicon\\lex.ipa_'
|
|
||||||
#lex_ipa_htk = FAME_dir + '\\lexicon\\lex.ipa_htk'
|
|
||||||
lex_htk = FAME_dir + '\\lexicon\\lex_original.htk'
|
|
||||||
lex_htk_ = FAME_dir + '\\lexicon\\lex.htk'
|
|
||||||
|
|
||||||
hcompv_scp = output_dir + '\\scp\\combined.scp'
|
|
||||||
combined_mlf = output_dir + '\\label\\combined.mlf'
|
|
||||||
|
|
||||||
model_dir = output_dir + '\\model'
|
|
||||||
model0_dir = model_dir + '\\hmm0'
|
|
||||||
proto_init = model_dir + '\\proto38'
|
|
||||||
proto_name = 'proto'
|
|
||||||
phonelist = output_dir + '\\config\\phonelist_friesian.txt'
|
|
||||||
hmmdefs_name = 'hmmdefs'
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## ======================= extract features =======================
|
|
||||||
if extract_features:
|
|
||||||
print("==== extract features ====\n")
|
|
||||||
|
|
||||||
for dataset in dataset_list:
|
|
||||||
print(dataset)
|
|
||||||
|
|
||||||
# a script file for HCopy
|
|
||||||
hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False)
|
|
||||||
hcopy_scp.close()
|
|
||||||
|
|
||||||
# get a list of features (hcopy.scp) from the filelist in FAME! corpus
|
|
||||||
feature_dir = output_dir + '\\mfc\\' + dataset
|
|
||||||
am_func.make_hcopy_scp_from_filelist_in_fame(FAME_dir, dataset, feature_dir, hcopy_scp.name)
|
|
||||||
|
|
||||||
# extract features
|
|
||||||
subprocessStr = 'HCopy -C ' + config_hcopy + ' -S ' + hcopy_scp.name
|
|
||||||
subprocess.call(subprocessStr, shell=True)
|
|
||||||
|
|
||||||
|
|
||||||
## ======================= make a list of features =======================
|
|
||||||
if make_feature_list:
|
|
||||||
print("==== make a list of features ====\n")
|
|
||||||
|
|
||||||
for dataset in dataset_list:
|
|
||||||
print(dataset)
|
|
||||||
|
|
||||||
feature_dir = output_dir + '\\mfc\\' + dataset
|
|
||||||
hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp'
|
|
||||||
|
|
||||||
am_func.make_filelist(feature_dir, hcompv_scp)
|
|
||||||
|
|
||||||
|
|
||||||
## ======================= convert lexicon from ipa to fame_htk =======================
|
|
||||||
if conv_lexicon:
|
|
||||||
print('==== convert lexicon from ipa 2 fame ====\n')
|
|
||||||
|
|
||||||
# lex.asr is Kaldi compatible version of lex.ipa.
|
|
||||||
# to check...
|
|
||||||
#lexicon_ipa = pd.read_table(lex_ipa, names=['word', 'pronunciation'])
|
|
||||||
#with open(lex_ipa_, "w", encoding="utf-8") as fout:
|
|
||||||
# for word, pronunciation in zip(lexicon_ipa['word'], lexicon_ipa['pronunciation']):
|
|
||||||
# # ignore nasalization and '.'
|
|
||||||
# pronunciation_ = pronunciation.replace(u'ⁿ', '')
|
|
||||||
# pronunciation_ = pronunciation_.replace('.', '')
|
|
||||||
# pronunciation_split = convert_phone_set.split_ipa_fame(pronunciation_)
|
|
||||||
# fout.write("{0}\t{1}\n".format(word, ' '.join(pronunciation_split)))
|
|
||||||
|
|
||||||
# convert each lexicon from ipa description to fame_htk phoneset.
|
|
||||||
am_func.ipa2famehtk_lexicon(lex_oov, lex_oov_htk)
|
|
||||||
am_func.ipa2famehtk_lexicon(lex_asr, lex_asr_htk)
|
|
||||||
|
|
||||||
# combine lexicon
|
|
||||||
# pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov.
|
|
||||||
# therefore there is no overlap between lex_asr and lex_oov.
|
|
||||||
am_func.combine_lexicon(lex_asr_htk, lex_oov_htk, lex_htk)
|
|
||||||
|
|
||||||
|
|
||||||
## ======================= check if all the phones are successfully converted =======================
|
|
||||||
if check_lexicon:
|
|
||||||
print("==== check if all the phones are successfully converted. ====\n")
|
|
||||||
|
|
||||||
# the phones used in the lexicon.
|
|
||||||
phonelist_asr = am_func.get_phonelist(lex_asr)
|
|
||||||
phonelist_oov = am_func.get_phonelist(lex_oov)
|
|
||||||
phonelist_htk = am_func.get_phonelist(lex_htk)
|
|
||||||
|
|
||||||
phonelist = phonelist_asr.union(phonelist_oov)
|
|
||||||
|
|
||||||
# the lines which include a specific phone.
|
|
||||||
lines = am_func.find_phone(lex_asr, 'g')
|
|
||||||
|
|
||||||
# statistics over the lexicon
|
|
||||||
lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])
|
|
||||||
pronunciation = lexicon_htk['pronunciation']
|
|
||||||
phones_all = []
|
|
||||||
for word in pronunciation:
|
|
||||||
phones_all = phones_all + word.split()
|
|
||||||
c = Counter(phones_all)
|
|
||||||
|
|
||||||
|
|
||||||
## =======================
|
|
||||||
## manually make changes to the pronunciation dictionary and save it as lex.htk
|
|
||||||
## =======================
|
|
||||||
# (1) Replace all tabs with single space;
|
|
||||||
# (2) Put a '\' before any dictionary entry beginning with single quote
|
|
||||||
#http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html
|
|
||||||
|
|
||||||
|
|
||||||
## ======================= make label file =======================
|
|
||||||
if make_mlf:
|
|
||||||
print("==== make mlf ====\n")
|
|
||||||
|
|
||||||
print("generating word level transcription...\n")
|
|
||||||
for dataset in dataset_list:
|
|
||||||
hcompv_scp = output_dir + '\\scp\\' + dataset + '.scp'
|
|
||||||
hcompv_scp2 = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'
|
|
||||||
script_list = FAME_dir + '\\data\\' + dataset + '\\text'
|
|
||||||
mlf_word = output_dir + '\\label\\' + dataset + '_word.mlf'
|
|
||||||
mlf_phone = output_dir + '\\label\\' + dataset + '_phone.mlf'
|
|
||||||
|
|
||||||
# lexicon
|
|
||||||
lexicon_htk = pd.read_table(lex_htk, names=['word', 'pronunciation'])
|
|
||||||
|
|
||||||
# list of features
|
|
||||||
with open(hcompv_scp) as fin:
|
|
||||||
features = fin.read()
|
|
||||||
features = features.split('\n')
|
|
||||||
|
|
||||||
# list of scripts
|
|
||||||
with open(script_list, "rt", encoding="utf-8") as fin:
|
|
||||||
scripts = fin.read()
|
|
||||||
scripts = pd.Series(scripts.split('\n'))
|
|
||||||
|
|
||||||
i = 0
|
|
||||||
missing_words = []
|
|
||||||
fscp = open(hcompv_scp2, 'wt')
|
|
||||||
fmlf = open(mlf_word, "wt", encoding="utf-8")
|
|
||||||
fmlf.write("#!MLF!#\n")
|
|
||||||
feature_nr = 1
|
|
||||||
for feature in features:
|
|
||||||
sys.stdout.write("\r%d/%d" % (feature_nr, len(features)))
|
|
||||||
sys.stdout.flush()
|
|
||||||
feature_nr += 1
|
|
||||||
file_basename = os.path.basename(feature).replace('.mfc', '')
|
|
||||||
|
|
||||||
# get words from scripts.
|
|
||||||
try:
|
|
||||||
script = scripts[scripts.str.contains(file_basename)]
|
|
||||||
except IndexError:
|
|
||||||
script = []
|
|
||||||
|
|
||||||
if len(script) != 0:
|
|
||||||
script_id = script.index[0]
|
|
||||||
script_txt = script.get(script_id)
|
|
||||||
script_words = script_txt.split(' ')
|
|
||||||
del script_words[0]
|
|
||||||
|
|
||||||
# check if all words can be found in the lexicon.
|
|
||||||
SCRIPT_WORDS = []
|
|
||||||
script_prons = []
|
|
||||||
is_in_lexicon = 1
|
|
||||||
for word in script_words:
|
|
||||||
WORD = word.upper()
|
|
||||||
SCRIPT_WORDS.append(WORD)
|
|
||||||
extracted = lexicon_htk[lexicon_htk['word']==WORD]
|
|
||||||
if len(extracted) == 0:
|
|
||||||
missing_words.append(word)
|
|
||||||
script_prons.append(extracted)
|
|
||||||
is_in_lexicon *= len(extracted)
|
|
||||||
|
|
||||||
# if all pronunciations are found in the lexicon, update scp and mlf files.
|
|
||||||
if is_in_lexicon:
|
|
||||||
# add the feature filename into the .scp file.
|
|
||||||
fscp.write("{}\n".format(feature))
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
# add the words to the mlf file.
|
|
||||||
fmlf.write('\"*/{}.lab\"\n'.format(file_basename))
|
|
||||||
#fmlf.write('{}'.format('\n'.join(SCRIPT_WORDS)))
|
|
||||||
for word_ in SCRIPT_WORDS:
|
|
||||||
if word_[0] == '\'':
|
|
||||||
word_ = '\\' + word_
|
|
||||||
fmlf.write('{}\n'.format(word_))
|
|
||||||
fmlf.write('.\n')
|
|
||||||
print("\n{0} has {1} samples.\n".format(dataset, i))
|
|
||||||
np.save(output_dir + '\\missing_words' + '_' + dataset + '.npy', missing_words)
|
|
||||||
|
|
||||||
fscp.close()
|
|
||||||
fmlf.close()
|
|
||||||
|
|
||||||
|
|
||||||
## generate phone level transcription
|
|
||||||
print("generating phone level transcription...\n")
|
|
||||||
mkphones = output_dir + '\\label\\mkphones0.txt'
|
|
||||||
subprocessStr = r"HLEd -l * -d " + lex_htk_ + ' -i ' + mlf_phone + ' ' + mkphones + ' ' + mlf_word
|
|
||||||
subprocess.call(subprocessStr, shell=True)
|
|
||||||
|
|
||||||
|
|
||||||
## ======================= combined scps and mlfs =======================
|
|
||||||
if combine_files:
|
|
||||||
print("==== combine scps and mlfs ====\n")
|
|
||||||
|
|
||||||
fscp = open(hcompv_scp, 'wt')
|
|
||||||
fmlf = open(combined_mlf, 'wt')
|
|
||||||
|
|
||||||
for dataset in dataset_list:
|
|
||||||
fmlf.write("#!MLF!#\n")
|
|
||||||
for dataset in dataset_list:
|
|
||||||
each_mlf = output_dir + '\\label\\' + dataset + '_phone.mlf'
|
|
||||||
each_scp = output_dir + '\\scp\\' + dataset + '_all_words_in_lexicon.scp'
|
|
||||||
|
|
||||||
with open(each_mlf, 'r') as fin:
|
|
||||||
lines = fin.read()
|
|
||||||
lines = lines.split('\n')
|
|
||||||
fmlf.write('\n'.join(lines[1:]))
|
|
||||||
|
|
||||||
with open(each_scp, 'r') as fin:
|
|
||||||
lines = fin.read()
|
|
||||||
fscp.write(lines)
|
|
||||||
|
|
||||||
fscp.close()
|
|
||||||
fmlf.close()
|
|
||||||
|
|
||||||
|
|
||||||
## ======================= flat start monophones =======================
|
|
||||||
if flat_start:
|
|
||||||
subprocessStr = 'HCompV -T 1 -C ' + config_train + ' -m -v 0.01 -S ' + hcompv_scp + ' -M ' + model0_dir + ' ' + proto_init
|
|
||||||
subprocess.call(subprocessStr, shell=True)
|
|
||||||
|
|
||||||
# allocate mean & variance to all phones in the phone list
|
|
||||||
subprocessStr = 'perl ' + mkhmmdefs_pl + ' ' + model0_dir + '\\proto38' + ' ' + phonelist + ' > ' + model0_dir + '\\' + hmmdefs_name
|
|
||||||
subprocess.call(subprocessStr, shell=True)
|
|
||||||
|
|
||||||
|
|
||||||
## ======================= estimate monophones =======================
|
|
||||||
if train_model:
|
|
||||||
iter_num_max = 3
|
|
||||||
for mix_num in [128, 256, 512, 1024]:
|
|
||||||
for iter_num in range(1, iter_num_max+1):
|
|
||||||
print("===== mix{}, iter{} =====".format(mix_num, iter_num))
|
|
||||||
iter_num_pre = iter_num - 1
|
|
||||||
modelN_dir = model_dir + '\\hmm' + str(mix_num) + '-' + str(iter_num)
|
|
||||||
if not os.path.exists(modelN_dir):
|
|
||||||
os.makedirs(modelN_dir)
|
|
||||||
|
|
||||||
if iter_num == 1 and mix_num == 1:
|
|
||||||
modelN_dir_pre = model0_dir
|
|
||||||
else:
|
|
||||||
modelN_dir_pre = model_dir + '\\hmm' + str(mix_num) + '-' + str(iter_num_pre)
|
|
||||||
|
|
||||||
## re-estimation
|
|
||||||
subprocessStr = 'HERest -T 1 -C ' + config_train + ' -v 0.01 -I ' + combined_mlf + ' -H ' + modelN_dir_pre + '\\' + hmmdefs_name + ' -M ' + modelN_dir + ' ' + phonelist + ' -S ' + hcompv_scp
|
|
||||||
subprocess.call(subprocessStr, shell=True)
|
|
||||||
|
|
||||||
mix_num_next = mix_num * 2
|
|
||||||
modelN_dir_next = model_dir + '\\hmm' + str(mix_num_next) + '-0'
|
|
||||||
if not os.path.exists(modelN_dir_next):
|
|
||||||
os.makedirs(modelN_dir_next)
|
|
||||||
|
|
||||||
header_file = modelN_dir + '\\mix' + str(mix_num_next) + '.hed'
|
|
||||||
with open(header_file, 'w') as fout:
|
|
||||||
fout.write("MU %d {*.state[2-4].mix}" % (mix_num_next))
|
|
||||||
|
|
||||||
subprocessStr = 'HHEd -T 1 -H ' + modelN_dir + '\\' + hmmdefs_name + ' -M ' + modelN_dir_next + ' ' + header_file + ' ' + phonelist
|
|
||||||
|
|
||||||
subprocess.call(subprocessStr, shell=True)
|
|
||||||
|
|
@ -4,7 +4,7 @@
|
|||||||
<SchemaVersion>2.0</SchemaVersion>
|
<SchemaVersion>2.0</SchemaVersion>
|
||||||
<ProjectGuid>4d8c8573-32f0-4a62-9e62-3ce5cc680390</ProjectGuid>
|
<ProjectGuid>4d8c8573-32f0-4a62-9e62-3ce5cc680390</ProjectGuid>
|
||||||
<ProjectHome>.</ProjectHome>
|
<ProjectHome>.</ProjectHome>
|
||||||
<StartupFile>performance_check.py</StartupFile>
|
<StartupFile>check_novoapi.py</StartupFile>
|
||||||
<SearchPath>
|
<SearchPath>
|
||||||
</SearchPath>
|
</SearchPath>
|
||||||
<WorkingDirectory>.</WorkingDirectory>
|
<WorkingDirectory>.</WorkingDirectory>
|
||||||
@ -21,8 +21,8 @@
|
|||||||
<EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
|
<EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<Compile Include="acoustic_model.py" />
|
<Compile Include="check_novoapi.py" />
|
||||||
<Compile Include="acoustic_model_functions.py">
|
<Compile Include="convert_phoneset.py">
|
||||||
<SubType>Code</SubType>
|
<SubType>Code</SubType>
|
||||||
</Compile>
|
</Compile>
|
||||||
<Compile Include="convert_xsampa2ipa.py">
|
<Compile Include="convert_xsampa2ipa.py">
|
||||||
@ -31,15 +31,43 @@
|
|||||||
<Compile Include="defaultfiles.py">
|
<Compile Include="defaultfiles.py">
|
||||||
<SubType>Code</SubType>
|
<SubType>Code</SubType>
|
||||||
</Compile>
|
</Compile>
|
||||||
|
<Compile Include="fame_test.py">
|
||||||
|
<SubType>Code</SubType>
|
||||||
|
</Compile>
|
||||||
<Compile Include="fa_test.py">
|
<Compile Include="fa_test.py">
|
||||||
<SubType>Code</SubType>
|
<SubType>Code</SubType>
|
||||||
</Compile>
|
</Compile>
|
||||||
<Compile Include="performance_check.py">
|
<Compile Include="fame_functions.py" />
|
||||||
|
<Compile Include="forced_aligner_comparison.py" />
|
||||||
|
<Compile Include="novoapi_forced_alignment.py">
|
||||||
<SubType>Code</SubType>
|
<SubType>Code</SubType>
|
||||||
</Compile>
|
</Compile>
|
||||||
|
<Compile Include="htk_vs_kaldi.py">
|
||||||
|
<SubType>Code</SubType>
|
||||||
|
</Compile>
|
||||||
|
<Compile Include="novoapi_functions.py">
|
||||||
|
<SubType>Code</SubType>
|
||||||
|
</Compile>
|
||||||
|
<Compile Include="fame_hmm.py" />
|
||||||
|
<Compile Include="phoneset\fame_asr.py" />
|
||||||
|
<Compile Include="phoneset\fame_ipa.py" />
|
||||||
|
<Compile Include="phoneset\fame_phonetics.py">
|
||||||
|
<SubType>Code</SubType>
|
||||||
|
</Compile>
|
||||||
|
<Compile Include="stimmen_functions.py" />
|
||||||
|
<Compile Include="stimmen_test.py" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<Content Include="config.ini" />
|
<Content Include="config.ini" />
|
||||||
|
<Content Include="phoneset\fame_ipa2asr.npy" />
|
||||||
|
<Content Include="phoneset\output_get_translation_key_phone_unknown.npy" />
|
||||||
|
<Content Include="phoneset\output_get_translation_key_translation_key.npy" />
|
||||||
|
<Content Include="phoneset\__pycache__\fame_asr.cpython-36.pyc" />
|
||||||
|
<Content Include="phoneset\__pycache__\fame_ipa.cpython-36.pyc" />
|
||||||
|
</ItemGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<Folder Include="phoneset\" />
|
||||||
|
<Folder Include="phoneset\__pycache__\" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
|
<Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
|
||||||
<!-- Uncomment the CoreCompile target to enable the Build command in
|
<!-- Uncomment the CoreCompile target to enable the Build command in
|
||||||
|
@ -1,151 +0,0 @@
|
|||||||
import os
|
|
||||||
import sys
|
|
||||||
from collections import Counter
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
import defaultfiles as default
|
|
||||||
|
|
||||||
sys.path.append(default.forced_alignment_module_dir)
|
|
||||||
from forced_alignment import convert_phone_set
|
|
||||||
|
|
||||||
|
|
||||||
def make_hcopy_scp_from_filelist_in_fame(FAME_dir, dataset, feature_dir, hcopy_scp):
|
|
||||||
""" Make a script file for HCopy using the filelist in FAME! corpus. """
|
|
||||||
filelist_txt = FAME_dir + '\\fame\\filelists\\' + dataset + 'list.txt'
|
|
||||||
with open(filelist_txt) as fin:
|
|
||||||
filelist = fin.read()
|
|
||||||
filelist = filelist.split('\n')
|
|
||||||
|
|
||||||
with open(hcopy_scp, 'w') as fout:
|
|
||||||
for filename_ in filelist:
|
|
||||||
filename = filename_.replace('.TextGrid', '')
|
|
||||||
|
|
||||||
if len(filename) > 3: # remove '.', '..' and ''
|
|
||||||
wav_file = FAME_dir + '\\fame\\wav\\' + dataset + '\\' + filename + '.wav'
|
|
||||||
mfc_file = feature_dir + '\\' + filename + '.mfc'
|
|
||||||
|
|
||||||
fout.write(wav_file + '\t' + mfc_file + '\n')
|
|
||||||
|
|
||||||
|
|
||||||
def make_filelist(input_dir, output_txt):
|
|
||||||
""" Make a list of files in the input_dir. """
|
|
||||||
filenames = os.listdir(input_dir)
|
|
||||||
|
|
||||||
with open(output_txt, 'w') as fout:
|
|
||||||
for filename in filenames:
|
|
||||||
fout.write(input_dir + '\\' + filename + '\n')
|
|
||||||
|
|
||||||
|
|
||||||
def make_dic(word, pronvar_, fileDic, output_type):
|
|
||||||
"""
|
|
||||||
make dict files which can be used for HTK.
|
|
||||||
param word: target word.
|
|
||||||
param pronvar_: pronunciation variant. nx2 (WORD /t pronunciation) ndarray.
|
|
||||||
param fileDic: output dic file.
|
|
||||||
param output_type: 0:full, 1:statistics, 2:frequency <2% entries are removed. 3:top 3.
|
|
||||||
"""
|
|
||||||
#assert(output_type < 4 and output_type >= 0, 'output_type should be an integer between 0 and 3.')
|
|
||||||
WORD = word.upper()
|
|
||||||
|
|
||||||
if output_type == 0: # full
|
|
||||||
pronvar = np.unique(pronvar_)
|
|
||||||
|
|
||||||
with open(fileDic, 'w') as f:
|
|
||||||
for pvar in pronvar:
|
|
||||||
f.write('{0}\t{1}\n'.format(WORD, pvar))
|
|
||||||
else:
|
|
||||||
c = Counter(pronvar_)
|
|
||||||
total_num = sum(c.values())
|
|
||||||
with open(fileDic, 'w') as f:
|
|
||||||
if output_type == 3:
|
|
||||||
for key, value in c.most_common(3):
|
|
||||||
f.write('{0}\t{1}\n'.format(WORD, key))
|
|
||||||
else:
|
|
||||||
for key, value in c.items():
|
|
||||||
percentage = value/total_num*100
|
|
||||||
|
|
||||||
if output_type == 1: # all
|
|
||||||
f.write('{0}\t{1:.2f}\t{2}\t{3}\n'.format(value, percentage, WORD, key))
|
|
||||||
elif output_type == 2: # less than 2 percent
|
|
||||||
if percentage < 2:
|
|
||||||
f.write('{0}\t{1}\n'.format(WORD, key))
|
|
||||||
|
|
||||||
|
|
||||||
def get_phonelist(lexicon_file):
|
|
||||||
""" Make a list of phones which appears in the lexicon. """
|
|
||||||
|
|
||||||
with open(lexicon_file, "rt", encoding="utf-8") as fin:
|
|
||||||
lines = fin.read()
|
|
||||||
lines = lines.split('\n')
|
|
||||||
phonelist = set([])
|
|
||||||
for line in lines:
|
|
||||||
line = line.split('\t')
|
|
||||||
if len(line) > 1:
|
|
||||||
pronunciation = set(line[1].split())
|
|
||||||
phonelist = phonelist | pronunciation
|
|
||||||
return phonelist
|
|
||||||
|
|
||||||
|
|
||||||
def find_phone(lexicon_file, phone):
|
|
||||||
""" Search where the phone is used in the lexicon. """
|
|
||||||
with open(lexicon_file, "rt", encoding="utf-8") as fin:
|
|
||||||
lines = fin.read()
|
|
||||||
lines = lines.split('\n')
|
|
||||||
|
|
||||||
extracted = []
|
|
||||||
for line in lines:
|
|
||||||
line = line.split('\t')
|
|
||||||
if len(line) > 1:
|
|
||||||
pron = line[1]
|
|
||||||
if phone in pron:
|
|
||||||
extracted.append(line)
|
|
||||||
return extracted
|
|
||||||
|
|
||||||
|
|
||||||
def ipa2famehtk_lexicon(lexicon_file_in, lexicon_file_out):
|
|
||||||
""" Convert a lexicon file from IPA to HTK format for FAME! corpus. """
|
|
||||||
|
|
||||||
lexicon_in = pd.read_table(lexicon_file_in, names=['word', 'pronunciation'])
|
|
||||||
with open(lexicon_file_out, "w", encoding="utf-8") as fout:
|
|
||||||
for word, pronunciation in zip(lexicon_in['word'], lexicon_in['pronunciation']):
|
|
||||||
pronunciation_no_space = pronunciation.replace(' ', '')
|
|
||||||
pronunciation_famehtk = convert_phone_set.ipa2famehtk(pronunciation_no_space)
|
|
||||||
if 'ceh' not in pronunciation_famehtk and 'sh' not in pronunciation_famehtk:
|
|
||||||
fout.write("{0}\t{1}\n".format(word.upper(), pronunciation_famehtk))
|
|
||||||
|
|
||||||
|
|
||||||
def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out):
|
|
||||||
""" Combine two lexicon files and sort by words. """
|
|
||||||
|
|
||||||
with open(lexicon_file1, "rt", encoding="utf-8") as fin:
|
|
||||||
lines1 = fin.read()
|
|
||||||
lines1 = lines1.split('\n')
|
|
||||||
with open(lexicon_file2, "rt", encoding="utf-8") as fin:
|
|
||||||
lines2 = fin.read()
|
|
||||||
lines2 = lines2.split('\n')
|
|
||||||
|
|
||||||
lex1 = pd.read_table(lexicon_file1, names=['word', 'pronunciation'])
|
|
||||||
lex2 = pd.read_table(lexicon_file2, names=['word', 'pronunciation'])
|
|
||||||
lex = pd.concat([lex1, lex2])
|
|
||||||
lex = lex.sort_values(by='word', ascending=True)
|
|
||||||
lex.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
|
|
||||||
|
|
||||||
|
|
||||||
def read_fileFA(fileFA):
|
|
||||||
"""
|
|
||||||
read the result file of HTK forced alignment.
|
|
||||||
this function only works when input is one word.
|
|
||||||
"""
|
|
||||||
with open(fileFA, 'r') as f:
|
|
||||||
lines = f.read()
|
|
||||||
lines = lines.split('\n')
|
|
||||||
|
|
||||||
phones = []
|
|
||||||
for line in lines:
|
|
||||||
line_split = line.split()
|
|
||||||
if len(line_split) > 1:
|
|
||||||
phones.append(line_split[2])
|
|
||||||
|
|
||||||
return ' '.join(phones)
|
|
207
acoustic_model/check_novoapi.py
Normal file
207
acoustic_model/check_novoapi.py
Normal file
@ -0,0 +1,207 @@
|
|||||||
|
import os
|
||||||
|
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import csv
|
||||||
|
from collections import Counter
|
||||||
|
import random
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from sklearn.metrics import confusion_matrix
|
||||||
|
from sklearn.metrics import accuracy_score
|
||||||
|
import novoapi
|
||||||
|
|
||||||
|
import defaultfiles as default
|
||||||
|
sys.path.append(default.forced_alignment_module_dir)
|
||||||
|
from forced_alignment import convert_phone_set
|
||||||
|
#import acoustic_model_functions as am_func
|
||||||
|
import convert_xsampa2ipa
|
||||||
|
import novoapi_functions
|
||||||
|
import stimmen_functions
|
||||||
|
sys.path.append(default.accent_classification_dir)
|
||||||
|
import output_confusion_matrix
|
||||||
|
|
||||||
|
## procedure
|
||||||
|
forced_alignment_novo70 = True
|
||||||
|
|
||||||
|
|
||||||
|
## ===== load novo phoneset =====
|
||||||
|
phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa = novoapi_functions.load_novo70_phoneset()
|
||||||
|
|
||||||
|
|
||||||
|
## ===== extract pronunciations written in novo70 only (not_in_novo70) =====
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## read pronunciation variants.
|
||||||
|
#stimmen_transcription_ = pd.ExcelFile(default.stimmen_transcription_xlsx)
|
||||||
|
#df = pd.read_excel(stimmen_transcription_, 'frequency')
|
||||||
|
#transcription_ipa = list(df['IPA'])
|
||||||
|
|
||||||
|
|
||||||
|
stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test'
|
||||||
|
df = stimmen_functions.load_transcriptions_novo70(stimmen_test_dir)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## transcription mistake?
|
||||||
|
#transcription_ipa = [ipa.replace(';', 'ː') for ipa in transcription_ipa if not ipa=='pypɪl' and not pd.isnull(ipa)]
|
||||||
|
#transcription_ipa = [ipa.replace('ˑ', '') for ipa in transcription_ipa] # only one case.
|
||||||
|
|
||||||
|
#not_in_novo70 = []
|
||||||
|
#all_in_novo70 = []
|
||||||
|
#for ipa in transcription_ipa:
|
||||||
|
# ipa = ipa.replace(':', 'ː')
|
||||||
|
# ipa = convert_phone_set.split_ipa(ipa)
|
||||||
|
|
||||||
|
# # list of phones not in novo70 phoneset.
|
||||||
|
# not_in_novo70_ = [phone for phone in ipa
|
||||||
|
# if not phone in phoneset_ipa and not phone in david_suggestion]
|
||||||
|
# not_in_novo70_ = [phone.replace('sp', '') for phone in not_in_novo70_]
|
||||||
|
# not_in_novo70_ = [phone.replace(':', '') for phone in not_in_novo70_]
|
||||||
|
# not_in_novo70_ = [phone.replace('ː', '') for phone in not_in_novo70_]
|
||||||
|
|
||||||
|
# if len(not_in_novo70_) == 0:
|
||||||
|
# all_in_novo70.append(''.join(ipa))
|
||||||
|
|
||||||
|
# #translation_key.get(phone, phone)
|
||||||
|
# not_in_novo70.extend(not_in_novo70_)
|
||||||
|
#not_in_novo70_list = list(set(not_in_novo70))
|
||||||
|
|
||||||
|
|
||||||
|
## check which phones used in stimmen but not in novo70
|
||||||
|
# 'ʀ', 'ʁ',
|
||||||
|
# 'ɒ', 'ɐ',
|
||||||
|
# 'o', 'a' (o:, a:?)
|
||||||
|
# [e] 'nyːver mɑntsjə' (1)
|
||||||
|
# [ɾ] 'ɪːɾ'(1)
|
||||||
|
# [ɹ] 'iːjəɹ' (1), 'ɪ:ɹ' (1)
|
||||||
|
# [ø] 'gʀøtəpi:r'(1), 'grøtəpi:r'(1)
|
||||||
|
# [æ] 'røːzəʀæt'(2), 'røːzəræt'(1)
|
||||||
|
# [ʊ] 'ʊ'(1) --> can be ʏ (uh)??
|
||||||
|
# [χ] --> can be x??
|
||||||
|
|
||||||
|
#def search_phone_ipa(x, phone_list):
|
||||||
|
# x_in_item = []
|
||||||
|
# for ipa in phone_list:
|
||||||
|
# ipa_original = ipa
|
||||||
|
# ipa = ipa.replace(':', 'ː')
|
||||||
|
# ipa = convert_phone_set.split_ipa(ipa)
|
||||||
|
# if x in ipa and not x+':' in ipa:
|
||||||
|
# x_in_item.append(ipa_original)
|
||||||
|
# return x_in_item
|
||||||
|
#search_phone_ipa('ø', transcription_ipa)
|
||||||
|
|
||||||
|
|
||||||
|
## ===== load all transcriptions (df) =====
|
||||||
|
#df = stimmen_functions.load_transcriptions()
|
||||||
|
word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
|
||||||
|
word_list = sorted(word_list)
|
||||||
|
|
||||||
|
|
||||||
|
## check frequency of each pronunciation variants
|
||||||
|
#cols = ['word', 'ipa', 'frequency']
|
||||||
|
#df_samples = pd.DataFrame(index=[], columns=cols)
|
||||||
|
#for ipa in all_in_novo70:
|
||||||
|
# ipa = ipa.replace('ː', ':')
|
||||||
|
# samples = df[df['ipa'] == ipa]
|
||||||
|
# word = list(set(samples['word']))[0]
|
||||||
|
# samples_Series = pd.Series([word, ipa, len(samples)], index=df_samples.columns)
|
||||||
|
# df_samples = df_samples.append(samples_Series, ignore_index=True)
|
||||||
|
|
||||||
|
# each word
|
||||||
|
#df_per_word = pd.DataFrame(index=[], columns=df_samples.keys())
|
||||||
|
|
||||||
|
#for word in word_list:
|
||||||
|
word = word_list[2]
|
||||||
|
df_ = df[df['word']==word]
|
||||||
|
np.unique(list(df_['ipa']))
|
||||||
|
#df_samples_ = df_samples_[df_samples_['frequency']>2]
|
||||||
|
#df_per_word = df_per_word.append(df_samples_, ignore_index=True)
|
||||||
|
#df_per_word.to_excel(os.path.join(default.stimmen_dir, 'pronunciation_variants_novo70.xlsx'), encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
## ===== forced alignment =====
|
||||||
|
rozen_dir = r'c:\Users\Aki\source\repos\acoustic_model\rozen-test'
|
||||||
|
if forced_alignment_novo70:
|
||||||
|
Results = pd.DataFrame(index=[],
|
||||||
|
columns=['filename', 'word', 'xsampa', 'ipa', 'result_ipa', 'result_novo70', 'llh'])
|
||||||
|
#for word in word_list:
|
||||||
|
for word in ['Rozen']:
|
||||||
|
# pronunciation variants top 3
|
||||||
|
df_per_word_ = df_per_word[df_per_word['word']==word]
|
||||||
|
df_per_word_ = df_per_word_.sort_values('frequency', ascending=False)
|
||||||
|
if len(df_per_word_) < 3: # pauw, rozen
|
||||||
|
pronunciation_ipa = list(df_per_word_['ipa'])
|
||||||
|
elif word=='Reuzenrad':
|
||||||
|
pronunciation_ipa = [
|
||||||
|
df_per_word_.iloc[0]['ipa'],
|
||||||
|
df_per_word_.iloc[1]['ipa'],
|
||||||
|
df_per_word_.iloc[2]['ipa'],
|
||||||
|
df_per_word_.iloc[3]['ipa']]
|
||||||
|
else:
|
||||||
|
# oog, oor, reus, roeiboot
|
||||||
|
pronunciation_ipa = [
|
||||||
|
df_per_word_.iloc[0]['ipa'],
|
||||||
|
df_per_word_.iloc[1]['ipa'],
|
||||||
|
df_per_word_.iloc[2]['ipa']]
|
||||||
|
#print("{0}: {1}".format(word, pronunciation_ipa))
|
||||||
|
|
||||||
|
# samples for the word
|
||||||
|
df_ = df[df['word']==word]
|
||||||
|
|
||||||
|
# samples in which all pronunciations are written in novo70.
|
||||||
|
samples = df_.query("ipa in @pronunciation_ipa")
|
||||||
|
|
||||||
|
results = pd.DataFrame(index=[],
|
||||||
|
columns=['filename', 'word', 'xsampa', 'ipa', 'result_ipa', 'result_novo70', 'llh'])
|
||||||
|
|
||||||
|
for i in range(0, len(samples)):
|
||||||
|
sample = samples.iloc[i]
|
||||||
|
filename = sample['filename']
|
||||||
|
wav_file = os.path.join(default.stimmen_wav_dir, filename)
|
||||||
|
if os.path.exists(wav_file):
|
||||||
|
# for Martijn
|
||||||
|
shutil.copy(wav_file, os.path.join(rozen_dir, filename))
|
||||||
|
|
||||||
|
# pronunciation_ipa_ = [ipa.replace(':', 'ː') for ipa in pronunciation_ipa]
|
||||||
|
# result = novoapi_functions.forced_alignment(wav_file, word, pronunciation_ipa_)
|
||||||
|
# result_ipa, result_novo70, llh = novoapi_functions.result2pronunciation(result, word)
|
||||||
|
# result_ = pd.Series([
|
||||||
|
# sample['filename'],
|
||||||
|
# sample['word'],
|
||||||
|
# sample['xsampa'],
|
||||||
|
# sample['ipa'],
|
||||||
|
# ' '.join(result_ipa),
|
||||||
|
# ' '.join(result_novo70),
|
||||||
|
# llh
|
||||||
|
# ], index=results.columns)
|
||||||
|
# results = results.append(result_, ignore_index = True)
|
||||||
|
# print('{0}/{1}: answer {2} - prediction {3}'.format(
|
||||||
|
# i+1, len(samples), result_['ipa'], result_['result_ipa']))
|
||||||
|
# #results.to_excel(os.path.join(default.stimmen_dir, 'results.xlsx'), encoding="utf-8")
|
||||||
|
#if len(results) > 0:
|
||||||
|
# Results = Results.append(results, ignore_index = True)
|
||||||
|
#Results.to_excel(os.path.join(default.stimmen_result_novoapi_dir, 'Results.xlsx'), encoding="utf-8")
|
||||||
|
else:
|
||||||
|
Results_xlsx = pd.ExcelFile(os.path.join(default.stimmen_result_novoapi_dir, 'Results.xlsx'), encoding="utf-8")
|
||||||
|
Results = pd.read_excel(Results_xlsx, 'Sheet1')
|
||||||
|
|
||||||
|
|
||||||
|
## ===== analysis =====
|
||||||
|
#for word in word_list:
|
||||||
|
# if not word == 'Oog':
|
||||||
|
# Results_ = Results[Results['word'] == word]
|
||||||
|
# y_true = list(Results_['ipa'])
|
||||||
|
# y_pred_ = [ipa.replace(' ', '') for ipa in list(Results_['result_ipa'])]
|
||||||
|
# y_pred = [ipa.replace('ː', ':') for ipa in y_pred_]
|
||||||
|
# pronunciation_variants = list(set(y_true))
|
||||||
|
# cm = confusion_matrix(y_true, y_pred, labels=pronunciation_variants)
|
||||||
|
|
||||||
|
# plt.figure()
|
||||||
|
# output_confusion_matrix.plot_confusion_matrix(cm, pronunciation_variants, normalize=False)
|
||||||
|
# #plt.show()
|
||||||
|
# plt.savefig(os.path.join(default.stimmen_result_novoapi_dir, word + '.png'))
|
58
acoustic_model/convert_phoneset.py
Normal file
58
acoustic_model/convert_phoneset.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
"""Module to convert phonemes."""
|
||||||
|
|
||||||
|
def multi_character_tokenize(line, multi_character_tokens):
|
||||||
|
"""Tries to match one of the tokens in multi_character_tokens at each position of line, starting at position 0,
|
||||||
|
if so tokenizes and eats that token. Otherwise tokenizes a single character"""
|
||||||
|
while line != '':
|
||||||
|
for token in multi_character_tokens:
|
||||||
|
if line.startswith(token) and len(token) > 0:
|
||||||
|
yield token
|
||||||
|
line = line[len(token):]
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
yield line[:1]
|
||||||
|
line = line[1:]
|
||||||
|
|
||||||
|
|
||||||
|
def split_word(word, phoneset):
|
||||||
|
"""
|
||||||
|
split a line by given phoneset.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
word (str): a word written in given phoneset.
|
||||||
|
#multi_character_phones (list): the list of multicharacter phones which is considered as one phone. this can be obtained with phoneset definition such as fame_ipa.py.
|
||||||
|
phoneset (list): the list of phones.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(word_seperated) (list): the word splitted in given phoneset.
|
||||||
|
|
||||||
|
"""
|
||||||
|
multi_character_phones = extract_multi_character_phones(phoneset)
|
||||||
|
return [phone
|
||||||
|
for phone in multi_character_tokenize(word.strip(), multi_character_phones)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def convert_phoneset(word_list, translation_key):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
word_list (str): a list of phones written in given phoneset.
|
||||||
|
translation_key (dict):
|
||||||
|
"""
|
||||||
|
return [translation_key.get(phone, phone) for phone in word_list]
|
||||||
|
|
||||||
|
|
||||||
|
def phone_reduction(phones, reduction_key):
|
||||||
|
multi_character_tokenize(wo.strip(), multi_character_phones)
|
||||||
|
return [reduction_key.get(i, i) for i in phones
|
||||||
|
if not i in phones_to_be_removed]
|
||||||
|
|
||||||
|
|
||||||
|
def extract_multi_character_phones(phoneset):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
phoneset (list):
|
||||||
|
"""
|
||||||
|
multi_character_phones = [i for i in phoneset if len(i) > 1]
|
||||||
|
multi_character_phones.sort(key=len, reverse=True)
|
||||||
|
return multi_character_phones
|
@ -1,35 +1,42 @@
|
|||||||
import os
|
import os
|
||||||
|
# add path of the parent directory
|
||||||
|
#os.path.dirname(os.path.realpath(__file__))
|
||||||
|
|
||||||
#default_hvite_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', 'htk', 'config.HVite')
|
# repos
|
||||||
|
|
||||||
cygwin_dir = r'C:\cygwin64\home\Aki\acoustic_model'
|
|
||||||
#config_hcopy = os.path.join(cygwin_dir, 'config', 'config.HCopy')
|
|
||||||
#config_train = os.path.join(cygwin_dir, 'config', 'config.train')
|
|
||||||
config_hvite = os.path.join(cygwin_dir, 'config', 'config.HVite')
|
|
||||||
#mkhmmdefs_pl = os.path.join(cygwin_dir, 'src', 'acoustic_model', 'mkhmmdefs.pl')
|
|
||||||
|
|
||||||
#dbLexicon = C:\\Users\\Aki\\source\\repos\\rug_VS\\forced_alignment\\config\\lexicon.accdb
|
|
||||||
#scriptBarbara = C:\\Users\\Aki\\source\\repos\\rug_VS\\forced_alignment\\config\\pronvars_barbara.perl
|
|
||||||
#exeG2P = C:\\Users\\Aki\\source\\repos\\rug_VS\\forced_alignment\\config\\string2phon.exe
|
|
||||||
|
|
||||||
#[pyHTK]
|
|
||||||
#configHVite = C:\\Users\\Aki\\source\\repos\\rug_VS\\forced_alignment\\config\\config.HVite
|
|
||||||
#filePhoneList = C:\\Users\\Aki\\source\\repos\\rug_VS\\forced_alignment\\config\\phonelist_barbara.txt
|
|
||||||
#AcousticModel = C:\\Users\\Aki\\source\\repos\\rug_VS\\forced_alignment\\config\\hmmdefs_16-2_barbara.compo
|
|
||||||
|
|
||||||
#dbLexicon = config['cLexicon']['dbLexicon']
|
|
||||||
#scriptBarbara = config['cLexicon']['scriptBarbara']
|
|
||||||
#exeG2P = config['cLexicon']['exeG2P']
|
|
||||||
|
|
||||||
#configHVite = config['pyHTK']['configHVite']
|
|
||||||
#filePhoneList = config['pyHTK']['filePhoneList']
|
|
||||||
#AcousticModel = config['pyHTK']['AcousticModel']
|
|
||||||
|
|
||||||
repo_dir = r'C:\Users\Aki\source\repos'
|
repo_dir = r'C:\Users\Aki\source\repos'
|
||||||
ipa_xsampa_converter_dir = os.path.join(repo_dir, 'ipa-xsama-converter')
|
ipa_xsampa_converter_dir = os.path.join(repo_dir, 'ipa-xsama-converter')
|
||||||
forced_alignment_module_dir = os.path.join(repo_dir, 'forced_alignment')
|
forced_alignment_module_dir = os.path.join(repo_dir, 'forced_alignment')
|
||||||
|
accent_classification_dir = os.path.join(repo_dir, 'accent_classification', 'accent_classification')
|
||||||
|
toolbox_dir = os.path.join(repo_dir, 'toolbox')
|
||||||
|
|
||||||
|
WSL_dir = r'C:\OneDrive\WSL'
|
||||||
|
novo_api_dir = os.path.join(WSL_dir, 'python-novo-api', 'novoapi')
|
||||||
|
#novo_api_dir = r'c:\Python36-32\Lib\site-packages\novoapi'
|
||||||
|
|
||||||
|
# working directories
|
||||||
|
rug_dir = r'c:\OneDrive\Research\rug'
|
||||||
|
experiments_dir = os.path.join(rug_dir, 'experiments')
|
||||||
|
htk_dir = os.path.join(experiments_dir, 'acoustic_model', 'fame', 'htk')
|
||||||
|
kaldi_dir = os.path.join(WSL_dir, 'kaldi-trunk', 'egs', '_stimmen')
|
||||||
|
stimmen_dir = os.path.join(experiments_dir, 'stimmen')
|
||||||
|
|
||||||
|
# data
|
||||||
|
fame_dir = os.path.join(rug_dir, '_data', 'FAME')
|
||||||
|
#fame_dir = os.path.join(WSL_dir, 'kaldi-trunk', 'egs', 'fame')
|
||||||
|
# 44.1 kHz
|
||||||
|
#stimmen_wav_dir = os.path.join(stimmen_dir, 'wav')
|
||||||
|
# 16 kHz
|
||||||
|
stimmen_wav_dir = r'c:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\corpus\stimmen'
|
||||||
|
stimmen_transcription_xlsx = os.path.join(stimmen_dir, 'data', 'Frisian Variants Picture Task Stimmen.xlsx')
|
||||||
|
phonelist_friesian_txt = os.path.join(experiments_dir, 'friesian', 'acoustic_model', 'config', 'phonelist_friesian.txt')
|
||||||
|
novo70_phoneset = os.path.join(novo_api_dir, 'asr', 'phoneset', 'nl', 'novo70.phoneset')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#phonelist_txt = os.path.join(htk_dir, 'config', 'phonelist.txt')
|
||||||
|
#fame_s5_dir = os.path.join(fame_dir, 's5')
|
||||||
|
#fame_corpus_dir = os.path.join(fame_dir, 'corpus')
|
||||||
|
#stimmen_result_novoapi_dir = os.path.join(stimmen_dir, 'result', 'novoapi')
|
||||||
|
# novoapi_functions
|
||||||
|
|
||||||
fame_dir = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\corpus'
|
|
||||||
experiments_dir = r'c:\OneDrive\Research\rug\experiments'
|
|
||||||
|
|
||||||
phonelist = os.path.join(experiments_dir, 'friesian', 'acoustic_model', 'config', 'phonelist_friesian.txt')
|
|
406
acoustic_model/fame_functions.py
Normal file
406
acoustic_model/fame_functions.py
Normal file
@ -0,0 +1,406 @@
|
|||||||
|
import os
|
||||||
|
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from collections import Counter
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import defaultfiles as default
|
||||||
|
import convert_phoneset
|
||||||
|
from phoneset import fame_ipa, fame_asr
|
||||||
|
|
||||||
|
sys.path.append(default.toolbox_dir)
|
||||||
|
from htk import pyhtk
|
||||||
|
|
||||||
|
|
||||||
|
#def read_fileFA(fileFA):
|
||||||
|
# """
|
||||||
|
# read the result file of HTK forced alignment.
|
||||||
|
# this function only works when input is one word.
|
||||||
|
# """
|
||||||
|
# with open(fileFA, 'r') as f:
|
||||||
|
# lines = f.read()
|
||||||
|
# lines = lines.split('\n')
|
||||||
|
|
||||||
|
# phones = []
|
||||||
|
# for line in lines:
|
||||||
|
# line_split = line.split()
|
||||||
|
# if len(line_split) > 1:
|
||||||
|
# phones.append(line_split[2])
|
||||||
|
|
||||||
|
# return ' '.join(phones)
|
||||||
|
|
||||||
|
|
||||||
|
#def fame_pronunciation_variant(ipa):
|
||||||
|
# ipa = ipa.replace('æ', 'ɛ')
|
||||||
|
# ipa = ipa.replace('ɐ', 'a')
|
||||||
|
# ipa = ipa.replace('ɑ', 'a')
|
||||||
|
# ipa = ipa.replace('ɾ', 'r')
|
||||||
|
# ipa = ipa.replace('ɹ', 'r') # ???
|
||||||
|
# ipa = ipa.replace('ʁ', 'r')
|
||||||
|
# ipa = ipa.replace('ʀ', 'r') # ???
|
||||||
|
# ipa = ipa.replace('ʊ', 'u')
|
||||||
|
# ipa = ipa.replace('χ', 'x')
|
||||||
|
|
||||||
|
# pronvar_list = [ipa]
|
||||||
|
# while 'ø:' in ' '.join(pronvar_list) or 'œ' in ' '.join(pronvar_list) or 'ɒ' in ' '.join(pronvar_list):
|
||||||
|
# pronvar_list_ = []
|
||||||
|
# for p in pronvar_list:
|
||||||
|
# if 'ø:' in p:
|
||||||
|
# pronvar_list_.append(p.replace('ø:', 'ö'))
|
||||||
|
# pronvar_list_.append(p.replace('ø:', 'ö:'))
|
||||||
|
# if 'œ' in p:
|
||||||
|
# pronvar_list_.append(p.replace('œ', 'ɔ̈'))
|
||||||
|
# pronvar_list_.append(p.replace('œ', 'ɔ̈:'))
|
||||||
|
# if 'ɒ' in p:
|
||||||
|
# pronvar_list_.append(p.replace('ɒ', 'ɔ̈'))
|
||||||
|
# pronvar_list_.append(p.replace('ɒ', 'ɔ̈:'))
|
||||||
|
# pronvar_list = np.unique(pronvar_list_)
|
||||||
|
# return pronvar_list
|
||||||
|
|
||||||
|
|
||||||
|
#def make_fame2ipa_variants(fame):
|
||||||
|
# fame = 'rɛös'
|
||||||
|
# ipa = [fame]
|
||||||
|
# ipa.append(fame.replace('ɛ', 'æ'))
|
||||||
|
# ipa.append(fame.replace('a', 'ɐ'))
|
||||||
|
# ipa.append(fame.replace('a', 'ɑ'))
|
||||||
|
# ipa.append(fame.replace('r', 'ɾ'))
|
||||||
|
# ipa.append(fame.replace('r', 'ɹ'))
|
||||||
|
# ipa.append(fame.replace('r', 'ʁ'))
|
||||||
|
# ipa.append(fame.replace('r', 'ʀ'))
|
||||||
|
# ipa.append(fame.replace('u', 'ʊ'))
|
||||||
|
# ipa.append(fame.replace('x', 'χ'))
|
||||||
|
|
||||||
|
# ipa.append(fame.replace('ö', 'ø:'))
|
||||||
|
# ipa.append(fame.replace('ö:', 'ø:'))
|
||||||
|
# ipa.append(fame.replace('ɔ̈', 'œ'))
|
||||||
|
# ipa.append(fame.replace('ɔ̈:', 'œ'))
|
||||||
|
# ipa.append(fame.replace('ɔ̈', 'ɒ'))
|
||||||
|
# ipa.append(fame.replace('ɔ̈:', 'ɒ'))
|
||||||
|
|
||||||
|
# return ipa
|
||||||
|
|
||||||
|
|
||||||
|
#def make_htk_dict(word, pronvar_, fileDic, output_type):
|
||||||
|
# """
|
||||||
|
# make dict files which can be used for HTK.
|
||||||
|
# param word: target word.
|
||||||
|
# param pronvar_: pronunciation variant. nx2 (WORD /t pronunciation) ndarray.
|
||||||
|
# param fileDic: output dic file.
|
||||||
|
# param output_type: 0:full, 1:statistics, 2:frequency <2% entries are removed. 3:top 3.
|
||||||
|
# """
|
||||||
|
# #assert(output_type < 4 and output_type >= 0, 'output_type should be an integer between 0 and 3.')
|
||||||
|
# WORD = word.upper()
|
||||||
|
|
||||||
|
# if output_type == 0: # full
|
||||||
|
# pronvar = np.unique(pronvar_)
|
||||||
|
|
||||||
|
# with open(fileDic, 'w') as f:
|
||||||
|
# for pvar in pronvar:
|
||||||
|
# f.write('{0}\t{1}\n'.format(WORD, pvar))
|
||||||
|
# else:
|
||||||
|
# c = Counter(pronvar_)
|
||||||
|
# total_num = sum(c.values())
|
||||||
|
# with open(fileDic, 'w') as f:
|
||||||
|
# if output_type == 3:
|
||||||
|
# for key, value in c.most_common(3):
|
||||||
|
# f.write('{0}\t{1}\n'.format(WORD, key))
|
||||||
|
# else:
|
||||||
|
# for key, value in c.items():
|
||||||
|
# percentage = value/total_num*100
|
||||||
|
|
||||||
|
# if output_type == 1: # all
|
||||||
|
# f.write('{0}\t{1:.2f}\t{2}\t{3}\n'.format(value, percentage, WORD, key))
|
||||||
|
# elif output_type == 2: # less than 2 percent
|
||||||
|
# if percentage < 2:
|
||||||
|
# f.write('{0}\t{1}\n'.format(WORD, key))
|
||||||
|
|
||||||
|
|
||||||
|
def make_hcopy_scp_from_filelist_in_fame(fame_dir, dataset, feature_dir, hcopy_scp):
|
||||||
|
""" Make a script file for HCopy using the filelist in FAME! corpus.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
fame_dir (path): the directory of FAME corpus.
|
||||||
|
dataset (str): 'devel', 'test' or 'train'.
|
||||||
|
feature_dir (path): the directory where feature will be stored.
|
||||||
|
hcopy_scp (path): a script file for HCopy to be made.
|
||||||
|
|
||||||
|
"""
|
||||||
|
filelist_txt = os.path.join(fame_dir, 'fame', 'filelists', dataset + 'list.txt')
|
||||||
|
with open(filelist_txt) as fin:
|
||||||
|
filelist = fin.read()
|
||||||
|
filelist = filelist.split('\n')
|
||||||
|
|
||||||
|
with open(hcopy_scp, 'w') as fout:
|
||||||
|
for filename_ in filelist:
|
||||||
|
filename = filename_.replace('.TextGrid', '')
|
||||||
|
|
||||||
|
if len(filename) > 3: # remove '.', '..' and ''
|
||||||
|
wav_file = os.path.join(fame_dir, 'fame', 'wav', dataset, filename + '.wav')
|
||||||
|
mfc_file = os.path.join(feature_dir, filename + '.mfc')
|
||||||
|
|
||||||
|
fout.write(wav_file + '\t' + mfc_file + '\n')
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
def load_lexicon(lexicon_file):
|
||||||
|
""" load lexicon file as data frame.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
lex (df): lexicon as Data Frame, which has columns 'word' and 'pronunciation'.
|
||||||
|
|
||||||
|
"""
|
||||||
|
lex = pd.read_csv(lexicon_file, delimiter='\t', header=None, encoding="utf-8")
|
||||||
|
lex.rename(columns={0: 'word', 1: 'pronunciation'}, inplace=True)
|
||||||
|
return lex
|
||||||
|
|
||||||
|
|
||||||
|
def get_phoneset_from_lexicon(lexicon_file, phoneset_name='asr'):
|
||||||
|
""" Make a list of phones which appears in the lexicon.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'.
|
||||||
|
phoneset_name (str): the name of phoneset with which lexicon_file is written. 'asr'(default) or 'ipa'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(list_of_phones) (set): the set of phones included in the lexicon_file.
|
||||||
|
|
||||||
|
"""
|
||||||
|
assert phoneset_name in ['asr', 'ipa'], 'phoneset_name should be \'asr\' or \'ipa\''
|
||||||
|
|
||||||
|
lex = load_lexicon(lexicon_file)
|
||||||
|
if phoneset_name == 'asr':
|
||||||
|
return set(' '.join(lex['pronunciation']).split(' '))
|
||||||
|
elif phoneset_name == 'ipa':
|
||||||
|
join_pronunciations = ''.join(lex['pronunciation'])
|
||||||
|
return set(convert_phone_set.split_word(join_pronunciations, fame_ipa.multi_character_phones))
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
def extract_unknown_phones(ipa, known_phones):
|
||||||
|
"""extract unknown phones in the pronunciation written in IPA.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ipa (str): a pronunciation written in IPA.
|
||||||
|
known_phones (list): list of phones already know.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(list_of_phones) (list): unknown phones not included in 'known_phones'.
|
||||||
|
|
||||||
|
"""
|
||||||
|
ipa_split = convert_phone_set.split_word(ipa, fame_ipa.multi_character_phones)
|
||||||
|
return [i for i in ipa_split if not i in known_phones]
|
||||||
|
|
||||||
|
|
||||||
|
def get_translation_key(lexicon_file_ipa, lexicon_file_asr):
|
||||||
|
""" get correspondence between lexicon_file_ipa and lexicon_file_asr.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lexicon_file_ipa (path): lexicon in the format of 'word' /t 'pronunciation (IPA)'.
|
||||||
|
lexicon_file_asr (path): lexicon in the format of 'word' /t 'pronunciation (asr)'.
|
||||||
|
the each character of 'pronunciation' should be delimited by ' '.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
translation_key (dict): translation key from ipa to asr.
|
||||||
|
(phone_unknown) (list): the list of IPA phones, which does not appear in lexicon_file_asr.
|
||||||
|
|
||||||
|
"""
|
||||||
|
lex_ipa = load_lexicon(lexicon_file_ipa)
|
||||||
|
lex_asr = load_lexicon(lexicon_file_asr)
|
||||||
|
phone_unknown = fame_ipa.phoneset[:]
|
||||||
|
translation_key = dict()
|
||||||
|
for word in lex_ipa['word']:
|
||||||
|
if np.sum(lex_ipa['word'] == word) == 1 and np.sum(lex_asr['word'] == word) == 1:
|
||||||
|
ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
|
||||||
|
asr = lex_asr[lex_asr['word'] == word].iat[0, 1]
|
||||||
|
|
||||||
|
ipa_list = convert_phone_set.split_word(ipa, fame_ipa.multi_character_phones)
|
||||||
|
asr_list = asr.split(' ')
|
||||||
|
|
||||||
|
# if there are phones which is not in phone_unknown
|
||||||
|
#if len([True for i in asr_list if i in phone_unknown]) > 0:
|
||||||
|
if(len(ipa_list) == len(asr_list)):
|
||||||
|
print("{0}: {1} --> {2}".format(word, ipa_list, asr_list))
|
||||||
|
for ipa_, asr_ in zip(ipa_list, asr_list):
|
||||||
|
if ipa_ in phone_unknown:
|
||||||
|
translation_key[ipa_] = asr_
|
||||||
|
phone_unknown.remove(ipa_)
|
||||||
|
return translation_key, list(phone_unknown)
|
||||||
|
|
||||||
|
|
||||||
|
def find_phone(lexicon_file, phone, phoneset_name='ipa'):
|
||||||
|
""" extract rows where the phone is used in the lexicon_file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'.
|
||||||
|
phone (str): the phone to be searched.
|
||||||
|
phoneset_name (str): the name of phoneset_name with which lexicon_file is written. 'asr' or 'ipa'(default).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
extracted (df): rows where the phone is used.
|
||||||
|
|
||||||
|
ToDo:
|
||||||
|
* develop when the phonset == 'asr'.
|
||||||
|
|
||||||
|
"""
|
||||||
|
assert phoneset_name in ['asr', 'ipa'], 'phoneset_name should be \'asr\' or \'ipa\''
|
||||||
|
|
||||||
|
lex = load_lexicon(lexicon_file)
|
||||||
|
|
||||||
|
# to reduce the calculation time, only target rows which include 'phone' at least once.
|
||||||
|
lex_ = lex[lex['pronunciation'].str.count(phone)>0]
|
||||||
|
|
||||||
|
extracted = pd.DataFrame(index=[], columns=['word', 'pronunciation'])
|
||||||
|
for index, row in lex_.iterrows():
|
||||||
|
if phoneset_name == 'ipa':
|
||||||
|
pronunciation = convert_phone_set.split_word(row['pronunciation'], fame_ipa.multi_character_phones)
|
||||||
|
if phone in pronunciation:
|
||||||
|
extracted_ = pd.Series([row['word'], pronunciation], index=extracted.columns)
|
||||||
|
extracted = extracted.append(extracted_, ignore_index=True)
|
||||||
|
return extracted
|
||||||
|
|
||||||
|
|
||||||
|
def asr2htk_space_delimited(pronunciation):
|
||||||
|
"""convert phoneset from asr to htk.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pronunciation (str): space delimited asr phones.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(pronunciation) (str): space delimited asr phones in htk format (ascii).
|
||||||
|
|
||||||
|
"""
|
||||||
|
pronunciation_short = [fame_asr.reduction_key.get(i, i) for i in pronunciation.split(' ')
|
||||||
|
if not i in fame_asr.phones_to_be_removed]
|
||||||
|
return ' '.join(convert_phoneset.convert_phoneset(
|
||||||
|
pronunciation_short, fame_asr.translation_key_asr2htk))
|
||||||
|
|
||||||
|
|
||||||
|
def lexicon_asr2htk(lexicon_file_asr, lexicon_file_htk):
|
||||||
|
""" Convert a lexicon file from asr to htk format (ascii).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lexicon_file_asr (path): a lexicon file written in asr format e.g. fame/lex.asr.
|
||||||
|
lexicon_file_htk (path): a lexicon file written in htk format (ascii).
|
||||||
|
|
||||||
|
"""
|
||||||
|
lex_asr = load_lexicon(lexicon_file_asr)
|
||||||
|
def word2htk_(row):
|
||||||
|
return word2htk(row['word'])
|
||||||
|
def asr2htk_space_delimited_(row):
|
||||||
|
return asr2htk_space_delimited(row['pronunciation'])
|
||||||
|
|
||||||
|
lex_htk = pd.DataFrame({
|
||||||
|
'word': lex_asr.apply(word2htk_, axis=1).str.upper(),
|
||||||
|
'pronunciation': lex_asr.apply(asr2htk_space_delimited_, axis=1)
|
||||||
|
})
|
||||||
|
lex_htk = lex_htk.ix[:, ['word', 'pronunciation']]
|
||||||
|
lex_htk.to_csv(lexicon_file_htk, header=None, index=None, sep='\t', encoding='utf-8')
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out):
|
||||||
|
""" Combine two lexicon files and sort by words.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lexicon_file1, lexicon_file2 (path): input lexicon files.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
lexicon_file_out (path): lexicon_file which lexcion_file1 and 2 are combined and sorted.
|
||||||
|
|
||||||
|
"""
|
||||||
|
lex1 = load_lexicon(lexicon_file1)
|
||||||
|
lex2 = load_lexicon(lexicon_file2)
|
||||||
|
lex = pd.concat([lex1, lex2])
|
||||||
|
lex = lex.sort_values(by='word', ascending=True)
|
||||||
|
lex.to_csv(lexicon_out, index=False, header=False, sep='\t', encoding='utf-8')
|
||||||
|
|
||||||
|
|
||||||
|
def fix_lexicon(lexicon_file):
|
||||||
|
""" fix lexicon
|
||||||
|
- add '\' before all single quote at the beginning of words.
|
||||||
|
- convert special characters to ascii compatible characters.
|
||||||
|
- add silence.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lexicon_file (path): lexicon file, which will be overwitten.
|
||||||
|
|
||||||
|
"""
|
||||||
|
lex = load_lexicon(lexicon_file)
|
||||||
|
lex = lex.dropna() # remove N/A.
|
||||||
|
|
||||||
|
# add 'sil'
|
||||||
|
row = pd.Series(['SILENCE', 'sil'], index=lex.columns)
|
||||||
|
lex = lex.append(row, ignore_index=True)
|
||||||
|
lex = lex.sort_values(by='word', ascending=True)
|
||||||
|
|
||||||
|
for i in lex[lex['word'].str.startswith('\'')].index.values:
|
||||||
|
lex.iat[i, 0] = lex.iat[i, 0].replace('\'', '\\\'')
|
||||||
|
|
||||||
|
# to_csv does not work with space seperator. therefore all tabs should manually be replaced.
|
||||||
|
#lex.to_csv(lexicon_file, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
|
||||||
|
lex.to_csv(lexicon_file, index=False, header=False, sep='\t', encoding='utf-8')
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
def word2htk(word):
|
||||||
|
return ''.join([fame_asr.translation_key_word2htk.get(i, i) for i in word])
|
||||||
|
|
||||||
|
|
||||||
|
def ipa2asr(ipa):
|
||||||
|
curr_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0)
|
||||||
|
#ipa_ = fame_asr.phone_reduction(ipa)
|
||||||
|
ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)
|
||||||
|
ipa_splitted = fame_ipa.phone_reduction(ipa_splitted)
|
||||||
|
asr_splitted = convert_phoneset.convert_phoneset(ipa_splitted, translation_key_ipa2asr)
|
||||||
|
asr_splitted = fame_asr.phone_reduction(asr_splitted)
|
||||||
|
return ''.join(asr_splitted)
|
||||||
|
|
||||||
|
|
||||||
|
def ipa2htk(ipa):
|
||||||
|
curr_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
translation_key_ipa2asr = np.load(os.path.join(curr_dir, 'phoneset', 'fame_ipa2asr.npy')).item(0)
|
||||||
|
#translation_key_ipa2asr = np.load(r'c:\Users\Aki\source\repos\acoustic_model\acoustic_model\phoneset\fame_ipa2asr.npy').item(0)
|
||||||
|
|
||||||
|
ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)
|
||||||
|
ipa_splitted = fame_ipa.phone_reduction(ipa_splitted)
|
||||||
|
asr_splitted = convert_phoneset.convert_phoneset(ipa_splitted, translation_key_ipa2asr)
|
||||||
|
asr_splitted = fame_asr.phone_reduction(asr_splitted)
|
||||||
|
htk_splitted = convert_phoneset.convert_phoneset(asr_splitted, fame_asr.translation_key_asr2htk)
|
||||||
|
return ''.join(htk_splitted)
|
||||||
|
|
||||||
|
|
||||||
|
def performance_on_stimmen(config_dir, stimmen_dir, hmmdefs):
|
||||||
|
lattice_file = os.path.join(stimmen_dir, 'word_lattice.ltc')
|
||||||
|
hvite_scp = os.path.join(stimmen_dir, 'hvite.scp')
|
||||||
|
#fh.make_filelist(os.path.join(stimmen_dir, 'mfc'), hvite_scp, file_type='mfc')
|
||||||
|
hresult_scp = os.path.join(stimmen_dir, 'hresult.scp')
|
||||||
|
#fh.make_filelist(os.path.join(stimmen_dir, 'mfc'), hresult_scp, file_type='rec')
|
||||||
|
lexicon_file = os.path.join(stimmen_dir, 'lexicon_recognition.dic')
|
||||||
|
|
||||||
|
# get feature_size from hmmdefs.
|
||||||
|
with open(hmmdefs) as f:
|
||||||
|
line = f.readline()
|
||||||
|
line = f.readline().strip()
|
||||||
|
feature_size = int(line.split(' ')[2])
|
||||||
|
|
||||||
|
chtk = pyhtk.HTK(config_dir, fame_asr.phoneset_htk, lexicon_file, feature_size)
|
||||||
|
|
||||||
|
result = chtk.recognition(
|
||||||
|
lattice_file,
|
||||||
|
hmmdefs,
|
||||||
|
hvite_scp
|
||||||
|
)
|
||||||
|
per_sentence, per_word = chtk.calc_recognition_performance(hresult_scp)
|
||||||
|
|
||||||
|
return per_sentence['accuracy']
|
566
acoustic_model/fame_hmm.py
Normal file
566
acoustic_model/fame_hmm.py
Normal file
@ -0,0 +1,566 @@
|
|||||||
|
import sys
|
||||||
|
import os
|
||||||
|
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
||||||
|
|
||||||
|
import tempfile
|
||||||
|
import shutil
|
||||||
|
import glob
|
||||||
|
import time
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import fame_functions
|
||||||
|
from phoneset import fame_ipa, fame_asr, fame_phonetics
|
||||||
|
import defaultfiles as default
|
||||||
|
sys.path.append(default.toolbox_dir)
|
||||||
|
import file_handling as fh
|
||||||
|
from htk import pyhtk
|
||||||
|
#from scripts import run_command
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= user define =======================
|
||||||
|
# procedure
|
||||||
|
combine_all = 1
|
||||||
|
|
||||||
|
make_lexicon = 0
|
||||||
|
make_label = 0 # it takes roughly 4800 sec on Surface pro 2.
|
||||||
|
make_mlf = 0
|
||||||
|
extract_features = 0
|
||||||
|
flat_start = 1
|
||||||
|
train_monophone_without_sp = 1
|
||||||
|
add_sp = 1
|
||||||
|
train_monophone_with_re_aligned_mlf = 1
|
||||||
|
increase_mixture = 1
|
||||||
|
train_triphone = 0
|
||||||
|
train_triphone_tied = 0
|
||||||
|
|
||||||
|
|
||||||
|
# pre-defined values.
|
||||||
|
dataset_list = ['devel', 'test', 'train']
|
||||||
|
feature_size = 30
|
||||||
|
improvement_threshold = 0.3
|
||||||
|
|
||||||
|
lexicon_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr')
|
||||||
|
lexicon_oov = os.path.join(default.fame_dir, 'lexicon', 'lex.oov')
|
||||||
|
|
||||||
|
config_dir = os.path.join(default.htk_dir, 'config')
|
||||||
|
phonelist_full_txt = os.path.join(config_dir, 'phonelist_full.txt')
|
||||||
|
tree_hed = os.path.join(config_dir, 'tree.hed')
|
||||||
|
quests_hed = os.path.join(config_dir, 'quests.hed')
|
||||||
|
|
||||||
|
model_dir = os.path.join(default.htk_dir, 'model')
|
||||||
|
model_mono0_dir = os.path.join(model_dir, 'mono0')
|
||||||
|
model_mono1_dir = os.path.join(model_dir, 'mono1')
|
||||||
|
model_mono1sp_dir = os.path.join(model_dir, 'mono1sp')
|
||||||
|
model_mono1sp2_dir = os.path.join(model_dir, 'mono1sp2')
|
||||||
|
model_tri1_dir = os.path.join(model_dir, 'tri1')
|
||||||
|
model_tri1tied_dir = os.path.join(model_dir, 'tri1tied')
|
||||||
|
|
||||||
|
# directories / files to be made.
|
||||||
|
lexicon_dir = os.path.join(default.htk_dir, 'lexicon')
|
||||||
|
lexicon_htk_asr = os.path.join(lexicon_dir, 'lex.htk_asr')
|
||||||
|
lexicon_htk_oov = os.path.join(lexicon_dir, 'lex.htk_oov')
|
||||||
|
lexicon_htk = os.path.join(lexicon_dir, 'lex.htk')
|
||||||
|
lexicon_htk_with_sp = os.path.join(lexicon_dir, 'lex_with_sp.htk')
|
||||||
|
lexicon_htk_triphone = os.path.join(lexicon_dir, 'lex_triphone.htk')
|
||||||
|
|
||||||
|
feature_dir = os.path.join(default.htk_dir, 'mfc')
|
||||||
|
fh.make_new_directory(feature_dir, existing_dir='leave')
|
||||||
|
tmp_dir = os.path.join(default.htk_dir, 'tmp')
|
||||||
|
fh.make_new_directory(tmp_dir, existing_dir='leave')
|
||||||
|
label_dir = os.path.join(default.htk_dir, 'label')
|
||||||
|
fh.make_new_directory(label_dir, existing_dir='leave')
|
||||||
|
|
||||||
|
|
||||||
|
## training
|
||||||
|
if combine_all:
|
||||||
|
hcompv_scp_train = os.path.join(tmp_dir, 'all.scp')
|
||||||
|
mlf_file_train = os.path.join(label_dir, 'all_phone.mlf')
|
||||||
|
mlf_file_train_word = os.path.join(label_dir, 'all_word.mlf')
|
||||||
|
mlf_file_train_with_sp = os.path.join(label_dir, 'all_phone_with_sp.mlf')
|
||||||
|
mlf_file_train_aligned = os.path.join(label_dir, 'all_phone_aligned.mlf')
|
||||||
|
triphone_mlf = os.path.join(label_dir, 'all_triphone.mlf')
|
||||||
|
else:
|
||||||
|
hcompv_scp_train = os.path.join(tmp_dir, 'train.scp')
|
||||||
|
mlf_file_train = os.path.join(label_dir, 'train_phone.mlf')
|
||||||
|
mlf_file_train_word = os.path.join(label_dir, 'train_word.mlf')
|
||||||
|
mlf_file_train_with_sp = os.path.join(label_dir, 'train_phone_with_sp.mlf')
|
||||||
|
mlf_file_train_aligned = os.path.join(label_dir, 'train_phone_aligned.mlf')
|
||||||
|
triphone_mlf = os.path.join(label_dir, 'train_triphone.mlf')
|
||||||
|
hcompv_scp_train_updated = hcompv_scp_train.replace('.scp', '_updated.scp')
|
||||||
|
|
||||||
|
## testing
|
||||||
|
htk_stimmen_dir = os.path.join(default.htk_dir, 'stimmen')
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= make lexicon for HTK =======================
|
||||||
|
if make_lexicon:
|
||||||
|
timer_start = time.time()
|
||||||
|
print('==== making lexicon for HTK ====')
|
||||||
|
|
||||||
|
# convert each lexicon from fame_asr phoneset to fame_htk phoneset.
|
||||||
|
print('>>> converting each lexicon from fame_asr phoneset to fame_htk phoneset...')
|
||||||
|
fame_functions.lexicon_asr2htk(lexicon_asr, lexicon_htk_asr)
|
||||||
|
fame_functions.lexicon_asr2htk(lexicon_oov, lexicon_htk_oov)
|
||||||
|
|
||||||
|
# combine lexicon
|
||||||
|
print('>>> combining lexicon files into one lexicon...')
|
||||||
|
# pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov.
|
||||||
|
# therefore there is no overlap between lex_asr and lex_oov.
|
||||||
|
fame_functions.combine_lexicon(lexicon_htk_asr, lexicon_htk_oov, lexicon_htk)
|
||||||
|
|
||||||
|
## fixing the lexicon for HTK.
|
||||||
|
# (1) Replace all tabs with single space;
|
||||||
|
# (2) Put a '\' before any dictionary entry beginning with single quote
|
||||||
|
# http://electroblaze.blogspot.nl/2013/03/understanding-htk-error-messages.html
|
||||||
|
print('>>> fixing the lexicon...')
|
||||||
|
fame_functions.fix_lexicon(lexicon_htk)
|
||||||
|
|
||||||
|
## adding sp to the lexicon for HTK.
|
||||||
|
print('>>> adding sp to the lexicon...')
|
||||||
|
with open(lexicon_htk) as f:
|
||||||
|
lines = f.read().split('\n')
|
||||||
|
with open(lexicon_htk_with_sp, 'wb') as f:
|
||||||
|
f.write(bytes(' sp\n'.join(lines), 'ascii'))
|
||||||
|
|
||||||
|
print("elapsed time: {}".format(time.time() - timer_start))
|
||||||
|
|
||||||
|
|
||||||
|
## intialize the instance for HTK.
|
||||||
|
chtk = pyhtk.HTK(config_dir, fame_asr.phoneset_htk, lexicon_htk_with_sp, feature_size)
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= make label files =======================
|
||||||
|
if make_label:
|
||||||
|
for dataset in dataset_list:
|
||||||
|
timer_start = time.time()
|
||||||
|
print("==== making label files on dataset {}".format(dataset))
|
||||||
|
|
||||||
|
script_list = os.path.join(default.fame_dir, 'data', dataset, 'text')
|
||||||
|
wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
|
||||||
|
label_dir_ = os.path.join(label_dir, dataset)
|
||||||
|
dictionary_file = os.path.join(label_dir_, 'temp.dic')
|
||||||
|
fh.make_new_directory(label_dir_, existing_dir='leave')
|
||||||
|
|
||||||
|
# list of scripts
|
||||||
|
with open(script_list, "rt", encoding="utf-8") as fin:
|
||||||
|
scripts = fin.read().split('\n')
|
||||||
|
|
||||||
|
for line in scripts:
|
||||||
|
# sample line:
|
||||||
|
# sp0457m_test_1968_plakkenfryslanterhorne_2168 en dan begjinne je natuerlik
|
||||||
|
filename_ = line.split(' ')[0]
|
||||||
|
filename = '_'.join(filename_.split('_')[1:])
|
||||||
|
sentence = ' '.join(line.split(' ')[1:])
|
||||||
|
sentence_htk = fame_functions.word2htk(sentence)
|
||||||
|
|
||||||
|
wav_file = os.path.join(wav_dir_, filename + '.wav')
|
||||||
|
if os.path.exists(wav_file) and chtk.can_be_ascii(sentence_htk) == 0:
|
||||||
|
if chtk.get_number_of_missing_words(
|
||||||
|
sentence_htk, dictionary_file) == 0:
|
||||||
|
# when the file name is too long, HDMan command does not work.
|
||||||
|
# therefore first temporary dictionary_file is made, then renamed.
|
||||||
|
shutil.move(dictionary_file, os.path.join(label_dir_, filename + '.dic'))
|
||||||
|
|
||||||
|
label_file = os.path.join(label_dir_, filename + '.lab')
|
||||||
|
chtk.make_label_file(sentence_htk, label_file)
|
||||||
|
else:
|
||||||
|
os.remove(dictionary_file)
|
||||||
|
|
||||||
|
print("elapsed time: {}".format(time.time() - timer_start))
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= make master label files =======================
|
||||||
|
if make_mlf:
|
||||||
|
timer_start = time.time()
|
||||||
|
print("==== making master label files ====")
|
||||||
|
|
||||||
|
# train_2002_gongfansaken_10347.lab is empty. should be removed.
|
||||||
|
empty_lab_file = os.path.join(label_dir, 'train', 'train_2002_gongfansaken_10347.lab')
|
||||||
|
empty_dic_file = empty_lab_file.replace('.lab', '.dic')
|
||||||
|
|
||||||
|
if os.path.exists(empty_lab_file):
|
||||||
|
os.remove(empty_lab_file)
|
||||||
|
if os.path.exists(empty_dic_file):
|
||||||
|
os.remove(empty_dic_file)
|
||||||
|
|
||||||
|
for dataset in dataset_list:
|
||||||
|
feature_dir_ = os.path.join(feature_dir, dataset)
|
||||||
|
label_dir_ = os.path.join(label_dir, dataset)
|
||||||
|
mlf_word = os.path.join(label_dir, dataset + '_word.mlf')
|
||||||
|
mlf_phone = os.path.join(label_dir, dataset + '_phone.mlf')
|
||||||
|
mlf_phone_with_sp = os.path.join(label_dir, dataset + '_phone_with_sp.mlf')
|
||||||
|
|
||||||
|
print(">>> generating a word level mlf file for {}...".format(dataset))
|
||||||
|
chtk.label2mlf(label_dir_, mlf_word)
|
||||||
|
|
||||||
|
print(">>> generating a phone level mlf file for {}...".format(dataset))
|
||||||
|
chtk.mlf_word2phone(mlf_phone, mlf_word, with_sp=False)
|
||||||
|
chtk.mlf_word2phone(mlf_phone_with_sp, mlf_word, with_sp=True)
|
||||||
|
|
||||||
|
print("elapsed time: {}".format(time.time() - timer_start))
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= extract features =======================
|
||||||
|
if extract_features:
|
||||||
|
for dataset in dataset_list:
|
||||||
|
timer_start = time.time()
|
||||||
|
print('==== extract features on dataset {} ===='.format(dataset))
|
||||||
|
|
||||||
|
wav_dir_ = os.path.join(default.fame_dir, 'fame', 'wav', dataset)
|
||||||
|
label_dir_ = os.path.join(label_dir, dataset)
|
||||||
|
feature_dir_ = os.path.join(feature_dir, dataset)
|
||||||
|
fh.make_new_directory(feature_dir_, existing_dir='delete')
|
||||||
|
|
||||||
|
# a script file for HCopy
|
||||||
|
print(">>> making a script file for HCopy...")
|
||||||
|
hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False)
|
||||||
|
hcopy_scp.close()
|
||||||
|
|
||||||
|
# get a list of features (hcopy.scp)
|
||||||
|
# from the filelist in FAME! corpus.
|
||||||
|
#fame_functions.make_hcopy_scp_from_filelist_in_fame(default.fame_dir, dataset, feature_dir_, hcopy_scp.name)
|
||||||
|
# from the list of label files.
|
||||||
|
lab_list = glob.glob(os.path.join(label_dir_, '*.lab'))
|
||||||
|
feature_list = [
|
||||||
|
os.path.join(wav_dir_, os.path.basename(lab_file).replace('.lab', '.wav')) + '\t'
|
||||||
|
+ os.path.join(feature_dir_, os.path.basename(lab_file).replace('.lab', '.mfc'))
|
||||||
|
for lab_file in lab_list]
|
||||||
|
|
||||||
|
#if os.path.exists(empty_mfc_file):
|
||||||
|
# os.remove(empty_mfc_file)
|
||||||
|
with open(hcopy_scp.name, 'wb') as f:
|
||||||
|
f.write(bytes('\n'.join(feature_list), 'ascii'))
|
||||||
|
|
||||||
|
# extract features.
|
||||||
|
print(">>> extracting features on {}...".format(dataset))
|
||||||
|
chtk.wav2mfc(hcopy_scp.name)
|
||||||
|
os.remove(hcopy_scp.name)
|
||||||
|
|
||||||
|
# make hcompv.scp.
|
||||||
|
print(">>> making a script file for {}...".format(dataset))
|
||||||
|
listdir = glob.glob(os.path.join(label_dir_, '*.dic'))
|
||||||
|
mfc_list = [filename.replace(label_dir_, feature_dir_).replace('.dic', '.mfc') for filename in listdir]
|
||||||
|
hcompv_scp = os.path.join(tmp_dir, dataset + '.scp')
|
||||||
|
with open(hcompv_scp, 'wb') as f:
|
||||||
|
f.write(bytes('\n'.join(mfc_list) + '\n', 'ascii'))
|
||||||
|
|
||||||
|
print(">>> extracting features on stimmen...")
|
||||||
|
chtk.wav2mfc(os.path.join(htk_stimmen_dir, 'hcopy.scp'))
|
||||||
|
|
||||||
|
print("elapsed time: {}".format(time.time() - timer_start))
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= flat start monophones =======================
|
||||||
|
if combine_all:
|
||||||
|
# script files.
|
||||||
|
fh.concatenate(
|
||||||
|
os.path.join(tmp_dir, 'devel.scp'),
|
||||||
|
os.path.join(tmp_dir, 'test.scp'),
|
||||||
|
hcompv_scp_train
|
||||||
|
)
|
||||||
|
fh.concatenate(
|
||||||
|
hcompv_scp_train,
|
||||||
|
os.path.join(tmp_dir, 'train.scp'),
|
||||||
|
hcompv_scp_train
|
||||||
|
)
|
||||||
|
|
||||||
|
# phone level mlfs.
|
||||||
|
fh.concatenate(
|
||||||
|
os.path.join(label_dir, 'devel_phone.mlf'),
|
||||||
|
os.path.join(label_dir, 'test_phone.mlf'),
|
||||||
|
mlf_file_train
|
||||||
|
)
|
||||||
|
fh.concatenate(
|
||||||
|
mlf_file_train,
|
||||||
|
os.path.join(label_dir, 'train_phone.mlf'),
|
||||||
|
mlf_file_train
|
||||||
|
)
|
||||||
|
|
||||||
|
# phone level mlfs with sp.
|
||||||
|
fh.concatenate(
|
||||||
|
os.path.join(label_dir, 'devel_phone_with_sp.mlf'),
|
||||||
|
os.path.join(label_dir, 'test_phone_with_sp.mlf'),
|
||||||
|
mlf_file_train_with_sp
|
||||||
|
)
|
||||||
|
fh.concatenate(
|
||||||
|
mlf_file_train_with_sp,
|
||||||
|
os.path.join(label_dir, 'train_phone_with_sp.mlf'),
|
||||||
|
mlf_file_train_with_sp
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# word level mlfs.
|
||||||
|
fh.concatenate(
|
||||||
|
os.path.join(label_dir, 'devel_word.mlf'),
|
||||||
|
os.path.join(label_dir, 'test_word.mlf'),
|
||||||
|
mlf_file_train_word
|
||||||
|
)
|
||||||
|
fh.concatenate(
|
||||||
|
mlf_file_train_word,
|
||||||
|
os.path.join(label_dir, 'train_word.mlf'),
|
||||||
|
mlf_file_train_word
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= flat start monophones =======================
|
||||||
|
if flat_start:
|
||||||
|
timer_start = time.time()
|
||||||
|
print('==== flat start ====')
|
||||||
|
fh.make_new_directory(model_mono0_dir, existing_dir='leave')
|
||||||
|
|
||||||
|
chtk.flat_start(hcompv_scp_train, model_mono0_dir)
|
||||||
|
|
||||||
|
# make macros.
|
||||||
|
vFloors = os.path.join(model_mono0_dir, 'vFloors')
|
||||||
|
if os.path.exists(vFloors):
|
||||||
|
chtk.make_macros(vFloors)
|
||||||
|
|
||||||
|
# allocate mean & variance to all phones in the phone list
|
||||||
|
print('>>> allocating mean & variance to all phones in the phone list...')
|
||||||
|
chtk.make_hmmdefs(model_mono0_dir)
|
||||||
|
|
||||||
|
print("elapsed time: {}".format(time.time() - timer_start))
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= train model without short pause =======================
|
||||||
|
if train_monophone_without_sp:
|
||||||
|
print('==== train monophone without sp ====')
|
||||||
|
|
||||||
|
timer_start = time.time()
|
||||||
|
niter = chtk.re_estimation_until_saturated(
|
||||||
|
model_mono1_dir,
|
||||||
|
model_mono0_dir, improvement_threshold, hcompv_scp_train,
|
||||||
|
os.path.join(htk_stimmen_dir, 'mfc'),
|
||||||
|
'mfc',
|
||||||
|
os.path.join(htk_stimmen_dir, 'word_lattice.ltc'),
|
||||||
|
mlf_file=mlf_file_train,
|
||||||
|
lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic')
|
||||||
|
)
|
||||||
|
|
||||||
|
print("elapsed time: {}".format(time.time() - timer_start))
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= adding sp to the model =======================
|
||||||
|
if add_sp:
|
||||||
|
print('==== adding sp to the model ====')
|
||||||
|
# reference:
|
||||||
|
# http://www.f.waseda.jp/yusukekondo/htk.html#flat_start_estimation
|
||||||
|
timer_start = time.time()
|
||||||
|
|
||||||
|
# make model with sp.
|
||||||
|
print('>>> adding sp state to the last model in the previous step...')
|
||||||
|
fh.make_new_directory(model_mono1sp_dir, existing_dir='leave')
|
||||||
|
niter = chtk.get_niter_max(model_mono1_dir)
|
||||||
|
modeln_dir_pre = os.path.join(model_mono1_dir, 'iter'+str(niter))
|
||||||
|
modeln_dir = os.path.join(model_mono1sp_dir, 'iter0')
|
||||||
|
|
||||||
|
chtk.add_sp(modeln_dir_pre, modeln_dir)
|
||||||
|
|
||||||
|
print('>>> re-estimation...')
|
||||||
|
niter = chtk.re_estimation_until_saturated(
|
||||||
|
model_mono1sp_dir, modeln_dir, improvement_threshold, hcompv_scp_train,
|
||||||
|
os.path.join(htk_stimmen_dir, 'mfc'),
|
||||||
|
'mfc',
|
||||||
|
os.path.join(htk_stimmen_dir, 'word_lattice.ltc'),
|
||||||
|
mlf_file=mlf_file_train_with_sp,
|
||||||
|
lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'),
|
||||||
|
model_type='monophone_with_sp'
|
||||||
|
)
|
||||||
|
print("elapsed time: {}".format(time.time() - timer_start))
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= train model with re-aligned mlf =======================
|
||||||
|
if train_monophone_with_re_aligned_mlf:
|
||||||
|
print('==== traina monophone with re-aligned mlf ====')
|
||||||
|
timer_start = time.time()
|
||||||
|
|
||||||
|
print('>>> re-aligning the training data... ')
|
||||||
|
niter = chtk.get_niter_max(model_mono1sp_dir)
|
||||||
|
modeln_dir = os.path.join(model_mono1sp_dir, 'iter'+str(niter))
|
||||||
|
chtk.make_aligned_label(
|
||||||
|
os.path.join(modeln_dir, 'macros'),
|
||||||
|
os.path.join(modeln_dir, 'hmmdefs'),
|
||||||
|
mlf_file_train_aligned,
|
||||||
|
mlf_file_train_word,
|
||||||
|
hcompv_scp_train)
|
||||||
|
chtk.fix_mlf(mlf_file_train_aligned)
|
||||||
|
|
||||||
|
print('>>> updating the script file... ')
|
||||||
|
chtk.update_script_file(
|
||||||
|
mlf_file_train_aligned,
|
||||||
|
mlf_file_train_with_sp,
|
||||||
|
hcompv_scp_train,
|
||||||
|
hcompv_scp_train_updated)
|
||||||
|
|
||||||
|
print('>>> re-estimation... ')
|
||||||
|
timer_start = time.time()
|
||||||
|
fh.make_new_directory(model_mono1sp2_dir, existing_dir='leave')
|
||||||
|
niter = chtk.get_niter_max(model_mono1sp_dir)
|
||||||
|
niter = chtk.re_estimation_until_saturated(
|
||||||
|
model_mono1sp2_dir,
|
||||||
|
os.path.join(model_mono1sp_dir, 'iter'+str(niter)),
|
||||||
|
improvement_threshold,
|
||||||
|
hcompv_scp_train_updated,
|
||||||
|
os.path.join(htk_stimmen_dir, 'mfc'),
|
||||||
|
'mfc',
|
||||||
|
os.path.join(htk_stimmen_dir, 'word_lattice.ltc'),
|
||||||
|
mlf_file=mlf_file_train_aligned,
|
||||||
|
lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'),
|
||||||
|
model_type='monophone_with_sp'
|
||||||
|
)
|
||||||
|
print("elapsed time: {}".format(time.time() - timer_start))
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= increase mixture =======================
|
||||||
|
if increase_mixture:
|
||||||
|
print('==== increase mixture ====')
|
||||||
|
timer_start = time.time()
|
||||||
|
for nmix in [2, 4, 8, 16]:
|
||||||
|
if nmix == 2:
|
||||||
|
modeln_dir_ = model_mono1sp2_dir
|
||||||
|
else:
|
||||||
|
modeln_dir_ = os.path.join(model_dir, 'mono'+str(nmix_))
|
||||||
|
modeln_dir = os.path.join(model_dir, 'mono'+str(nmix))
|
||||||
|
|
||||||
|
print('mixture: {}'.format(nmix))
|
||||||
|
fh.make_new_directory(modeln_dir, existing_dir='delete')
|
||||||
|
niter = chtk.get_niter_max(modeln_dir_)
|
||||||
|
chtk.increase_mixture(
|
||||||
|
os.path.join(modeln_dir_, 'iter'+str(niter), 'hmmdefs'),
|
||||||
|
nmix,
|
||||||
|
os.path.join(modeln_dir, 'iter0'),
|
||||||
|
model_type='monophone_with_sp')
|
||||||
|
shutil.copy2(os.path.join(modeln_dir_, 'iter'+str(niter), 'macros'),
|
||||||
|
os.path.join(modeln_dir, 'iter0', 'macros'))
|
||||||
|
|
||||||
|
#improvement_threshold = -10
|
||||||
|
niter = chtk.re_estimation_until_saturated(
|
||||||
|
modeln_dir,
|
||||||
|
os.path.join(modeln_dir_, 'iter0'),
|
||||||
|
improvement_threshold,
|
||||||
|
hcompv_scp_train_updated,
|
||||||
|
os.path.join(htk_stimmen_dir, 'mfc'),
|
||||||
|
'mfc',
|
||||||
|
os.path.join(htk_stimmen_dir, 'word_lattice.ltc'),
|
||||||
|
mlf_file=mlf_file_train_aligned,
|
||||||
|
lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'),
|
||||||
|
model_type='monophone_with_sp'
|
||||||
|
)
|
||||||
|
nmix_ = nmix
|
||||||
|
|
||||||
|
print("elapsed time: {}".format(time.time() - timer_start))
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= train triphone =======================
|
||||||
|
print('>>> making triphone list... ')
|
||||||
|
chtk.make_triphonelist(
|
||||||
|
mlf_file_train_aligned,
|
||||||
|
triphone_mlf)
|
||||||
|
|
||||||
|
if train_triphone:
|
||||||
|
print('==== train triphone model ====')
|
||||||
|
timer_start = time.time()
|
||||||
|
|
||||||
|
print('>>> init triphone model... ')
|
||||||
|
niter = chtk.get_niter_max(model_mono1sp2_dir)
|
||||||
|
fh.make_new_directory(os.path.join(model_tri1_dir, 'iter0'), existing_dir='leave')
|
||||||
|
chtk.init_triphone(
|
||||||
|
os.path.join(model_mono1sp2_dir, 'iter'+str(niter)),
|
||||||
|
os.path.join(model_tri1_dir, 'iter0')
|
||||||
|
)
|
||||||
|
|
||||||
|
print('>>> re-estimation... ')
|
||||||
|
## I wanted to train until satulated:
|
||||||
|
#niter = chtk.re_estimation_until_saturated(
|
||||||
|
# model_tri1_dir,
|
||||||
|
# os.path.join(model_tri1_dir, 'iter0'),
|
||||||
|
# improvement_threshold,
|
||||||
|
# hcompv_scp_train_updated,
|
||||||
|
# os.path.join(htk_stimmen_dir, 'mfc'),
|
||||||
|
# 'mfc',
|
||||||
|
# os.path.join(htk_stimmen_dir, 'word_lattice.ltc'),
|
||||||
|
# mlf_file=triphone_mlf,
|
||||||
|
# lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'),
|
||||||
|
# model_type='triphone'
|
||||||
|
# )
|
||||||
|
#
|
||||||
|
# but because the data size is limited, some triphone cannot be trained and received the error:
|
||||||
|
# ERROR [+8231] GetHCIModel: Cannot find hmm [i:-]r[+???]
|
||||||
|
# therefore only two times re-estimation is performed.
|
||||||
|
output_dir = model_tri1_dir
|
||||||
|
for niter in range(1, 4):
|
||||||
|
hmm_n = 'iter' + str(niter)
|
||||||
|
hmm_n_pre = 'iter' + str(niter-1)
|
||||||
|
_modeln_dir = os.path.join(output_dir, hmm_n)
|
||||||
|
_modeln_dir_pre = os.path.join(output_dir, hmm_n_pre)
|
||||||
|
|
||||||
|
fh.make_new_directory(_modeln_dir, 'leave')
|
||||||
|
chtk.re_estimation(
|
||||||
|
os.path.join(_modeln_dir_pre, 'hmmdefs'),
|
||||||
|
_modeln_dir,
|
||||||
|
hcompv_scp_train_updated,
|
||||||
|
mlf_file=triphone_mlf,
|
||||||
|
macros=os.path.join(_modeln_dir_pre, 'macros'),
|
||||||
|
model_type='triphone')
|
||||||
|
|
||||||
|
print("elapsed time: {}".format(time.time() - timer_start))
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= train tied-state triphones =======================
|
||||||
|
if train_triphone_tied:
|
||||||
|
print('==== train tied-state triphones ====')
|
||||||
|
timer_start = time.time()
|
||||||
|
|
||||||
|
print('>>> making lexicon for triphone... ')
|
||||||
|
chtk.make_lexicon_triphone(phonelist_full_txt, lexicon_htk_triphone)
|
||||||
|
chtk.combine_phonelists(phonelist_full_txt)
|
||||||
|
|
||||||
|
print('>>> making a tree header... ')
|
||||||
|
fame_phonetics.make_quests_hed(quests_hed)
|
||||||
|
stats = os.path.join(r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\model\tri1\iter3', 'stats')
|
||||||
|
chtk.make_tree_header(tree_hed, quests_hed, stats, config_dir)
|
||||||
|
|
||||||
|
print('>>> init triphone model... ')
|
||||||
|
niter = chtk.get_niter_max(model_tri1_dir)
|
||||||
|
fh.make_new_directory(os.path.join(model_tri1tied_dir, 'iter0'), existing_dir='leave')
|
||||||
|
chtk.init_triphone(
|
||||||
|
os.path.join(model_tri1_dir, 'iter'+str(niter)),
|
||||||
|
os.path.join(model_tri1tied_dir, 'iter0'),
|
||||||
|
tied=True)
|
||||||
|
|
||||||
|
# I wanted to train until satulated:
|
||||||
|
#niter = chtk.re_estimation_until_saturated(
|
||||||
|
# model_tri1tied_dir,
|
||||||
|
# os.path.join(model_tri1tied_dir, 'iter0'),
|
||||||
|
# improvement_threshold,
|
||||||
|
# hcompv_scp_train_updated,
|
||||||
|
# os.path.join(htk_stimmen_dir, 'mfc'),
|
||||||
|
# 'mfc',
|
||||||
|
# os.path.join(htk_stimmen_dir, 'word_lattice.ltc'),
|
||||||
|
# mlf_file=triphone_mlf,
|
||||||
|
# lexicon=os.path.join(htk_stimmen_dir, 'lexicon_recognition.dic'),
|
||||||
|
# model_type='triphone'
|
||||||
|
# )
|
||||||
|
#
|
||||||
|
# but because the data size is limited, some triphone cannot be trained and received the error:
|
||||||
|
# ERROR [+8231] GetHCIModel: Cannot find hmm [i:-]r[+???]
|
||||||
|
# therefore only 3 times re-estimation is performed.
|
||||||
|
output_dir = model_tri1tied_dir
|
||||||
|
for niter in range(1, 4):
|
||||||
|
hmm_n = 'iter' + str(niter)
|
||||||
|
hmm_n_pre = 'iter' + str(niter-1)
|
||||||
|
_modeln_dir = os.path.join(output_dir, hmm_n)
|
||||||
|
_modeln_dir_pre = os.path.join(output_dir, hmm_n_pre)
|
||||||
|
|
||||||
|
fh.make_new_directory(_modeln_dir, 'leave')
|
||||||
|
chtk.re_estimation(
|
||||||
|
os.path.join(_modeln_dir_pre, 'hmmdefs'),
|
||||||
|
_modeln_dir,
|
||||||
|
hcompv_scp_train_updated,
|
||||||
|
mlf_file=triphone_mlf,
|
||||||
|
macros=os.path.join(_modeln_dir_pre, 'macros'),
|
||||||
|
model_type='triphone')
|
||||||
|
|
||||||
|
print("elapsed time: {}".format(time.time() - timer_start))
|
138
acoustic_model/fame_test.py
Normal file
138
acoustic_model/fame_test.py
Normal file
@ -0,0 +1,138 @@
|
|||||||
|
import sys
|
||||||
|
import os
|
||||||
|
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
||||||
|
from collections import Counter
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import fame_functions
|
||||||
|
import defaultfiles as default
|
||||||
|
sys.path.append(default.toolbox_dir)
|
||||||
|
from phoneset import fame_ipa, fame_asr
|
||||||
|
import convert_phoneset
|
||||||
|
|
||||||
|
lexicon_dir = os.path.join(default.fame_dir, 'lexicon')
|
||||||
|
lexicon_ipa = os.path.join(lexicon_dir, 'lex.ipa')
|
||||||
|
lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
|
||||||
|
lexicon_htk = os.path.join(default.htk_dir, 'lexicon', 'lex.htk')
|
||||||
|
|
||||||
|
## check if all the phones in lexicon.ipa are in fame_ipa.py.
|
||||||
|
#timer_start = time.time()
|
||||||
|
#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_ipa, phoneset='ipa')
|
||||||
|
#phoneset_py = fame_ipa.phoneset
|
||||||
|
#print("phones which is in lexicon.ipa but not in fame_ipa.py:\n{}".format(
|
||||||
|
# set(phoneset_lex) - set(phoneset_py)))
|
||||||
|
#print("elapsed time: {}".format(time.time() - timer_start))
|
||||||
|
|
||||||
|
# check which word has the phone.
|
||||||
|
#timer_start = time.time()
|
||||||
|
#extracted = find_phone(lexicon_ipa, 'ⁿ')
|
||||||
|
#print("elapsed time: {}".format(time.time() - timer_start))
|
||||||
|
|
||||||
|
|
||||||
|
## get the correspondence between lex_ipa and lex_asr.
|
||||||
|
lex_asr = fame_functions.load_lexicon(lexicon_asr)
|
||||||
|
lex_ipa = fame_functions.load_lexicon(lexicon_ipa)
|
||||||
|
if 0:
|
||||||
|
timer_start = time.time()
|
||||||
|
translation_key_ipa2asr, phone_unknown = fame_functions.get_translation_key(lexicon_ipa, lexicon_asr)
|
||||||
|
print("elapsed time: {}".format(time.time() - timer_start))
|
||||||
|
|
||||||
|
np.save(os.path.join('phoneset', 'output_get_translation_key_translation_key.npy'), translation_key_ipa2asr)
|
||||||
|
np.save(os.path.join('phoneset', 'output_get_translation_key_phone_unknown.npy'), phone_unknown)
|
||||||
|
else:
|
||||||
|
translation_key_ipa2asr = np.load(os.path.join('phoneset', 'output_get_translation_key_translation_key.npy')).item()
|
||||||
|
phone_unknown = np.load(os.path.join('phoneset', 'output_get_translation_key_phone_unknown.npy'))
|
||||||
|
phone_unknown = list(phone_unknown)
|
||||||
|
|
||||||
|
# manually check the correspondence for the phone in phone_unknown.
|
||||||
|
#p = phone_unknown[0]
|
||||||
|
#lex_ipa_ = find_phone(lexicon_ipa, p, phoneset='ipa')
|
||||||
|
|
||||||
|
#for word in lex_ipa_['word']:
|
||||||
|
# ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
|
||||||
|
# if np.sum(lex_asr['word'] == word) > 0:
|
||||||
|
# asr = lex_asr[lex_asr['word'] == word].iat[0, 1]
|
||||||
|
|
||||||
|
# ipa_list = convert_phone_set.split_word(ipa, fame_ipa.multi_character_phones)
|
||||||
|
# asr_list = asr.split(' ')
|
||||||
|
# if p in ipa_list and (len(ipa_list) == len(asr_list)):
|
||||||
|
# print("{0}: {1} --> {2}".format(word, ipa_list, asr_list))
|
||||||
|
# for ipa_, asr_ in zip(ipa_list, asr_list):
|
||||||
|
# if ipa_ in phone_unknown:
|
||||||
|
# translation_key_ipa2asr[ipa_] = asr_
|
||||||
|
# phone_unknown.remove(ipa_)
|
||||||
|
|
||||||
|
translation_key_ipa2asr['ə:'] = 'ə'
|
||||||
|
translation_key_ipa2asr['r.'] = 'r'
|
||||||
|
translation_key_ipa2asr['r:'] = 'r'
|
||||||
|
# added for stimmen.
|
||||||
|
translation_key_ipa2asr['ɪ:'] = 'ɪ:'
|
||||||
|
translation_key_ipa2asr['y:'] = 'y'
|
||||||
|
|
||||||
|
np.save(os.path.join('phoneset', 'fame_ipa2asr.npy'), translation_key_ipa2asr)
|
||||||
|
|
||||||
|
|
||||||
|
## check if all the phones in lexicon.asr are in translation_key_ipa2asr.
|
||||||
|
#timer_start = time.time()
|
||||||
|
#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_asr, phoneset='asr')
|
||||||
|
#phoneset_lex.remove("")
|
||||||
|
#phoneset_asr = list(set(translation_key_ipa2asr.values()))
|
||||||
|
#print("phones which is in lexicon.asr but not in the translation_key_ipa2asr:\n{}".format(
|
||||||
|
# set(phoneset_lex) - set(phoneset_asr)))
|
||||||
|
#print("elapsed time: {}".format(time.time() - timer_start))
|
||||||
|
|
||||||
|
|
||||||
|
## check if all the phones in lexicon.htk are in fame_asr.py.
|
||||||
|
#timer_start = time.time()
|
||||||
|
#phoneset_htk = fame_asr.phoneset_htk
|
||||||
|
#phoneset_lex = fame_functions.get_phoneset_from_lexicon(lexicon_htk)
|
||||||
|
#phoneset_lex.remove('')
|
||||||
|
#print("phones which is in lexicon.htk but not in the fame_asr.py are:\n{}".format(
|
||||||
|
# set(phoneset_htk) - set(phoneset_lex)))
|
||||||
|
#print("elapsed time: {}".format(time.time() - timer_start))
|
||||||
|
|
||||||
|
## statistics over the lexicon
|
||||||
|
#lex_htk = fame_functions.load_lexicon(lexicon_htk)
|
||||||
|
#phones_all = (' '.join(lex_htk['pronunciation'])).split(' ')
|
||||||
|
#c = Counter(phones_all)
|
||||||
|
|
||||||
|
#lexicon_out = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\lexicon\lex.htk2'
|
||||||
|
#for i in lex_htk[lex_htk['word'].str.startswith('\'')].index.values:
|
||||||
|
# lex_htk.iat[i, 0] = lex_htk.iat[i, 0].replace('\'', '\\\'')
|
||||||
|
## to_csv does not work with space seperator. therefore all tabs should manually be replaced.
|
||||||
|
##lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep=' ', quoting=csv.QUOTE_NONE, escapechar='\\')
|
||||||
|
#lex_htk.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
|
||||||
|
|
||||||
|
|
||||||
|
## check which letters are not coded in ascii.
|
||||||
|
#print('asr phones which cannot be coded in ascii:\n')
|
||||||
|
#for i in fame_asr.phoneset_short:
|
||||||
|
# try:
|
||||||
|
# i_encoded = i.encode("ascii")
|
||||||
|
# #print("{0} --> {1}".format(i, i.encode("ascii")))
|
||||||
|
# except UnicodeEncodeError:
|
||||||
|
# print(">>> {}".format(i))
|
||||||
|
|
||||||
|
#print("letters in the scripts which is not coded in ascii:\n")
|
||||||
|
#for dataset in ['train', 'devel', 'test']:
|
||||||
|
# timer_start = time.time()
|
||||||
|
|
||||||
|
# script_list = os.path.join(default.fame_dir, 'data', dataset, 'text')
|
||||||
|
# with open(script_list, "rt", encoding="utf-8") as fin:
|
||||||
|
# scripts = fin.read().split('\n')
|
||||||
|
|
||||||
|
# for line in scripts:
|
||||||
|
# sentence = ' '.join(line.split(' ')[1:])
|
||||||
|
# sentence_htk = fame_functions.word2htk(sentence)
|
||||||
|
|
||||||
|
# #if len(re.findall(r'[âêôûč\'àéèúćäëïöü]', sentence))==0:
|
||||||
|
# try:
|
||||||
|
# sentence_htk = bytes(sentence_htk, 'ascii')
|
||||||
|
# except UnicodeEncodeError:
|
||||||
|
# print(sentence)
|
||||||
|
# print(sentence_htk)
|
||||||
|
|
42
acoustic_model/forced_aligner_comparison.py
Normal file
42
acoustic_model/forced_aligner_comparison.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
import os
|
||||||
|
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import defaultfiles as default
|
||||||
|
sys.path.append(default.forced_alignment_module_dir)
|
||||||
|
from forced_alignment import pyhtk, convert_phone_set, scripts
|
||||||
|
|
||||||
|
reus_dir = r'c:\Users\Aki\source\repos\acoustic_model\reus-test'
|
||||||
|
wav_dir = reus_dir
|
||||||
|
wav_files = ['reus1008-reus.wav',
|
||||||
|
'reus1167-man.wav',
|
||||||
|
'reus3768-mantsje.wav']
|
||||||
|
|
||||||
|
word = 'reus'
|
||||||
|
pronunciation_ipa = ['rø:s', 'mɑn', 'mɑntsjə']
|
||||||
|
|
||||||
|
for wav_file in wav_files:
|
||||||
|
file_lab = os.path.join(reus_dir, wav_file.replace('.wav', '.lab'))
|
||||||
|
file_dic = os.path.join(reus_dir, wav_file.replace('.wav', '.dic'))
|
||||||
|
file_txt = os.path.join(reus_dir, wav_file.replace('.wav', '.txt'))
|
||||||
|
|
||||||
|
# output htk dict file
|
||||||
|
with open(file_dic, 'w', encoding="utf-8") as f:
|
||||||
|
for ipa in pronunciation_ipa:
|
||||||
|
cgn = convert_phone_set.ipa2cgn([ipa.replace(':', 'ː')])
|
||||||
|
barbara = convert_phone_set.cgn2barbara(cgn)
|
||||||
|
f.write(word.upper() + '\t' + barbara + '\n')
|
||||||
|
|
||||||
|
# output htk label file.
|
||||||
|
pyhtk._create_label_file(word, file_lab)
|
||||||
|
|
||||||
|
scripts.run_command([
|
||||||
|
'HVite','-T', '1',
|
||||||
|
'-a',
|
||||||
|
'-C', default.config_hvite,
|
||||||
|
'-H', default.acoustic_model,
|
||||||
|
'-m',
|
||||||
|
'-i', file_txt,
|
||||||
|
#'-S', script_file,
|
||||||
|
file_dic, default.phonelist_txt, os.path.join(wav_dir, wav_file)
|
||||||
|
])
|
587
acoustic_model/htk_vs_kaldi.py
Normal file
587
acoustic_model/htk_vs_kaldi.py
Normal file
@ -0,0 +1,587 @@
|
|||||||
|
import os
|
||||||
|
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
||||||
|
import sys
|
||||||
|
|
||||||
|
#import csv
|
||||||
|
#import subprocess
|
||||||
|
#from collections import Counter
|
||||||
|
#import re
|
||||||
|
import shutil
|
||||||
|
import glob
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from collections import Counter
|
||||||
|
#import matplotlib.pyplot as plt
|
||||||
|
#from sklearn.metrics import confusion_matrix
|
||||||
|
|
||||||
|
#import acoustic_model_functions as am_func
|
||||||
|
#import convert_xsampa2ipa
|
||||||
|
import defaultfiles as default
|
||||||
|
|
||||||
|
#from forced_alignment import pyhtk
|
||||||
|
#sys.path.append(default.forced_alignment_module_dir)
|
||||||
|
#from forced_alignment import convert_phone_set
|
||||||
|
#import acoustic_model_functions as am_func
|
||||||
|
import convert_xsampa2ipa
|
||||||
|
import stimmen_functions
|
||||||
|
import fame_functions
|
||||||
|
import convert_phoneset
|
||||||
|
from phoneset import fame_ipa, fame_asr
|
||||||
|
sys.path.append(default.toolbox_dir)
|
||||||
|
import file_handling as fh
|
||||||
|
from htk import pyhtk
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= user define =======================
|
||||||
|
#excel_file = os.path.join(default.experiments_dir, 'stimmen', 'data', 'Frisian Variants Picture Task Stimmen.xlsx')
|
||||||
|
#data_dir = os.path.join(default.experiments_dir, 'stimmen', 'data')
|
||||||
|
|
||||||
|
#wav_dir = r'c:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\corpus\stimmen' # 16k
|
||||||
|
|
||||||
|
#acoustic_model_dir = os.path.join(default.experiments_dir, 'friesian', 'acoustic_model', 'model')
|
||||||
|
#htk_dict_dir = os.path.join(default.experiments_dir, 'stimmen', 'dic_short')
|
||||||
|
#fa_dir = os.path.join(default.experiments_dir, 'stimmen', 'FA_44k')
|
||||||
|
#result_dir = os.path.join(default.experiments_dir, 'stimmen', 'result')
|
||||||
|
|
||||||
|
#kaldi_data_dir = os.path.join(default.kaldi_dir, 'data', 'alignme')
|
||||||
|
#kaldi_dict_dir = os.path.join(default.kaldi_dir, 'data', 'local', 'dict')
|
||||||
|
#lexicon_txt = os.path.join(kaldi_dict_dir, 'lexicon.txt')
|
||||||
|
|
||||||
|
#lex_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr')
|
||||||
|
#lex_asr_htk = os.path.join(default.fame_dir, 'lexicon', 'lex.asr_htk')
|
||||||
|
|
||||||
|
# procedure
|
||||||
|
make_dic_file = 0
|
||||||
|
make_HTK_files = 0
|
||||||
|
extract_features = 0
|
||||||
|
#make_htk_dict_files = 0
|
||||||
|
#do_forced_alignment_htk = 0
|
||||||
|
#eval_forced_alignment_htk = 0
|
||||||
|
make_kaldi_files = 0
|
||||||
|
#make_kaldi_lexicon_txt = 0
|
||||||
|
#load_forced_alignment_kaldi = 1
|
||||||
|
#eval_forced_alignment_kaldi = 1
|
||||||
|
|
||||||
|
#sys.path.append(os.path.join(default.repo_dir, 'forced_alignment'))
|
||||||
|
#from forced_alignment import convert_phone_set
|
||||||
|
#from forced_alignment import pyhtk
|
||||||
|
|
||||||
|
#sys.path.append(os.path.join(default.repo_dir, 'toolbox'))
|
||||||
|
#from evaluation import plot_confusion_matrix
|
||||||
|
|
||||||
|
## HTK related files.
|
||||||
|
config_dir = os.path.join(default.htk_dir, 'config')
|
||||||
|
model_dir = os.path.join(default.htk_dir, 'model')
|
||||||
|
feature_dir = os.path.join(default.htk_dir, 'mfc', 'stimmen')
|
||||||
|
|
||||||
|
config_hcopy = os.path.join(config_dir, 'config.HCopy')
|
||||||
|
|
||||||
|
# files to be made.
|
||||||
|
lattice_file = os.path.join(config_dir, 'stimmen.ltc')
|
||||||
|
phonelist_txt = os.path.join(config_dir, 'phonelist.txt')
|
||||||
|
stimmen_dic = os.path.join(default.htk_dir, 'lexicon', 'stimmen_recognition.dic')
|
||||||
|
hcopy_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test_hcopy.scp')
|
||||||
|
hvite_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test_hvite.scp')
|
||||||
|
hresult_scp = os.path.join(default.htk_dir, 'tmp', 'stimmen_test_result.scp')
|
||||||
|
|
||||||
|
|
||||||
|
## Kaldi related files.
|
||||||
|
kaldi_data_dir = os.path.join(default.kaldi_dir, 'data')
|
||||||
|
|
||||||
|
# files to be made.
|
||||||
|
wav_scp = os.path.join(kaldi_data_dir, 'test', 'wav.scp')
|
||||||
|
text_file = os.path.join(kaldi_data_dir, 'test', 'text')
|
||||||
|
utt2spk = os.path.join(kaldi_data_dir, 'test', 'utt2spk')
|
||||||
|
corpus_txt = os.path.join(kaldi_data_dir, 'local', 'corpus.txt')
|
||||||
|
lexicon_txt = os.path.join(kaldi_data_dir, 'local', 'dict', 'lexicon.txt')
|
||||||
|
nonsilence_phones_txt = os.path.join(kaldi_data_dir, 'local', 'dict', 'nonsilence_phones.txt')
|
||||||
|
silence_phones_txt = os.path.join(kaldi_data_dir, 'local', 'dict', 'silence_phones.txt')
|
||||||
|
optional_silence_txt = os.path.join(kaldi_data_dir, 'local', 'dict', 'optional_silence.txt')
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= load test data ======================
|
||||||
|
stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test'
|
||||||
|
|
||||||
|
df = stimmen_functions.load_transcriptions_clean(stimmen_test_dir)
|
||||||
|
df = stimmen_functions.add_row_asr(df)
|
||||||
|
df = stimmen_functions.add_row_htk(df)
|
||||||
|
|
||||||
|
word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
|
||||||
|
word_list = sorted(word_list)
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= make dic file to check pronunciation variants ======================
|
||||||
|
# dic file should be manually modified depends on the task - recognition / forced-alignemnt.
|
||||||
|
if make_dic_file:
|
||||||
|
# for HTK.
|
||||||
|
with open(stimmen_dic, mode='wb') as f:
|
||||||
|
for word in word_list:
|
||||||
|
df_ = df[df['word']==word]
|
||||||
|
pronunciations = list(np.unique(df_['htk']))
|
||||||
|
pronunciations_ = [word.upper() + ' sil ' + ' '.join(convert_phoneset.split_word(
|
||||||
|
htk, fame_asr.multi_character_phones_htk)) + ' sil'
|
||||||
|
for htk in pronunciations]
|
||||||
|
f.write(bytes('\n'.join(pronunciations_) + '\n', 'ascii'))
|
||||||
|
f.write(bytes('SILENCE sil\n', 'ascii'))
|
||||||
|
|
||||||
|
# for Kaldi.
|
||||||
|
fh.make_new_directory(os.path.join(kaldi_data_dir, 'local', 'dict'))
|
||||||
|
with open(lexicon_txt, mode='wb') as f:
|
||||||
|
f.write(bytes('!SIL sil\n', 'utf-8'))
|
||||||
|
f.write(bytes('<UNK> spn\n', 'utf-8'))
|
||||||
|
for word in word_list:
|
||||||
|
df_ = df[df['word']==word]
|
||||||
|
pronunciations = list(np.unique(df_['asr']))
|
||||||
|
pronunciations_ = [word.lower() + ' ' + ' '.join(convert_phoneset.split_word(
|
||||||
|
asr, fame_asr.multi_character_phones))
|
||||||
|
for asr in pronunciations]
|
||||||
|
f.write(bytes('\n'.join(pronunciations_) + '\n', 'utf-8'))
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= test data for recognition ======================
|
||||||
|
# only target pronunciation variants.
|
||||||
|
df_rec = pd.DataFrame(index=[], columns=list(df.keys()))
|
||||||
|
for word in word_list:
|
||||||
|
variants = [htk.replace(' ', '')
|
||||||
|
for htk in stimmen_functions.load_pronunciations(word.upper(), stimmen_dic)]
|
||||||
|
df_ = df[df['word'] == word]
|
||||||
|
for index, row in df_.iterrows():
|
||||||
|
if row['htk'] in variants:
|
||||||
|
df_rec = df_rec.append(row, ignore_index=True)
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= make files required for HTK ======================
|
||||||
|
if make_HTK_files:
|
||||||
|
# make a word lattice file.
|
||||||
|
pyhtk.create_word_lattice_file(
|
||||||
|
os.path.join(config_dir, 'stimmen.net'),
|
||||||
|
lattice_file)
|
||||||
|
|
||||||
|
# extract features.
|
||||||
|
with open(hcopy_scp, 'wb') as f:
|
||||||
|
filelist = [os.path.join(stimmen_test_dir, filename) + '\t'
|
||||||
|
+ os.path.join(feature_dir, os.path.basename(filename).replace('.wav', '.mfc'))
|
||||||
|
for filename in df['filename']]
|
||||||
|
f.write(bytes('\n'.join(filelist), 'ascii'))
|
||||||
|
pyhtk.wav2mfc(config_hcopy, hcopy_scp)
|
||||||
|
|
||||||
|
# make label files.
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
filename = row['filename'].replace('.wav', '.lab')
|
||||||
|
label_file = os.path.join(feature_dir, filename)
|
||||||
|
with open(label_file, 'wb') as f:
|
||||||
|
label_string = 'SILENCE\n' + row['word'].upper() + '\nSILENCE\n'
|
||||||
|
f.write(bytes(label_string, 'ascii'))
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= make files required for Kaldi =======================
|
||||||
|
if make_kaldi_files:
|
||||||
|
fh.make_new_directory(os.path.join(kaldi_data_dir, 'test'))
|
||||||
|
fh.make_new_directory(os.path.join(kaldi_data_dir, 'test', 'local'))
|
||||||
|
fh.make_new_directory(os.path.join(kaldi_data_dir, 'conf'))
|
||||||
|
|
||||||
|
# remove previous files.
|
||||||
|
if os.path.exists(wav_scp):
|
||||||
|
os.remove(wav_scp)
|
||||||
|
if os.path.exists(text_file):
|
||||||
|
os.remove(text_file)
|
||||||
|
if os.path.exists(utt2spk):
|
||||||
|
os.remove(utt2spk)
|
||||||
|
|
||||||
|
f_wav_scp = open(wav_scp, 'a', encoding="utf-8", newline='\n')
|
||||||
|
f_text_file = open(text_file, 'a', encoding="utf-8", newline='\n')
|
||||||
|
f_utt2spk = open(utt2spk, 'a', encoding="utf-8", newline='\n')
|
||||||
|
|
||||||
|
# make wav.scp, text, and utt2spk files.
|
||||||
|
for i, row in df_rec.iterrows():
|
||||||
|
filename = row['filename']
|
||||||
|
print('=== {0}: {1} ==='.format(i, filename))
|
||||||
|
|
||||||
|
wav_file = os.path.join(stimmen_test_dir, filename)
|
||||||
|
#if os.path.exists(wav_file):
|
||||||
|
speaker_id = 'speaker_' + str(i).zfill(4)
|
||||||
|
utterance_id = filename.replace('.wav', '')
|
||||||
|
utterance_id = utterance_id.replace(' ', '_')
|
||||||
|
utterance_id = speaker_id + '-' + utterance_id
|
||||||
|
|
||||||
|
# output
|
||||||
|
f_wav_scp.write('{0} {1}\n'.format(
|
||||||
|
utterance_id,
|
||||||
|
wav_file.replace('c:/', '/mnt/c/').replace('\\', '/'))) # convert path to unix format.
|
||||||
|
f_text_file.write('{0}\t{1}\n'.format(utterance_id, df_rec['word'][i].lower()))
|
||||||
|
f_utt2spk.write('{0} {1}\n'.format(utterance_id, speaker_id))
|
||||||
|
|
||||||
|
f_wav_scp.close()
|
||||||
|
f_text_file.close()
|
||||||
|
f_utt2spk.close()
|
||||||
|
|
||||||
|
with open(corpus_txt, 'wb') as f:
|
||||||
|
f.write(bytes('\n'.join([word.lower() for word in word_list]) + '\n', 'utf-8'))
|
||||||
|
|
||||||
|
with open(nonsilence_phones_txt, 'wb') as f:
|
||||||
|
f.write(bytes('\n'.join(fame_asr.phoneset_short) + '\n', 'utf-8'))
|
||||||
|
|
||||||
|
with open(silence_phones_txt, 'wb') as f:
|
||||||
|
f.write(bytes('sil\nspn\n', 'utf-8'))
|
||||||
|
|
||||||
|
with open(optional_silence_txt, 'wb') as f:
|
||||||
|
f.write(bytes('sil\n', 'utf-8'))
|
||||||
|
|
||||||
|
with open(os.path.join(kaldi_data_dir, 'conf', 'decode.config'), 'wb') as f:
|
||||||
|
f.write(bytes('first_beam=10.0\n', 'utf-8'))
|
||||||
|
f.write(bytes('beam=13.0\n', 'utf-8'))
|
||||||
|
f.write(bytes('lattice_beam=6.0\n', 'utf-8'))
|
||||||
|
|
||||||
|
with open(os.path.join(kaldi_data_dir, 'conf', 'mfcc.conf'), 'wb') as f:
|
||||||
|
f.write(bytes('--use-energy=false', 'utf-8'))
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= recognition ======================
|
||||||
|
|
||||||
|
listdir = glob.glob(os.path.join(feature_dir, '*.mfc'))
|
||||||
|
with open(hvite_scp, 'wb') as f:
|
||||||
|
f.write(bytes('\n'.join(listdir), 'ascii'))
|
||||||
|
|
||||||
|
with open(hresult_scp, 'wb') as f:
|
||||||
|
f.write(bytes('\n'.join(listdir).replace('.mfc', '.rec'), 'ascii'))
|
||||||
|
|
||||||
|
|
||||||
|
# calculate result
|
||||||
|
performance = np.zeros((1, 2))
|
||||||
|
for niter in range(50, 60):
|
||||||
|
output = pyhtk.recognition(
|
||||||
|
os.path.join(config_dir, 'config.rec'),
|
||||||
|
lattice_file,
|
||||||
|
os.path.join(default.htk_dir, 'model', 'hmm1', 'iter' + str(niter), 'hmmdefs'),
|
||||||
|
stimmen_dic, phonelist_txt, hvite_scp)
|
||||||
|
|
||||||
|
output = pyhtk.calc_recognition_performance(
|
||||||
|
stimmen_dic, hresult_scp)
|
||||||
|
per_sentence, per_word = pyhtk.load_recognition_output_all(output)
|
||||||
|
performance_ = np.array([niter, per_sentence['accuracy']]).reshape(1, 2)
|
||||||
|
performance = np.r_[performance, performance_]
|
||||||
|
print('{0}: {1}[%]'.format(niter, per_sentence['accuracy']))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#output = run_command_with_output([
|
||||||
|
# 'HVite', '-T', '1',
|
||||||
|
# '-C', config_rec,
|
||||||
|
# '-w', lattice_file,
|
||||||
|
# '-H', hmm,
|
||||||
|
# dictionary_file, phonelist_txt,
|
||||||
|
# '-S', HVite_scp
|
||||||
|
#])
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= forced alignment using HTK =======================
|
||||||
|
if do_forced_alignment_htk:
|
||||||
|
|
||||||
|
#for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
|
||||||
|
for hmm_num in [256, 512, 1024]:
|
||||||
|
hmm_num_str = str(hmm_num)
|
||||||
|
acoustic_model = os.path.join(acoustic_model_dir, 'hmm' + hmm_num_str + r'-2\hmmdefs')
|
||||||
|
|
||||||
|
predictions = pd.DataFrame({'filename': [''],
|
||||||
|
'word': [''],
|
||||||
|
'xsampa': [''],
|
||||||
|
'ipa': [''],
|
||||||
|
'famehtk': [''],
|
||||||
|
'prediction': ['']})
|
||||||
|
for i, filename in enumerate(df['filename']):
|
||||||
|
print('=== {0}/{1} ==='.format(i, len(df)))
|
||||||
|
if (i in df['filename'].keys()) and (isinstance(df['filename'][i], str)):
|
||||||
|
wav_file = os.path.join(wav_dir, filename)
|
||||||
|
if os.path.exists(wav_file):
|
||||||
|
word = df['word'][i]
|
||||||
|
WORD = word.upper()
|
||||||
|
fa_file = os.path.join(fa_dir, filename.replace('.wav', '.txt') + hmm_num_str)
|
||||||
|
|
||||||
|
#if not os.path.exists(fa_file):
|
||||||
|
# make label file.
|
||||||
|
label_file = os.path.join(wav_dir, filename.replace('.wav', '.lab'))
|
||||||
|
with open(label_file, 'w') as f:
|
||||||
|
lines = f.write(WORD)
|
||||||
|
|
||||||
|
htk_dict_file = os.path.join(htk_dict_dir, word + '.dic')
|
||||||
|
|
||||||
|
pyhtk.doHVite(wav_file, label_file, htk_dict_file, fa_file, default.config_hvite,
|
||||||
|
default.phonelist, acoustic_model)
|
||||||
|
os.remove(label_file)
|
||||||
|
|
||||||
|
prediction = am_func.read_fileFA(fa_file)
|
||||||
|
|
||||||
|
print('{0}: {1} -> {2}'.format(WORD, df['famehtk'][i], prediction))
|
||||||
|
else:
|
||||||
|
prediction = ''
|
||||||
|
print('!!!!! file not found.')
|
||||||
|
|
||||||
|
line = pd.Series([df['filename'][i], df['word'][i], df['xsampa'][i], df['ipa'][i], df['famehtk'][i], prediction], index=['filename', 'word', 'xsampa', 'ipa', 'famehtk', 'prediction'], name=i)
|
||||||
|
predictions = predictions.append(line)
|
||||||
|
else:
|
||||||
|
prediction = ''
|
||||||
|
print('!!!!! invalid entry.')
|
||||||
|
|
||||||
|
predictions.to_pickle(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.pkl'))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= make lexicon txt which is used by Kaldi =======================
|
||||||
|
if make_kaldi_lexicon_txt:
|
||||||
|
option_num = 6
|
||||||
|
|
||||||
|
# remove previous file.
|
||||||
|
if os.path.exists(lexicon_txt):
|
||||||
|
os.remove(lexicon_txt)
|
||||||
|
lexiconp_txt = lexicon_txt.replace('lexicon.txt', 'lexiconp.txt')
|
||||||
|
if os.path.exists(lexiconp_txt):
|
||||||
|
os.remove(lexiconp_txt)
|
||||||
|
|
||||||
|
# output lexicon.txt
|
||||||
|
f_lexicon_txt = open(lexicon_txt, 'a', encoding="utf-8", newline='\n')
|
||||||
|
pronvar_list_all = []
|
||||||
|
for word in word_list:
|
||||||
|
|
||||||
|
# pronunciation variant of the target word.
|
||||||
|
pronunciation_variants = df['ipa'][df['word'].str.match(word)]
|
||||||
|
|
||||||
|
c = Counter(pronunciation_variants)
|
||||||
|
total_num = sum(c.values())
|
||||||
|
|
||||||
|
#with open(result_dir + '\\' + word + '.csv', 'a', encoding="utf-8", newline='\n') as f:
|
||||||
|
# for key in c.keys():
|
||||||
|
# f.write("{0},{1}\n".format(key,c[key]))
|
||||||
|
|
||||||
|
for key, value in c.most_common(option_num):
|
||||||
|
# make possible pronunciation variant list.
|
||||||
|
pronvar_list = am_func.fame_pronunciation_variant(key)
|
||||||
|
|
||||||
|
for pronvar_ in pronvar_list:
|
||||||
|
split_ipa = convert_phone_set.split_fame_ipa(pronvar_)
|
||||||
|
pronvar_out = ' '.join(split_ipa)
|
||||||
|
pronvar_list_all.append([word, pronvar_out])
|
||||||
|
|
||||||
|
pronvar_list_all = np.array(pronvar_list_all)
|
||||||
|
pronvar_list_all = np.unique(pronvar_list_all, axis=0)
|
||||||
|
|
||||||
|
|
||||||
|
# output
|
||||||
|
f_lexicon_txt.write('<UNK>\tSPN\n')
|
||||||
|
for line in pronvar_list_all:
|
||||||
|
f_lexicon_txt.write('{0}\t{1}\n'.format(line[0].lower(), line[1]))
|
||||||
|
|
||||||
|
f_lexicon_txt.close()
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= load kaldi forced alignment result =======================
|
||||||
|
if load_forced_alignment_kaldi:
|
||||||
|
phones_txt = os.path.join(default.kaldi_dir, 'data', 'lang', 'phones.txt')
|
||||||
|
merged_alignment_txt = os.path.join(default.kaldi_dir, 'exp', 'tri1_alignme', 'merged_alignment.txt')
|
||||||
|
|
||||||
|
#filenames = np.load(data_dir + '\\filenames.npy')
|
||||||
|
#words = np.load(data_dir + '\\words.npy')
|
||||||
|
#pronunciations = np.load(data_dir + '\\pronunciations_ipa.npy')
|
||||||
|
#pronvar_list_all = np.load(data_dir + '\\pronvar_list_all.npy')
|
||||||
|
#word_list = np.unique(words)
|
||||||
|
|
||||||
|
# load the mapping between phones and ids.
|
||||||
|
with open(phones_txt, 'r', encoding="utf-8") as f:
|
||||||
|
mapping_phone2id = f.read().split('\n')
|
||||||
|
|
||||||
|
phones = []
|
||||||
|
phone_ids = [] # ID of phones
|
||||||
|
for m in mapping_phone2id:
|
||||||
|
m = m.split(' ')
|
||||||
|
if len(m) > 1:
|
||||||
|
phones.append(m[0])
|
||||||
|
phone_ids.append(int(m[1]))
|
||||||
|
|
||||||
|
|
||||||
|
# load the result of FA.
|
||||||
|
with open(merged_alignment_txt, 'r') as f:
|
||||||
|
lines = f.read()
|
||||||
|
lines = lines.split('\n')
|
||||||
|
|
||||||
|
predictions = pd.DataFrame({'filename': [''],
|
||||||
|
'word': [''],
|
||||||
|
'xsampa': [''],
|
||||||
|
'ipa': [''],
|
||||||
|
'famehtk': [''],
|
||||||
|
'prediction': ['']})
|
||||||
|
#fa_filenames = []
|
||||||
|
#fa_pronunciations = []
|
||||||
|
utterance_id_ = ''
|
||||||
|
pronunciation = []
|
||||||
|
for line in lines:
|
||||||
|
line = line.split(' ')
|
||||||
|
if len(line) == 5:
|
||||||
|
utterance_id = line[0]
|
||||||
|
if utterance_id == utterance_id_:
|
||||||
|
phone_id = int(line[4])
|
||||||
|
#if not phone_id == 1:
|
||||||
|
phone_ = phones[phone_ids.index(phone_id)]
|
||||||
|
phone = re.sub(r'_[A-Z]', '', phone_)
|
||||||
|
if not phone == 'SIL':
|
||||||
|
pronunciation.append(phone)
|
||||||
|
else:
|
||||||
|
filename = re.sub(r'speaker_[0-9]{4}-', '', utterance_id_)
|
||||||
|
prediction = ''.join(pronunciation)
|
||||||
|
df_ = df[df['filename'].str.match(filename)]
|
||||||
|
df_idx = df_.index[0]
|
||||||
|
prediction_ = pd.Series([#filename,
|
||||||
|
#df_['word'][df_idx],
|
||||||
|
#df_['xsampa'][df_idx],
|
||||||
|
#df_['ipa'][df_idx],
|
||||||
|
#df_['famehtk'][df_idx],
|
||||||
|
df_.iloc[0,1],
|
||||||
|
df_.iloc[0,3],
|
||||||
|
df_.iloc[0,4],
|
||||||
|
df_.iloc[0,2],
|
||||||
|
df_.iloc[0,0],
|
||||||
|
prediction],
|
||||||
|
index=['filename', 'word', 'xsampa', 'ipa', 'famehtk', 'prediction'],
|
||||||
|
name=df_idx)
|
||||||
|
predictions = predictions.append(prediction_)
|
||||||
|
#fa_filenames.append()
|
||||||
|
#fa_pronunciations.append(' '.join(pronunciation))
|
||||||
|
pronunciation = []
|
||||||
|
|
||||||
|
utterance_id_ = utterance_id
|
||||||
|
predictions.to_pickle(os.path.join(result_dir, 'kaldi', 'predictions.pkl'))
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= evaluate the result of forced alignment =======================
|
||||||
|
if eval_forced_alignment_htk:
|
||||||
|
htk_dict_dir = os.path.join(default.experiments_dir, 'stimmen', 'dic_short')
|
||||||
|
|
||||||
|
compare_hmm_num = 1
|
||||||
|
|
||||||
|
if compare_hmm_num:
|
||||||
|
f_result = open(os.path.join(result_dir, 'result.csv'), 'w')
|
||||||
|
f_result.write("nmix,Oog,Oog,Oor,Oor,Pauw,Pauw,Reus,Reus,Reuzenrad,Reuzenrad,Roeiboot,Roeiboot,Rozen,Rozen\n")
|
||||||
|
|
||||||
|
for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
|
||||||
|
#for hmm_num in [256]:
|
||||||
|
hmm_num_str = str(hmm_num)
|
||||||
|
if compare_hmm_num:
|
||||||
|
f_result.write("{},".format(hmm_num_str))
|
||||||
|
|
||||||
|
#match = np.load(data_dir + '\\match_hmm' + hmm_num_str + '.npy')
|
||||||
|
#prediction = np.load(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.npy'))
|
||||||
|
#prediction = pd.Series(prediction, index=df.index, name='prediction')
|
||||||
|
#result = pd.concat([df, prediction], axis=1)
|
||||||
|
result = pd.read_pickle(os.path.join(result_dir, 'htk', 'predictions_hmm' + hmm_num_str + '.pkl'))
|
||||||
|
|
||||||
|
|
||||||
|
# load pronunciation variants
|
||||||
|
for word in word_list:
|
||||||
|
htk_dict_file = os.path.join(htk_dict_dir, word + '.dic')
|
||||||
|
with open(htk_dict_file, 'r') as f:
|
||||||
|
lines = f.read().split('\n')[:-1]
|
||||||
|
pronunciation_variants = [line.split('\t')[1] for line in lines]
|
||||||
|
|
||||||
|
# see only words which appears in top 3.
|
||||||
|
result_ = result[result['word'].str.match(word)]
|
||||||
|
result_ = result_[result_['famehtk'].isin(pronunciation_variants)]
|
||||||
|
|
||||||
|
match_num = sum(result_['famehtk'] == result_['prediction'])
|
||||||
|
total_num = len(result_)
|
||||||
|
|
||||||
|
print("word '{0}': {1}/{2} ({3:.2f} %)".format(word, match_num, total_num, match_num/total_num*100))
|
||||||
|
if compare_hmm_num:
|
||||||
|
f_result.write("{0},{1},".format(match_num, total_num))
|
||||||
|
else:
|
||||||
|
# output confusion matrix
|
||||||
|
cm = confusion_matrix(result_['famehtk'], result_['prediction'])
|
||||||
|
|
||||||
|
plt.figure()
|
||||||
|
plot_confusion_matrix(cm, classes=pronunciation_variants, normalize=False)
|
||||||
|
plt.savefig(result_dir + '\\cm_' + word + '.png')
|
||||||
|
|
||||||
|
if compare_hmm_num:
|
||||||
|
f_result.write('\n')
|
||||||
|
|
||||||
|
if compare_hmm_num:
|
||||||
|
f_result.close()
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= evaluate the result of forced alignment of kaldi =======================
|
||||||
|
if eval_forced_alignment_kaldi:
|
||||||
|
result = pd.read_pickle(os.path.join(result_dir, 'kaldi', 'predictions.pkl'))
|
||||||
|
|
||||||
|
f_result = open(os.path.join(result_dir, 'result.csv'), 'w')
|
||||||
|
f_result.write("word,total,valid,match,[%]\n")
|
||||||
|
|
||||||
|
# load pronunciation variants
|
||||||
|
with open(lexicon_txt, 'r', encoding="utf-8", newline='\n') as f:
|
||||||
|
lines = f.read().split('\n')[:-1]
|
||||||
|
pronunciation_variants_all = [line.split('\t') for line in lines]
|
||||||
|
|
||||||
|
word_list = np.delete(word_list, [0], 0) # remove 'Oog'
|
||||||
|
for word in word_list:
|
||||||
|
|
||||||
|
# load pronunciation variant of the word.
|
||||||
|
pronunciation_variants = []
|
||||||
|
for line in pronunciation_variants_all:
|
||||||
|
if line[0] == word.lower():
|
||||||
|
pronunciation_variants.append(line[1].replace(' ', ''))
|
||||||
|
|
||||||
|
# see only words which appears in top 3.
|
||||||
|
result_ = result[result['word'].str.match(word)]
|
||||||
|
result_tolerant = pd.DataFrame({
|
||||||
|
'filename': [''],
|
||||||
|
'word': [''],
|
||||||
|
'xsampa': [''],
|
||||||
|
'ipa': [''],
|
||||||
|
'prediction': [''],
|
||||||
|
'match': ['']})
|
||||||
|
|
||||||
|
for i in range(0, len(result_)):
|
||||||
|
line = result_.iloc[i]
|
||||||
|
|
||||||
|
# make a list of all possible pronunciation variants of ipa description.
|
||||||
|
# i.e. possible answers from forced alignment.
|
||||||
|
ipa = line['ipa']
|
||||||
|
pronvar_list = [ipa]
|
||||||
|
pronvar_list_ = am_func.fame_pronunciation_variant(ipa)
|
||||||
|
if not pronvar_list_ is None:
|
||||||
|
pronvar_list += list(pronvar_list_)
|
||||||
|
|
||||||
|
# only focus on pronunciations which can be estimated from ipa.
|
||||||
|
if len(set(pronvar_list) & set(pronunciation_variants)) > 0:
|
||||||
|
if line['prediction'] in pronvar_list:
|
||||||
|
ismatch = True
|
||||||
|
else:
|
||||||
|
ismatch = False
|
||||||
|
|
||||||
|
line_df = pd.DataFrame(result_.iloc[i]).T
|
||||||
|
df_idx = line_df.index[0]
|
||||||
|
result_tolerant_ = pd.Series([line_df.loc[df_idx, 'filename'],
|
||||||
|
line_df.loc[df_idx, 'word'],
|
||||||
|
line_df.loc[df_idx, 'xsampa'],
|
||||||
|
line_df.loc[df_idx, 'ipa'],
|
||||||
|
line_df.loc[df_idx, 'prediction'],
|
||||||
|
ismatch],
|
||||||
|
index=['filename', 'word', 'xsampa', 'ipa', 'prediction', 'match'],
|
||||||
|
name=df_idx)
|
||||||
|
result_tolerant = result_tolerant.append(result_tolerant_)
|
||||||
|
# remove the first entry (dummy)
|
||||||
|
result_tolerant = result_tolerant.drop(0, axis=0)
|
||||||
|
|
||||||
|
total_num = len(result_)
|
||||||
|
valid_num = len(result_tolerant)
|
||||||
|
match_num = np.sum(result_tolerant['match'])
|
||||||
|
|
||||||
|
print("word '{0}': {1}/{2} ({3:.2f} %) originally {4}".format(word, match_num, valid_num, match_num/valid_num*100, total_num))
|
||||||
|
f_result.write("{0},{1},{2},{3},{4}\n".format(word, total_num, valid_num, match_num, match_num/valid_num*100))
|
||||||
|
|
||||||
|
f_result.close()
|
||||||
|
## output confusion matrix
|
||||||
|
#cm = confusion_matrix(result_['ipa'], result_['prediction'])
|
||||||
|
|
||||||
|
#plt.figure()
|
||||||
|
#plot_confusion_matrix(cm, classes=pronunciation_variants, normalize=False)
|
||||||
|
#plt.savefig(result_dir + '\\cm_' + word + '.png')
|
65
acoustic_model/novoapi_forced_alignment.py
Normal file
65
acoustic_model/novoapi_forced_alignment.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
#
|
||||||
|
# forced alignment using novo-api.
|
||||||
|
#
|
||||||
|
# *** IMPORTANT ***
|
||||||
|
# This file should be treated as confidencial.
|
||||||
|
# This file should not be copied or uploaded to public sites.
|
||||||
|
#
|
||||||
|
# NOTES:
|
||||||
|
# The usage of novo api: https://bitbucket.org/novolanguage/python-novo-api
|
||||||
|
# I couldn't make it work as I described in the mail to Martijn Bartelds on
|
||||||
|
# 2018/12/03.
|
||||||
|
# As per the advice from him, I modified testgrammer.py and made it a function.
|
||||||
|
#
|
||||||
|
# In order to run on Python 3.6, the following points are changed in novo-api.
|
||||||
|
# (1) backend/__init__.py
|
||||||
|
# - #import session
|
||||||
|
# from . import session
|
||||||
|
# (2) backend/session.py
|
||||||
|
# - #except Exception, e:
|
||||||
|
# except Exception as e:
|
||||||
|
# - #print self.last_message
|
||||||
|
# print(self.last_message)
|
||||||
|
# (3) asr/segment/praat.py
|
||||||
|
# - def print_tier(output, title, begin, end, segs, (format, formatter))
|
||||||
|
# def print_tier(output, title, begin, end, segs, format, formatter):
|
||||||
|
# (4) asr/spraaklab/__init.py
|
||||||
|
# - #import session
|
||||||
|
# from . import session
|
||||||
|
# (5) asr/spraaklab/schema.py
|
||||||
|
# - #print data, "validated not OK", e.message
|
||||||
|
# print("{0} validated not OK {1}".format(data, e.message))
|
||||||
|
# - #print data, "validated OK"
|
||||||
|
# print("{} validated OK".format(data))
|
||||||
|
# - #if isinstance(object, basestring):
|
||||||
|
# if isinstance(object, str)
|
||||||
|
#
|
||||||
|
# Aki Kunikoshi
|
||||||
|
# 428968@gmail.com
|
||||||
|
#
|
||||||
|
import os
|
||||||
|
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
|
||||||
|
from novoapi.backend import session
|
||||||
|
import novoapi_functions
|
||||||
|
import defaultfiles as default
|
||||||
|
|
||||||
|
# username / password cannot be passed as artuments...
|
||||||
|
p = argparse.ArgumentParser()
|
||||||
|
#p.add_argument("--user", default=None)
|
||||||
|
#p.add_argument("--password", default=None)
|
||||||
|
p.add_argument("--user", default='martijn.wieling')
|
||||||
|
p.add_argument("--password", default='xxxxxx')
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
#wav_file = 'c:\\OneDrive\\WSL\\test\\onetwothree.wav'
|
||||||
|
wav_file = os.path.join(default.stimmen_wav_dir, 'pg_pauw_2206_0fjd8.wav')
|
||||||
|
# list of the pronunciation for each words
|
||||||
|
word = 'pauw'
|
||||||
|
pronunciation_ipa = ['pau', 'pɑu']
|
||||||
|
|
||||||
|
result = novoapi_functions.forced_alignment(wav_file, word, pronunciation_ipa)
|
||||||
|
pronunciation_ipa, pronunciation_novo70, llh = novoapi_functions.result2pronunciation(result, word)
|
220
acoustic_model/novoapi_functions.py
Normal file
220
acoustic_model/novoapi_functions.py
Normal file
@ -0,0 +1,220 @@
|
|||||||
|
## this script should be used only by Aki Kunikoshi.
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
|
||||||
|
from novoapi.backend import session
|
||||||
|
|
||||||
|
import defaultfiles as default
|
||||||
|
import convert_phoneset
|
||||||
|
|
||||||
|
|
||||||
|
def load_novo70_phoneset():
|
||||||
|
#phonelist_novo70_ = pd.ExcelFile(default.phonelist_novo70_xlsx)
|
||||||
|
#df = pd.read_excel(phonelist_novo70_, 'list')
|
||||||
|
## *_simple includes columns which has only one phone in.
|
||||||
|
#for ipa, novo70 in zip(df['IPA_simple'], df['novo70_simple']):
|
||||||
|
# if not pd.isnull(ipa):
|
||||||
|
# print('{0}:{1}'.format(ipa, novo70))
|
||||||
|
# translation_key[ipa] = novo70
|
||||||
|
#phonelist_novo70 = np.unique(list(df['novo70_simple']))
|
||||||
|
novo70_phoneset = pd.read_csv(default.novo70_phoneset, delimiter='\t', header=None, encoding="utf-8")
|
||||||
|
novo70_phoneset.rename(columns={0: 'novo70', 1: 'ipa', 2: 'description'}, inplace=True)
|
||||||
|
|
||||||
|
#phoneset_ipa = []
|
||||||
|
#phoneset_novo70 = []
|
||||||
|
#with open(default.novo70_phoneset, "rt", encoding="utf-8") as fin:
|
||||||
|
# lines = fin.read()
|
||||||
|
# lines = lines.split('\n')
|
||||||
|
# for line in lines:
|
||||||
|
# words = line.split('\t')
|
||||||
|
# if len(words) > 1:
|
||||||
|
# novo70 = words[0]
|
||||||
|
# ipa = words[1]
|
||||||
|
# phoneset_ipa.append(ipa)
|
||||||
|
# phoneset_novo70.append(novo70)
|
||||||
|
# translation_key_ipa2novo70[ipa] = novo70
|
||||||
|
# translation_key_novo702ipa[novo70] = ipa
|
||||||
|
|
||||||
|
# As per Nederlandse phoneset_aki.xlsx recieved from David
|
||||||
|
# [ɔː] oh / ohr # from ipa->novo70, only oh is used.
|
||||||
|
# [ɪː] ih / ihr # from ipa->novo70, only ih is used.
|
||||||
|
# [iː] iy
|
||||||
|
# [œː] uh
|
||||||
|
# [ɛː] eh
|
||||||
|
# [w] wv in IPA written as ʋ.
|
||||||
|
extra_ipa = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː', 'ʋ']
|
||||||
|
extra_novo70 = ['oh', 'ih', 'iy', 'uh', 'eh', 'wv']
|
||||||
|
|
||||||
|
phoneset_ipa = list(novo70_phoneset['ipa'])
|
||||||
|
phoneset_ipa.extend(extra_ipa)
|
||||||
|
phoneset_ipa = [i.replace('ː', ':') for i in phoneset_ipa]
|
||||||
|
|
||||||
|
phoneset_novo70 = list(novo70_phoneset['novo70'])
|
||||||
|
phoneset_novo70.extend(extra_novo70)
|
||||||
|
phoneset_novo70 = [i.replace('ː', ':') for i in phoneset_novo70]
|
||||||
|
|
||||||
|
translation_key_ipa2novo70 = dict()
|
||||||
|
translation_key_novo702ipa = dict()
|
||||||
|
for ipa, novo70 in zip(phoneset_ipa, phoneset_novo70):
|
||||||
|
#phoneset_ipa.append(ipa)
|
||||||
|
#phoneset_novo70.append(novo70)
|
||||||
|
translation_key_ipa2novo70[ipa] = novo70
|
||||||
|
translation_key_novo702ipa[novo70] = ipa
|
||||||
|
|
||||||
|
translation_key_novo702ipa['ohr'] = 'ɔ:'
|
||||||
|
translation_key_novo702ipa['ihr'] = 'ɪ:'
|
||||||
|
|
||||||
|
phoneset_ipa = np.unique(phoneset_ipa)
|
||||||
|
phoneset_novo70 = np.unique(phoneset_novo70)
|
||||||
|
|
||||||
|
return phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa
|
||||||
|
|
||||||
|
|
||||||
|
def split_ipa(line):
|
||||||
|
"""
|
||||||
|
Split a line by IPA phones.
|
||||||
|
If nasalized sound (such as ɛ̃ː) is included, it will give error.
|
||||||
|
:param string line: one line written in IPA.
|
||||||
|
:return string lineSeperated: the line splitted in IPA phone.
|
||||||
|
"""
|
||||||
|
phoneset_ipa, _, _, _ = load_novo70_phoneset()
|
||||||
|
#multi_character_phones = [i for i in phoneset_ipa if len(i) > 1]
|
||||||
|
#multi_character_phones.sort(key=len, reverse=True)
|
||||||
|
#multi_character_phones = [
|
||||||
|
# # IPAs in CGN.
|
||||||
|
# u'ʌu', u'ɛi', u'œy', u'aː', u'eː', u'iː', u'oː', u'øː', u'ɛː', u'œː', u'ɔː', u'ɛ̃ː', u'ɑ̃ː', u'ɔ̃ː', u'œ̃', u'ɪː'
|
||||||
|
# ]
|
||||||
|
#return [phone for phone in multi_character_tokenize(line.strip(), multi_character_phones)]
|
||||||
|
|
||||||
|
return convert_phoneset.split_word(line, phoneset_ipa)
|
||||||
|
|
||||||
|
|
||||||
|
def split_novo70(line):
|
||||||
|
"""
|
||||||
|
Split a line by novo70 phones.
|
||||||
|
:param string line: one line written in novo70.
|
||||||
|
:return string lineSeperated: the line splitted by novo70 phones.
|
||||||
|
"""
|
||||||
|
_, phoneset_novo70, _, _ = load_novo70_phoneset()
|
||||||
|
#multi_character_phones = [p for p in phoneset_novo70 if len(p) > 1]
|
||||||
|
#multi_character_phones = sorted(multi_character_phones, key=len, reverse=True)
|
||||||
|
multi_character_phones = convert_phoneset.extract_multi_character_phones(phoneset_novo70)
|
||||||
|
|
||||||
|
return ['sp' if phone == ' ' else phone
|
||||||
|
for phone in multi_character_tokenize(line.strip(), multi_character_phones)]
|
||||||
|
|
||||||
|
|
||||||
|
def novo702ipa(line):
|
||||||
|
#pronunciation = []
|
||||||
|
_, _, _, translation_key = load_novo70_phoneset()
|
||||||
|
#for phone in split_novo70(tokens):
|
||||||
|
# pronunciation.append(translation_key.get(phone, phone))
|
||||||
|
#return ' '.join(pronunciation)
|
||||||
|
return ' '.join(convert_phoneset.convert_phoneset(split_novo70(line), translation_key))
|
||||||
|
|
||||||
|
|
||||||
|
# numbering of novo70 should be checked.
|
||||||
|
def ipa2novo70(line):
|
||||||
|
#pronunciation = []
|
||||||
|
_, _, translation_key, _ = load_novo70_phoneset()
|
||||||
|
#for phone in split_ipa(tokens):
|
||||||
|
# pronunciation.append(translation_key.get(phone, phone))
|
||||||
|
#return ' '.join(pronunciation)
|
||||||
|
return ' '.join(convert_phoneset.convert_phoneset(split_ipa(line), translation_key))
|
||||||
|
|
||||||
|
|
||||||
|
def make_grammar(word, pronunciation_ipa):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
words
|
||||||
|
pronunciation_ipa: list of pronunciation variants.
|
||||||
|
"""
|
||||||
|
#word = 'pauw'
|
||||||
|
#pronunciation_ipa = ['pau', 'pɑu']
|
||||||
|
|
||||||
|
grammer_data_elements0_pronunciation = []
|
||||||
|
for id, ipa in enumerate(pronunciation_ipa):
|
||||||
|
novo70 = ipa2novo70(ipa)
|
||||||
|
grammer_data_elements0_pronunciation.append({
|
||||||
|
"phones": novo70.split(),
|
||||||
|
"id": id
|
||||||
|
})
|
||||||
|
|
||||||
|
grammar_data = {
|
||||||
|
"kind": 'sequence',
|
||||||
|
"elements": [{
|
||||||
|
"kind": "word",
|
||||||
|
"pronunciation": grammer_data_elements0_pronunciation,
|
||||||
|
"label": word
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
|
||||||
|
grammar = {
|
||||||
|
"type": "confusion_network",
|
||||||
|
"version": "1.0",
|
||||||
|
"data": grammar_data,
|
||||||
|
"return_objects": ["grammar"],
|
||||||
|
"phoneset": "novo70"
|
||||||
|
}
|
||||||
|
|
||||||
|
return grammar
|
||||||
|
|
||||||
|
|
||||||
|
def forced_alignment(wav_file, word, pronunciation_ipa):
|
||||||
|
### IMPORTANT ###
|
||||||
|
# because of this function, this script should not be uploaded / shared.
|
||||||
|
|
||||||
|
# username / password cannot be passed as artuments...
|
||||||
|
p = argparse.ArgumentParser()
|
||||||
|
p.add_argument("--user", default='martijn.wieling')
|
||||||
|
p.add_argument("--password", default='xxxxxx')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True) # , modeldir=modeldir)
|
||||||
|
|
||||||
|
grammar = make_grammar(word, pronunciation_ipa)
|
||||||
|
result = rec.setgrammar(grammar)
|
||||||
|
#print "Set grammar result", res
|
||||||
|
result = rec.recognize_wav(wav_file)
|
||||||
|
return result.export()
|
||||||
|
|
||||||
|
|
||||||
|
def result2pronunciation(result, word):
|
||||||
|
result_ = [result[i] for i in range(len(result)) if result[i]['label'] == word]
|
||||||
|
llh = result_[0]['llh']
|
||||||
|
phones = result_[0]['phones']
|
||||||
|
pronunciation_novo70 = [phone['label'] for phone in phones]
|
||||||
|
pronunciation_ipa = [novo702ipa(phone) for phone in pronunciation_novo70]
|
||||||
|
return pronunciation_ipa, pronunciation_novo70, llh
|
||||||
|
|
||||||
|
|
||||||
|
def phones_not_in_novo70(ipa):
|
||||||
|
""" extract phones which is not in novo70 phoneset. """
|
||||||
|
|
||||||
|
phoneset_ipa, _, _, _ = load_novo70_phoneset()
|
||||||
|
|
||||||
|
# As per Nederlandse phoneset_aki.xlsx recieved from David
|
||||||
|
# [ɔː] oh / ohr
|
||||||
|
# [ɪː] ih / ihr
|
||||||
|
# [iː] iy
|
||||||
|
# [œː] uh
|
||||||
|
# [ɛː] eh
|
||||||
|
# [w] wv in IPA written as ʋ.
|
||||||
|
david_suggestion = ['ɔː', 'ɪː', 'iː', 'œː', 'ɛː', 'w']
|
||||||
|
|
||||||
|
return [phone for phone in split_ipa(ipa)
|
||||||
|
if not phone in phoneset_ipa and not phone in david_suggestion]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == 'main':
|
||||||
|
pronunciation_ipa = ['rø:s', 'mɑn', 'mɑntsjə']
|
||||||
|
#grammar = make_grammar('reus', pronunciation_ipa)
|
||||||
|
phoneset_ipa, phoneset_novo70, translation_key_ipa2novo70, translation_key_novo702ipa = load_novo70_phoneset()
|
@ -1,437 +0,0 @@
|
|||||||
import os
|
|
||||||
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import csv
|
|
||||||
import subprocess
|
|
||||||
from collections import Counter
|
|
||||||
import re
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
#from sklearn.metrics import confusion_matrix
|
|
||||||
|
|
||||||
import acoustic_model_functions as am_func
|
|
||||||
import convert_xsampa2ipa
|
|
||||||
import defaultfiles as default
|
|
||||||
|
|
||||||
|
|
||||||
## ======================= user define =======================
|
|
||||||
#curr_dir = r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model'
|
|
||||||
#config_ini = 'config.ini'
|
|
||||||
#repo_dir = r'C:\Users\Aki\source\repos'
|
|
||||||
#forced_alignment_module = repo_dir + '\\forced_alignment'
|
|
||||||
#forced_alignment_module_old = repo_dir + '\\aki_tools'
|
|
||||||
#ipa_xsampa_converter_dir = repo_dir + '\\ipa-xsama-converter'
|
|
||||||
#accent_classification_dir = repo_dir + '\\accent_classification\accent_classification'
|
|
||||||
excel_file = os.path.join(default.experiments_dir, 'stimmen', 'data', 'Frisian Variants Picture Task Stimmen.xlsx')
|
|
||||||
|
|
||||||
|
|
||||||
#experiments_dir = r'C:\OneDrive\Research\rug\experiments'
|
|
||||||
data_dir = os.path.join(default.experiments_dir, 'stimmen', 'data')
|
|
||||||
#csvfile = data_dir + '\\Frisian Variants Picture Task Stimmen.csv'
|
|
||||||
wav_dir = os.path.join(default.experiments_dir, 'stimmen', 'wav')
|
|
||||||
acoustic_model_dir = os.path.join(default.experiments_dir, 'friesian', 'acoustic_model', 'model')
|
|
||||||
htk_dict_dir = os.path.join(default.experiments_dir, 'stimmen', 'dic_short')
|
|
||||||
fa_dir = os.path.join(default.experiments_dir, 'stimmen', 'FA')
|
|
||||||
|
|
||||||
#cygwin_dir = r'C:\cygwin64\home\Aki\acoustic_model'
|
|
||||||
#lex_asr = os.path.join(default.fame_dir, 'lexicon', 'lex.asr')
|
|
||||||
#lex_asr_htk = os.path.join(default.fame_dir, 'lexicon', 'lex.asr_htk')
|
|
||||||
|
|
||||||
|
|
||||||
# procedure
|
|
||||||
make_dic_files = 0
|
|
||||||
do_forced_alignment_htk = 1
|
|
||||||
make_kaldi_data_files = 0
|
|
||||||
make_kaldi_lexicon_txt = 0
|
|
||||||
load_forced_alignment_kaldi = 0
|
|
||||||
eval_forced_alignment = 0
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## ======================= add paths =======================
|
|
||||||
|
|
||||||
sys.path.append(os.path.join(default.repo_dir, 'forced_alignment'))
|
|
||||||
from forced_alignment import convert_phone_set
|
|
||||||
from forced_alignment import pyhtk
|
|
||||||
|
|
||||||
sys.path.append(os.path.join(default.repo_dir, 'toolbox'))
|
|
||||||
#import pyHTK
|
|
||||||
from evaluation import plot_confusion_matrix
|
|
||||||
|
|
||||||
|
|
||||||
## ======================= convert phones ======================
|
|
||||||
|
|
||||||
mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_converter_dir)
|
|
||||||
|
|
||||||
xls = pd.ExcelFile(excel_file)
|
|
||||||
|
|
||||||
## check conversion
|
|
||||||
#df = pd.read_excel(xls, 'frequency')
|
|
||||||
#for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']):
|
|
||||||
# #ipa_converted = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, xsampa_)
|
|
||||||
# ipa_converted = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
|
|
||||||
# if not ipa_converted == ipa:
|
|
||||||
# print('{0}: {1} - {2}'.format(xsampa, ipa_converted, ipa))
|
|
||||||
|
|
||||||
|
|
||||||
## check phones included in FAME!
|
|
||||||
# the phones used in the lexicon.
|
|
||||||
#phonelist = am_func.get_phonelist(lex_asr)
|
|
||||||
|
|
||||||
# the lines which include a specific phone.
|
|
||||||
#lines = am_func.find_phone(lex_asr, 'x')
|
|
||||||
|
|
||||||
|
|
||||||
# Filename, Word, Self Xsampa
|
|
||||||
df = pd.read_excel(xls, 'original')
|
|
||||||
|
|
||||||
ipas = []
|
|
||||||
famehtks = []
|
|
||||||
for xsampa in df['Self Xsampa']:
|
|
||||||
if not isinstance(xsampa, float): # 'NaN'
|
|
||||||
# typo?
|
|
||||||
xsampa = xsampa.replace('r2:z@rA:\\t', 'r2:z@rA:t')
|
|
||||||
xsampa = xsampa.replace(';', ':')
|
|
||||||
|
|
||||||
ipa = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
|
|
||||||
ipa = ipa.replace('ː', ':')
|
|
||||||
ipa = ipa.replace(' ', '')
|
|
||||||
ipas.append(ipa)
|
|
||||||
famehtk = convert_phone_set.ipa2famehtk(ipa)
|
|
||||||
famehtks.append(famehtk)
|
|
||||||
else:
|
|
||||||
ipas.append('')
|
|
||||||
famehtks.append('')
|
|
||||||
|
|
||||||
# extract interesting cols.
|
|
||||||
df = pd.DataFrame({'filename': df['Filename'],
|
|
||||||
'word': df['Word'],
|
|
||||||
'xsampa': df['Self Xsampa'],
|
|
||||||
'ipa': pd.Series(ipas),
|
|
||||||
'famehtk': pd.Series(famehtks)})
|
|
||||||
# cleansing.
|
|
||||||
df = df[~df['famehtk'].isin(['/', ''])]
|
|
||||||
|
|
||||||
|
|
||||||
## ======================= make dict files used for HTK. ======================
|
|
||||||
if make_dic_files:
|
|
||||||
word_list = np.unique(df['word'])
|
|
||||||
|
|
||||||
output_type = 3
|
|
||||||
|
|
||||||
for word in word_list:
|
|
||||||
htk_dict_file = htk_dict_dir + '\\' + word + '.dic'
|
|
||||||
|
|
||||||
# pronunciation variant of the target word.
|
|
||||||
pronvar_ = df['famehtk'][df['word'].str.match(word)]
|
|
||||||
|
|
||||||
# make dic file.
|
|
||||||
am_func.make_dic(word, pronvar_, htk_dict_file, output_type)
|
|
||||||
|
|
||||||
|
|
||||||
## ======================= forced alignment using HTK =======================
|
|
||||||
if do_forced_alignment_htk:
|
|
||||||
|
|
||||||
#hmm_num = 2
|
|
||||||
for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
|
|
||||||
|
|
||||||
hmm_num_str = str(hmm_num)
|
|
||||||
acoustic_model = os.path.join(acoustic_model_dir, 'hmm' + hmm_num_str + r'-2\hmmdefs')
|
|
||||||
|
|
||||||
predictions = []
|
|
||||||
for i, filename in enumerate(df['filename']):
|
|
||||||
print('=== {0}/{1} ==='.format(i, len(df)))
|
|
||||||
wav_file = os.path.join(wav_dir, filename)
|
|
||||||
|
|
||||||
if os.path.exists(wav_file) and i in df['filename'].keys():
|
|
||||||
word = df['word'][i]
|
|
||||||
WORD = word.upper()
|
|
||||||
|
|
||||||
# make label file.
|
|
||||||
label_file = os.path.join(wav_dir, filename.replace('.wav', '.lab'))
|
|
||||||
with open(label_file, 'w') as f:
|
|
||||||
lines = f.write(WORD)
|
|
||||||
|
|
||||||
htk_dict_file = os.path.join(htk_dict_dir, word + '.dic')
|
|
||||||
fa_file = os.path.join(fa_dir, filename.replace('.wav', '.txt') + hmm_num_str)
|
|
||||||
pyhtk.doHVite(wav_file, label_file, htk_dict_file, fa_file, default.config_hvite, default.phonelist, acoustic_model)
|
|
||||||
|
|
||||||
prediction = am_func.read_fileFA(fa_file)
|
|
||||||
predictions.append(prediction)
|
|
||||||
|
|
||||||
os.remove(label_file)
|
|
||||||
print('{0}: {1} -> {2}'.format(WORD, df['famehtk'][i], prediction))
|
|
||||||
else:
|
|
||||||
predictions.append('')
|
|
||||||
print('!!!!! file not found.')
|
|
||||||
|
|
||||||
predictions = np.array(predictions)
|
|
||||||
#match = np.c_[words[predictions != ''], pronunciations[predictions != ''], predictions[predictions != '']]
|
|
||||||
np.save(os.path.join(data_dir, 'predictions_hmm' + hmm_num_str + '.npy'), predictions)
|
|
||||||
|
|
||||||
|
|
||||||
## ======================= make files which is used for forced alignment by Kaldi =======================
|
|
||||||
if make_kaldi_data_files:
|
|
||||||
wav_dir = r'c:\OneDrive\WSL\kaldi-trunk\egs\fame\s5\corpus\stimmen'
|
|
||||||
kaldi_work_dir = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5'
|
|
||||||
kaldi_data_dir = os.path.join(kaldi_work_dir, 'data', 'alignme')
|
|
||||||
kaldi_dict_dir = os.path.join(kaldi_work_dir, 'data', 'local', 'dict')
|
|
||||||
htk_dict_dir = os.path.join(experiments_dir, 'stimmen', 'dic_top3')
|
|
||||||
|
|
||||||
wav_scp = os.path.join(kaldi_data_dir, 'wav.scp')
|
|
||||||
text_file = os.path.join(kaldi_data_dir, 'text')
|
|
||||||
utt2spk = os.path.join(kaldi_data_dir, 'utt2spk')
|
|
||||||
|
|
||||||
lexicon_txt = os.path.join(kaldi_dict_dir, 'lexicon.txt')
|
|
||||||
|
|
||||||
predictions = []
|
|
||||||
file_num_max = len(filenames)
|
|
||||||
|
|
||||||
# remove previous files.
|
|
||||||
if os.path.exists(wav_scp):
|
|
||||||
os.remove(wav_scp)
|
|
||||||
if os.path.exists(text_file):
|
|
||||||
os.remove(text_file)
|
|
||||||
if os.path.exists(utt2spk):
|
|
||||||
os.remove(utt2spk)
|
|
||||||
|
|
||||||
f_wav_scp = open(wav_scp, 'a', encoding="utf-8", newline='\n')
|
|
||||||
f_text_file = open(text_file, 'a', encoding="utf-8", newline='\n')
|
|
||||||
f_utt2spk = open(utt2spk, 'a', encoding="utf-8", newline='\n')
|
|
||||||
|
|
||||||
# make wav.scp, text, and utt2spk files.
|
|
||||||
for i in range(0, file_num_max):
|
|
||||||
#for i in range(400, 410):
|
|
||||||
print('=== {0}/{1} ==='.format(i+1, file_num_max))
|
|
||||||
filename = filenames[i]
|
|
||||||
wav_file = wav_dir + '\\' + filename
|
|
||||||
|
|
||||||
if os.path.exists(wav_file):
|
|
||||||
speaker_id = 'speaker_' + str(i).zfill(4)
|
|
||||||
utterance_id = filename.replace('.wav', '')
|
|
||||||
utterance_id = utterance_id.replace(' ', '_')
|
|
||||||
utterance_id = speaker_id + '-' + utterance_id
|
|
||||||
|
|
||||||
# wav.scp file
|
|
||||||
wav_file_unix = wav_file.replace('\\', '/')
|
|
||||||
wav_file_unix = wav_file_unix.replace('c:/', '/mnt/c/')
|
|
||||||
|
|
||||||
f_wav_scp.write('{0} {1}\n'.format(utterance_id, wav_file_unix))
|
|
||||||
|
|
||||||
# text file
|
|
||||||
word = words[i].lower()
|
|
||||||
f_text_file.write('{0}\t{1}\n'.format(utterance_id, word))
|
|
||||||
|
|
||||||
# utt2spk
|
|
||||||
f_utt2spk.write('{0} {1}\n'.format(utterance_id, speaker_id))
|
|
||||||
|
|
||||||
f_wav_scp.close()
|
|
||||||
f_text_file.close()
|
|
||||||
f_utt2spk.close()
|
|
||||||
|
|
||||||
|
|
||||||
## ======================= make lexicon txt which is used by Kaldi =======================
|
|
||||||
if make_kaldi_lexicon_txt:
|
|
||||||
kaldi_work_dir = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5'
|
|
||||||
kaldi_dict_dir = os.path.join(kaldi_work_dir, 'data', 'local', 'dict')
|
|
||||||
lexicon_txt = os.path.join(kaldi_dict_dir, 'lexicon.txt')
|
|
||||||
option_num = 5
|
|
||||||
|
|
||||||
# remove previous file.
|
|
||||||
if os.path.exists(lexicon_txt):
|
|
||||||
os.remove(lexicon_txt)
|
|
||||||
|
|
||||||
mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', ipa_xsampa_converter_dir)
|
|
||||||
with open(csvfile, encoding="utf-8") as fin:
|
|
||||||
lines = csv.reader(fin, delimiter=';', lineterminator="\n", skipinitialspace=True)
|
|
||||||
next(lines, None) # skip the headers
|
|
||||||
|
|
||||||
filenames = []
|
|
||||||
words = []
|
|
||||||
pronunciations = []
|
|
||||||
p = []
|
|
||||||
for line in lines:
|
|
||||||
if line[1] is not '' and len(line) > 5:
|
|
||||||
filenames.append(line[0])
|
|
||||||
words.append(line[1])
|
|
||||||
pron_xsampa = line[3]
|
|
||||||
pron_ipa = convert_xsampa2ipa.conversion('xsampa', 'ipa', mapping, pron_xsampa)
|
|
||||||
pron_ipa = pron_ipa.replace('ː', ':')
|
|
||||||
|
|
||||||
# adjust to phones used in the acoustic model.
|
|
||||||
pronunciations.append(pron_ipa)
|
|
||||||
|
|
||||||
# check if all phones are in the phonelist of the acoustic model.
|
|
||||||
#'y', 'b', 'ɾ', 'u', 'ɔ:', 'ø', 't', 'œ', 'n', 'ɒ', 'ɐ', 'f', 'o', 'k', 'x', 'ɡ', 'v', 's', 'ɛ:', 'ɪ:', 'ɑ', 'ɛ', 'a', 'd', 'z', 'ɪ', 'ɔ', 'l', 'i:', 'm', 'p', 'a:', 'i', 'e', 'j', 'o:', 'ʁ', 'h', ':', 'e:', 'ə', 'æ', 'χ', 'w', 'r', 'ə:', 'sp', 'ʊ', 'u:', 'ŋ'
|
|
||||||
|
|
||||||
filenames = np.array(filenames)
|
|
||||||
words = np.array(words)
|
|
||||||
wordlist = np.unique(words)
|
|
||||||
pronunciations = np.array(pronunciations)
|
|
||||||
|
|
||||||
# output lexicon.txt
|
|
||||||
#f_lexicon_txt = open(lexicon_txt, 'a', encoding="utf-8", newline='\n')
|
|
||||||
pronvar_list_all = []
|
|
||||||
for word in word_list:
|
|
||||||
|
|
||||||
# pronunciation variant of the target word.
|
|
||||||
pronvar_ = pronunciations[words == word]
|
|
||||||
# remove ''
|
|
||||||
pronvar_ = np.delete(pronvar_, np.where(pronvar_==''))
|
|
||||||
|
|
||||||
c = Counter(pronvar_)
|
|
||||||
total_num = sum(c.values())
|
|
||||||
|
|
||||||
for key, value in c.most_common(option_num):
|
|
||||||
#print('{0}\t{1}\t{2}\t{3}'.format(word, key, value, total_num))
|
|
||||||
key = key.replace('æ', 'ɛ')
|
|
||||||
key = key.replace('ɐ', 'a')
|
|
||||||
key = key.replace('ɑ', 'a')
|
|
||||||
key = key.replace('ɾ', 'r')
|
|
||||||
key = key.replace('ʁ', 'r')
|
|
||||||
key = key.replace('ʊ', 'u')
|
|
||||||
key = key.replace('χ', 'x')
|
|
||||||
#print('-->{0}\t{1}\t{2}\t{3}\n'.format(word, key, value, total_num))
|
|
||||||
|
|
||||||
# make possible pronounciation variant list.
|
|
||||||
pronvar_list = [key]
|
|
||||||
while 'ø:' in ' '.join(pronvar_list) or 'œ' in ' '.join(pronvar_list) or 'ɒ' in ' '.join(pronvar_list):
|
|
||||||
pronvar_list_ = []
|
|
||||||
for p in pronvar_list:
|
|
||||||
if 'ø:' in p:
|
|
||||||
pronvar_list_.append(p.replace('ø:', 'ö'))
|
|
||||||
pronvar_list_.append(p.replace('ø:', 'ö:'))
|
|
||||||
if 'œ' in p:
|
|
||||||
pronvar_list_.append(p.replace('œ', 'ɔ̈'))
|
|
||||||
pronvar_list_.append(p.replace('œ', 'ɔ̈:'))
|
|
||||||
if 'ɒ' in p:
|
|
||||||
pronvar_list_.append(p.replace('ɒ', 'ɔ̈'))
|
|
||||||
pronvar_list_.append(p.replace('ɒ', 'ɔ̈:'))
|
|
||||||
pronvar_list = np.unique(pronvar_list_)
|
|
||||||
|
|
||||||
for pronvar_ in pronvar_list:
|
|
||||||
split_ipa = convert_phone_set.split_fame_ipa(pronvar_)
|
|
||||||
pronvar_out = ' '.join(split_ipa)
|
|
||||||
pronvar_list_all.append([word, pronvar_out])
|
|
||||||
|
|
||||||
# output
|
|
||||||
pronvar_list_all = np.array(pronvar_list_all)
|
|
||||||
pronvar_list_all = np.unique(pronvar_list_all, axis=0)
|
|
||||||
#f_lexicon_txt.write('<UNK>\tSPN\n')
|
|
||||||
#for line in pronvar_list_all:
|
|
||||||
# f_lexicon_txt.write('{0}\t{1}\n'.format(line[0].lower(), line[1]))
|
|
||||||
|
|
||||||
#f_lexicon_txt.close()
|
|
||||||
|
|
||||||
## ======================= load kaldi forced alignment result =======================
|
|
||||||
if load_forced_alignment_kaldi:
|
|
||||||
kaldi_work_dir = r'C:\OneDrive\WSL\kaldi-trunk\egs\fame\s5'
|
|
||||||
phones_txt = kaldi_work_dir + '\\data\\lang\\phones.txt'
|
|
||||||
merged_alignment_txt = kaldi_work_dir + '\\exp\\tri1_alignme\\merged_alignment.txt'
|
|
||||||
|
|
||||||
filenames = np.load(data_dir + '\\filenames.npy')
|
|
||||||
words = np.load(data_dir + '\\words.npy')
|
|
||||||
pronunciations = np.load(data_dir + '\\pronunciations_ipa.npy')
|
|
||||||
pronvar_list_all = np.load(data_dir + '\\pronvar_list_all.npy')
|
|
||||||
word_list = np.unique(words)
|
|
||||||
|
|
||||||
# load the mapping between phones and ids.
|
|
||||||
with open(phones_txt, 'r', encoding="utf-8") as f:
|
|
||||||
mappings = f.read().split('\n')
|
|
||||||
|
|
||||||
phones = []
|
|
||||||
phone_ids = []
|
|
||||||
for m in mappings:
|
|
||||||
m = m.split(' ')
|
|
||||||
if len(m) > 1:
|
|
||||||
phones.append(m[0])
|
|
||||||
phone_ids.append(int(m[1]))
|
|
||||||
|
|
||||||
with open(merged_alignment_txt, 'r') as f:
|
|
||||||
lines = f.read()
|
|
||||||
lines = lines.split('\n')
|
|
||||||
|
|
||||||
fa_filenames = []
|
|
||||||
fa_pronunciations = []
|
|
||||||
filename_ = ''
|
|
||||||
pron = []
|
|
||||||
for line in lines:
|
|
||||||
line = line.split(' ')
|
|
||||||
if len(line) == 5:
|
|
||||||
filename = line[0]
|
|
||||||
if filename == filename_:
|
|
||||||
phone_id = int(line[4])
|
|
||||||
#if not phone_id == 1:
|
|
||||||
phone = phones[phone_ids.index(phone_id)]
|
|
||||||
pron_ = re.sub(r'_[A-Z]', '', phone)
|
|
||||||
if not pron_ == 'SIL':
|
|
||||||
pron.append(pron_)
|
|
||||||
else:
|
|
||||||
fa_filenames.append(re.sub(r'speaker_[0-9]{4}-', '', filename))
|
|
||||||
fa_pronunciations.append(' '.join(pron))
|
|
||||||
pron = []
|
|
||||||
|
|
||||||
filename_ = filename
|
|
||||||
|
|
||||||
# correct or not.
|
|
||||||
#for filename, fa_pronunciation in zip(fa_filenames, fa_pronunciations):
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## ======================= evaluate the result of forced alignment =======================
|
|
||||||
if eval_forced_alignment:
|
|
||||||
match_num = []
|
|
||||||
for hmm_num in [1, 2, 4, 8, 16, 32, 64, 128, 256]:
|
|
||||||
#hmm_num = 256
|
|
||||||
hmm_num_str = str(hmm_num)
|
|
||||||
match = np.load(data_dir + '\\match_hmm' + hmm_num_str + '.npy')
|
|
||||||
|
|
||||||
# use dic_short?
|
|
||||||
if 1:
|
|
||||||
pronunciation_variants = np.array(['WORD', 'pronunciation']).reshape(1, 2)
|
|
||||||
for word in word_list:
|
|
||||||
fileDic = experiments_dir + r'\stimmen\dic_top3' + '\\' + word + '.dic'
|
|
||||||
pronunciation_variants = np.r_[pronunciation_variants, pyHTK.loadHTKdic(fileDic)]
|
|
||||||
|
|
||||||
# see only words which appears in top 3.
|
|
||||||
match_short = []
|
|
||||||
for line in match:
|
|
||||||
word = line[0]
|
|
||||||
WORD = word.upper()
|
|
||||||
pronvar = pronunciation_variants[pronunciation_variants[:, 0] == word.upper(), 1]
|
|
||||||
|
|
||||||
if line[1] in pronvar:
|
|
||||||
match_short.append(line)
|
|
||||||
|
|
||||||
match_short = np.array(match_short)
|
|
||||||
match = np.copy(match_short)
|
|
||||||
|
|
||||||
# number of match
|
|
||||||
total_match = sum(match[:, 1] == match[:, 2])
|
|
||||||
print("{}: {}/{}".format(hmm_num_str, total_match, match.shape[0]))
|
|
||||||
match_num.append([hmm_num, total_match, match.shape[0]])
|
|
||||||
|
|
||||||
|
|
||||||
# number of mixtures vs accuracy
|
|
||||||
match_num = np.array(match_num)
|
|
||||||
plt.xscale("log")
|
|
||||||
plt.plot(match_num[:, 0], match_num[:, 1]/match_num[0, 2], 'o-')
|
|
||||||
plt.xlabel('number of mixtures', fontsize=14, fontweight='bold')
|
|
||||||
plt.ylabel('accuracy', fontsize=14, fontweight='bold')
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
# confusion matrix
|
|
||||||
#dir_out = r'C:\OneDrive\Research\rug\experiments\stimmen\result'
|
|
||||||
#word_list = np.unique(match[:, 0])
|
|
||||||
|
|
||||||
#for word in word_list:
|
|
||||||
# match_ = match[match[:, 0] == word, :]
|
|
||||||
# cm = confusion_matrix(match_[:, 1], match_[:, 2])
|
|
||||||
# pronvar = pronunciation_variants[pronunciation_variants[:, 0] == word.upper(), 1]
|
|
||||||
|
|
||||||
# plt.figure()
|
|
||||||
# plot_confusion_matrix(cm, classes=pronvar, normalize=True)
|
|
||||||
# plt.savefig(dir_out + '\\cm_' + word + '.png')
|
|
154
acoustic_model/phoneset/fame_asr.py
Normal file
154
acoustic_model/phoneset/fame_asr.py
Normal file
@ -0,0 +1,154 @@
|
|||||||
|
""" definition of the phones to be used. """
|
||||||
|
|
||||||
|
# phonese in {FAME}/lexicon/lex.asr
|
||||||
|
phoneset = [
|
||||||
|
# vowels
|
||||||
|
'a',
|
||||||
|
'a:',
|
||||||
|
'e',
|
||||||
|
'e:',
|
||||||
|
'i',
|
||||||
|
'i:',
|
||||||
|
'i̯',
|
||||||
|
'o',
|
||||||
|
'o:',
|
||||||
|
'ö',
|
||||||
|
'ö:',
|
||||||
|
'u',
|
||||||
|
'u:',
|
||||||
|
'ü',
|
||||||
|
'ü:',
|
||||||
|
#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone.
|
||||||
|
'ṷ',
|
||||||
|
'y',
|
||||||
|
'ɔ',
|
||||||
|
'ɔ:',
|
||||||
|
'ɔ̈',
|
||||||
|
'ɔ̈:',
|
||||||
|
'ə',
|
||||||
|
'ɛ',
|
||||||
|
'ɛ:',
|
||||||
|
'ɪ',
|
||||||
|
'ɪ:',
|
||||||
|
|
||||||
|
# plosives
|
||||||
|
'p',
|
||||||
|
'b',
|
||||||
|
't',
|
||||||
|
'd',
|
||||||
|
'k',
|
||||||
|
'g',
|
||||||
|
'ɡ', # = 'g'
|
||||||
|
|
||||||
|
# nasals
|
||||||
|
'm',
|
||||||
|
'n',
|
||||||
|
'ŋ',
|
||||||
|
|
||||||
|
# fricatives
|
||||||
|
'f',
|
||||||
|
'v',
|
||||||
|
's',
|
||||||
|
's:',
|
||||||
|
'z',
|
||||||
|
'x',
|
||||||
|
'h',
|
||||||
|
|
||||||
|
# tap and flip
|
||||||
|
'r',
|
||||||
|
'r:',
|
||||||
|
|
||||||
|
# approximant
|
||||||
|
'j',
|
||||||
|
'l'
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
## reduce the number of phones.
|
||||||
|
# the phones which seldom occur are replaced with another more popular phones.
|
||||||
|
# replacements are based on the advice from Martijn Wieling.
|
||||||
|
reduction_key = {
|
||||||
|
'y':'i:', 'e':'e:', 'ə:':'ɛ:', 'r:':'r', 'ɡ':'g',
|
||||||
|
# aki added because this is used in stimmen_project.
|
||||||
|
'ɔ̈:':'ɔ:'
|
||||||
|
}
|
||||||
|
# already removed beforehand in phoneset. Just to be sure.
|
||||||
|
phones_to_be_removed = ['ú', 's:']
|
||||||
|
|
||||||
|
def phone_reduction(phones):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
phones (list): list of phones.
|
||||||
|
"""
|
||||||
|
if sum([phone in phones for phone in phones_to_be_removed]) != 0:
|
||||||
|
print('input includes phone(s) which is not defined in fame_asr.')
|
||||||
|
print('those phone(s) are removed.')
|
||||||
|
return [reduction_key.get(i, i) for i in phones
|
||||||
|
if i not in phones_to_be_removed]
|
||||||
|
|
||||||
|
phoneset_short = list(set(phone_reduction(phoneset)))
|
||||||
|
phoneset_short.sort()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## translation_key to htk format (ascii).
|
||||||
|
# phones which gives UnicodeEncodeError when phone.encode("ascii")
|
||||||
|
# are replaced with other characters.
|
||||||
|
translation_key_asr2htk = {
|
||||||
|
'i̯': 'i_',
|
||||||
|
'ṷ': 'u_',
|
||||||
|
|
||||||
|
# on the analogy of German umlaut, 'e' is used.
|
||||||
|
'ö': 'oe', 'ö:': 'oe:', ''
|
||||||
|
'ü': 'ue', 'ü:': 'ue:',
|
||||||
|
|
||||||
|
# on the analogy of Chinese...
|
||||||
|
'ŋ': 'ng',
|
||||||
|
|
||||||
|
# refer to Xsampa.
|
||||||
|
'ɔ': 'O', 'ɔ:': 'O:', 'ɔ̈': 'Oe',
|
||||||
|
#'ɔ̈:': 'O:', # does not appear in FAME, but used in stimmen.
|
||||||
|
'ɛ': 'E', 'ɛ:': 'E:',
|
||||||
|
'ɪ': 'I', 'ɪ:': 'I:',
|
||||||
|
|
||||||
|
# it is @ in Xsampa, but that is not handy on HTK.
|
||||||
|
'ə': 'A'
|
||||||
|
}
|
||||||
|
phoneset_htk = [translation_key_asr2htk.get(i, i) for i in phoneset_short]
|
||||||
|
|
||||||
|
#not_in_ascii = [
|
||||||
|
# '\'',
|
||||||
|
# 'â', 'ê', 'ô', 'û', 'č',
|
||||||
|
# 'à', 'í', 'é', 'è', 'ú', 'ć',
|
||||||
|
# 'ä', 'ë', 'ï', 'ö', 'ü'
|
||||||
|
#]
|
||||||
|
translation_key_word2htk = {
|
||||||
|
#'\'': '\\\'',
|
||||||
|
'í':'i1', 'é':'e1', 'ú':'u1', 'ć':'c1',
|
||||||
|
'à':'a2', 'è':'e2',
|
||||||
|
'â':'a3', 'ê':'e3', 'ô':'o3', 'û':'u3',
|
||||||
|
'č':'c4',
|
||||||
|
'ä': 'ao', 'ë': 'ee', 'ï': 'ie', 'ö': 'oe', 'ü': 'ue',
|
||||||
|
}
|
||||||
|
#[translation_key_word2htk.get(i, i) for i in not_in_ascii]
|
||||||
|
#Stop: p, b, t, d, k, g
|
||||||
|
#Nasal: m, n, ng(ŋ)
|
||||||
|
#Fricative: s, z, f, v, h, x
|
||||||
|
#Liquid: l, r
|
||||||
|
#Vowel: a, a:, e:, i, i:, i_(i̯), o, o:, u, u:, u_(ṷ), oe(ö), oe:(ö:), ue(ü), ue:(ü:), O(ɔ), O:(ɔ:), Oe(ɔ̈), A(ə), E(ɛ), E:(ɛ:), I(ɪ), I:(ɪ:)
|
||||||
|
|
||||||
|
|
||||||
|
## the list of multi character phones.
|
||||||
|
# for example, the length of 'a:' is 3, but in the codes it is treated as one letter.
|
||||||
|
|
||||||
|
# original.
|
||||||
|
multi_character_phones = [i for i in phoneset if len(i) > 1]
|
||||||
|
multi_character_phones.sort(key=len, reverse=True)
|
||||||
|
|
||||||
|
# phonset reduced.
|
||||||
|
multi_character_phones_short = [i for i in phoneset_short if len(i) > 1]
|
||||||
|
multi_character_phones_short.sort(key=len, reverse=True)
|
||||||
|
|
||||||
|
# htk compatible.
|
||||||
|
multi_character_phones_htk = [i for i in phoneset_htk if len(i) > 1]
|
||||||
|
multi_character_phones_htk.sort(key=len, reverse=True)
|
138
acoustic_model/phoneset/fame_ipa.py
Normal file
138
acoustic_model/phoneset/fame_ipa.py
Normal file
@ -0,0 +1,138 @@
|
|||||||
|
""" definition of the phones to be used. """
|
||||||
|
|
||||||
|
phoneset = [
|
||||||
|
# vowels
|
||||||
|
'i̯',
|
||||||
|
'i̯ⁿ',
|
||||||
|
'y',
|
||||||
|
'y:', # not included in lex.ipa, but in stimmen.
|
||||||
|
'i',
|
||||||
|
'i.',
|
||||||
|
'iⁿ',
|
||||||
|
'i:',
|
||||||
|
'i:ⁿ',
|
||||||
|
'ɪ',
|
||||||
|
'ɪⁿ',
|
||||||
|
'ɪ.',
|
||||||
|
'ɪ:', # not included in lex.ipa, but in stimmen.
|
||||||
|
'ɪ:ⁿ',
|
||||||
|
'e',
|
||||||
|
'e:',
|
||||||
|
'e:ⁿ',
|
||||||
|
'ə',
|
||||||
|
'əⁿ',
|
||||||
|
'ə:',
|
||||||
|
'ɛ',
|
||||||
|
'ɛ.',
|
||||||
|
'ɛⁿ',
|
||||||
|
'ɛ:',
|
||||||
|
'ɛ:ⁿ',
|
||||||
|
'a',
|
||||||
|
'aⁿ',
|
||||||
|
'a.',
|
||||||
|
'a:',
|
||||||
|
'a:ⁿ',
|
||||||
|
'ṷ',
|
||||||
|
'ṷ.',
|
||||||
|
'ṷⁿ',
|
||||||
|
#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr. The pronunciation in Fries may be mistakes so I removed this phone.
|
||||||
|
'u',
|
||||||
|
'uⁿ',
|
||||||
|
'u.',
|
||||||
|
'u:',
|
||||||
|
'u:ⁿ',
|
||||||
|
'ü',
|
||||||
|
'ü.',
|
||||||
|
'üⁿ',
|
||||||
|
'ü:',
|
||||||
|
'ü:ⁿ',
|
||||||
|
'o',
|
||||||
|
'oⁿ',
|
||||||
|
'o.',
|
||||||
|
'o:',
|
||||||
|
'o:ⁿ',
|
||||||
|
'ö',
|
||||||
|
'ö.',
|
||||||
|
'öⁿ',
|
||||||
|
'ö:',
|
||||||
|
'ö:ⁿ',
|
||||||
|
'ɔ',
|
||||||
|
'ɔ.',
|
||||||
|
'ɔⁿ',
|
||||||
|
'ɔ:',
|
||||||
|
'ɔ:ⁿ',
|
||||||
|
'ɔ̈', # not included in lex.ipa
|
||||||
|
'ɔ̈.',
|
||||||
|
'ɔ̈:',
|
||||||
|
|
||||||
|
# plosives
|
||||||
|
'p',
|
||||||
|
'b',
|
||||||
|
't',
|
||||||
|
'tⁿ',
|
||||||
|
'd',
|
||||||
|
'k',
|
||||||
|
'g',
|
||||||
|
'ɡ', # = 'g'
|
||||||
|
|
||||||
|
# nasals
|
||||||
|
'm',
|
||||||
|
'n',
|
||||||
|
'ŋ',
|
||||||
|
|
||||||
|
# fricatives
|
||||||
|
'f',
|
||||||
|
'v',
|
||||||
|
's',
|
||||||
|
's:',
|
||||||
|
'z',
|
||||||
|
'zⁿ',
|
||||||
|
'x',
|
||||||
|
'h',
|
||||||
|
|
||||||
|
# tap and flip
|
||||||
|
'r',
|
||||||
|
'r.', # only appears in word 'mearpartijestelsel'(does not exist in lex_asr) and 'tenoarpartij'.
|
||||||
|
'r:', # only appears in word 'mûsearflearmûs' and 'sjochdêr'.
|
||||||
|
|
||||||
|
# approximant
|
||||||
|
'j',
|
||||||
|
'j.',
|
||||||
|
'l'
|
||||||
|
]
|
||||||
|
|
||||||
|
## reduce the number of phones.
|
||||||
|
# the phones which are used in stimmen transcription but not in FAME corpus.
|
||||||
|
# replacements are based on the advice from Jelske Dijkstra on 2018/06/21.
|
||||||
|
stimmen_replacement = {
|
||||||
|
'æ': 'ɛ',
|
||||||
|
'ø': 'ö', # or 'ö:'
|
||||||
|
'ø:': 'ö:', # Aki added.
|
||||||
|
'œ': 'ɔ̈', # or 'ɔ̈:'
|
||||||
|
'œ:': 'ɔ̈:', # Aki added.
|
||||||
|
'ɐ': 'a', # or 'a:'
|
||||||
|
'ɐ:': 'a:', # Aki added.
|
||||||
|
'ɑ': 'a', # or 'a:'
|
||||||
|
'ɑ:': 'a:', # Aki added
|
||||||
|
'ɒ': 'ɔ', # or 'ɔ:'
|
||||||
|
'ɒ:': 'ɔ:', # Aki added.
|
||||||
|
'ɾ': 'r',
|
||||||
|
'ʁ': 'r',
|
||||||
|
'ʊ': 'u',
|
||||||
|
'χ': 'x',
|
||||||
|
|
||||||
|
# aki guessed.
|
||||||
|
'ʀ': 'r',
|
||||||
|
'ɹ': 'r',
|
||||||
|
'w': 'ö'
|
||||||
|
}
|
||||||
|
phoneset.extend(list(stimmen_replacement.keys()))
|
||||||
|
|
||||||
|
def phone_reduction(phones):
|
||||||
|
return [stimmen_replacement.get(i, i) for i in phones]
|
||||||
|
|
||||||
|
|
||||||
|
## the list of multi character phones.
|
||||||
|
# for example, the length of 'i̯ⁿ' is 3, but in the codes it is treated as one letter.
|
||||||
|
multi_character_phones = [i for i in phoneset if len(i) > 1]
|
||||||
|
multi_character_phones.sort(key=len, reverse=True)
|
BIN
acoustic_model/phoneset/fame_ipa2asr.npy
Normal file
BIN
acoustic_model/phoneset/fame_ipa2asr.npy
Normal file
Binary file not shown.
197
acoustic_model/phoneset/fame_phonetics.py
Normal file
197
acoustic_model/phoneset/fame_phonetics.py
Normal file
@ -0,0 +1,197 @@
|
|||||||
|
import sys
|
||||||
|
import os
|
||||||
|
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
||||||
|
|
||||||
|
import fame_functions
|
||||||
|
from phoneset import fame_ipa, fame_asr
|
||||||
|
import convert_phoneset
|
||||||
|
|
||||||
|
|
||||||
|
## general
|
||||||
|
stop = 'p, b, t, d, k, g'
|
||||||
|
nasal = 'm, n, ŋ'
|
||||||
|
fricative = 's, z, f, v, h, x, j'
|
||||||
|
liquid = 'l, r'
|
||||||
|
vowel = 'a, a:, e:, i, i:, i̯, o, o:, u, u:, ṷ, ö, ö:, ü, ü:, ɔ, ɔ:, ɔ̈, ə, ɛ, ɛ:, ɪ, ɪ:'
|
||||||
|
|
||||||
|
## consonant
|
||||||
|
c_front = 'p, b, m, f, v'
|
||||||
|
c_central = 't, d, n, s, z, l, r'
|
||||||
|
c_back = 'k, g, ŋ, h, x, j'
|
||||||
|
|
||||||
|
fortis = 'p, t, k, f, s'
|
||||||
|
lenis = 'b, d, g, v, z, j'
|
||||||
|
neither_fortis_nor_lenis = 'm, n, ŋ, h, l, r, x'
|
||||||
|
|
||||||
|
coronal = 't, d, n, s, z, l, r, j'
|
||||||
|
non_coronal = 'p, b, m, k, g, ŋ, f, v, h, x'
|
||||||
|
|
||||||
|
anterior = 'p, b, m, t, d, n, f, v, s, z, l'
|
||||||
|
non_anterior = 'k, g, ŋ, h, x, j, r'
|
||||||
|
|
||||||
|
continuent = 'm, n, ŋ, f, v, s, z, h, l, r'
|
||||||
|
non_continuent = 'p, b, t, d, k, g, x, j'
|
||||||
|
|
||||||
|
strident = 's, z, j'
|
||||||
|
non_strident = 'f, v, h'
|
||||||
|
unstrident = 'p, b, t, d, m, n, ŋ, k, g, r, x'
|
||||||
|
|
||||||
|
glide = 'h, l, r'
|
||||||
|
syllabic = 'm, l, ŋ'
|
||||||
|
|
||||||
|
unvoiced = 'p, t, k, s, f, x, h'
|
||||||
|
voiced = 'b, d, g, z, v, m, n, ŋ, l, r, j'
|
||||||
|
|
||||||
|
#affricate: ???
|
||||||
|
non_affricate = 's, z, f, v'
|
||||||
|
|
||||||
|
voiced_stop = 'b, d, g'
|
||||||
|
unvoiced_stop = 'p, t, k'
|
||||||
|
front_stop = 'p, b'
|
||||||
|
central_stop = 't, d'
|
||||||
|
back_stop = 'k, g'
|
||||||
|
|
||||||
|
voiced_fricative = 'z, v'
|
||||||
|
unvoiced_fricative = 's, f'
|
||||||
|
front_fricative = 'f, v'
|
||||||
|
central_fricative = 's, z'
|
||||||
|
back_fricative = 'j'
|
||||||
|
|
||||||
|
|
||||||
|
## vowel
|
||||||
|
v_front = 'i, i:, i̯, ɪ, ɪ:, e:, ə, ɛ, ɛ:, a, a:'
|
||||||
|
v_central = 'ə, ɛ, ɛ:, a, a:'
|
||||||
|
v_back = 'u, u:, ü, ü:, ṷ, ɔ, ɔ:, ɔ̈, ö, ö:, o, o:'
|
||||||
|
|
||||||
|
long = 'a:, e:, i:, o:, u:, ö:, ü:, ɔ:, ɛ:, ɪ:'
|
||||||
|
short = 'a, i, i̯, o, u, ṷ, ö, ü, ɔ, ɔ̈, ə, ɛ, ɪ'
|
||||||
|
|
||||||
|
#Dipthong: ???
|
||||||
|
#Front-Start: ???
|
||||||
|
#Fronting: ???
|
||||||
|
|
||||||
|
high = 'i, i:, i̯, ɪ, ɪ: u, u:, ṷ, ə, e:, o, o:, ö, ö:, ü, ü:'
|
||||||
|
medium = 'e:, ə, ɛ, ɛ:, ɔ, ɔ:, ɔ̈, o, o:, ö, ö:'
|
||||||
|
low = 'a, a:, ɛ, ɛ:, ɔ, ɔ:, ɔ̈'
|
||||||
|
|
||||||
|
rounded = 'a, a:, o, o:, u, u:, ṷ, ö, ö:, ü, ü:, ɔ, ɔ:, ɔ̈'
|
||||||
|
unrounded = 'i, i:, i̯, e:, ə, ɛ, ɛ:, ɪ, ɪ:'
|
||||||
|
|
||||||
|
i_vowel = 'i, i:, i̯, ɪ, ɪ:'
|
||||||
|
e_vowel = 'e:,ə, ɛ, ɛ:'
|
||||||
|
a_vowel = 'a, a:'
|
||||||
|
o_vowel = 'o, o:, ö, ö:, ɔ, ɔ:, ɔ̈'
|
||||||
|
u_vowel = 'u, u:, ṷ, ü, ü:'
|
||||||
|
|
||||||
|
## htk phoneset
|
||||||
|
phoneset = fame_asr.phoneset_htk
|
||||||
|
|
||||||
|
## convert ipa group to htk format for quests.hed.
|
||||||
|
def _ipa2quest(R_or_L, ipa_text):
|
||||||
|
assert R_or_L in ['R', 'L'], print('the first argument should be either R or L.')
|
||||||
|
ipa_list = ipa_text.replace(' ', '').split(',')
|
||||||
|
if R_or_L == 'R':
|
||||||
|
quests_list = ['*+' + fame_functions.ipa2htk(ipa) for ipa in ipa_list]
|
||||||
|
else:
|
||||||
|
quests_list = [fame_functions.ipa2htk(ipa) + '-*' for ipa in ipa_list]
|
||||||
|
return ','.join(quests_list)
|
||||||
|
|
||||||
|
|
||||||
|
def make_quests_hed(quest_hed):
|
||||||
|
def _add_quests_item(R_or_L, item_name_, ipa_text):
|
||||||
|
assert R_or_L in ['R', 'L'], print('the first argument should be either R or L.')
|
||||||
|
item_name = R_or_L + '_' + item_name_
|
||||||
|
with open(quest_hed, 'ab') as f:
|
||||||
|
f.write(bytes('QS "' + item_name + '"\t{ ' + _ipa2quest(R_or_L, ipa_text) + ' }\n', 'ascii'))
|
||||||
|
|
||||||
|
if os.path.exists(quest_hed):
|
||||||
|
os.remove(quest_hed)
|
||||||
|
|
||||||
|
for R_or_L in ['R', 'L']:
|
||||||
|
_add_quests_item(R_or_L, 'NonBoundary', '*')
|
||||||
|
_add_quests_item(R_or_L, 'Silence', 'sil')
|
||||||
|
|
||||||
|
_add_quests_item(R_or_L, 'Stop', stop)
|
||||||
|
_add_quests_item(R_or_L, 'Nasal', nasal)
|
||||||
|
_add_quests_item(R_or_L, 'Fricative', fricative)
|
||||||
|
_add_quests_item(R_or_L, 'Liquid', liquid)
|
||||||
|
_add_quests_item(R_or_L, 'Vowel', vowel)
|
||||||
|
|
||||||
|
_add_quests_item(R_or_L, 'C-Front', c_front)
|
||||||
|
_add_quests_item(R_or_L, 'C-Central', c_central)
|
||||||
|
_add_quests_item(R_or_L, 'C-Back', c_back)
|
||||||
|
|
||||||
|
_add_quests_item(R_or_L, 'V-Front', v_front)
|
||||||
|
_add_quests_item(R_or_L, 'V-Central', v_central)
|
||||||
|
_add_quests_item(R_or_L, 'V-Back', v_back)
|
||||||
|
|
||||||
|
_add_quests_item(R_or_L, 'Front', c_front + v_front)
|
||||||
|
_add_quests_item(R_or_L, 'Central', c_central + v_central)
|
||||||
|
_add_quests_item(R_or_L, 'Back', c_front + v_back)
|
||||||
|
|
||||||
|
_add_quests_item(R_or_L, 'Fortis', fortis)
|
||||||
|
_add_quests_item(R_or_L, 'Lenis', lenis)
|
||||||
|
_add_quests_item(R_or_L, 'UnFortLenis', neither_fortis_nor_lenis)
|
||||||
|
|
||||||
|
_add_quests_item(R_or_L, 'Coronal', coronal)
|
||||||
|
_add_quests_item(R_or_L, 'NonCoronal', non_coronal)
|
||||||
|
|
||||||
|
_add_quests_item(R_or_L, 'Anterior', anterior)
|
||||||
|
_add_quests_item(R_or_L, 'NonAnterior', non_anterior)
|
||||||
|
|
||||||
|
_add_quests_item(R_or_L, 'Continuent', continuent)
|
||||||
|
_add_quests_item(R_or_L, 'NonContinuent', non_continuent)
|
||||||
|
|
||||||
|
_add_quests_item(R_or_L, 'Strident', strident)
|
||||||
|
_add_quests_item(R_or_L, 'NonStrident', non_strident)
|
||||||
|
_add_quests_item(R_or_L, 'UnStrident', unstrident)
|
||||||
|
|
||||||
|
_add_quests_item(R_or_L, 'Glide', glide)
|
||||||
|
_add_quests_item(R_or_L, 'Syllabic', syllabic)
|
||||||
|
|
||||||
|
_add_quests_item(R_or_L, 'Unvoiced-Cons', unvoiced)
|
||||||
|
_add_quests_item(R_or_L, 'Voiced-Cons', voiced)
|
||||||
|
_add_quests_item(R_or_L, 'Unvoiced-All', unvoiced + ', sil')
|
||||||
|
|
||||||
|
_add_quests_item(R_or_L, 'Long', long)
|
||||||
|
_add_quests_item(R_or_L, 'Short', short)
|
||||||
|
|
||||||
|
#_add_quests_item(R_or_L, 'Dipthong', xxx)
|
||||||
|
#_add_quests_item(R_or_L, 'Front-Start', xxx)
|
||||||
|
#_add_quests_item(R_or_L, 'Fronting', xxx)
|
||||||
|
|
||||||
|
_add_quests_item(R_or_L, 'High', high)
|
||||||
|
_add_quests_item(R_or_L, 'Medium', medium)
|
||||||
|
_add_quests_item(R_or_L, 'Low', low)
|
||||||
|
|
||||||
|
_add_quests_item(R_or_L, 'Rounded', rounded)
|
||||||
|
_add_quests_item(R_or_L, 'UnRounded', unrounded)
|
||||||
|
|
||||||
|
#_add_quests_item(R_or_L, 'Affricative', rounded)
|
||||||
|
_add_quests_item(R_or_L, 'NonAffricative', non_affricate)
|
||||||
|
|
||||||
|
_add_quests_item(R_or_L, 'IVowel', i_vowel)
|
||||||
|
_add_quests_item(R_or_L, 'EVowel', e_vowel)
|
||||||
|
_add_quests_item(R_or_L, 'AVowel', a_vowel)
|
||||||
|
_add_quests_item(R_or_L, 'OVowel', o_vowel)
|
||||||
|
_add_quests_item(R_or_L, 'UVowel', u_vowel)
|
||||||
|
|
||||||
|
_add_quests_item(R_or_L, 'Voiced-Stop', voiced_stop)
|
||||||
|
_add_quests_item(R_or_L, 'UnVoiced-Stop', unvoiced_stop)
|
||||||
|
|
||||||
|
_add_quests_item(R_or_L, 'Front-Stop', front_stop)
|
||||||
|
_add_quests_item(R_or_L, 'Central-Stop', central_stop)
|
||||||
|
_add_quests_item(R_or_L, 'Back-Stop', back_stop)
|
||||||
|
|
||||||
|
_add_quests_item(R_or_L, 'Voiced-Fric', voiced_fricative)
|
||||||
|
_add_quests_item(R_or_L, 'UnVoiced-Fric', unvoiced_fricative)
|
||||||
|
|
||||||
|
_add_quests_item(R_or_L, 'Front-Fric', front_fricative)
|
||||||
|
_add_quests_item(R_or_L, 'Central-Fric', central_fricative)
|
||||||
|
_add_quests_item(R_or_L, 'Back-Fric', back_fricative)
|
||||||
|
|
||||||
|
for p in phoneset:
|
||||||
|
_add_quests_item(R_or_L, p, p)
|
||||||
|
|
||||||
|
return
|
||||||
|
|
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
119
acoustic_model/stimmen_functions.py
Normal file
119
acoustic_model/stimmen_functions.py
Normal file
@ -0,0 +1,119 @@
|
|||||||
|
import os
|
||||||
|
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
||||||
|
import glob
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import convert_xsampa2ipa
|
||||||
|
import defaultfiles as default
|
||||||
|
import fame_functions
|
||||||
|
import novoapi_functions
|
||||||
|
|
||||||
|
|
||||||
|
def _load_transcriptions():
|
||||||
|
stimmen_transcription = pd.ExcelFile(default.stimmen_transcription_xlsx)
|
||||||
|
df = pd.read_excel(stimmen_transcription, 'original')
|
||||||
|
|
||||||
|
# mapping from ipa to xsampa
|
||||||
|
mapping = convert_xsampa2ipa.load_converter('xsampa', 'ipa', default.ipa_xsampa_converter_dir)
|
||||||
|
#for xsampa, ipa in zip(df['X-SAMPA'], df['IPA']):
|
||||||
|
# ipa_converted = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
|
||||||
|
# if not ipa_converted == ipa:
|
||||||
|
# print('{0}: {1} - {2}'.format(xsampa, ipa_converted, ipa))
|
||||||
|
|
||||||
|
ipas = []
|
||||||
|
for xsampa in df['Self Xsampa']:
|
||||||
|
if not isinstance(xsampa, float): # 'NaN'
|
||||||
|
# typo?
|
||||||
|
xsampa = xsampa.replace('r2:z@rA:\\t', 'r2:z@rA:t').replace(';', ':')
|
||||||
|
|
||||||
|
ipa = convert_xsampa2ipa.xsampa2ipa(mapping, xsampa)
|
||||||
|
ipa = ipa.replace('ː', ':').replace(' ', '')
|
||||||
|
ipas.append(ipa)
|
||||||
|
else:
|
||||||
|
ipas.append('')
|
||||||
|
|
||||||
|
df_ = pd.DataFrame({'filename': df['Filename'],
|
||||||
|
'word': df['Word'],
|
||||||
|
'xsampa': df['Self Xsampa'],
|
||||||
|
'ipa': pd.Series(ipas)})
|
||||||
|
|
||||||
|
# not valid inputs, but seperator.
|
||||||
|
df_ = df_[~df_['ipa'].str.contains('/')]
|
||||||
|
return df_.dropna()
|
||||||
|
|
||||||
|
|
||||||
|
def load_transcriptions():
|
||||||
|
""" in default.stimmen_transcription_xlsx
|
||||||
|
rows of which wav files can be easily found"""
|
||||||
|
df = _load_transcriptions()
|
||||||
|
df_ = pd.DataFrame(index=[], columns=list(df.keys()))
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
filename = row['filename']
|
||||||
|
if isinstance(filename, str):
|
||||||
|
wav_file = os.path.join(default.stimmen_wav_dir, filename)
|
||||||
|
if os.path.exists(wav_file):
|
||||||
|
df_ = df_.append(row, ignore_index=True)
|
||||||
|
return df_
|
||||||
|
|
||||||
|
|
||||||
|
def load_transcriptions_clean(clean_wav_dir):
|
||||||
|
df = _load_transcriptions()
|
||||||
|
wav_file_list = glob.glob(os.path.join(clean_wav_dir, '*.wav'))
|
||||||
|
df_clean = pd.DataFrame(index=[], columns=list(df.keys()))
|
||||||
|
for wav_file in wav_file_list:
|
||||||
|
filename = os.path.basename(wav_file)
|
||||||
|
df_ = df[df['filename'].str.match(filename)]
|
||||||
|
df_clean = pd.concat([df_clean, df_])
|
||||||
|
return df_clean
|
||||||
|
|
||||||
|
|
||||||
|
def load_transcriptions_novo70(clean_wav_dir):
|
||||||
|
""" extract rows of which ipa is written in novo70 phonset. """
|
||||||
|
df = load_transcriptions_clean(clean_wav_dir)
|
||||||
|
|
||||||
|
df_novo70 = pd.DataFrame(index=[], columns=list(df.keys()))
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
not_in_novo70 = novoapi_functions.phones_not_in_novo70(row['ipa'])
|
||||||
|
if len(not_in_novo70) == 0:
|
||||||
|
df_novo70 = df_novo70.append(row, ignore_index=True)
|
||||||
|
|
||||||
|
return df_novo70
|
||||||
|
|
||||||
|
|
||||||
|
def add_row_htk(df):
|
||||||
|
""" df['htk'] is made from df['ipa'] and added. """
|
||||||
|
htk = []
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
htk.append(fame_functions.ipa2htk(row['ipa']))
|
||||||
|
return df.assign(htk=htk)
|
||||||
|
|
||||||
|
|
||||||
|
def add_row_asr(df):
|
||||||
|
""" df['asr'] is made from df['ipa'] and added. """
|
||||||
|
asr = []
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
asr.append(fame_functions.ipa2asr(row['ipa']))
|
||||||
|
return df.assign(asr=asr)
|
||||||
|
|
||||||
|
|
||||||
|
def load_pronunciations(WORD, htk_dic):
|
||||||
|
""" load pronunciation variants from HTK dic file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
WORD (str): word in capital letters.
|
||||||
|
htk_dic (path): HTK dict file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(pronunciations) (list): pronunciation variants of WORD.
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
Because this function loads all contents from htk_dic file,
|
||||||
|
it is not recommended to use for large lexicon.
|
||||||
|
|
||||||
|
"""
|
||||||
|
with open(htk_dic) as f:
|
||||||
|
lines = f.read().replace(' sil', '')
|
||||||
|
lines = lines.split('\n')
|
||||||
|
return [' '.join(line.split(' ')[1:])
|
||||||
|
for line in lines if line.split(' ')[0]==WORD]
|
93
acoustic_model/stimmen_test.py
Normal file
93
acoustic_model/stimmen_test.py
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
import os
|
||||||
|
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
||||||
|
import sys
|
||||||
|
import shutil
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import defaultfiles as default
|
||||||
|
import convert_xsampa2ipa
|
||||||
|
import stimmen_functions
|
||||||
|
import fame_functions
|
||||||
|
import convert_phoneset
|
||||||
|
from phoneset import fame_ipa, fame_asr
|
||||||
|
sys.path.append(default.toolbox_dir)
|
||||||
|
import file_handling as fh
|
||||||
|
from htk import pyhtk
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= user define =======================
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## ======================= make test data ======================
|
||||||
|
stimmen_test_dir = r'c:\OneDrive\Research\rug\_data\stimmen_test'
|
||||||
|
|
||||||
|
## copy wav files which is in the stimmen data.
|
||||||
|
df = stimmen_functions.load_transcriptions()
|
||||||
|
#for index, row in df.iterrows():
|
||||||
|
# filename = row['filename']
|
||||||
|
# wav_file = os.path.join(default.stimmen_wav_dir, filename)
|
||||||
|
# shutil.copy(wav_file, os.path.join(stimmen_test_dir, filename))
|
||||||
|
|
||||||
|
# after manually removed files which has too much noise and multiple words...
|
||||||
|
# update the info.
|
||||||
|
df_clean = stimmen_functions.load_transcriptions_clean(stimmen_test_dir)
|
||||||
|
|
||||||
|
# count how many files are removed due to the quality.
|
||||||
|
word_list = [i for i in list(set(df['word'])) if not pd.isnull(i)]
|
||||||
|
word_list = sorted(word_list)
|
||||||
|
for word in word_list:
|
||||||
|
df_ = df[df['word']==word]
|
||||||
|
df_clean_ = df_clean[df_clean['word']==word]
|
||||||
|
print('word {0} has {1} clean files among {2} files ({3:.2f} [%]).'.format(
|
||||||
|
word, len(df_clean_), len(df_), len(df_clean_)/len(df_)*100))
|
||||||
|
|
||||||
|
|
||||||
|
## check phones included in stimmen but not in FAME!
|
||||||
|
splitted_ipas = [' '.join(
|
||||||
|
convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones))
|
||||||
|
for ipa in df['ipa']]
|
||||||
|
stimmen_phones = set(' '.join(splitted_ipas))
|
||||||
|
stimmen_phones = list(stimmen_phones)
|
||||||
|
fame_phones = fame_ipa.phoneset
|
||||||
|
stimmen_phones.sort()
|
||||||
|
fame_phones.sort()
|
||||||
|
print('phones which are used in stimmen transcription but not in FAME corpus are:\n{}'.format(
|
||||||
|
set(stimmen_phones) - set(fame_phones)
|
||||||
|
))
|
||||||
|
for ipa in df['ipa']:
|
||||||
|
ipa_splitted = convert_phoneset.split_word(ipa, fame_ipa.multi_character_phones)
|
||||||
|
if ':' in ipa_splitted:
|
||||||
|
print(ipa_splitted)
|
||||||
|
|
||||||
|
|
||||||
|
## check pronunciation variants
|
||||||
|
df_clean = stimmen_functions.load_transcriptions_clean(stimmen_test_dir)
|
||||||
|
df_clean = stimmen_functions.add_row_asr(df_clean)
|
||||||
|
df_clean = stimmen_functions.add_row_htk(df_clean)
|
||||||
|
|
||||||
|
for word in word_list:
|
||||||
|
#word = word_list[1]
|
||||||
|
df_ = df_clean[df_clean['word']==word]
|
||||||
|
c = Counter(df_['htk'])
|
||||||
|
pronunciations = dict()
|
||||||
|
for key, value in zip(c.keys(), c.values()):
|
||||||
|
if value > 3:
|
||||||
|
pronunciations[key] = value
|
||||||
|
print(pronunciations)
|
||||||
|
|
||||||
|
|
||||||
|
monophone_mlf = os.path.join(default.htk_dir, 'label', 'train_phone_aligned.mlf')
|
||||||
|
triphone_mlf = os.path.join(default.htk_dir, 'label', 'train_triphone.mlf')
|
||||||
|
def filenames_in_mlf(file_mlf):
|
||||||
|
with open(file_mlf) as f:
|
||||||
|
lines_ = f.read().split('\n')
|
||||||
|
lines = [line for line in lines_ if len(line.split(' ')) == 1 and line != '.']
|
||||||
|
filenames = [line.replace('"', '').replace('*/', '') for line in lines[1:-1]]
|
||||||
|
return filenames
|
||||||
|
filenames_mono = filenames_in_mlf(monophone_mlf)
|
||||||
|
filenames_tri = filenames_in_mlf(triphone_mlf)
|
||||||
|
|
5
novoapi_for_python3x/__init__.py
Normal file
5
novoapi_for_python3x/__init__.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__version__ = "0.2"
|
||||||
|
|
||||||
|
import backend
|
6
novoapi_for_python3x/asr/__init__.py
Normal file
6
novoapi_for_python3x/asr/__init__.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
#import segments
|
||||||
|
#import spraaklab
|
||||||
|
from . import segments
|
||||||
|
from . import spraaklab
|
4
novoapi_for_python3x/asr/segments/__init__.py
Normal file
4
novoapi_for_python3x/asr/segments/__init__.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
from .segments import Segmentation
|
||||||
|
from .praat import seg2tg
|
79
novoapi_for_python3x/asr/segments/praat.py
Normal file
79
novoapi_for_python3x/asr/segments/praat.py
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# (c) 2015--2018 NovoLanguage, author: David A. van Leeuwen
|
||||||
|
|
||||||
|
import codecs
|
||||||
|
|
||||||
|
def print_header(output, begin, end, nr_tiers):
|
||||||
|
print >> output, 'File type = "ooTextFile"'
|
||||||
|
print >> output, 'Object class = "TextGrid"'
|
||||||
|
print >> output, ''
|
||||||
|
print >> output, 'xmin = %s' % begin
|
||||||
|
print >> output, 'xmax = %s' % end
|
||||||
|
print >> output, 'tiers? <exists>'
|
||||||
|
print >> output, 'size = %d' % nr_tiers
|
||||||
|
print >> output, 'item []:'
|
||||||
|
|
||||||
|
|
||||||
|
def print_info_tier(output, title, begin, end, label):
|
||||||
|
print >> output, '\titem [%d]:' % 0
|
||||||
|
print >> output, '\t\tclass = "IntervalTier"'
|
||||||
|
print >> output, '\t\tname = "%s"' % title
|
||||||
|
print >> output, '\t\txmin = %s' % begin
|
||||||
|
print >> output, '\t\txmax = %s' % end
|
||||||
|
print >> output, '\t\tintervals: size = %d' % 1
|
||||||
|
|
||||||
|
print >> output, '\t\tintervals [1]:'
|
||||||
|
print >> output, '\t\t\txmin = %s' % begin
|
||||||
|
print >> output, '\t\t\txmax = %s' % end
|
||||||
|
print >> output, '\t\t\ttext = "%s"' % label
|
||||||
|
|
||||||
|
|
||||||
|
def print_tier(output, title, begin, end, segs, format, formatter):
|
||||||
|
print >> output, '\titem [%d]:' % 0
|
||||||
|
print >> output, '\t\tclass = "IntervalTier"'
|
||||||
|
print >> output, '\t\tname = "%s"' % title
|
||||||
|
print >> output, '\t\txmin = %s' % begin
|
||||||
|
print >> output, '\t\txmax = %s' % end
|
||||||
|
print >> output, '\t\tintervals: size = %d' % len(segs)
|
||||||
|
|
||||||
|
count = 1
|
||||||
|
for seg in segs:
|
||||||
|
#print seg
|
||||||
|
print >> output, '\t\tintervals [%d]:' % count
|
||||||
|
print >> output, '\t\t\txmin = %s' % repr(int(seg['begin']) / 100.0)
|
||||||
|
print >> output, '\t\t\txmax = %s' % repr(int(seg['end']) / 100.0)
|
||||||
|
string = '\t\t\ttext = "' + format + '"'
|
||||||
|
print >> output, string % formatter(seg['label'])
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
|
||||||
|
def seg2tg(fname, segments):
|
||||||
|
if not segments:
|
||||||
|
return
|
||||||
|
output = codecs.open(fname, "w", encoding="utf-8")
|
||||||
|
|
||||||
|
confidences = []
|
||||||
|
word_labels = []
|
||||||
|
phones = []
|
||||||
|
|
||||||
|
for s in segments:
|
||||||
|
conf = s.llh if hasattr(s, "llh") else s.score
|
||||||
|
confidences.append({'begin': s.begin, 'end': s.end, 'label': conf})
|
||||||
|
word_labels.append({'begin': s.begin, 'end': s.end, 'label': s.label})
|
||||||
|
for p in s.phones:
|
||||||
|
phones.append({'begin': p.begin, 'end': p.end, 'label': p.label})
|
||||||
|
|
||||||
|
|
||||||
|
begin = repr(int(segments[0].begin) / 100.0)
|
||||||
|
end = repr(int(segments[-1].end) / 100.0)
|
||||||
|
|
||||||
|
nr_tiers = 3
|
||||||
|
print_header(output, begin, end, nr_tiers)
|
||||||
|
#print_tier(output, "confidence", begin, end, confidences, ('%.3f', lambda x: x))
|
||||||
|
#print_tier(output, "words", begin, end, word_labels, ('%s', lambda x: x))
|
||||||
|
#print_tier(output, "phones", begin, end, phones, ('%s', lambda x: x))
|
||||||
|
print_tier(output, "confidence", begin, end, confidences, '%.3f', lambda x: x)
|
||||||
|
print_tier(output, "words", begin, end, word_labels, '%s', lambda x: x)
|
||||||
|
print_tier(output, "phones", begin, end, phones, '%s', lambda x: x)
|
||||||
|
|
||||||
|
output.close()
|
99
novoapi_for_python3x/asr/segments/segments.py
Normal file
99
novoapi_for_python3x/asr/segments/segments.py
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# (c) 2015--2018 NovoLanguage, author: David A. van Leeuwen
|
||||||
|
|
||||||
|
## These classes can be initialized with dictionaries, as they are returned by the python spraaklab recognition system.
|
||||||
|
|
||||||
|
class Segment(object):
|
||||||
|
def __init__(self, segment):
|
||||||
|
self.begin = segment["begin"]
|
||||||
|
self.end = segment["end"]
|
||||||
|
self.begintime = segment.get("beginTime", self.begin / 100.0)
|
||||||
|
self.endtime = segment.get("endTime", self.end / 100.0)
|
||||||
|
self.label = segment["label"]
|
||||||
|
self.score = segment["score"]
|
||||||
|
if "llh" in segment:
|
||||||
|
self.llh = segment["llh"]
|
||||||
|
if "phones" in segment:
|
||||||
|
self.type = "word"
|
||||||
|
self.phones = Segmentation(segment["phones"], ["sil"])
|
||||||
|
if hasattr(self.phones[0], "llh"):
|
||||||
|
self.minllh = min([s.llh for s in self.phones]) ## the current word llh for error detection
|
||||||
|
else:
|
||||||
|
self.type = "phone"
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
res = "%8.3f -- %8.3f score %8.3f " % (self.begintime, self.endtime, self.score)
|
||||||
|
if hasattr(self, "llh"):
|
||||||
|
res += "llh %8.3f " % self.llh
|
||||||
|
res += self.label.encode("utf8")
|
||||||
|
return res
|
||||||
|
|
||||||
|
def export(self):
|
||||||
|
r = {"begin": self.begin, "end": self.end, "label": self.label, "score": self.score, "type": self.type}
|
||||||
|
if hasattr(self, "llh"):
|
||||||
|
r["llh"] = self.llh
|
||||||
|
if hasattr(self, "phones"):
|
||||||
|
r["phones"] = self.phones.export()
|
||||||
|
return r
|
||||||
|
|
||||||
|
class Segmentation(object):
|
||||||
|
def __init__(self, segments, sils=["<s>", "</s>", "!sil"]):
|
||||||
|
"""Create a segmentation from a spraaklab recognition structure.
|
||||||
|
segments: an array of words (or phones), represented by a dict with
|
||||||
|
"begin", "end", "label", "score", and "llh" keys. Words can also have
|
||||||
|
"phones" which is another array of segments."""
|
||||||
|
self.segments = [Segment(s) for s in segments]
|
||||||
|
if self.segments:
|
||||||
|
self.type = self.segments[0].type
|
||||||
|
else:
|
||||||
|
self.type = None
|
||||||
|
self.sils = sils
|
||||||
|
self.orig = segments ## in case we want to have access to the original recognition structure
|
||||||
|
|
||||||
|
def __getitem__(self, item):
|
||||||
|
return self.segments[item]
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
ns = len(self.segments)
|
||||||
|
res = "Segmentation with %d %s%s" % (ns, self.type, "" if ns==1 else "s")
|
||||||
|
for seg in self.segments:
|
||||||
|
res += "\n " + repr(seg)
|
||||||
|
return res
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.segments)
|
||||||
|
|
||||||
|
def score(self, skip=None):
|
||||||
|
if not skip:
|
||||||
|
skip = self.sils
|
||||||
|
s = 0.0
|
||||||
|
for seg in self.segments:
|
||||||
|
if seg.label not in skip:
|
||||||
|
s += seg.score
|
||||||
|
return s
|
||||||
|
|
||||||
|
def llhs(self, skip=None):
|
||||||
|
if not skip:
|
||||||
|
skip = self.sils
|
||||||
|
return [seg.llh for seg in self.segments if hasattr(seg, "llh") and seg.label not in skip]
|
||||||
|
|
||||||
|
def llh(self, skip=None):
|
||||||
|
return sum(self.llhs(skip))
|
||||||
|
|
||||||
|
def minllh(self, skip=None):
|
||||||
|
llhs = self.llhs(skip)
|
||||||
|
if llhs:
|
||||||
|
return min(llhs)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def labels(self, skip=None):
|
||||||
|
if not skip:
|
||||||
|
skip = self.sils
|
||||||
|
return [seg.label for seg in self.segments if seg.label not in skip]
|
||||||
|
|
||||||
|
def sentence(self, skip=None):
|
||||||
|
return " ".join(self.labels(skip))
|
||||||
|
|
||||||
|
def export(self):
|
||||||
|
return [seg.export() for seg in self.segments]
|
4
novoapi_for_python3x/asr/spraaklab/__init__.py
Normal file
4
novoapi_for_python3x/asr/spraaklab/__init__.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
#import schema
|
||||||
|
from . import schema
|
273
novoapi_for_python3x/asr/spraaklab/schema.py
Normal file
273
novoapi_for_python3x/asr/spraaklab/schema.py
Normal file
@ -0,0 +1,273 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
## (c) 2017 NovoLanguage, author: David A. van Leeuwen
|
||||||
|
|
||||||
|
## The purpose of this to define the grammar structure in a json schema, so that it can be validated,
|
||||||
|
## (de)serialized, and perhaps even automatically converted to a Python class structure.
|
||||||
|
|
||||||
|
import json
|
||||||
|
import jsonschema
|
||||||
|
|
||||||
|
grammar_schema_v10 = {
|
||||||
|
"$schema": "http://json-schema.org/schema#",
|
||||||
|
"title": "NovoLanguage grammar",
|
||||||
|
"description": "A grammar specification for the NovoLanguage Automatic Speech Recognition",
|
||||||
|
"$ref": "#/definitions/group",
|
||||||
|
"definitions": {
|
||||||
|
"phones": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"minItems": 1
|
||||||
|
},
|
||||||
|
"pronunciation": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"phones": {
|
||||||
|
"$ref": "#/definitions/phones"
|
||||||
|
},
|
||||||
|
"syllables": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/definitions/syllable"
|
||||||
|
},
|
||||||
|
"minItems": 1
|
||||||
|
},
|
||||||
|
"id": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "ID to distinguish this pronunciation from other variants"
|
||||||
|
},
|
||||||
|
"meta": {
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["phones"]
|
||||||
|
},
|
||||||
|
"syllable": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"begin": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"end": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"stress": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"tone": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["begin", "end"]
|
||||||
|
},
|
||||||
|
"word": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"kind": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["word"]
|
||||||
|
},
|
||||||
|
"label": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"pronunciation": {
|
||||||
|
"anyOf": [
|
||||||
|
{
|
||||||
|
"$ref": "#/definitions/pronunciation"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"anyOf": [
|
||||||
|
{
|
||||||
|
"$ref": "#/definitions/pronunciation"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/definitions/phones"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"minItems": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/definitions/phones"
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"syllables": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/definitions/syllable"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"graphemes": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"id": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "ID to distinguish this word from other words (with possibly the same label)"
|
||||||
|
},
|
||||||
|
"meta": {
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["label"]
|
||||||
|
},
|
||||||
|
"element": {
|
||||||
|
"title": "element",
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"$ref": "#/definitions/word"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/definitions/group"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": ["string", "null"]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"group": {
|
||||||
|
"title": "element group",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"kind": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["sequence", "alternatives", "order"]
|
||||||
|
},
|
||||||
|
"elements": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/definitions/element"
|
||||||
|
},
|
||||||
|
"minItems": 1,
|
||||||
|
},
|
||||||
|
"meta": {
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["kind", "elements"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
grammar_schema_v01 = {
|
||||||
|
"$schema": "http://json-schema.org/schema#",
|
||||||
|
"title": "NovoLanguage grammar v0.1",
|
||||||
|
"description": "A grammar specification for the NovoLanguage Automatic Speech Recognition",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"type": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["multiple_choice", "word_order"]
|
||||||
|
},
|
||||||
|
"parts": {
|
||||||
|
"type": "array",
|
||||||
|
"minItems": 1,
|
||||||
|
"maxItems": 5,
|
||||||
|
"items": {
|
||||||
|
"type": ["string", "array"],
|
||||||
|
"items": {
|
||||||
|
"type": ["string"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
grammar_rpc_schema = {
|
||||||
|
"$schema": "http://json-schema.org/schema#",
|
||||||
|
"title": "NovoLanguage RPC grammar",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"type": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["confusion_network"]
|
||||||
|
},
|
||||||
|
"version": {
|
||||||
|
"type": "string",
|
||||||
|
"default": "v0.1"
|
||||||
|
},
|
||||||
|
"data": {
|
||||||
|
"type": "object"
|
||||||
|
},
|
||||||
|
"return_dict": {
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"return_objects": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["dict", "grammar"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"phoneset": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["cmu69", "novo70", "mdbg115"]
|
||||||
|
},
|
||||||
|
"parallel_silence": {
|
||||||
|
"type": "boolean"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["type", "data"]
|
||||||
|
}
|
||||||
|
|
||||||
|
def validate(object, schema=grammar_schema_v10):
|
||||||
|
#if isinstance(object, basestring):
|
||||||
|
if isinstance(object, str):
|
||||||
|
object = json.loads(object)
|
||||||
|
if not isinstance(object, dict):
|
||||||
|
raise TypeError("Expected dict or json string")
|
||||||
|
try:
|
||||||
|
jsonschema.validate(object, schema)
|
||||||
|
except jsonschema.ValidationError:
|
||||||
|
return False
|
||||||
|
except Exception:
|
||||||
|
raise
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def validate_rpc_grammar(message):
|
||||||
|
"""validate an rpc grammar message"""
|
||||||
|
if not validate(message, grammar_rpc_schema):
|
||||||
|
raise ValueError("Not a valid RPC grammar")
|
||||||
|
version = message.get("version", "0.1")
|
||||||
|
data = message["data"]
|
||||||
|
if version == "0.1":
|
||||||
|
if not validate(data, grammar_schema_v01):
|
||||||
|
raise ValueError("Not a valid grammar v0.1")
|
||||||
|
elif version == "1.0":
|
||||||
|
if not validate(data, grammar_schema_v10):
|
||||||
|
raise ValueError("Not a valid grammar v1.0")
|
||||||
|
else:
|
||||||
|
raise ValueError("Unsupported schema version")
|
||||||
|
|
||||||
|
|
||||||
|
## test
|
||||||
|
def test(data=None):
|
||||||
|
if not data:
|
||||||
|
data = {"kind": "sequence", "elements": [
|
||||||
|
{"kind": "alternatives", "elements": ["a plain string", "an alternative string"]},
|
||||||
|
{"kind": "word", "label": "a word", "pronunciation": {"phones": ["ah", "w", "er", "d"]}},
|
||||||
|
{"kind": "order", "elements": [{"kind": "word", "label": "another word", "visible": False}, "last word"]}]}
|
||||||
|
try:
|
||||||
|
jsonschema.validate(data, schema)
|
||||||
|
except jsonschema.ValidationError as e:
|
||||||
|
#print data, "validated not OK", e.message
|
||||||
|
print("{0} validated not OK {1}".format(data, e.message))
|
||||||
|
else:
|
||||||
|
#print data, "validated OK"
|
||||||
|
print("{0} validated OK".format(data))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test()
|
4
novoapi_for_python3x/backend/__init__.py
Normal file
4
novoapi_for_python3x/backend/__init__.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
#import session
|
||||||
|
from . import session
|
255
novoapi_for_python3x/backend/session.py
Normal file
255
novoapi_for_python3x/backend/session.py
Normal file
@ -0,0 +1,255 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# (c) 2015--2018 NovoLanguage, author: David A. van Leeuwen
|
||||||
|
|
||||||
|
## Recognition interface for actual backend. Adapted from player.asr.debug.
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import wave
|
||||||
|
import requests
|
||||||
|
import websocket
|
||||||
|
import logging
|
||||||
|
import collections
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
from .. import asr
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
## turn off annoying warnings
|
||||||
|
requests.packages.urllib3.disable_warnings()
|
||||||
|
logging.getLogger("requests.packages.urllib3.connectionpool").setLevel(logging.WARN)
|
||||||
|
|
||||||
|
buffer_size = 4096
|
||||||
|
gm = "gm.novolanguage.com" ## dev
|
||||||
|
protocol = "https"
|
||||||
|
port = 443
|
||||||
|
apiversion = 0
|
||||||
|
|
||||||
|
sessions = collections.Counter()
|
||||||
|
|
||||||
|
def segmentation(result):
|
||||||
|
"""converts a raw backend recognition result to a segment of novo.asr.segments class Segmentation"""
|
||||||
|
for w in result:
|
||||||
|
w["score"] = w["confidence"]["prob"]
|
||||||
|
w["llh"] = w["confidence"]["llr"]
|
||||||
|
w["label"] = w["label"]["raw"]
|
||||||
|
w["begin"] /= 10
|
||||||
|
w["end"] /= 10
|
||||||
|
for p in w["phones"]:
|
||||||
|
p["score"] = p["confidence"]["prob"]
|
||||||
|
p["llh"] = p["confidence"]["llr"]
|
||||||
|
p["begin"] /= 10
|
||||||
|
p["end"] /= 10
|
||||||
|
return asr.segments.Segmentation(result)
|
||||||
|
|
||||||
|
class rpcid:
|
||||||
|
id = 0
|
||||||
|
@staticmethod
|
||||||
|
def next():
|
||||||
|
rpcid.id += 1
|
||||||
|
return rpcid.id
|
||||||
|
|
||||||
|
class Recognizer(object):
|
||||||
|
def __init__(self, lang="en", gm=gm, grammar_version="0.1", user=None, password=None, snodeid=None, keepopen=False):
|
||||||
|
self.lang = lang
|
||||||
|
self.keepopen = keepopen
|
||||||
|
self.api_url = "%s://%s:%d/v%d" % (protocol, gm, port, apiversion)
|
||||||
|
self.verify = False
|
||||||
|
self.headers = {"Content-Type": "application/json"}
|
||||||
|
self.login_user(user, password)
|
||||||
|
data = {"l2": lang, "local": False, "skipupload": True}
|
||||||
|
if snodeid:
|
||||||
|
data["snodeid"] = snodeid
|
||||||
|
self.conn = None
|
||||||
|
self.init_session(data)
|
||||||
|
self.grammar_version = grammar_version
|
||||||
|
self.last_message = None
|
||||||
|
|
||||||
|
def login_user(self, username, password):
|
||||||
|
# obtain authentication token of user
|
||||||
|
logger.info('obtain auth token at %s', self.api_url)
|
||||||
|
data = {
|
||||||
|
'username': username,
|
||||||
|
'password': password
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
r = requests.post(self.api_url + '/publishers/1/login', headers=self.headers, data=json.dumps(data), verify=self.verify)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Cannot post request to GM API for user login: %s", e.message)
|
||||||
|
sys.exit(-1)
|
||||||
|
assert r.ok, r.reason
|
||||||
|
result = r.json()
|
||||||
|
if "errors" in result["response"]:
|
||||||
|
logger.info("Error in logging in: %s", result["response"]["errors"])
|
||||||
|
sys.exit(-1)
|
||||||
|
|
||||||
|
user_auth_token = result['response']['user']['authentication_token']
|
||||||
|
logger.info("User auth token is: %s", user_auth_token)
|
||||||
|
|
||||||
|
# set auth token in header
|
||||||
|
self.headers['Authentication-Token'] = user_auth_token
|
||||||
|
|
||||||
|
def init_session(self, data, direct=False, use_ip=False):
|
||||||
|
logger.info('Request new session: %s', data)
|
||||||
|
r = requests.post(self.api_url + '/sessions', headers=self.headers, data=json.dumps(data), verify=self.verify)
|
||||||
|
if not r.ok:
|
||||||
|
logger.error("New session request failed: %s", r.text)
|
||||||
|
return
|
||||||
|
|
||||||
|
status_url = r.headers.get("location")
|
||||||
|
if status_url:
|
||||||
|
## we got a redirect
|
||||||
|
status = {}
|
||||||
|
while True:
|
||||||
|
logger.debug("Checking %s", status_url)
|
||||||
|
s = requests.get(status_url, verify=self.verify)
|
||||||
|
if not s.ok:
|
||||||
|
logger.error('Checking Failed: %s', s.text)
|
||||||
|
return
|
||||||
|
|
||||||
|
status = s.json()
|
||||||
|
if status['status'] == 'PENDING':
|
||||||
|
logger.debug("Status: %s", status['status'])
|
||||||
|
time.sleep(1)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
session = status['result'][0] ## [1] is another status code...
|
||||||
|
if "error" in session:
|
||||||
|
logger.error("Error in getting a snode: %s", session["error"])
|
||||||
|
raise Exception
|
||||||
|
else:
|
||||||
|
session = r.json()
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.info("Session: %r", session)
|
||||||
|
if direct:
|
||||||
|
snode_ip = session["snode"]["ip"]
|
||||||
|
proxy_url = snode_ip
|
||||||
|
snode_port = session["port"]
|
||||||
|
ws_url = "%s://%s:%d/" % ("ws", snode_ip, snode_port)
|
||||||
|
else:
|
||||||
|
field = "ip" if use_ip else "hostname"
|
||||||
|
proxy_url = session['snode']['datacentre']['proxy'][field]
|
||||||
|
ws_url = 'wss://' + proxy_url + '/' + session['uuid']
|
||||||
|
logger.info("Connecting to websocket: %s", ws_url)
|
||||||
|
conn = websocket.create_connection(ws_url, sslopt={"check_hostname": self.verify})
|
||||||
|
logger.info("Connected.")
|
||||||
|
#except Exception, e:
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Unable to connect to websocket: %s", e.message)
|
||||||
|
raise e
|
||||||
|
|
||||||
|
self.session_id = session['id']
|
||||||
|
self.proxy_url = proxy_url
|
||||||
|
self.conn = conn
|
||||||
|
self.session = session
|
||||||
|
sessions[session["uuid"]] += 1
|
||||||
|
|
||||||
|
def setgrammar(self, grammar): ## backend grammar object: {"data": {...}, "type": "confusion_network"}
|
||||||
|
data = {"jsonrpc": "2.0",
|
||||||
|
'type': 'jsonrpc',
|
||||||
|
'method': 'set_grammar',
|
||||||
|
'params': grammar,
|
||||||
|
"id": rpcid.next()}
|
||||||
|
asr.spraaklab.schema.validate_rpc_grammar(grammar)
|
||||||
|
self.conn.send(json.dumps(data))
|
||||||
|
result = json.loads(self.conn.recv())
|
||||||
|
if result.get("error"):
|
||||||
|
logger.error("Exercise validation error: %s", result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def set_alternatives_grammar(self, *args, **kwargs):
|
||||||
|
if not "version" in kwargs:
|
||||||
|
kwargs["version"] = self.grammar_version
|
||||||
|
return self.setgrammar(alternatives_grammar(*args, **kwargs))
|
||||||
|
|
||||||
|
def recognize_wav(self, wavf):
|
||||||
|
w = wave.open(wavf, 'r')
|
||||||
|
nchannels, sampwidth, framerate, nframes, comptype, compname = w.getparams()
|
||||||
|
if nchannels > 1:
|
||||||
|
logging.error("Please use .wav with only 1 channel, found %d channels in %s", nchannels, wavf)
|
||||||
|
return
|
||||||
|
if (sampwidth != 2):
|
||||||
|
logging.error("Please use .wav with 2-byte PCM data, found %d bytes in %s", sampwidth, wavf)
|
||||||
|
return
|
||||||
|
if (framerate != 16000.0):
|
||||||
|
logging.error("Please use .wav sampled at 16000 Hz, found %1.0f in %s", framerate, wavf)
|
||||||
|
return
|
||||||
|
if (comptype != 'NONE'):
|
||||||
|
logging.error("Please use .wav with uncompressed data, found %s in %s", compname, wavf)
|
||||||
|
return
|
||||||
|
buf = w.readframes(nframes)
|
||||||
|
w.close()
|
||||||
|
return self.recognize_data(buf)
|
||||||
|
|
||||||
|
def recognize_data(self, buf):
|
||||||
|
nbytes_sent = 0
|
||||||
|
start = time.time()
|
||||||
|
for j in range(0, len(buf), buffer_size):
|
||||||
|
#audio_packet = str(buf[j:j + buffer_size])
|
||||||
|
audio_packet = buf[j:j + buffer_size]
|
||||||
|
nbytes_sent += len(audio_packet)
|
||||||
|
self.conn.send_binary(audio_packet)
|
||||||
|
self.conn.send(json.dumps({"jsonrpc": "2.0", "method": "get_result", "id": rpcid.next()}))
|
||||||
|
logger.info("Waiting for recognition result...")
|
||||||
|
self.last_message = self.conn.recv() ## keep result for the interested applications
|
||||||
|
message = json.loads(self.last_message)
|
||||||
|
dur = time.time() - start
|
||||||
|
logger.info("Recognition took %5.3f seconds", dur)
|
||||||
|
if "error" in message:
|
||||||
|
raise RuntimeError("Error from recognition backend: %r" % message.get("error"))
|
||||||
|
return segmentation(message["result"]["words"])
|
||||||
|
|
||||||
|
def recognize_url(self, url):
|
||||||
|
start = time.time()
|
||||||
|
data = json.dumps({"jsonrpc": "2.0", "method": "send_audio", "id": rpcid.next(), "params": {"type": "url", "data": url, "details": ["word", "utterance"]}})
|
||||||
|
self.conn.send(data)
|
||||||
|
logger.info("Waiting for recognition result...")
|
||||||
|
self.last_message = self.conn.recv() ## keep result for the interested applications
|
||||||
|
#print self.last_message
|
||||||
|
print(self.last_message)
|
||||||
|
message = json.loads(self.last_message)
|
||||||
|
dur = time.time() - start
|
||||||
|
logger.info("Recognition took %5.3f seconds", dur)
|
||||||
|
if "error" in message:
|
||||||
|
raise RuntimeError("Error from recognition backend: %r" % message.get("error"))
|
||||||
|
return segmentation(message["result"]["words"])
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
sessions[self.session["uuid"]] -= 1
|
||||||
|
if self.conn and sessions[self.session["uuid"]] <= 0:
|
||||||
|
self.conn.close()
|
||||||
|
url = self.api_url + '/sessions/%d' % self.session_id
|
||||||
|
if self.keepopen:
|
||||||
|
logger.info("Keeping session open...")
|
||||||
|
else:
|
||||||
|
logger.info("Closing session: %s", url)
|
||||||
|
r = requests.delete(url, headers=self.headers, verify=self.verify)
|
||||||
|
assert r.ok, r.reason
|
||||||
|
|
||||||
|
def alternatives_grammar(parts, version="0.1", ret=None):
|
||||||
|
"""Make a grammar of alternatives, as array(sequence)-of-array(alternatives)-of-strings"""
|
||||||
|
r = {"type": "confusion_network", "version": version}
|
||||||
|
if version=="0.1":
|
||||||
|
r["data"] = {"type": "multiple_choice", "parts": parts}
|
||||||
|
if isinstance(ret, list) and "dict" in ret:
|
||||||
|
r["return_dict"] = True
|
||||||
|
elif version=="1.0":
|
||||||
|
seqels = []
|
||||||
|
for part in parts:
|
||||||
|
altels = []
|
||||||
|
for alt in part:
|
||||||
|
words = alt.split(" ")
|
||||||
|
if len(words) > 1:
|
||||||
|
alt = {"kind": "sequence", "elements": words}
|
||||||
|
altels.append(alt)
|
||||||
|
seqels.append({"kind": "alternatives", "elements": altels})
|
||||||
|
r["data"] = {"kind": "sequence", "elements": seqels}
|
||||||
|
if isinstance(ret, list):
|
||||||
|
r["return_objects"] = ret
|
||||||
|
else:
|
||||||
|
raise ValueError("Unsupported version: %s" % version)
|
||||||
|
asr.spraaklab.schema.validate_rpc_grammar(r)
|
||||||
|
return r
|
64
novoapi_for_python3x/readme
Normal file
64
novoapi_for_python3x/readme
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
novoapi( https://bitbucket.org/novolanguage/python-novo-api ) is written in Python 2.7.
|
||||||
|
To install it on Python 3.x the following points should be modified.
|
||||||
|
- basestring --> str
|
||||||
|
- print xxx --> print({}.format(xxx)).
|
||||||
|
- import xxx --> from . import xxx
|
||||||
|
- except Exception, e --> except Exception as e
|
||||||
|
- remove tuples from input arguments of a function.
|
||||||
|
Concretely...
|
||||||
|
|
||||||
|
=== novoapi\backend\__init__.py
|
||||||
|
#import session
|
||||||
|
from . import session
|
||||||
|
|
||||||
|
|
||||||
|
=== novoapi\backend\session.py
|
||||||
|
#except Exception, e:
|
||||||
|
except Exception as e:
|
||||||
|
|
||||||
|
#print self.last_message
|
||||||
|
print(self.last_message)
|
||||||
|
|
||||||
|
|
||||||
|
=== novoapi\asr\__init__.py
|
||||||
|
#import segments
|
||||||
|
#import spraaklab
|
||||||
|
from . import segments
|
||||||
|
from . import spraaklab
|
||||||
|
|
||||||
|
|
||||||
|
=== novoapi\asr\segments\praat.py
|
||||||
|
#print_tier(output, "confidence", begin, end, confidences, ('%.3f', lambda x: x))
|
||||||
|
#print_tier(output, "words", begin, end, word_labels, ('%s', lambda x: x))
|
||||||
|
#print_tier(output, "phones", begin, end, phones, ('%s', lambda x: x))
|
||||||
|
print_tier(output, "confidence", begin, end, confidences, '%.3f', lambda x: x)
|
||||||
|
print_tier(output, "words", begin, end, word_labels, '%s', lambda x: x)
|
||||||
|
print_tier(output, "phones", begin, end, phones, '%s', lambda x: x)
|
||||||
|
|
||||||
|
|
||||||
|
=== novoapi\asr\spraaklab\__init__.py ===
|
||||||
|
#import schema
|
||||||
|
from . import schema
|
||||||
|
|
||||||
|
|
||||||
|
=== novoapi\asr\spraaklab\schema.py ===
|
||||||
|
#if isinstance(object, basestring):
|
||||||
|
if isinstance(object, str):
|
||||||
|
|
||||||
|
except jsonschema.ValidationError as e:
|
||||||
|
#print data, "validated not OK", e.message
|
||||||
|
print("{0} validated not OK {1}".format(data, e.message))
|
||||||
|
else:
|
||||||
|
#print data, "validated OK"
|
||||||
|
print("{0} validated OK".format(data))
|
||||||
|
|
||||||
|
|
||||||
|
Then to make it correctly work, few more modification is needed.
|
||||||
|
When the wav file is read using the wave module, the output (named buf) is a string of bytes on Python 2.7 while buf is a byte object on Python 3.6.
|
||||||
|
Therefore...
|
||||||
|
|
||||||
|
=== novoapi\backend\session.py
|
||||||
|
#audio_packet = str(buf[j:j + buffer_size])
|
||||||
|
audio_packet = buf[j:j + buffer_size]
|
||||||
|
|
||||||
|
Also, because of this difference, Segment.__repr__ (novoapi\asr\segments\segments.py) does not work.
|
25
novoapi_for_python3x/utils/json/__init__.py
Normal file
25
novoapi_for_python3x/utils/json/__init__.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
## from https://stackoverflow.com/questions/1447287/format-floats-with-standard-json-module
|
||||||
|
class PrettyFloat(float):
|
||||||
|
def __repr__(self):
|
||||||
|
return '%.15g' % self
|
||||||
|
|
||||||
|
def pretty_floats(obj):
|
||||||
|
if isinstance(obj, float):
|
||||||
|
return PrettyFloat(obj)
|
||||||
|
elif isinstance(obj, dict):
|
||||||
|
return dict((k, pretty_floats(v)) for k, v in obj.items())
|
||||||
|
elif isinstance(obj, (list, tuple)):
|
||||||
|
return map(pretty_floats, obj)
|
||||||
|
return obj
|
||||||
|
|
||||||
|
def rounded_floats(obj, ndigits=15):
|
||||||
|
if isinstance(obj, float):
|
||||||
|
return PrettyFloat(round(obj, ndigits))
|
||||||
|
elif isinstance(obj, dict):
|
||||||
|
return dict((k, rounded_floats(v, ndigits)) for k, v in obj.items())
|
||||||
|
elif isinstance(obj, (list, tuple)):
|
||||||
|
return map(lambda o: rounded_floats(o, ndigits), obj)
|
||||||
|
return obj
|
||||||
|
|
BIN
reus-test/check_novoapi.zip
Normal file
BIN
reus-test/check_novoapi.zip
Normal file
Binary file not shown.
119
reus-test/reus-test.py
Normal file
119
reus-test/reus-test.py
Normal file
@ -0,0 +1,119 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
import os
|
||||||
|
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
|
||||||
|
from novoapi.backend import session
|
||||||
|
|
||||||
|
p = argparse.ArgumentParser()
|
||||||
|
p.add_argument("--user", default='martijn.wieling')
|
||||||
|
p.add_argument("--password", default='xxxxx')
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True)
|
||||||
|
|
||||||
|
grammar = {
|
||||||
|
"type": "confusion_network",
|
||||||
|
"version": "1.0",
|
||||||
|
"data": {
|
||||||
|
"kind": "sequence",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"kind": "word",
|
||||||
|
"pronunciation": [
|
||||||
|
{
|
||||||
|
"phones": [
|
||||||
|
"r",
|
||||||
|
"eu0",
|
||||||
|
"s"
|
||||||
|
],
|
||||||
|
"id": 0
|
||||||
|
}
|
||||||
|
,
|
||||||
|
{
|
||||||
|
"phones": [
|
||||||
|
"m",
|
||||||
|
"a0",
|
||||||
|
"n"
|
||||||
|
],
|
||||||
|
"id": 1
|
||||||
|
}
|
||||||
|
,
|
||||||
|
{
|
||||||
|
"phones": [
|
||||||
|
"m",
|
||||||
|
"a0",
|
||||||
|
"n",
|
||||||
|
"t",
|
||||||
|
"s",
|
||||||
|
"y",
|
||||||
|
"ax"
|
||||||
|
],
|
||||||
|
"id": 2
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"label": "reus"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"return_objects": [
|
||||||
|
"grammar"
|
||||||
|
],
|
||||||
|
"phoneset": "novo70"
|
||||||
|
}
|
||||||
|
|
||||||
|
res = rec.setgrammar(grammar)
|
||||||
|
#print "Set grammar result", res
|
||||||
|
|
||||||
|
|
||||||
|
## === novoapi/backend/session.py ===
|
||||||
|
#import wave
|
||||||
|
#import time
|
||||||
|
#from novoapi.backend.session import rpcid, segmentation
|
||||||
|
|
||||||
|
#wavf = "reus1008-reus.wav"
|
||||||
|
#w = wave.open(wavf, 'r')
|
||||||
|
#nchannels, sampwidth, framerate, nframes, comptype, compname = w.getparams()
|
||||||
|
#buf = w.readframes(nframes)
|
||||||
|
#w.close()
|
||||||
|
|
||||||
|
#buffer_size = 4096
|
||||||
|
#nbytes_sent = 0
|
||||||
|
#start = time.time()
|
||||||
|
#for j in range(0, len(buf), buffer_size):
|
||||||
|
# audio_packet = buf[j:j + buffer_size]
|
||||||
|
# nbytes_sent += len(audio_packet)
|
||||||
|
# rec.conn.send_binary(audio_packet)
|
||||||
|
#rec.conn.send(json.dumps({"jsonrpc": "2.0", "method": "get_result", "id": rpcid.next()}))
|
||||||
|
#print(rpcid.next())
|
||||||
|
#rec.last_message = rec.conn.recv()
|
||||||
|
#message = json.loads(rec.last_message)
|
||||||
|
#result = session.segmentation(message["result"]["words"])
|
||||||
|
#result.export()
|
||||||
|
## ====================================
|
||||||
|
|
||||||
|
def result2pronunciation(result, word):
|
||||||
|
#result_ = res.export()[1]
|
||||||
|
result_ = [result[i] for i in range(len(result)) if result[i]['label'] == word]
|
||||||
|
llh = result_[0]['llh']
|
||||||
|
phones = result_[0]['phones']
|
||||||
|
pronunciation = [phone['label'] for phone in phones]
|
||||||
|
return pronunciation, llh
|
||||||
|
|
||||||
|
|
||||||
|
res = rec.recognize_wav("reus1008-reus.wav")
|
||||||
|
#print "\n\n\nThe pronounced word in reus1008-reus.wav is: REUS\n\n"
|
||||||
|
#print "Recognition result:", json.dumps(res.export(), indent=4)
|
||||||
|
result2pronunciation(res.export(), 'reus')
|
||||||
|
|
||||||
|
#print "\n\n\nThe pronounced word in reus1167-man.wav is: MAN\n\n"
|
||||||
|
res2 = rec.recognize_wav("reus1167-man.wav")
|
||||||
|
#print "Recognition result:", json.dumps(res2.export(), indent=4)
|
||||||
|
result2pronunciation(res2.export(), 'reus')
|
||||||
|
|
||||||
|
#print "\n\n\nThe pronounced word in reus3768-mantsje.wav is: MANTSJE\n\n"
|
||||||
|
res3 = rec.recognize_wav("reus3768-mantsje.wav")
|
||||||
|
#print "Recognition result:", json.dumps(res3.export(), indent=4)
|
||||||
|
result2pronunciation(res3.export(), 'reus')
|
3
reus-test/reus1008-reus.dic
Normal file
3
reus-test/reus1008-reus.dic
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
REUS r eu s
|
||||||
|
REUS m ac n
|
||||||
|
REUS m ac n t s j @
|
1
reus-test/reus1008-reus.lab
Normal file
1
reus-test/reus1008-reus.lab
Normal file
@ -0,0 +1 @@
|
|||||||
|
REUS
|
6
reus-test/reus1008-reus.txt
Normal file
6
reus-test/reus1008-reus.txt
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
#!MLF!#
|
||||||
|
"c:/Users/Aki/source/repos/acoustic_model/reus-test/reus1008-reus.rec"
|
||||||
|
0 9700000 r -12463.852539 REUS
|
||||||
|
9700000 12800000 eu -3622.108887
|
||||||
|
12800000 26250001 s -17303.216797
|
||||||
|
.
|
BIN
reus-test/reus1008-reus.wav
Normal file
BIN
reus-test/reus1008-reus.wav
Normal file
Binary file not shown.
3
reus-test/reus1167-man.dic
Normal file
3
reus-test/reus1167-man.dic
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
REUS r eu s
|
||||||
|
REUS m ac n
|
||||||
|
REUS m ac n t s j @
|
1
reus-test/reus1167-man.lab
Normal file
1
reus-test/reus1167-man.lab
Normal file
@ -0,0 +1 @@
|
|||||||
|
REUS
|
10
reus-test/reus1167-man.txt
Normal file
10
reus-test/reus1167-man.txt
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
#!MLF!#
|
||||||
|
"c:/Users/Aki/source/repos/acoustic_model/reus-test/reus1167-man.rec"
|
||||||
|
0 150000 m -230.057571 REUS
|
||||||
|
150000 300000 ac -250.994858
|
||||||
|
300000 450000 n -202.377716
|
||||||
|
450000 4600000 t -5128.984375
|
||||||
|
4600000 5050000 s -711.338501
|
||||||
|
5050000 5450000 j -564.730591
|
||||||
|
5450000 16049999 @ -13249.787109
|
||||||
|
.
|
BIN
reus-test/reus1167-man.wav
Normal file
BIN
reus-test/reus1167-man.wav
Normal file
Binary file not shown.
3
reus-test/reus3768-mantsje.dic
Normal file
3
reus-test/reus3768-mantsje.dic
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
REUS r eu s
|
||||||
|
REUS m ac n
|
||||||
|
REUS m ac n t s j @
|
1
reus-test/reus3768-mantsje.lab
Normal file
1
reus-test/reus3768-mantsje.lab
Normal file
@ -0,0 +1 @@
|
|||||||
|
REUS
|
10
reus-test/reus3768-mantsje.txt
Normal file
10
reus-test/reus3768-mantsje.txt
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
#!MLF!#
|
||||||
|
"c:/Users/Aki/source/repos/acoustic_model/reus-test/reus3768-mantsje.rec"
|
||||||
|
0 150000 m -217.347229 REUS
|
||||||
|
150000 1150000 ac -1266.293579
|
||||||
|
1150000 1650000 n -583.382568
|
||||||
|
1650000 11100000 t -11259.270508
|
||||||
|
11100000 11250000 s -247.939255
|
||||||
|
11250000 11550000 j -445.511444
|
||||||
|
11550000 24150000 @ -16769.048828
|
||||||
|
.
|
BIN
reus-test/reus3768-mantsje.wav
Normal file
BIN
reus-test/reus3768-mantsje.wav
Normal file
Binary file not shown.
BIN
rozen-test/pg_rozen_100_jko5r.wav
Normal file
BIN
rozen-test/pg_rozen_100_jko5r.wav
Normal file
Binary file not shown.
BIN
rozen-test/pg_rozen_113_o9kzs.wav
Normal file
BIN
rozen-test/pg_rozen_113_o9kzs.wav
Normal file
Binary file not shown.
BIN
rozen-test/pg_rozen_1296_zbve2.wav
Normal file
BIN
rozen-test/pg_rozen_1296_zbve2.wav
Normal file
Binary file not shown.
BIN
rozen-test/pg_rozen_1709_kq9xr.wav
Normal file
BIN
rozen-test/pg_rozen_1709_kq9xr.wav
Normal file
Binary file not shown.
BIN
rozen-test/pg_rozen_241_bahqi.wav
Normal file
BIN
rozen-test/pg_rozen_241_bahqi.wav
Normal file
Binary file not shown.
BIN
rozen-test/pg_rozen_5502_q79fd.wav
Normal file
BIN
rozen-test/pg_rozen_5502_q79fd.wav
Normal file
Binary file not shown.
BIN
rozen-test/pg_rozen_632_2m04y.wav
Normal file
BIN
rozen-test/pg_rozen_632_2m04y.wav
Normal file
Binary file not shown.
BIN
rozen-test/pg_rozen_911_1zvda.wav
Normal file
BIN
rozen-test/pg_rozen_911_1zvda.wav
Normal file
Binary file not shown.
119
rozen-test/rozen-test.py
Normal file
119
rozen-test/rozen-test.py
Normal file
@ -0,0 +1,119 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
import os
|
||||||
|
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
|
||||||
|
from novoapi.backend import session
|
||||||
|
|
||||||
|
p = argparse.ArgumentParser()
|
||||||
|
p.add_argument("--user", default='martijn.wieling')
|
||||||
|
p.add_argument("--password", default='xxxxx')
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
rec = session.Recognizer(grammar_version="1.0", lang="nl", snodeid=101, user=args.user, password=args.password, keepopen=True)
|
||||||
|
|
||||||
|
grammar = {
|
||||||
|
"type": "confusion_network",
|
||||||
|
"version": "1.0",
|
||||||
|
"data": {
|
||||||
|
"kind": "sequence",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"kind": "word",
|
||||||
|
"pronunciation": [
|
||||||
|
{
|
||||||
|
"phones": [
|
||||||
|
"r",
|
||||||
|
"eu0",
|
||||||
|
"s"
|
||||||
|
],
|
||||||
|
"id": 0
|
||||||
|
}
|
||||||
|
,
|
||||||
|
{
|
||||||
|
"phones": [
|
||||||
|
"m",
|
||||||
|
"a0",
|
||||||
|
"n"
|
||||||
|
],
|
||||||
|
"id": 1
|
||||||
|
}
|
||||||
|
,
|
||||||
|
{
|
||||||
|
"phones": [
|
||||||
|
"m",
|
||||||
|
"a0",
|
||||||
|
"n",
|
||||||
|
"t",
|
||||||
|
"s",
|
||||||
|
"y",
|
||||||
|
"ax"
|
||||||
|
],
|
||||||
|
"id": 2
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"label": "reus"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"return_objects": [
|
||||||
|
"grammar"
|
||||||
|
],
|
||||||
|
"phoneset": "novo70"
|
||||||
|
}
|
||||||
|
|
||||||
|
res = rec.setgrammar(grammar)
|
||||||
|
#print "Set grammar result", res
|
||||||
|
|
||||||
|
|
||||||
|
## === novoapi/backend/session.py ===
|
||||||
|
#import wave
|
||||||
|
#import time
|
||||||
|
#from novoapi.backend.session import rpcid, segmentation
|
||||||
|
|
||||||
|
#wavf = "reus1008-reus.wav"
|
||||||
|
#w = wave.open(wavf, 'r')
|
||||||
|
#nchannels, sampwidth, framerate, nframes, comptype, compname = w.getparams()
|
||||||
|
#buf = w.readframes(nframes)
|
||||||
|
#w.close()
|
||||||
|
|
||||||
|
#buffer_size = 4096
|
||||||
|
#nbytes_sent = 0
|
||||||
|
#start = time.time()
|
||||||
|
#for j in range(0, len(buf), buffer_size):
|
||||||
|
# audio_packet = buf[j:j + buffer_size]
|
||||||
|
# nbytes_sent += len(audio_packet)
|
||||||
|
# rec.conn.send_binary(audio_packet)
|
||||||
|
#rec.conn.send(json.dumps({"jsonrpc": "2.0", "method": "get_result", "id": rpcid.next()}))
|
||||||
|
#print(rpcid.next())
|
||||||
|
#rec.last_message = rec.conn.recv()
|
||||||
|
#message = json.loads(rec.last_message)
|
||||||
|
#result = session.segmentation(message["result"]["words"])
|
||||||
|
#result.export()
|
||||||
|
## ====================================
|
||||||
|
|
||||||
|
def result2pronunciation(result, word):
|
||||||
|
#result_ = res.export()[1]
|
||||||
|
result_ = [result[i] for i in range(len(result)) if result[i]['label'] == word]
|
||||||
|
llh = result_[0]['llh']
|
||||||
|
phones = result_[0]['phones']
|
||||||
|
pronunciation = [phone['label'] for phone in phones]
|
||||||
|
return pronunciation, llh
|
||||||
|
|
||||||
|
|
||||||
|
res = rec.recognize_wav("reus1008-reus.wav")
|
||||||
|
#print "\n\n\nThe pronounced word in reus1008-reus.wav is: REUS\n\n"
|
||||||
|
#print "Recognition result:", json.dumps(res.export(), indent=4)
|
||||||
|
result2pronunciation(res.export(), 'reus')
|
||||||
|
|
||||||
|
#print "\n\n\nThe pronounced word in reus1167-man.wav is: MAN\n\n"
|
||||||
|
res2 = rec.recognize_wav("reus1167-man.wav")
|
||||||
|
#print "Recognition result:", json.dumps(res2.export(), indent=4)
|
||||||
|
result2pronunciation(res2.export(), 'reus')
|
||||||
|
|
||||||
|
#print "\n\n\nThe pronounced word in reus3768-mantsje.wav is: MANTSJE\n\n"
|
||||||
|
res3 = rec.recognize_wav("reus3768-mantsje.wav")
|
||||||
|
#print "Recognition result:", json.dumps(res3.export(), indent=4)
|
||||||
|
result2pronunciation(res3.export(), 'reus')
|
Reference in New Issue
Block a user