Compare commits

..

No commits in common. "87abbbb95aeb0b72cd5b553cbec8e0901d472e4e" and "7844a56281e4f11df12e52b856512d7c30fe5f31" have entirely different histories.

11 changed files with 118 additions and 344 deletions

Binary file not shown.

Binary file not shown.

BIN
_tmp/translation_key.npy Normal file

Binary file not shown.

View File

@ -10,6 +10,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
..\forced_alignment\forced_alignment\__init__.py = ..\forced_alignment\forced_alignment\__init__.py
..\forced_alignment\forced_alignment\convert_phone_set.py = ..\forced_alignment\forced_alignment\convert_phone_set.py
..\toolbox\evaluation.py = ..\toolbox\evaluation.py
..\toolbox\toolbox\file_handling.py = ..\toolbox\toolbox\file_handling.py
..\forced_alignment\forced_alignment\htk_dict.py = ..\forced_alignment\forced_alignment\htk_dict.py
..\forced_alignment\forced_alignment\lexicon.py = ..\forced_alignment\forced_alignment\lexicon.py
..\forced_alignment\forced_alignment\mlf.py = ..\forced_alignment\forced_alignment\mlf.py
@ -22,7 +23,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
..\forced_alignment\forced_alignment\test_environment.py = ..\forced_alignment\forced_alignment\test_environment.py
EndProjectSection
EndProject
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "toolbox", "..\toolbox\toolbox.pyproj", "{F0D46C9C-51C6-4989-8A2F-35F2A0C048BE}"
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "pyhtk", "..\pyhtk\pyhtk\pyhtk.pyproj", "{75FCEFAF-9397-43FC-8189-DE97ADB77AA5}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
@ -32,8 +33,8 @@ Global
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{4D8C8573-32F0-4A62-9E62-3CE5CC680390}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{4D8C8573-32F0-4A62-9E62-3CE5CC680390}.Release|Any CPU.ActiveCfg = Release|Any CPU
{F0D46C9C-51C6-4989-8A2F-35F2A0C048BE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{F0D46C9C-51C6-4989-8A2F-35F2A0C048BE}.Release|Any CPU.ActiveCfg = Release|Any CPU
{75FCEFAF-9397-43FC-8189-DE97ADB77AA5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{75FCEFAF-9397-43FC-8189-DE97ADB77AA5}.Release|Any CPU.ActiveCfg = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE

View File

@ -23,18 +23,12 @@
</PropertyGroup>
<ItemGroup>
<Compile Include="check_novoapi.py" />
<Compile Include="convert_phone_set.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="convert_xsampa2ipa.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="defaultfiles.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="fame_phoneset.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="fa_test.py">
<SubType>Code</SubType>
</Compile>

View File

@ -1,29 +0,0 @@
"""Module to convert phonemes."""
def multi_character_tokenize(line, multi_character_tokens):
"""Tries to match one of the tokens in multi_character_tokens at each position of line, starting at position 0,
if so tokenizes and eats that token. Otherwise tokenizes a single character"""
while line != '':
for token in multi_character_tokens:
if line.startswith(token) and len(token) > 0:
yield token
line = line[len(token):]
break
else:
yield line[:1]
line = line[1:]
def split_word(word, multi_character_phones):
"""
split a line by given phoneset.
Args:
word (str): a word written in given phoneset.
multi_character_phones (list): the list of multicharacter phones which is considered as one phone. this can be obtained with phoneset definition such as fame_phoneset.py.
Returns:
(word_seperated) (list): the word splitted in given phoneset.
"""
return [phone for phone in multi_character_tokenize(word.strip(), multi_character_phones)]

View File

@ -4,8 +4,7 @@ import os
#cygwin_dir = r'C:\cygwin64\home\Aki\acoustic_model'
#htk_dir = r'C:\Aki\htk_fame'
htk_dir = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk'
htk_dir = r'C:\Aki\htk_fame'
config_hcopy = os.path.join(htk_dir, 'config', 'config.HCopy')
#config_train = os.path.join(cygwin_dir, 'config', 'config.train')
@ -29,21 +28,22 @@ config_hcopy = os.path.join(htk_dir, 'config', 'config.HCopy')
#filePhoneList = config['pyHTK']['filePhoneList']
#AcousticModel = config['pyHTK']['AcousticModel']
repo_dir = r'C:\Users\Aki\source\repos'
repo_dir = r'C:\Users\A.Kunikoshi\source\repos'
ipa_xsampa_converter_dir = os.path.join(repo_dir, 'ipa-xsama-converter')
forced_alignment_module_dir = os.path.join(repo_dir, 'forced_alignment')
accent_classification_dir = os.path.join(repo_dir, 'accent_classification', 'accent_classification')
toolbox_dir = os.path.join(repo_dir, 'toolbox')
pyhtk_dir = os.path.join(repo_dir, 'pyhtk', 'pyhtk')
toolbox_dir = os.path.join(repo_dir, 'toolbox', 'toolbox')
#htk_config_dir = r'c:\Users\A.Kunikoshi\source\repos\forced_alignment\forced_alignment\data\htk\preset_models\aki_dutch_2017'
#config_hvite = os.path.join(htk_config_dir, 'config.HVite')
htk_config_dir = r'c:\Users\A.Kunikoshi\source\repos\forced_alignment\forced_alignment\data\htk\preset_models\aki_dutch_2017'
config_hvite = os.path.join(htk_config_dir, 'config.HVite')
#acoustic_model = os.path.join(htk_config_dir, 'hmmdefs.compo')
#acoustic_model = r'c:\cygwin64\home\A.Kunikoshi\acoustic_model\model\barbara\hmm128-2\hmmdefs.compo'
#phonelist_txt = os.path.join(htk_config_dir, 'phonelist.txt')
acoustic_model = r'c:\cygwin64\home\A.Kunikoshi\acoustic_model\model\barbara\hmm128-2\hmmdefs.compo'
phonelist_txt = os.path.join(htk_config_dir, 'phonelist.txt')
WSL_dir = r'C:\OneDrive\WSL'
#fame_dir = os.path.join(WSL_dir, 'kaldi-trunk', 'egs', 'fame')
fame_dir = r'd:\_corpus\fame'
fame_dir = r'f:\_corpus\fame'
fame_s5_dir = os.path.join(fame_dir, 's5')
fame_corpus_dir = os.path.join(fame_dir, 'corpus')

View File

@ -1,5 +1,5 @@
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
os.chdir(r'C:\Users\A.Kunikoshi\source\repos\acoustic_model\acoustic_model')
import sys
from collections import Counter
@ -9,8 +9,24 @@ import numpy as np
import pandas as pd
import defaultfiles as default
import fame_phoneset
import convert_phone_set
#sys.path.append(default.forced_alignment_module_dir)
#from forced_alignment import convert_phone_set
#def find_phone(lexicon_file, phone):
# """ Search where the phone is used in the lexicon. """
# with open(lexicon_file, "rt", encoding="utf-8") as fin:
# lines = fin.read()
# lines = lines.split('\n')
# extracted = []
# for line in lines:
# line = line.split('\t')
# if len(line) > 1:
# pronunciation = line[1]
# if phone in pronunciation:
# extracted.append(line)
# return extracted
#def ipa2famehtk_lexicon(lexicon_file_in, lexicon_file_out):
@ -110,6 +126,25 @@ import convert_phone_set
# return ipa
def make_hcopy_scp_from_filelist_in_fame(fame_dir, dataset, feature_dir, hcopy_scp):
""" Make a script file for HCopy using the filelist in FAME! corpus. """
filelist_txt = os.path.join(fame_dir, 'fame', 'filelists', dataset + 'list.txt')
with open(filelist_txt) as fin:
filelist = fin.read()
filelist = filelist.split('\n')
with open(hcopy_scp, 'w') as fout:
for filename_ in filelist:
filename = filename_.replace('.TextGrid', '')
if len(filename) > 3: # remove '.', '..' and ''
wav_file = os.path.join(fame_dir, 'fame', 'wav', dataset, filename + '.wav')
mfc_file = os.path.join(feature_dir, filename + '.mfc')
fout.write(wav_file + '\t' + mfc_file + '\n')
#def make_filelist(input_dir, output_txt):
# """ Make a list of files in the input_dir. """
# filenames = os.listdir(input_dir)
@ -154,147 +189,64 @@ import convert_phone_set
# f.write('{0}\t{1}\n'.format(WORD, key))
def make_hcopy_scp_from_filelist_in_fame(fame_dir, dataset, feature_dir, hcopy_scp):
""" Make a script file for HCopy using the filelist in FAME! corpus.
Args:
fame_dir (path): the directory of FAME corpus.
dataset (str): 'devel', 'test' or 'train'.
feature_dir (path): the directory where feature will be stored.
hcopy_scp (path): a script file for HCopy to be made.
"""
filelist_txt = os.path.join(fame_dir, 'fame', 'filelists', dataset + 'list.txt')
with open(filelist_txt) as fin:
filelist = fin.read()
filelist = filelist.split('\n')
with open(hcopy_scp, 'w') as fout:
for filename_ in filelist:
filename = filename_.replace('.TextGrid', '')
if len(filename) > 3: # remove '.', '..' and ''
wav_file = os.path.join(fame_dir, 'fame', 'wav', dataset, filename + '.wav')
mfc_file = os.path.join(feature_dir, filename + '.mfc')
fout.write(wav_file + '\t' + mfc_file + '\n')
def load_lexicon(lexicon_file):
""" load lexicon file as Data Frame.
Args:
lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'.
Returns:
lex (df): lexicon as Data Frame, which has columns 'word' and 'pronunciation'.
"""
lex = pd.read_csv(lexicon_file, delimiter='\t', header=None, encoding="utf-8")
lex.rename(columns={0: 'word', 1: 'pronunciation'}, inplace=True)
return lex
def get_phoneset_from_lexicon(lexicon_file, phoneset='asr'):
""" Make a list of phones which appears in the lexicon.
def get_phonelist(lexicon_asr):
""" Make a list of phones which appears in the lexicon. """
Args:
lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'.
phoneset (str): the phoneset with which lexicon_file is written. 'asr'(default) or 'ipa'.
Returns:
(list_of_phones) (set): the set of phones included in the lexicon_file.
"""
assert phoneset in ['asr', 'ipa'], 'phoneset should be \'asr\' or \'ipa\''
lex = load_lexicon(lexicon_file)
if phoneset == 'asr':
#with open(lexicon_file, "rt", encoding="utf-8") as fin:
# lines = fin.read()
# lines = lines.split('\n')
# phonelist = set([])
# for line in lines:
# line = line.split('\t')
# if len(line) > 1:
# pronunciation = set(line[1].split())
# phonelist = phonelist | pronunciation
lex = load_lexicon(lexicon_asr)
return set(' '.join(lex['pronunciation']).split(' '))
elif phoneset == 'ipa':
join_pronunciations = ''.join(lex['pronunciation'])
return set(convert_phone_set.split_word(join_pronunciations, fame_phoneset.multi_character_phones_ipa))
import time
def extract_unknown_phones(ipa, known_phones):
"""extract unknown phones in the pronunciation written in IPA.
timer_start = time.time()
Args:
ipa (str): a pronunciation written in IPA.
known_phones (list): list of phones already know.
#def get_translation_key():
dir_tmp = r'c:\Users\A.Kunikoshi\source\repos\acoustic_model\_tmp'
lexicon_ipa = r'f:\_corpus\FAME\lexicon\lex.ipa'
lexicon_asr = r'f:\_corpus\FAME\lexicon\lex.asr'
Returns:
(list_of_phones) (list): unknown phones not included in 'known_phones'.
"""
ipa_split = convert_phone_set.split_word(ipa, fame_phoneset.multi_character_phones_ipa)
return [i for i in ipa_split if not i in known_phones]
def get_translation_key(lexicon_file_ipa, lexicon_file_asr):
""" get correspondence between lexicon_file_ipa and lexicon_file_asr.
Args:
lexicon_file_ipa (path): lexicon in the format of 'word' /t 'pronunciation (IPA)'.
lexicon_file_asr (path): lexicon in the format of 'word' /t 'pronunciation (asr)'.
the each character of 'pronunciation' should be delimited by ' '.
Returns:
translation_key (dict): translation key from ipa to asr.
(phone_unknown) (list): the list of IPA phones, which does not appear in lexicon_file_asr.
"""
lex_ipa = load_lexicon(lexicon_file_ipa)
lex_asr = load_lexicon(lexicon_file_asr)
phone_unknown = fame_phoneset.phoneset_ipa[:]
lex_ipa = load_lexicon(lexicon_ipa)
lex_asr = load_lexicon(lexicon_asr)
if 0:
phone_to_be_searched = get_phonelist(lexicon_asr)
translation_key = dict()
for word in lex_ipa['word']:
if np.sum(lex_ipa['word'] == word) == 1 and np.sum(lex_asr['word'] == word) == 1:
ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
for word in lex_asr['word']:
if np.sum(lex_asr['word'] == word) == 1 and np.sum(lex_ipa['word'] == word) == 1:
asr = lex_asr[lex_asr['word'] == word].iat[0, 1]
ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
ipa_list = convert_phone_set.split_word(ipa, fame_phoneset.multi_character_phones_ipa)
asr_list = asr.split(' ')
# if there are phones which is not in phone_unknown
#if len([True for i in asr_list if i in phone_unknown]) > 0:
if(len(ipa_list) == len(asr_list)):
print("{0}: {1} --> {2}".format(word, ipa_list, asr_list))
for ipa_, asr_ in zip(ipa_list, asr_list):
if ipa_ in phone_unknown:
# if there are phones which is not in phone_to_be_searched
if len([True for i in asr_list if i in phone_to_be_searched]) > 0:
if(len(ipa) == len(asr_list)):
print("{0}: {1} --> {2}".format(word, ipa, asr))
for ipa_, asr_ in zip(ipa, asr_list):
if asr_ in phone_to_be_searched:
#if not translation_key[ipa_] == asr_:
translation_key[ipa_] = asr_
phone_unknown.remove(ipa_)
return translation_key, list(phone_unknown)
phone_to_be_searched.remove(asr_)
print("elapsed time: {}".format(time.time() - timer_start))
def find_phone(lexicon_file, phone, phoneset='ipa'):
""" extract rows where the phone is used in the lexicon_file.
Args:
lexicon_file (path): lexicon in the format of 'word' /t 'pronunciation'.
phone (str): the phone to be searched.
phoneset (str): the phoneset with which lexicon_file is written. 'asr' or 'ipa'(default).
Returns:
extracted (df): rows where the phone is used.
ToDo:
* develop when the phonset == 'asr'.
"""
assert phoneset in ['asr', 'ipa'], 'phoneset should be \'asr\' or \'ipa\''
lex = load_lexicon(lexicon_file)
# to reduce the calculation time, only target rows which include 'phone' at least once.
lex_ = lex[lex['pronunciation'].str.count(phone)>0]
extracted = pd.DataFrame(index=[], columns=['word', 'pronunciation'])
for index, row in lex_.iterrows():
if phoneset == 'ipa':
pronunciation = convert_phone_set.split_word(row['pronunciation'], fame_phoneset.multi_character_phones_ipa)
if phone in pronunciation:
extracted_ = pd.Series([row['word'], pronunciation], index=extracted.columns)
extracted = extracted.append(extracted_, ignore_index=True)
return extracted
np.save(os.path.join(dir_tmp, 'translation_key.npy'), translation_key)
np.save(os.path.join(dir_tmp, 'phone_to_be_searched.npy'), phone_to_be_searched)
else:
translation_key = np.load(os.path.join(dir_tmp, 'translation_key.npy')).item()
phone_to_be_searched = np.load(os.path.join(dir_tmp, 'phone_to_be_searched.npy')).item()

View File

@ -1,21 +1,21 @@
import sys
import os
os.chdir(r'C:\Users\Aki\source\repos\acoustic_model\acoustic_model')
os.chdir(r'C:\Users\A.Kunikoshi\source\repos\acoustic_model\acoustic_model')
import tempfile
#import configparser
#import subprocess
#from collections import Counter
import time
#import numpy as np
#import pandas as pd
import fame_functions
import defaultfiles as default
sys.path.append(default.pyhtk_dir)
import pyhtk
sys.path.append(default.toolbox_dir)
import file_handling as fh
from htk import pyhtk
import file_handling
## ======================= user define =======================
@ -28,8 +28,8 @@ from htk import pyhtk
dataset_list = ['devel', 'test', 'train']
# procedure
extract_features = 0
conv_lexicon = 1
extract_features = 1
#conv_lexicon = 0
#check_lexicon = 0
#make_mlf = 0
#combine_files = 0
@ -85,12 +85,14 @@ if not os.path.exists(tmp_dir):
## ======================= extract features =======================
if extract_features:
for dataset in dataset_list:
#for dataset in ['test']:
print('==== {} ===='.format(dataset))
# a script file for HCopy
print(">>> making a script file for HCopy... \n")
hcopy_scp = tempfile.NamedTemporaryFile(mode='w', delete=False)
hcopy_scp.close()
#hcopy_scp = os.path.join(default.htk_dir, 'tmp', 'HCopy.scp')
# get a list of features (hcopy.scp) from the filelist in FAME! corpus
feature_dir_ = os.path.join(feature_dir, dataset)
@ -100,71 +102,32 @@ if extract_features:
# extract features
print(">>> extracting features... \n")
fame_functions.make_hcopy_scp_from_filelist_in_fame(default.fame_dir, dataset, feature_dir_, hcopy_scp.name)
#subprocessStr = 'HCopy -C ' + config_hcopy + ' -S ' + hcopy_scp.name
#subprocess.call(subprocessStr, shell=True)
pyhtk.wav2mfc(default.config_hcopy, hcopy_scp.name)
# a script file for HCompV
print(">>> making a script file for HCompV... \n")
## ======================= make a list of features =======================
#if make_feature_list:
# print("==== make a list of features ====\n")
# for dataset in dataset_list:
# print(dataset)
#feature_dir = output_dir + '\\mfc\\' + dataset
hcompv_scp = os.path.join(tmp_dir, dataset + '.scp')
fh.make_filelist(feature_dir_, hcompv_scp, '.mfc')
#am_func.make_filelist(feature_dir, hcompv_scp)
file_handling.make_filelist(feature_dir_, hcompv_scp, '.mfc')
## ======================= convert lexicon from ipa to fame_htk =======================
if conv_lexicon:
print('==== convert lexicon from ipa 2 fame ====\n')
#dir_out = r'c:\Users\Aki\source\repos\acoustic_model\_tmp'
lexicon_dir = os.path.join(default.fame_dir, 'lexicon')
lexicon_ipa = os.path.join(lexicon_dir, 'lex.ipa')
lexicon_asr = os.path.join(lexicon_dir, 'lex.asr')
# get the correspondence between lex_ipa and lex_asr.
lex_asr = fame_functions.load_lexicon(lexicon_asr)
lex_ipa = fame_functions.load_lexicon(lexicon_ipa)
if 1:
timer_start = time.time()
translation_key, phone_unknown = fame_functions.get_translation_key(lexicon_ipa, lexicon_asr)
print("elapsed time: {}".format(time.time() - timer_start))
np.save('translation_key_ipa2asr.npy', translation_key)
np.save('phone_unknown.npy', phone_unknown)
else:
translation_key = np.load('translation_key_ipa2asr.npy').item()
phone_unknown = np.load('phone_unknown.npy')
phone_unknown = list(phone_unknown)
## manually check the correspondence for the phone in phone_unknown.
#p = phone_unknown[0]
#lex_ipa_ = find_phone(lexicon_ipa, p, phoneset='ipa')
#for word in lex_ipa_['word']:
# ipa = lex_ipa[lex_ipa['word'] == word].iat[0, 1]
# if np.sum(lex_asr['word'] == word) > 0:
# asr = lex_asr[lex_asr['word'] == word].iat[0, 1]
# ipa_list = convert_phone_set.split_word(ipa, fame_phoneset.multi_character_phones_ipa)
# asr_list = asr.split(' ')
# if p in ipa_list and (len(ipa_list) == len(asr_list)):
# print("{0}: {1} --> {2}".format(word, ipa_list, asr_list))
# for ipa_, asr_ in zip(ipa_list, asr_list):
# if ipa_ in phone_unknown:
# translation_key[ipa_] = asr_
# phone_unknown.remove(ipa_)
## check if all the phones in lexicon_ipa are in fame_phoneset.py.
#timer_start = time.time()
#phoneset_lex = get_phoneset_from_lexicon(lexicon_ipa, phoneset='ipa')
#print("elapsed time: {}".format(time.time() - timer_start))
#phoneset_py = fame_phoneset.phoneset_ipa
#set(phoneset_lex) - set(phoneset_py)
##timer_start = time.time()
##extracted = find_phone(lexicon_ipa, 'ⁿ')
##print("elapsed time: {}".format(time.time() - timer_start))
# lex.asr is Kaldi compatible version of lex.ipa.
# to check...
#lexicon_ipa = pd.read_table(lex_ipa, names=['word', 'pronunciation'])
@ -177,13 +140,13 @@ if conv_lexicon:
# fout.write("{0}\t{1}\n".format(word, ' '.join(pronunciation_split)))
# convert each lexicon from ipa description to fame_htk phoneset.
#am_func.ipa2famehtk_lexicon(lex_oov, lex_oov_htk)
#am_func.ipa2famehtk_lexicon(lex_asr, lex_asr_htk)
am_func.ipa2famehtk_lexicon(lex_oov, lex_oov_htk)
am_func.ipa2famehtk_lexicon(lex_asr, lex_asr_htk)
# combine lexicon
# pronunciations which is not found in lex.asr are generated using G2P and listed in lex.oov.
# therefore there is no overlap between lex_asr and lex_oov.
#am_func.combine_lexicon(lex_asr_htk, lex_oov_htk, lex_htk)
am_func.combine_lexicon(lex_asr_htk, lex_oov_htk, lex_htk)
## ======================= check if all the phones are successfully converted =======================

View File

@ -1,107 +0,0 @@
""" definition of the phones to be used. """
## phones in IPA.
phoneset_ipa = [
# vowels
'',
'i̯ⁿ',
'y',
'i',
'i.',
'iⁿ',
'i:',
'i:ⁿ',
'ɪ',
'ɪⁿ',
'ɪ.',
#'ɪ:', # not included in lex.ipa
'ɪ:ⁿ',
'e',
'e:',
'e:ⁿ',
'ə',
'əⁿ',
'ə:',
'ɛ',
'ɛ.',
'ɛⁿ',
'ɛ:',
'ɛ:ⁿ',
'a',
'aⁿ',
'a.',
'a:',
'a:ⁿ',
'',
'ṷ.',
'ṷⁿ',
#'ú', # only appears in word 'feeste'(út) and 'gaste'(út) which are 'f e: s t ə' and 'yn' in lex_asr.
'u',
'uⁿ',
'u.',
'u:',
'u:ⁿ',
'ü',
'ü.',
'üⁿ',
'ü:',
'ü:ⁿ',
'o',
'oⁿ',
'o.',
'o:',
'o:ⁿ',
'ö',
'ö.',
'öⁿ',
'ö:',
'ö:ⁿ',
'ɔ',
'ɔ.',
'ɔⁿ',
'ɔ:',
'ɔ:ⁿ',
#'ɔ̈', # not included in lex.ipa
'ɔ̈.',
'ɔ̈:',
# plosives
'p',
'b',
't',
'tⁿ',
'd',
'k',
'g',
'ɡ', # = 'g'
# nasals
'm',
'n',
'ŋ',
# fricatives
'f',
'v',
's',
's:',
'z',
'zⁿ',
'x',
'h',
# tap and flip
'r',
'r.', # only appears in word 'mearpartijestelsel'(does not exist in lex_asr) and 'tenoarpartij'.
'r:', # only appears in word 'mûsearflearmûs' and 'sjochdêr'.
# approximant
'j',
'j.',
'l'
]
## the list of multi character phones.
# for example, the length of 'i̯ⁿ' is 3, but in the codes it is treated as one letter.
multi_character_phones_ipa = [i for i in phoneset_ipa if len(i) > 1]
multi_character_phones_ipa.sort(key=len, reverse=True)