152 lines
4.9 KiB
Python
152 lines
4.9 KiB
Python
import os
|
|
import sys
|
|
from collections import Counter
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
import defaultfiles as default
|
|
|
|
sys.path.append(default.forced_alignment_module_dir)
|
|
from forced_alignment import convert_phone_set
|
|
|
|
|
|
def make_hcopy_scp_from_filelist_in_fame(FAME_dir, dataset, feature_dir, hcopy_scp):
|
|
""" Make a script file for HCopy using the filelist in FAME! corpus. """
|
|
filelist_txt = FAME_dir + '\\fame\\filelists\\' + dataset + 'list.txt'
|
|
with open(filelist_txt) as fin:
|
|
filelist = fin.read()
|
|
filelist = filelist.split('\n')
|
|
|
|
with open(hcopy_scp, 'w') as fout:
|
|
for filename_ in filelist:
|
|
filename = filename_.replace('.TextGrid', '')
|
|
|
|
if len(filename) > 3: # remove '.', '..' and ''
|
|
wav_file = FAME_dir + '\\fame\\wav\\' + dataset + '\\' + filename + '.wav'
|
|
mfc_file = feature_dir + '\\' + filename + '.mfc'
|
|
|
|
fout.write(wav_file + '\t' + mfc_file + '\n')
|
|
|
|
|
|
def make_filelist(input_dir, output_txt):
|
|
""" Make a list of files in the input_dir. """
|
|
filenames = os.listdir(input_dir)
|
|
|
|
with open(output_txt, 'w') as fout:
|
|
for filename in filenames:
|
|
fout.write(input_dir + '\\' + filename + '\n')
|
|
|
|
|
|
def make_dic(word, pronvar_, fileDic, output_type):
|
|
"""
|
|
make dict files which can be used for HTK.
|
|
param word: target word.
|
|
param pronvar_: pronunciation variant. nx2 (WORD /t pronunciation) ndarray.
|
|
param fileDic: output dic file.
|
|
param output_type: 0:full, 1:statistics, 2:frequency <2% entries are removed. 3:top 3.
|
|
"""
|
|
#assert(output_type < 4 and output_type >= 0, 'output_type should be an integer between 0 and 3.')
|
|
WORD = word.upper()
|
|
|
|
if output_type == 0: # full
|
|
pronvar = np.unique(pronvar_)
|
|
|
|
with open(fileDic, 'w') as f:
|
|
for pvar in pronvar:
|
|
f.write('{0}\t{1}\n'.format(WORD, pvar))
|
|
else:
|
|
c = Counter(pronvar_)
|
|
total_num = sum(c.values())
|
|
with open(fileDic, 'w') as f:
|
|
if output_type == 3:
|
|
for key, value in c.most_common(3):
|
|
f.write('{0}\t{1}\n'.format(WORD, key))
|
|
else:
|
|
for key, value in c.items():
|
|
percentage = value/total_num*100
|
|
|
|
if output_type == 1: # all
|
|
f.write('{0}\t{1:.2f}\t{2}\t{3}\n'.format(value, percentage, WORD, key))
|
|
elif output_type == 2: # less than 2 percent
|
|
if percentage < 2:
|
|
f.write('{0}\t{1}\n'.format(WORD, key))
|
|
|
|
|
|
def get_phonelist(lexicon_file):
|
|
""" Make a list of phones which appears in the lexicon. """
|
|
|
|
with open(lexicon_file, "rt", encoding="utf-8") as fin:
|
|
lines = fin.read()
|
|
lines = lines.split('\n')
|
|
phonelist = set([])
|
|
for line in lines:
|
|
line = line.split('\t')
|
|
if len(line) > 1:
|
|
pronunciation = set(line[1].split())
|
|
phonelist = phonelist | pronunciation
|
|
return phonelist
|
|
|
|
|
|
def find_phone(lexicon_file, phone):
|
|
""" Search where the phone is used in the lexicon. """
|
|
with open(lexicon_file, "rt", encoding="utf-8") as fin:
|
|
lines = fin.read()
|
|
lines = lines.split('\n')
|
|
|
|
extracted = []
|
|
for line in lines:
|
|
line = line.split('\t')
|
|
if len(line) > 1:
|
|
pron = line[1]
|
|
if phone in pron:
|
|
extracted.append(line)
|
|
return extracted
|
|
|
|
|
|
def ipa2famehtk_lexicon(lexicon_file_in, lexicon_file_out):
|
|
""" Convert a lexicon file from IPA to HTK format for FAME! corpus. """
|
|
|
|
lexicon_in = pd.read_table(lexicon_file_in, names=['word', 'pronunciation'])
|
|
with open(lexicon_file_out, "w", encoding="utf-8") as fout:
|
|
for word, pronunciation in zip(lexicon_in['word'], lexicon_in['pronunciation']):
|
|
pronunciation_no_space = pronunciation.replace(' ', '')
|
|
pronunciation_famehtk = convert_phone_set.ipa2famehtk(pronunciation_no_space)
|
|
if 'ceh' not in pronunciation_famehtk and 'sh' not in pronunciation_famehtk:
|
|
fout.write("{0}\t{1}\n".format(word.upper(), pronunciation_famehtk))
|
|
|
|
|
|
def combine_lexicon(lexicon_file1, lexicon_file2, lexicon_out):
|
|
""" Combine two lexicon files and sort by words. """
|
|
|
|
with open(lexicon_file1, "rt", encoding="utf-8") as fin:
|
|
lines1 = fin.read()
|
|
lines1 = lines1.split('\n')
|
|
with open(lexicon_file2, "rt", encoding="utf-8") as fin:
|
|
lines2 = fin.read()
|
|
lines2 = lines2.split('\n')
|
|
|
|
lex1 = pd.read_table(lexicon_file1, names=['word', 'pronunciation'])
|
|
lex2 = pd.read_table(lexicon_file2, names=['word', 'pronunciation'])
|
|
lex = pd.concat([lex1, lex2])
|
|
lex = lex.sort_values(by='word', ascending=True)
|
|
lex.to_csv(lexicon_out, index=False, header=False, encoding="utf-8", sep='\t')
|
|
|
|
|
|
def read_fileFA(fileFA):
|
|
"""
|
|
read the result file of HTK forced alignment.
|
|
this function only works when input is one word.
|
|
"""
|
|
with open(fileFA, 'r') as f:
|
|
lines = f.read()
|
|
lines = lines.split('\n')
|
|
|
|
phones = []
|
|
for line in lines:
|
|
line_split = line.split()
|
|
if len(line_split) > 1:
|
|
phones.append(line_split[2])
|
|
|
|
return ' '.join(phones)
|