accent_classification/accent_classification/data_manipulation.py

41 lines
1.1 KiB
Python

import numpy as np
from sklearn import manifold
import Levenshtein
# x: ndarray (dnum x dim)
# n: number of samples to extract
# OUTPUT
# index: index of the chosen samples
#
def extractRandomSample(x, n):
xRowMax = x.shape[0]
indexOriginal = np.arange(xRowMax)
indexChosen = np.random.choice(indexOriginal, n, False)
xChosen = x[indexChosen, :]
return (xChosen, indexChosen)
# x: 1d string ndarray
def makeLevenshteinMatrix(x):
xRowMax = x.shape[0]
xLevenshtein = np.ones((xRowMax, xRowMax), dtype='int')
for xRow in range(0, xRowMax):
for xCol in range(0, xRowMax):
dist = Levenshtein.distance(x[xRow], x[xCol]);
xLevenshtein[xRow, xCol] = dist
return xLevenshtein
# x: 1d string ndarray
def calcLevenshteinArray(word, x):
xRowMax = x.shape[0]
xLevenshtein = np.zeros(x.shape, dtype='int')
for xRow in range(0, xRowMax):
dist = Levenshtein.distance(word, x[xRow]);
xLevenshtein[xRow] = dist
return xLevenshtein
def MDS(x):
mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=6)
xmds = mds.fit_transform(x)
return xmds