accent_classification/accent_classification/data_manipulation.py

import numpy as np
from sklearn import manifold
import Levenshtein

# x: ndarray (dnum x dim)
# n: number of samples to extract
# OUTPUT
# index: index of the chosen samples
#
def extractRandomSample(x, n):
	xRowMax = x.shape[0]
	indexOriginal = np.arange(xRowMax)
	indexChosen	= np.random.choice(indexOriginal, n, False)
	xChosen = x[indexChosen, :]
	return (xChosen, indexChosen)

# x: 1d string ndarray
def makeLevenshteinMatrix(x):
	xRowMax = x.shape[0]
	xLevenshtein = np.ones((xRowMax, xRowMax), dtype='int')

	for xRow in range(0, xRowMax):
		for xCol in range(0, xRowMax):
			dist = Levenshtein.distance(x[xRow], x[xCol]);
			xLevenshtein[xRow, xCol] = dist
	return xLevenshtein

# x: 1d string ndarray
def calcLevenshteinArray(word, x):
	xRowMax = x.shape[0]
	xLevenshtein = np.zeros(x.shape, dtype='int')

	for xRow in range(0, xRowMax):
		dist = Levenshtein.distance(word, x[xRow]);
		xLevenshtein[xRow] = dist
	return xLevenshtein

def MDS(x):
	mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=6)
	xmds = mds.fit_transform(x)
	return xmds