author Simon Chabot Mon, 12 Nov 2012 18:30:00 +0100 changeset 114 aa61b2e2ef74 parent 113 ecfd9543df77 child 115 8e7f09e0e9ca
[matrix] Let's the distance matrix API looks like scipy's one
 aligner.py file | annotate | diff | comparison | revisions matrix.py file | annotate | diff | comparison | revisions test/test_alignment.py file | annotate | diff | comparison | revisions
--- a/aligner.py	Mon Nov 12 16:45:44 2012 +0100
+++ b/aligner.py	Mon Nov 12 18:30:00 2012 +0100
@@ -206,7 +206,7 @@
items.append(item)

mat = m.globalalignmentmatrix(items)
-    matched = mat.matched(threshold)
+    matched = m.matched(mat, threshold)

if not matched:
return mat, False
--- a/matrix.py	Mon Nov 12 16:45:44 2012 +0100
+++ b/matrix.py	Mon Nov 12 18:30:00 2012 +0100
@@ -21,100 +21,47 @@
from scipy import array, empty
from scipy import where

-class Distancematrix(object):
-    """ Construct and compute a matrix of distance given a distance function.
-
-        Given :
-            input1 = ['Victor Hugo', 'Albert Camus']
-            input2 = ['Victor Wugo', 'Albert Camus']
-            distance = levenshtein
+""" Construct and compute a matrix of distance given a distance function.

-            constructs the following matrix :
-                 +----+----+
-                 | 1  | 11 |
-                 +----+----+
-                 | 11 | 0  |
-                 +----+----+
-    """
+    Given :
+        input1 = ['Victor Hugo', 'Albert Camus']
+        input2 = ['Victor Wugo', 'Albert Camus']
+        distance = levenshtein

-    def __init__(self, weighting, input1, input2, distance, normalized = True, kargs = {}):
-        self.distance = distance
-        self._matrix = empty((len(input1), len(input2)), dtype='float32')
-        self.size = self._matrix.shape
-        self.normalized = normalized
-        self._compute(weighting, input1, input2, kargs)
-
-    def _compute(self, weighting, input1, input2, kargs):
-        for i in xrange(self.size[0]):
-            for j in xrange(self.size[1]):
-                d = 1
-                if input1[i] and input2[j]:
-                    d = self.distance(input1[i], input2[j], **kargs)
-                    if self.normalized:
-                        d = 1 - (1.0 / (1.0 + d))
-                    d *= weighting
-                self._matrix[i, j] = d
-
-    def __getitem__(self, index):
-        return self._matrix[index]
-
-    def __repr__(self):
-        return self._matrix.__repr__()
-
-    def __rmul__(self, number):
-        return self * number
+        constructs the following matrix :
+             +----+----+
+             | 1  | 11 |
+             +----+----+
+             | 11 | 0  |
+             +----+----+
+"""

-    def __mul__(self, val):
-        if not (isinstance(val, int) or isinstance(val, float)
-                or isinstance(val, Distancematrix)):
-            raise NotImplementedError
-
-        other = deepcopy(self)
-        other._matrix *= val
-        return other
-
-        if not isinstance(other, Distancematrix):
-            raise NotImplementedError
-
-        result = deepcopy(self)
-        result._matrix = (self._matrix + other._matrix)
-        return result
-
-    def __sub__(self, other):
-        if not isinstance(other, Distancematrix):
-            raise NotImplementedError
-
-        result = deepcopy(self)
-        result._matrix = (self._matrix - other._matrix)
-        return result
+def cdist(input1, input2, distance, normalized = True, kargs = {}):
+    distmatrix = empty((len(input1), len(input2)), dtype='float32')
+    size = distmatrix.shape
+    for i in xrange(size[0]):
+        for j in xrange(size[1]):
+            d = 1
+            if input1[i] and input2[j]:
+                d = distance(input1[i], input2[j], **kargs)
+                if normalized:
+                    d = 1 - (1.0 / (1.0 + d))
+            distmatrix[i, j] = d
+    return distmatrix

-    def __eq__(self, other):
-        if not isinstance(other, Distancematrix):
-            return False
-
-        if (self._matrix != other._matrix).any():
-            return False
-
-        if self.distance != other.distance:
-            return False
-
-        return True
-
+def matched(distmatrix, cutoff = 0, normalized = False):
+    match = defaultdict(list)
+    if normalized:
+        distmatrix /= distmatrix.max()

-    def matched(self, cutoff = 0, normalized = False):
-        match = defaultdict(list)
-        if normalized:
-            self._matrix /= self._matrix.max()
+    ind = (distmatrix <= cutoff).nonzero()
+    indrow = ind[0].tolist()
+    indcol = ind[1].tolist()

-        ind = (self._matrix <= cutoff).nonzero()
-        indrow = ind[0].tolist()
-        indcol = ind[1].tolist()
+    for (i, j) in zip(indrow, indcol):
+        match[i].append((j, distmatrix[i, j]))

-        for (i, j) in zip(indrow, indcol):
-            match[i].append((j, self._matrix[i, j]))
-
-        return match
+    return match

def globalalignmentmatrix(items):
""" Compute and return the global alignment matrix.
@@ -147,7 +94,7 @@
/!\ All `input1` and `input2` of each tuple must have the same size
in twos
"""
-    globalmatrix = Distancematrix(*items[0])
+    globalmatrix = items[0][0]*cdist(*items[0][1:])
for item in items[1:]:
-        globalmatrix += Distancematrix(*item)
+        globalmatrix += item[0]*cdist(*item[1:])
return globalmatrix
--- a/test/test_alignment.py	Mon Nov 12 16:45:44 2012 +0100
+++ b/test/test_alignment.py	Mon Nov 12 18:30:00 2012 +0100
@@ -49,7 +49,7 @@
geographical)
from alignment.normalize import (lunormalize, loadlemmas, lemmatized, \
roundstr, rgxformat, tokenize, simplify)
-from alignment.matrix import Distancematrix
+import alignment.matrix as am
from alignment.minhashing import Minlsh
from alignment.aligner import parsefile

@@ -197,7 +197,7 @@
self.input1 = [u'Victor Hugo', u'Albert Camus', 'Jean Valjean']
self.input2 = [u'Victor Wugo', u'Albert Camus', 'Albert Camu']
self.distance = levenshtein
-        self.matrix = Distancematrix(1, self.input1, self.input2, self.distance, False)
+        self.matrix = am.cdist(self.input1, self.input2, self.distance, False)
def test_matrixconstruction(self):
d = self.distance
i1, i2 = self.input1, self.input2
@@ -214,19 +214,19 @@

#Only the element 1 of input1 has *exactly* matched with the element 1
#of input2
-        self.assertEqual(m.matched(), {1: [(1, 0)]})
+        self.assertEqual(am.matched(m), {1: [(1, 0)]})

#Victor Hugo --> Victor Wugo
#Albert Camus --> Albert Camus, Albert Camu
-        self.assertEqual(m.matched(cutoff = 2),
+        self.assertEqual(am.matched(m, cutoff = 2),
{0: [(0, d(i1[0], i2[0]))], 1: [(1, d(i1[1], i2[1])),
(2, d(i1[1], i2[2]))]})

def test_operation(self):
m = self.matrix
-        self.assertEqual(3 * m, m * 3)
-        self.assertEqual((m - 0.5*m), (0.5 * m))
-        self.assertEqual(m + 10*m - m * 3, 8 * m)
+        self.assertTrue((3 * m == m * 3).all())
+        self.assertTrue(((m - 0.5*m) == (0.5 * m)).all())
+        self.assertTrue(((m + 10*m - m * 3) == (8 * m)).all())

class MinLSHTest(unittest2.TestCase):
def test_all(self):