author Simon Chabot Thu, 25 Oct 2012 16:40:42 +0200 changeset 46 4c2f7553490b parent 45 9f4669700221 child 47 597f6b039bca
[Matrix] Give weighting and normalization at the contruction of the matrix It makes the computation faster
 matrix.py file | annotate | diff | comparison | revisions test/test_alignment.py file | annotate | diff | comparison | revisions
```--- a/matrix.py	Wed Oct 24 19:10:54 2012 +0200
+++ b/matrix.py	Thu Oct 25 16:40:42 2012 +0200
@@ -38,23 +38,29 @@
+----+----+
"""

-    def __init__(self, input1, input2, distance, defvalue, kargs = {}):
+    def __init__(self, weighting, input1, input2, distance, defvalue,
+                 normalized = True, kargs = {}):
self.distance = distance
self._matrix = lil_matrix((len(input1), len(input2)), dtype='float32')
self.size = self._matrix.get_shape()
self._maxdist = 0
-        self._compute(input1, input2, defvalue, kargs)
+        self.normalized = normalized
+        self._compute(weighting, input1, input2, defvalue, kargs)

-    def _compute(self, input1, input2, defvalue, kargs):
+    def _compute(self, weighting, input1, input2, defvalue, kargs):
for i in xrange(self.size[0]):
for j in xrange(self.size[1]):
-                if not (input1[i] and input2[j]):
-                    self._matrix[i, j] = defvalue
-                    continue
+                d = defvalue
+                if input1[i] and input2[j]:
+                    d = self.distance(input1[i], input2[j], **kargs)

-                self._matrix[i, j] = self.distance(input1[i], input2[j], **kargs)
-                if self._matrix[i, j] > self._maxdist:
-                    self._maxdist = self._matrix[i, j]
+                if self.normalized:
+                    d = 1 - (1.0 / (1.0 + d))
+
+                d *= weighting
+                if d > self._maxdist:
+                    self._maxdist = d
+                self._matrix[i, j] = d

def __getitem__(self, index):
return self._matrix[index]
@@ -128,7 +134,7 @@

- `items` is a list of tuples where each tuple is built as following :

-            `(weighting, input1, input2, distance_function, defvalue, args)`
+            `(weighting, input1, input2, distance_function, defvalue, normalize, args)`

* `input1` : a list of "things" (names, dates, numbers) to align on
`input2`. If a value is unknown, set it as `None`.
@@ -143,6 +149,10 @@
`input2` is unknown. A good idea should be `defvalue` has an
upper bound of the possible values to maximize the distance

+            * `normalize` : boolean, if true, the matrix values will between 0
+                and 1, else the real result of `distance_function` will be
+                stored
+
* `args` : a dictionnay of the extra arguments the
`distance_function` could take (as language or granularity)
```
```--- a/test/test_alignment.py	Wed Oct 24 19:10:54 2012 +0200
+++ b/test/test_alignment.py	Thu Oct 25 16:40:42 2012 +0200
@@ -180,7 +180,8 @@
self.input1 = [u'Victor Hugo', u'Albert Camus', 'Jean Valjean']
self.input2 = [u'Victor Wugo', u'Albert Camus', 'Albert Camu']
self.distance = levenshtein
-        self.matrix = Distancematrix(self.input1, self.input2, self.distance, 10)
+        self.matrix = Distancematrix(1, self.input1, self.input2, self.distance,
+                                     10, False)
def test_matrixconstruction(self):
d = self.distance
i1, i2 = self.input1, self.input2```