[Matrix] Can pass extra arguments to distance functions
authorSimon Chabot <simon.chabot@logilab.fr>
Wed, 24 Oct 2012 14:50:23 +0200
changeset 42 3bee9fcb2080
parent 41 9a3e43aaa29b
child 43 a05bc17fa163
[Matrix] Can pass extra arguments to distance functions
matrix.py
minhashing.py
test.py
test/test_alignment.py
--- a/matrix.py	Wed Oct 24 12:37:56 2012 +0200
+++ b/matrix.py	Wed Oct 24 14:50:23 2012 +0200
@@ -38,23 +38,23 @@
                  +----+----+
     """
 
-    def __init__(self, input1, input2, distance, defvalue = 1):
+    def __init__(self, input1, input2, distance, defvalue, kargs = {}):
         self.distance = distance
         self._matrix = lil_matrix((len(input1), len(input2)), dtype='float32')
         self.size = self._matrix.get_shape()
         self._maxdist = 0
-        self._compute(input1, input2, defvalue)
+        self._compute(input1, input2, defvalue, kargs)
 
-    def _compute(self, input1, input2, defvalue):
-       for i in xrange(self.size[0]):
-           for j in xrange(self.size[1]):
-               if not (input1[i] and input2[j]):
-                   self._matrix[i, j] = defvalue
-                   continue
+    def _compute(self, input1, input2, defvalue, kargs):
+        for i in xrange(self.size[0]):
+            for j in xrange(self.size[1]):
+                if not (input1[i] and input2[j]):
+                    self._matrix[i, j] = defvalue
+                    continue
 
-               self._matrix[i,j] = self.distance(input1[i], input2[j])
-               if self._matrix[i,j] > self._maxdist:
-                   self._maxdist = self._matrix[i,j]
+                self._matrix[i, j] = self.distance(input1[i], input2[j], **kargs)
+                if self._matrix[i, j] > self._maxdist:
+                    self._maxdist = self._matrix[i, j]
 
     def __getitem__(self, index):
         return self._matrix[index]
@@ -126,5 +126,8 @@
         if cutoff > 0: #If more is wanted, return it too
             for (i, j) in rowcol:
                 if self._matrix[i, j] <= cutoff:
-                    match[i].append((j, self._matrix[i, j]))
+                    if normalized:
+                        match[i].append((j, self._matrix[i, j]/self._maxdist))
+                    else:
+                        match[i].append((j, self._matrix[i, j]))
         return match
--- a/minhashing.py	Wed Oct 24 12:37:56 2012 +0200
+++ b/minhashing.py	Wed Oct 24 14:50:23 2012 +0200
@@ -166,7 +166,7 @@
         print ' - %s' % s
 
     print '\nLes phrases *possiblement* similaires sont : '
-    for s in minlsh.findsimilarsentences(15):
+    for s in minlsh.findsimilarsentences(6):
         for e in s:
             print ' -', sentences[e]
         print
--- a/test.py	Wed Oct 24 12:37:56 2012 +0200
+++ b/test.py	Wed Oct 24 14:50:23 2012 +0200
@@ -1,12 +1,12 @@
 # -*- coding:utf-8 -*-
 
-def dbpediasent(filename, maxind = None):
+def dbpediasent(filename, maxind = None, enco = 'unicode_escape'):
     fobj = open(filename)
     fobj.readline()
     for ind, line in enumerate(fobj):
         if maxind and ind >= maxind:
             break
-        line = line.strip().decode('utf-8')
+        line = line.strip().decode(enco)
         line = line.split('> "')[-1].split('"@fr')[0]
         if not line:
             continue
@@ -16,5 +16,5 @@
     ind = [int(i) for i in indastr.split(' ')]
     for i, s in enumerate(dbpediasent(filename, maxind)):
         if i in ind:
-            print s
+            print s.encode('utf-8')
             print
--- a/test/test_alignment.py	Wed Oct 24 12:37:56 2012 +0200
+++ b/test/test_alignment.py	Wed Oct 24 14:50:23 2012 +0200
@@ -180,8 +180,7 @@
         self.input1 = [u'Victor Hugo', u'Albert Camus', 'Jean Valjean']
         self.input2 = [u'Victor Wugo', u'Albert Camus', 'Albert Camu']
         self.distance = levenshtein
-        self.matrix = Distancematrix(self.input1, self.input2, self.distance)
-
+        self.matrix = Distancematrix(self.input1, self.input2, self.distance, 10)
     def test_matrixconstruction(self):
         d = self.distance
         i1, i2 = self.input1, self.input2
@@ -202,9 +201,9 @@
 
         #Victor Hugo --> Victor Wugo
         #Albert Camus --> Albert Camus, Albert Camu
-        self.assertEqual(m.matched(cutoff = 0.2, normalized = True),
+        self.assertEqual(m.matched(cutoff = 2),
                         {0: [(0, d(i1[0], i2[0]))], 1: [(1, d(i1[1], i2[1])),
-                                                       (2, d(i1[1], i2[2]))]})
+                                                        (2, d(i1[1], i2[2]))]})
 
     def test_operation(self):
         m = self.matrix