author Simon Chabot Wed, 24 Oct 2012 14:50:23 +0200 changeset 42 3bee9fcb2080 parent 41 9a3e43aaa29b child 43 a05bc17fa163
[Matrix] Can pass extra arguments to distance functions
 matrix.py file | annotate | diff | comparison | revisions minhashing.py file | annotate | diff | comparison | revisions test.py file | annotate | diff | comparison | revisions test/test_alignment.py file | annotate | diff | comparison | revisions
```--- a/matrix.py	Wed Oct 24 12:37:56 2012 +0200
+++ b/matrix.py	Wed Oct 24 14:50:23 2012 +0200
@@ -38,23 +38,23 @@
+----+----+
"""

-    def __init__(self, input1, input2, distance, defvalue = 1):
+    def __init__(self, input1, input2, distance, defvalue, kargs = {}):
self.distance = distance
self._matrix = lil_matrix((len(input1), len(input2)), dtype='float32')
self.size = self._matrix.get_shape()
self._maxdist = 0
-        self._compute(input1, input2, defvalue)
+        self._compute(input1, input2, defvalue, kargs)

-    def _compute(self, input1, input2, defvalue):
-       for i in xrange(self.size[0]):
-           for j in xrange(self.size[1]):
-               if not (input1[i] and input2[j]):
-                   self._matrix[i, j] = defvalue
-                   continue
+    def _compute(self, input1, input2, defvalue, kargs):
+        for i in xrange(self.size[0]):
+            for j in xrange(self.size[1]):
+                if not (input1[i] and input2[j]):
+                    self._matrix[i, j] = defvalue
+                    continue

-               self._matrix[i,j] = self.distance(input1[i], input2[j])
-               if self._matrix[i,j] > self._maxdist:
-                   self._maxdist = self._matrix[i,j]
+                self._matrix[i, j] = self.distance(input1[i], input2[j], **kargs)
+                if self._matrix[i, j] > self._maxdist:
+                    self._maxdist = self._matrix[i, j]

def __getitem__(self, index):
return self._matrix[index]
@@ -126,5 +126,8 @@
if cutoff > 0: #If more is wanted, return it too
for (i, j) in rowcol:
if self._matrix[i, j] <= cutoff:
-                    match[i].append((j, self._matrix[i, j]))
+                    if normalized:
+                        match[i].append((j, self._matrix[i, j]/self._maxdist))
+                    else:
+                        match[i].append((j, self._matrix[i, j]))
return match```
```--- a/minhashing.py	Wed Oct 24 12:37:56 2012 +0200
+++ b/minhashing.py	Wed Oct 24 14:50:23 2012 +0200
@@ -166,7 +166,7 @@
print ' - %s' % s

print '\nLes phrases *possiblement* similaires sont : '
-    for s in minlsh.findsimilarsentences(15):
+    for s in minlsh.findsimilarsentences(6):
for e in s:
print ' -', sentences[e]
print```
```--- a/test.py	Wed Oct 24 12:37:56 2012 +0200
+++ b/test.py	Wed Oct 24 14:50:23 2012 +0200
@@ -1,12 +1,12 @@
# -*- coding:utf-8 -*-

-def dbpediasent(filename, maxind = None):
+def dbpediasent(filename, maxind = None, enco = 'unicode_escape'):
fobj = open(filename)
for ind, line in enumerate(fobj):
if maxind and ind >= maxind:
break
-        line = line.strip().decode('utf-8')
+        line = line.strip().decode(enco)
line = line.split('> "')[-1].split('"@fr')[0]
if not line:
continue
@@ -16,5 +16,5 @@
ind = [int(i) for i in indastr.split(' ')]
for i, s in enumerate(dbpediasent(filename, maxind)):
if i in ind:
-            print s
+            print s.encode('utf-8')
print```
```--- a/test/test_alignment.py	Wed Oct 24 12:37:56 2012 +0200
+++ b/test/test_alignment.py	Wed Oct 24 14:50:23 2012 +0200
@@ -180,8 +180,7 @@
self.input1 = [u'Victor Hugo', u'Albert Camus', 'Jean Valjean']
self.input2 = [u'Victor Wugo', u'Albert Camus', 'Albert Camu']
self.distance = levenshtein
-        self.matrix = Distancematrix(self.input1, self.input2, self.distance)
-
+        self.matrix = Distancematrix(self.input1, self.input2, self.distance, 10)
def test_matrixconstruction(self):
d = self.distance
i1, i2 = self.input1, self.input2
@@ -202,9 +201,9 @@

#Victor Hugo --> Victor Wugo
#Albert Camus --> Albert Camus, Albert Camu
-        self.assertEqual(m.matched(cutoff = 0.2, normalized = True),
+        self.assertEqual(m.matched(cutoff = 2),
{0: [(0, d(i1[0], i2[0]))], 1: [(1, d(i1[1], i2[1])),
-                                                       (2, d(i1[1], i2[2]))]})
+                                                        (2, d(i1[1], i2[2]))]})

def test_operation(self):
m = self.matrix```