[minlsh] Remove the useless rows in the signature matrix while searching
authorSimon Chabot <simon.chabot@logilab.fr>
Tue, 06 Nov 2012 10:45:10 +0100
changeset 67 e37a33ec5b41
parent 66 221a299d9fb8
child 68 a8bea044b5e1
[minlsh] Remove the useless rows in the signature matrix while searching It's done to save memory…
minhashing.py
--- a/minhashing.py	Tue Nov 06 10:48:06 2012 +0100
+++ b/minhashing.py	Tue Nov 06 10:45:10 2012 +0100
@@ -177,8 +177,11 @@
 
         for r in xrange(0, self.sigmatrix.shape[0], bandsize):
             for i in xrange(len(col)):
-                stri = ''.join(str(val) for val in col[i][r:r+bandsize])
+                stri = ''.join(str(val) for val in col[i][:bandsize])
                 buckets[hash(stri)].add(i)
+                ## Let's make some memory space
+                col[i] = col[i][bandsize:] #pop the first rows
+            print "Progress : %.3f" % (r * 100. / self.sigmatrix.shape[0])
 
         if 0 <= sentenceid < self.sigmatrix.shape[1]:
             return set(tuple(v) for v in buckets.values()