[minhashing] Don't copy uselessly the signature matrix
authorSimon Chabot <simon.chabot@logilab.fr>
Mon, 12 Nov 2012 16:45:44 +0100
changeset 113 ecfd9543df77
parent 112 b85774902147
child 114 aa61b2e2ef74
[minhashing] Don't copy uselessly the signature matrix
minhashing.py
--- a/minhashing.py	Mon Nov 12 17:22:18 2012 +0100
+++ b/minhashing.py	Mon Nov 12 16:45:44 2012 +0100
@@ -167,13 +167,13 @@
             print "Threshold must be in ]0 ; 1]"
             return
 
-        col = [self.sigmatrix[:, i] for i in xrange(self.sigmatrix.shape[1])]
+        sig = self.sigmatrix
         bandsize = computebandsize(threshold, self.sigmatrix.shape[0])
 
         buckets = defaultdict(set)
-        for r in xrange(0, self.sigmatrix.shape[0], bandsize):
-            for i in xrange(len(col)):
-                buckets[tuple(col[i][r:r+bandsize])].add(i)
+        for r in xrange(0, sig.shape[0], bandsize):
+            for i in xrange(sig.shape[1]):
+                buckets[tuple(sig[r:r+bandsize, i])].add(i)
             #print "Progress : %.3f" % (r * 100. / self.sigmatrix.shape[0])
 
         if sentenceid and 0 <= sentenceid < self.sigmatrix.shape[1]: