[minhashing] Compute the union step by step
authorSimon Chabot <simon.chabot@logilab.fr>
Wed, 14 Nov 2012 12:17:02 +0100
changeset 136 91b710275ed3
parent 135 9d13b81e7f26
child 137 96791d189d48
[minhashing] Compute the union step by step
minhashing.py
--- a/minhashing.py	Wed Nov 14 11:23:10 2012 +0100
+++ b/minhashing.py	Wed Nov 14 12:17:02 2012 +0100
@@ -168,17 +168,16 @@
         # It should be inverted here (0 is closed, 1 is far)
         threshold = 1 - threshold
         bandsize = self.computebandsize(threshold, self.sigmatrix.shape[0])
-        nb_bands = self.sigmatrix.shape[0] / bandsize + 1
 
         buckets = defaultdict(set)
-        similars = [set(),] * nb_bands
-        for current_band, r in enumerate(xrange(0, sig.shape[0], bandsize)):
+        similars = set()
+        for r in xrange(0, sig.shape[0], bandsize):
             buckets.clear()
             for i in xrange(sig.shape[1]):
                 buckets[tuple(sig[r:r+bandsize, i])].add(i)
-            similars[current_band] = set(tuple(v) for v in buckets.itervalues()
-                                         if len(v) > 1)
-        return set.union(*similars)
+            similars.update(set(tuple(v) for v in buckets.itervalues()
+                                         if len(v) > 1))
+        return similars
 
 
 if __name__ == '__main__':