[minhashing] Buckets are row-dependant
authorSimon Chabot <simon.chabot@logilab.fr>
Wed, 14 Nov 2012 11:23:10 +0100
changeset 135 9d13b81e7f26
parent 134 c5bbcea9dca5
child 136 91b710275ed3
[minhashing] Buckets are row-dependant “We can use the same hash function for all the bands, but we use a separate bucket array for each band, so columns with the same vector in different bands will not hash to the same bucket.”
minhashing.py
--- a/minhashing.py	Wed Nov 14 11:23:49 2012 +0100
+++ b/minhashing.py	Wed Nov 14 11:23:10 2012 +0100
@@ -168,12 +168,17 @@
         # It should be inverted here (0 is closed, 1 is far)
         threshold = 1 - threshold
         bandsize = self.computebandsize(threshold, self.sigmatrix.shape[0])
+        nb_bands = self.sigmatrix.shape[0] / bandsize + 1
 
         buckets = defaultdict(set)
-        for r in xrange(0, sig.shape[0], bandsize):
+        similars = [set(),] * nb_bands
+        for current_band, r in enumerate(xrange(0, sig.shape[0], bandsize)):
+            buckets.clear()
             for i in xrange(sig.shape[1]):
                 buckets[tuple(sig[r:r+bandsize, i])].add(i)
-        return set(tuple(v) for v in buckets.itervalues() if len(v) > 1)
+            similars[current_band] = set(tuple(v) for v in buckets.itervalues()
+                                         if len(v) > 1)
+        return set.union(*similars)
 
 
 if __name__ == '__main__':