make Minlsh.predict() accept a minclustersize parameter
authorAdrien Di Mascio <Adrien.DiMascio@logilab.fr>
Fri, 08 Jul 2016 10:52:40 +0200
changeset 515 56f5b57a4ed6
parent 514 8f59175035a6
child 516 3f5f1b9a2b95
make Minlsh.predict() accept a minclustersize parameter client code should be able to choose what the minimum cluster size is rather than hardcoding 2 closes #6955741
utils/minhashing.py
--- a/utils/minhashing.py	Thu Feb 18 15:45:53 2016 +0100
+++ b/utils/minhashing.py	Fri Jul 08 10:52:40 2016 +0200
@@ -198,7 +198,7 @@
         ## Solve f(x) = 0, with x having values in [1, nbrows]
         return int(bisect(f, 1, nbrows))
 
-    def predict(self, threshold):
+    def predict(self, threshold, minclustersize=2):
         """ Return a set of tuples of *possible* similar sentences
         """
         if not self._trained:
@@ -222,5 +222,5 @@
             for i in xrange(sig.shape[1]):
                 buckets[tuple(sig[r:r+bandsize, i])].add(i)
             similars.update(set(tuple(v) for v in buckets.itervalues()
-                                         if len(v) > 1))
+                                if len(v) >= minclustersize))
         return similars