[Minhashing] Give a thresdhold instead of a abstract "bandsize"
authorSimon Chabot <simon.chabot@logilab.fr>
Mon, 29 Oct 2012 11:44:41 +0100
changeset 59 0dad7c581201
parent 58 27b66c6cee3a
child 60 8b2e1146fccd
[Minhashing] Give a thresdhold instead of a abstract "bandsize" The thresdhold and the bandsize are related and one can be computed knowning the other. Let's the user give the more explicit one.
minhashing.py
test/test_alignment.py
--- a/minhashing.py	Mon Oct 29 10:21:14 2012 +0100
+++ b/minhashing.py	Mon Oct 29 11:44:41 2012 +0100
@@ -23,6 +23,7 @@
 
 from numpy import ones
 from scipy.sparse import lil_matrix
+from scipy.optimize import bisect
 
 from cubes.alignment.normalize import wordgrams
 
@@ -139,7 +140,7 @@
             self._trained = False
 
 
-    def findsimilarsentences(self, bandsize, sentenceid = -1, dispThreshold = False):
+    def findsimilarsentences(self, threshold, sentenceid = -1):
         """ Return a set of tuples of *possible* similar sentences
 
             If 0 <= sentenceid <= nbsentences is given:
@@ -147,16 +148,32 @@
                 one
         """
 
+        def computebandsize(threshold, nbrows):
+            """ Compute the bandsize according to the threshold given """
+
+            ### t ~ (1/b)^(1/r), where t is the threshold, b the number of
+            ### bands, and r the number of rows per band. And nbrows (the length
+            ### of the matrix is nbrows = b * r, so t ~ (r / L)^(1 / r). So, let's
+            ### find the root of f(x) = (x / L)^(1/r) - t.
+            def f(x):
+                y = pow(x / nbrows, 1. /x) - threshold
+                return y
+
+            ## Solve f(x) = 0, with x values in [1, nbrows]
+            return int(bisect(f, 1, nbrows))
+
+
         if not self._trained:
             print "Train it before"
             return
 
-        col = [self.sigmatrix[:, i] for i in xrange(self.sigmatrix.shape[1])]
-        buckets = defaultdict(set)
+        if not (0 < threshold <= 1):
+            print "Threshold must be in ]0 ; 1]"
+            return
 
-        nbbands = int(self.sigmatrix.shape[0] / bandsize)
-        if dispThreshold:
-            print "threshold is %.3f" % pow(1./nbbands, 1./bandsize)
+        col = [self.sigmatrix[:, i] for i in xrange(self.sigmatrix.shape[1])]
+        bandsize = computebandsize(threshold, self.sigmatrix.shape[0])
+        buckets = defaultdict(set)
 
         for r in xrange(0, self.sigmatrix.shape[0], bandsize):
             for i in xrange(len(col)):
@@ -188,7 +205,7 @@
         print ' - %s' % s
 
     print '\nLes phrases *possiblement* similaires sont : '
-    for s in minlsh.findsimilarsentences(6):
+    for s in minlsh.findsimilarsentences(0.7):
         for e in s:
             print ' -', sentences[e]
         print
--- a/test/test_alignment.py	Mon Oct 29 10:21:14 2012 +0100
+++ b/test/test_alignment.py	Mon Oct 29 11:44:41 2012 +0100
@@ -235,7 +235,7 @@
         lemmas = loadlemmas('../data/french_lemmas.txt')
         minlsh.train((simplify(s, lemmas) for s in sentences), 1, 200)
 
-        self.assertEqual(minlsh.findsimilarsentences(7), set([(0, 1), (2, 4)]))
+        self.assertEqual(minlsh.findsimilarsentences(0.65), set([(0, 1), (2, 4)]))
 
 if __name__ == '__main__':
     unittest2.main()