author Vincent Michel Tue, 13 Nov 2012 15:37:59 +0100 changeset 119 b47df06d0b3d parent 118 e6e327ed2a4c child 120 0c0679c3c537
[minhashing] Modify API + change threshold handling
 minhashing.py file | annotate | diff | comparison | revisions
```--- a/minhashing.py	Tue Nov 13 15:37:25 2012 +0100
+++ b/minhashing.py	Tue Nov 13 15:37:59 2012 +0100
@@ -26,6 +26,7 @@

from alignment.normalize import iter_wordgrams

+
def randomhashfunction(zr):
""" Return a random hash function, mapping x in Z to ZR
h:x -> ax + b mod R
@@ -48,7 +49,7 @@
self._trained = False
self.sigmatrix = None

-    def train(self, sentences, k = 2, siglen = 200):
+    def train(self, sentences, k=2, siglen=200):
""" Train the minlsh on the given sentences.

- `k` is the length of the k-wordgrams used
@@ -102,7 +103,8 @@
sig = np.empty((siglen, nrows))
#Generate the random hash functions
hashfunc = [randomhashfunction(ncols) for _ in xrange(siglen)]
-        #Compute hashing values
+        #Compute hashing values just for once.
+        #Avoid multiple recomputations for the same column.
hashvalues = np.array([[hashfunc[i](r) for r in xrange(ncols)]
for i in  xrange(siglen)])

@@ -136,29 +138,23 @@
else:
self._trained = False

-    def findsimilarsentences(self, threshold, sentenceid = None):
-        """ Return a set of tuples of *possible* similar sentences
-
-            If 0 <= sentenceid <= nbsentences is given:
-                return a set of tuples of *possible* similar sentences to this
-                one
-        """
-
-        def computebandsize(threshold, nbrows):
-            """ Compute the bandsize according to the threshold given """
+    def computebandsize(self, threshold, nbrows):
+        """ Compute the bandsize according to the threshold given """

-            ### t ~ (1/b)^(1/r), where t is the threshold, b the number of
-            ### bands, and r the number of rows per band. And nbrows (the length
-            ### of the matrix is nbrows = b * r, so t ~ (r / L)^(1 / r). So, let's
-            ### find the root of f(x) = (x / L)^(1/r) - t.
-            def f(x):
-                y = pow(x / nbrows, 1. /x) - threshold
-                return y
+        ### t ~ (1/b)^(1/r), where t is the threshold, b the number of
+        ### bands, and r the number of rows per band. And nbrows (the length
+        ### of the matrix is nbrows = b * r, so t ~ (r / L)^(1 / r). So, let's
+        ### find the root of f(x) = (x / L)^(1/r) - t.
+        def f(x):
+            y = pow(x / nbrows, 1. /x) - threshold
+            return y

-            ## Solve f(x) = 0, with x having values in [1, nbrows]
-            return int(bisect(f, 1, nbrows))
+        ## Solve f(x) = 0, with x having values in [1, nbrows]
+        return int(bisect(f, 1, nbrows))

-
+    def predict(self, threshold):
+        """ Return a set of tuples of *possible* similar sentences
+        """
if not self._trained:
print "Train it before"
return
@@ -168,18 +164,17 @@
return

sig = self.sigmatrix
-        bandsize = computebandsize(threshold, self.sigmatrix.shape)
+        # Treshold is a percent of similarity
+        # It should be inverted here (0 is closed, 1 is far)
+        threshold = 1 - threshold
+        bandsize = self.computebandsize(threshold, self.sigmatrix.shape)

buckets = defaultdict(set)
for r in xrange(0, sig.shape, bandsize):
for i in xrange(sig.shape):
-            #print "Progress : %.3f" % (r * 100. / self.sigmatrix.shape)
+        return set(tuple(v) for v in buckets.itervalues() if len(v) > 1)

-        if sentenceid and 0 <= sentenceid < self.sigmatrix.shape:
-            return set(tuple(v) for v in buckets.itervalues()
-                       if len(v) > 1 and sentenceid in v)
-        return set(tuple(v) for v in buckets.itervalues() if len(v) > 1)

if __name__ == '__main__':