author Simon Chabot Mon, 12 Nov 2012 17:22:18 +0100 changeset 112 b85774902147 parent 111 27967dcdd5cc child 113 ecfd9543df77
[minhashing] Faster signaturing using numpy
 minhashing.py file | annotate | diff | comparison | revisions
```--- a/minhashing.py	Mon Nov 12 16:46:46 2012 +0100
+++ b/minhashing.py	Mon Nov 12 17:22:18 2012 +0100
@@ -20,7 +20,7 @@
from random import randint
from collections import defaultdict

-from numpy import ones
+import numpy as np
from scipy.sparse import lil_matrix
from scipy.optimize import bisect

@@ -89,7 +89,7 @@
matrixdoc.rows = rows
matrixdoc.data = data

-        return matrixdoc.T
+        return matrixdoc

def _signaturematrix(self, matrixdocument, siglen):
""" Return a matrix where each column is the signature the document
@@ -99,15 +99,18 @@
"""

nrows, ncols = matrixdocument.shape
-        sig = ones((siglen, ncols)) * (nrows + 1)
-        hashfunc = [randomhashfunction(nrows) for _ in xrange(siglen)]
+        sig = np.empty((siglen, nrows))
+        #Generate the random hash functions
+        hashfunc = [randomhashfunction(ncols) for _ in xrange(siglen)]
+        #Compute hashing values
+        hashvalues = np.array([[hashfunc[i](r) for r in xrange(ncols)]
+                                for i in  xrange(siglen)])

-        for r in xrange(nrows):
-            hashrs = [(i, func(r)) for i, func in enumerate(hashfunc)]
-            for c in matrixdocument.rows[r]:
-                for i, hashr in hashrs:
-                    if hashr < sig[i, c]:
-                        sig[i, c] = hashr
+        for docind, doc in enumerate(matrixdocument.rows):
+            #Concatenate the needed rows.
+            tmp = np.dstack([hashvalues[:,r] for r in doc])
+            #Take the mininum of hashes
+            sig[:,docind] = np.min(tmp[0], 1)
return sig

def save(self, savefile):```