[minhashing] Add a verbose mode
authorSimon Chabot <simon.chabot@logilab.fr>
Thu, 15 Nov 2012 14:38:15 +0100
changeset 142 b5ffd85f4711
parent 141 e1d817d82f62
child 143 e538838ee124
[minhashing] Add a verbose mode
minhashing.py
--- a/minhashing.py	Thu Nov 15 14:17:58 2012 +0100
+++ b/minhashing.py	Thu Nov 15 14:38:15 2012 +0100
@@ -45,9 +45,10 @@
     """ Operate minhashing + locally-sensitive-hashing to find similar sentences
     """
 
-    def __init__(self):
+    def __init__(self, verbose=False):
         self._trained = False
         self.sigmatrix = None
+        self._verbose = verbose
 
     def train(self, sentences, k=2, siglen=200):
         """ Train the minlsh on the given sentences.
@@ -59,7 +60,8 @@
         """
 
         rows, shape = self._buildmatrixdocument(sentences, k)
-        print "Training is done. Wait while signaturing"
+
+        if self._verbose: print "Training is done. Wait while signaturing"
 
         self._computesignaturematrix(rows, shape, siglen)
         self._trained = True
@@ -77,13 +79,15 @@
         """
 
         rows, universe, sizeofuniverse = [], {}, 0
-        for sent in sentences:
+        for nb, sent in enumerate(sentences):
             row = []
             for w in iter_wordgrams(sent, k):
                 row.append(universe.setdefault(w, sizeofuniverse))
                 if row[-1] == sizeofuniverse:
                     sizeofuniverse += 1
             rows.append(row)
+            if self._verbose and nb % 50000 == 0:
+                print nb
 
         return rows, (len(rows), sizeofuniverse)
 
@@ -111,6 +115,8 @@
             #Take the mininum of hashes
             sig[:, docind] = np.min(tmp[0], 1)
             docind += 1
+            if self._verbose and docind % 50000 == 0:
+                print (docind * 100) / nrows
         self.sigmatrix = sig
 
     def save(self, savefile):