[minhashing] consume less memory
authorSimon Chabot <simon.chabot@logilab.fr>
Thu, 15 Nov 2012 09:38:52 +0100
changeset 138 1e3d51f347d5
parent 137 96791d189d48
child 139 f29c1b937abb
[minhashing] consume less memory Storing the whole document matrix was useless because it was a boolean and sparse matrix. So the only stored element are equal to one, and therefore it's useless to store the data list : only the position are interesting. During the signature step, the read lines are useless because they aren't use anymore, so they are deleted to save memory.
minhashing.py
--- a/minhashing.py	Wed Nov 14 17:45:49 2012 +0100
+++ b/minhashing.py	Thu Nov 15 09:38:52 2012 +0100
@@ -58,10 +58,10 @@
 
         """
 
-        matrixdocument = self._buildmatrixdocument(sentences, k)
+        rows, shape = self._buildmatrixdocument(sentences, k)
         print "Training is done. Wait while signaturing"
 
-        self.sigmatrix = self._signaturematrix(matrixdocument, siglen)
+        self._computesignaturematrix(rows, shape, siglen)
         self._trained = True
 
 
@@ -76,7 +76,7 @@
 
         """
 
-        rows, data, universe, sizeofuniverse = [], [], {}, 0
+        rows, universe, sizeofuniverse = [], {}, 0
         for sent in sentences:
             row = []
             for w in iter_wordgrams(sent, k):
@@ -84,22 +84,17 @@
                 if row[-1] == sizeofuniverse:
                     sizeofuniverse += 1
             rows.append(row)
-            data.append([1] * len(row))
 
-        matrixdoc = lil_matrix((len(rows), sizeofuniverse))
-        matrixdoc.rows = rows
-        matrixdoc.data = data
+        return rows, (len(rows), sizeofuniverse)
 
-        return matrixdoc
-
-    def _signaturematrix(self, matrixdocument, siglen):
+    def _computesignaturematrix(self, rows, shape, siglen):
         """ Return a matrix where each column is the signature the document
             The signature is composed of `siglen` numbers
 
             The more the documents have rows in commun, the closer they are.
         """
 
-        nrows, ncols = matrixdocument.shape
+        nrows, ncols = shape
         sig = np.empty((siglen, nrows))
         #Generate the random hash functions
         hashfunc = [randomhashfunction(ncols) for _ in xrange(siglen)]
@@ -108,12 +103,15 @@
         hashvalues = np.array([[hashfunc[i](r) for r in xrange(ncols)]
                                 for i in  xrange(siglen)])
 
-        for docind, doc in enumerate(matrixdocument.rows):
+        docind = 0
+        while rows:
+            doc = rows.pop(0)
             #Concatenate the needed rows.
-            tmp = np.dstack([hashvalues[:,r] for r in doc])
+            tmp = np.dstack([hashvalues[:, r] for r in doc])
             #Take the mininum of hashes
-            sig[:,docind] = np.min(tmp[0], 1)
-        return sig
+            sig[:, docind] = np.min(tmp[0], 1)
+            docind += 1
+        self.sigmatrix = sig
 
     def save(self, savefile):
         """ Save the training into `savefile` for a future use """