[minhashing] Correctly build k grams in minhashing, closes #221665
authorVincent Michel <vincent.michel@logilab.fr>
Mon, 10 Mar 2014 10:30:38 +0000
changeset 389 9cb4aaf1a111
parent 388 3d6d250a4fa5
child 390 42092b0421da
[minhashing] Correctly build k grams in minhashing, closes #221665
test/test_minhashing.py
utils/minhashing.py
utils/normalize.py
--- a/test/test_minhashing.py	Mon Mar 10 10:59:57 2014 +0000
+++ b/test/test_minhashing.py	Mon Mar 10 10:30:38 2014 +0000
@@ -15,14 +15,14 @@
 #
 # You should have received a copy of the GNU Lesser General Public License along
 # with this program. If not, see <http://www.gnu.org/licenses/>.
-
+from functools import partial
 import unittest2
 from os import path
 import random
 random.seed(6) ### Make sure tests are repeatable
 
 from nazca.utils.normalize import simplify
-from nazca.utils.minhashing import Minlsh
+from nazca.utils.minhashing import Minlsh, count_vectorizer_func
 from nazca.data import FRENCH_LEMMAS
 
 TESTDIR = path.dirname(__file__)
@@ -30,6 +30,24 @@
 
 
 class MinLSHTest(unittest2.TestCase):
+
+    def test_iter_wordgrams(self):
+        sentence = 'nom de la rose'
+        minlsh = Minlsh()
+        results = list(minlsh._iter_wordgrams(sentence, 2))
+        truth = ['nom de', 'nom', 'de la', 'de', 'la rose', 'la', 'rose']
+        self.assertEqual(len(results), len(truth))
+        self.assertEqual(set(results), set(truth))
+
+    def test_iter_wordgrams_sklearn(self):
+        sentences = ('nom de la rose', 'nom de la')
+        tokenizer_func = partial(count_vectorizer_func, min_n=1, max_n=2)
+        minlsh = Minlsh(tokenizer_func=tokenizer_func)
+        rows, shape = list(minlsh._buildmatrixdocument(sentences, 2))
+        self.assertEqual(shape, (2, 7))
+        self.assertEqual(rows[0], [0, 1, 2, 3, 4, 5, 6])
+        self.assertEqual(rows[1], [0, 1, 2, 4, 5])
+
     def test_all(self):
         sentences = [u"Un nuage flotta dans le grand ciel bleu.",
                      u"Des grands nuages noirs flottent dans le ciel.",
--- a/utils/minhashing.py	Mon Mar 10 10:59:57 2014 +0000
+++ b/utils/minhashing.py	Mon Mar 10 10:30:38 2014 +0000
@@ -23,9 +23,10 @@
 import numpy as np
 from scipy.optimize import bisect
 
-from nazca.utils.normalize import iter_wordgrams
 
-
+###############################################################################
+### UTILITY FUNCTIONS #########################################################
+###############################################################################
 def randomhashfunction(zr):
     """ Return a random hash function, mapping x in Z to ZR
         h:x -> ax + b mod R
@@ -40,14 +41,41 @@
 
     return hashfunc
 
+def count_vectorizer_func(sentences, min_n, max_n):
+    """ Perform a tokenization using scikit learn
+    """
+    from sklearn.feature_extraction.text import CountVectorizer
+    count_vec = CountVectorizer(min_n=min_n, max_n=max_n)
+    # Transform and convert to lil to get rows
+    data = count_vec.fit_transform(sentences).tolil()
+    return [list(l) for l in data.rows], data.shape
 
+
+###############################################################################
+### MINHASHING ################################################################
+###############################################################################
 class Minlsh(object):
     """ Operate minhashing + locally-sensitive-hashing to find similar sentences
     """
 
-    def __init__(self, verbose=False):
+    def __init__(self, tokenizer_func=None, verbose=False):
+        """ Initialize a minhashing/lsh object
+
+        Parameters:
+        ==========
+
+           * tokenizer_func is a function that take the sentences
+             as argument and return the rows of the sparse matrix
+             of tokens, and its shape.
+
+           * verbose is a boolean that trigger the display of
+             some informations
+        """
         self._trained = False
         self.sigmatrix = None
+        if tokenizer_func:
+            # Use given tokenizer_func
+            self._buildmatrixdocument = lambda x, y: tokenizer_func(x)
         self._verbose = verbose
 
     def train(self, sentences, k=2, siglen=200):
@@ -58,14 +86,20 @@
             - `siglen` the length of the sentences signature
 
         """
-
         rows, shape = self._buildmatrixdocument(sentences, k)
-
-        if self._verbose: print "Training is done. Wait while signaturing"
-
+        if self._verbose:
+            print "Training is done. Wait while signaturing"
         self._computesignaturematrix(rows, shape, siglen)
         self._trained = True
 
+    def _iter_wordgrams(self, sentence, k):
+        """ Generator of k-wordgrams on the given sentence
+        """
+        words = sentence.split(' ')
+        for r in xrange(len(words)):
+            for width in range(1, k+1):
+                if r+width<=len(words):
+                    yield ' '.join(words[r:r + width])
 
     def _buildmatrixdocument(self, sentences, k):
         """ Return a sparse matrix where :
@@ -77,11 +111,11 @@
             sentence c, 0 otherwise
 
         """
-
+        # Use default mode
         rows, universe, sizeofuniverse = [], {}, 0
         for nb, sent in enumerate(sentences):
             row = []
-            for w in iter_wordgrams(sent, k):
+            for w in self._iter_wordgrams(sent, k):
                 row.append(universe.setdefault(w, sizeofuniverse))
                 if row[-1] == sizeofuniverse:
                     sizeofuniverse += 1
--- a/utils/normalize.py	Mon Mar 10 10:59:57 2014 +0000
+++ b/utils/normalize.py	Mon Mar 10 10:30:38 2014 +0000
@@ -140,14 +140,6 @@
             chunks.append(chunk)
     return chunks
 
-def iter_wordgrams(sentence, k):
-    """ Generator of k-wordgrams on the given sentence
-    """
-    words = sentence.split(' ')
-    #XXX Call tokenizer
-    for r in xrange(len(words)):
-        yield ' '.join(words[r:r + k])
-
 def lemmatized(sentence, lemmas, tokenizer=None):
     """ Return the lemmatized sentence
     """