[Minhashing] Api fonctionnelle
authorSimon Chabot <simon.chabot@logilab.fr>
Wed, 24 Oct 2012 11:29:08 +0200
changeset 37 dcdb1b91f33a
parent 36 ec24394673eb
child 38 9bcbce18a9ae
[Minhashing] Api fonctionnelle
minhashing.py
normalize.py
test.py
--- a/minhashing.py	Mon Oct 22 18:17:17 2012 +0200
+++ b/minhashing.py	Wed Oct 24 11:29:08 2012 +0200
@@ -16,50 +16,13 @@
 # with this program. If not, see <http://www.gnu.org/licenses/>.
 
 
-from nltk import ngrams
 from scipy.sparse import lil_matrix
-from scipy import matrix
-from numpy.random import shuffle
-from numpy import arange, ones
+from numpy import ones
 from random import randint
-
-def kshingle(content, k = 7):
-    """ Return the set of k-grams of content
-
-        eg : content = 'abcab'
-             k = 2
-
-             k-grams are : 'ab', 'bc', 'ca', 'ab'
-             return is set('ab', 'ba', 'ca')
-    """
-
-
-    return set(''.join(t) for t in ngrams(content, k))
+from collections import defaultdict
+from time import time
 
-def matrixdocument(sentences, k = 7):
-    """ Return a sparse matrix where :
-
-        - Each sentence is a column
-        - Each row is a element of the universal set
-
-        Each value (r, c) is set to 1 if the element at row r is in the sentence
-        c, 0 otherwise
-
-    """
-
-    sets = []
-    universe = set()
-    for sent in sentences:
-        sets.append(kshingle(sent, k))
-        universe = universe.union(sets[-1])
-
-    matrixdoc = lil_matrix((len(universe), len(sets)))
-
-    for inde, elt in enumerate(universe):
-        for inds, curset in enumerate(sets):
-            matrixdoc[inde, inds] = int(elt in curset)
-
-    return matrixdoc
+from cubes.alignment.normalize import wordgrams
 
 def randomhashfunction(zr):
     """ Return a random hash function, mapping x in Z to ZR
@@ -70,52 +33,123 @@
     b = randint(1, zr - 1)
 
     def hashfunc(x):
-        return (a * x + b) % zr
+        return ((a * x + b) % zr)
 
     return hashfunc
 
-def signaturematrix(matrixdocument, siglen = 4):
-    """ Return a matrix where each column is the signature the document
-        The signature is composed of siglen number
 
-        The more the document have rows in commun, the closer they are.
+class Minlsh(object):
+    """ Operate minhashing + locally-sensitive-hashing to find similar sentences
     """
 
-    nrows, ncols = matrixdocument.shape
-    sig = ones((siglen, ncols)) * (nrows + 1)
-    hashfunc = [randomhashfunction(nrows) for _ in xrange(siglen)]
+    def __init__(self):
+        self._trained = False
+        self.sigmatrix = None
+
+    def train(self, sentences, k = 2, siglen = 200, dispTime = False):
+        """ Train the minlsh on the given sentences.
 
-    for r in xrange(nrows):
-        for c in matrixdocument.rows[r]:
-            for i, func in enumerate(hashfunc):
-                hashr = func(r)
-                if hashr < sig[i, c]:
-                    sig[i, c] = hashr
-    return sig
+            - `k` is the length of the k-wordgrams used
+              (the lower k is, the faster is the training)
+            - `siglen` the length of the sentences signature
+            - `dispTime` is used for whether the left time should be displayed
+              or not.
 
-def colsimilar(signature, threahold = 0.5):
-    """ Read the signature matrix return index (i, j) if column (ie sentence) i
-        and j are similar
+        """
+
+        matrixdocument = self._buildmatrixdocument(sentences, k, dispTime)
+        if dispTime: print "Training is done. Wait while signaturing"
+
+        self.sigmatrix = self._signaturematrix(matrixdocument, siglen)
+        self._trained = True
 
 
-        /!\ This function should be used for testing purpose only
-    """
-    def sim(a, b):
-        eq = (v[a] == v[b]).sum()
-        return 1.0 * eq / len(v[a])
+    def _buildmatrixdocument(self, sentences, k, dispTime):
+        """ Return a sparse matrix where :
+
+            - Each sentence is a column
+            - Each row is a element of the universal set
+
+            Each value (r, c) is set to 1 if the element at row r is in the
+            sentence c, 0 otherwise
+
+        """
+
+        sets = []
+        universe = set()
+        for sent in sentences:
+            sets.append([w for w in wordgrams(sent, k)])
+        universe = universe.union(*sets)
+        matrixdoc = lil_matrix((len(universe), len(sets)))
+
+        univlen = len(universe)
+        if dispTime:
+            prev, cur = None, time()
+        for inde, elt in enumerate(universe):
+            if dispTime and inde and inde % 200 == 0:
+                prev, cur = cur, time()
+                dt = cur - prev
+                timeleft = int(((univlen - inde) * dt) / 200)
+                print "Time left : %(min)d min %(sec)d s" % {
+                          'min' : timeleft / 60,
+                          'sec' : timeleft - 60 * (timeleft / 60)
+                        }
+            for inds, curset in enumerate(sets):
+                matrixdoc[inde, inds] = int(elt in curset)
+
+        return matrixdoc
+
+    def _signaturematrix(self, matrixdocument, siglen):
+        """ Return a matrix where each column is the signature the document
+            The signature is composed of siglen number
 
-    similarity = []
-    v = [signature[:, i] for i in xrange(signature.shape[1])]
-    for i in xrange(len(v)):
-        for j in xrange(len(v)):
-            if sim(i, j) >= threahold:
-                similarity.append((i, j))
-    return similarity
+            The more the document have rows in commun, the closer they are.
+        """
+
+        nrows, ncols = matrixdocument.shape
+        sig = ones((siglen, ncols)) * (nrows + 1)
+        hashfunc = [randomhashfunction(nrows) for _ in xrange(siglen)]
+
+        for r in xrange(nrows):
+            hashrs = [(i, func(r)) for i, func in enumerate(hashfunc)]
+            for c in matrixdocument.rows[r]:
+                for i, hashr in hashrs:
+                    if hashr < sig[i, c]:
+                        sig[i, c] = hashr
+        return sig
+
+    def findsimilarsentences(self, bandsize, sentenceid = -1, dispThreshold = False):
+        """ Return a set of tuples of *possible* similar sentences
+
+            If 0 <= sentenceid <= nbsentences is given:
+                return a set of tuples of *possible* similar sentences to this
+                one
+        """
+
+        if not self._trained:
+            print "Train it before"
+            return
+
+        col = [self.sigmatrix[:, i] for i in xrange(self.sigmatrix.shape[1])]
+        buckets = defaultdict(set)
+
+        nbbands = int(self.sigmatrix.shape[0] / bandsize)
+        print "threshold is %.3f" % pow(1./nbbands, 1./bandsize)
+        for r in xrange(0, self.sigmatrix.shape[0], bandsize):
+            for i in xrange(len(col)):
+                stri = ''.join(str(val) for val in col[i][r:r+bandsize])
+                buckets[hash(stri)].add(i)
+
+        if sentenceid < 0 or sentenceid >= self.sigmatrix.shape[0]:
+            return set(tuple(v) for v in buckets.values() if len(v) > 1)
+
+        return set(tuple(v) for v in buckets.values()
+                   if len(v) > 1 and sentenceid in v)
 
 if __name__ == '__main__':
-    from normalize import (loadlemmas, lemmatized)
+    from cubes.alignment.normalize import (loadlemmas, simplify)
 
-    sentences = ["j'aime le poisson", "le poisson c'est bon", 
+    sentences = ["j'aime le poisson", "le poisson c'est bon",
                  "je cuis le poisson", "je fais du sport",
                  "le sport c'est bon pour la sante",
                  "pour la sante le sport est bon",
@@ -124,18 +158,16 @@
                  "les carottes sont cuites"]
 
     lemmas = loadlemmas('data/french_lemmas.txt')
-    m = matrixdocument([lemmatized(s, lemmas) for s in sentences], 3)
-    signature = signaturematrix(m, 100)
-    simi = colsimilar(signature)
+    minlsh = Minlsh()
+    minlsh.train((simplify(s, lemmas) for s in sentences), 1, 200)
 
     print 'Les phrases sont : '
     for s in sentences:
         print ' - %s' % s
 
-    print '\nLes phrases similaires sont : '
-    (ip, _) = simi[0]
-    for (i, j) in simi:
-        if ip != i:
-            print
-        print ' -', sentences[i], '---', sentences[j]
-        ip = i
+    print '\nLes phrases *possiblement* similaires sont : '
+    for s in minlsh.findsimilarsentences(5):
+        for e in s:
+            print ' -', sentences[e]
+        print
+
--- a/normalize.py	Mon Oct 22 18:17:17 2012 +0200
+++ b/normalize.py	Wed Oct 24 11:29:08 2012 +0200
@@ -24,6 +24,18 @@
     """ Normalize a sentence (ie remove accents, set to lower, etc) """
     return unormalize(sentence).lower()
 
+def simplify(sentence, lemmas = None):
+    if lemmas:
+        sentence = lemmatized(sentence, lemmas)
+    sentence = sentence.lower()
+    cleansent = ''
+    for s in sentence:
+        if s not in punctuation:
+            cleansent += s
+
+    return cleansent
+
+
 def tokenize(sentence, tokenizer = None):
     """ Tokenize a sentence.
         Use ``tokenizer`` if given, else
@@ -34,6 +46,13 @@
     tokenizer = tokenizer or WordPunctTokenizer
     return [w for w in tokenizer().tokenize(sentence)]
 
+def wordgrams(sentence, k):
+    """ Generator of k-wordgrams on the given sentence
+    """
+    words = sentence.split(' ')
+    for r in xrange(len(words)):
+        yield ' '.join(words[r:r + k])
+
 def loadlemmas(filename):
     """ Return the default lemmas dictionnary
     """
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test.py	Wed Oct 24 11:29:08 2012 +0200
@@ -0,0 +1,20 @@
+# -*- coding:utf-8 -*-
+
+def dbpediasent(filename, maxind = None):
+    fobj = open(filename)
+    fobj.readline()
+    for ind, line in enumerate(fobj):
+        if maxind and ind >= maxind:
+            break
+        line = line.strip().decode('utf-8')
+        line = line.split('> "')[-1].split('"@fr')[0]
+        if not line:
+            continue
+        yield line
+
+def printsents(filename, indastr, maxind = None):
+    ind = [int(i) for i in indastr.split(' ')]
+    for i, s in enumerate(dbpediasent(filename, maxind)):
+        if i in ind:
+            print s
+            print