[Normalizer] Add a normaliser (related to #128998)
authorSimon Chabot <simon.chabot@logilab.fr>
Thu, 18 Oct 2012 10:19:15 +0200
changeset 15 3940c3c651a3
parent 14 e02ce920aff6
child 16 7db50b138900
[Normalizer] Add a normaliser (related to #128998) - unormalize - tokenize - lemmatize - round
distances.py
normalize.py
test/test_alignment.py
--- a/distances.py	Wed Oct 17 16:47:07 2012 +0200
+++ b/distances.py	Thu Oct 18 10:19:15 2012 +0200
@@ -137,4 +137,7 @@
     return abs(diff.days)
 
 def euclidean(a, b):
-    return abs(a - b)
+    try:
+        return abs(a - b)
+    except TypeError:
+        return abs(float(a) - float(b))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/normalize.py	Thu Oct 18 10:19:15 2012 +0200
@@ -0,0 +1,62 @@
+# -*- coding:utf-8 -*-
+
+from logilab.common.textutils import unormalize
+from nltk.tokenize import WordPunctTokenizer 
+
+class Normalizer(object):
+    """ Use an object of this class to normalize your data """
+
+    def __init__(self, lemmasfilename = 'data/french_lemmas.txt'):
+        self.lemmas = None
+        self.lemmasfilename = lemmasfilename
+
+    def unormalize(self, sentence):
+        """ Normalize a sentence (ie remove accents, set to lower, etc) """
+        return unormalize(sentence).lower()
+
+    def tokenize(self, sentence, tokenizer = None):
+        """ Tokenize a sentence.
+            Use ``tokenizer`` if given, else 
+            nltk.tokenize.regexp.WordPunctTokenizer
+
+            Anyway, tokenizer must have a ``tokenize()`` method
+        """
+        tokenizer = tokenizer or WordPunctTokenizer
+        return [w for w in tokenizer().tokenize(sentence)]
+
+    def _deflemmas(self):
+        """ Return the default lemmas dictionnary
+        """
+        return dict([line.strip().split('\t') 
+                     for line in open(self.lemmasfilename)
+                         if len(line.strip().split('\t'))==2])
+
+    def lemmatized(self, sentence, tokenizer = None, lemmas = None):
+        """ Return the lemmatized sentence
+        """
+        self.lemmas = lemmas or self.lemmas or self._deflemmas()
+        return [self.lemmatized_word(w, self.lemmas)
+                for w in self.tokenize(sentence, tokenizer)]
+
+    def lemmatized_word(self, word, lemmas = None):
+        """ Return the lemmatized word
+        """
+        self.lemmas = lemmas or self.lemmas or self._deflemmas()
+        lemma = lemmas.get(word.lower(), word)
+        if '|' in lemma:
+            _words = lemma.split('|')
+            if word.lower() in _words:
+                lemma = word.lower()
+            else:
+                lemma = _words[0]
+        return lemma
+
+    def round(self, number, ndigits = 0):
+        """Return an unicode string of ``number`` rounded to a given precision
+            in decimal digits (default 0 digits)
+
+            If ``number`` is not a float, this method casts it to a float. (An
+            exception can be raised if it's not possible)
+        """
+
+        return format(round(float(number), ndigits), '0.%df' % ndigits)
--- a/test/test_alignment.py	Wed Oct 17 16:47:07 2012 +0200
+++ b/test/test_alignment.py	Thu Oct 18 10:19:15 2012 +0200
@@ -1,3 +1,5 @@
+# -*- coding:utf-8 -*-
+#
 # copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
 # contact http://www.logilab.fr -- mailto:contact@logilab.fr
 #
@@ -40,6 +42,7 @@
 from cubicweb.devtools import testlib
 from cubes.alignment.distances import (levenshtein, soundex, soundexcode, \
                                        jaccard, temporal, euclidean)
+from cubes.alignment.normalize import Normalizer
 
 class DistancesTest(testlib.CubicWebTC):
     def test_levenshtein(self):
@@ -101,6 +104,28 @@
         self.assertEqual(euclidean(10, 11), 1)
         self.assertEqual(euclidean(-10, 11), 21)
 
+class NormalizerTestCase(testlib.CubicWebTC):
+    def setUp(self):
+        self.normalizer = Normalizer('../data/french_lemmas.txt')
+
+    def test_unormalize(self):
+        self.assertEqual(self.normalizer.unormalize(u'bépoèàÀêùï'),
+                                                    u'bepoeaaeui')
+
+    def test_tokenize(self):
+        self.assertEqual(self.normalizer.tokenize(u"J'aime les frites !"),
+                         [u'J', u"'", u'aime', u'les', u'frites', u'!',])
+
+    def test_lemmatizer(self):
+        self.assertEqual(self.normalizer.lemmatized(u"J'aime les frites !"),
+                         [u'J', u"'", u'aimer', u'le', u'frite', u'!'])
+
+    def test_round(self):
+        self.assertEqual(self.normalizer.round(3.14159, 2), '3.14')
+        self.assertEqual(self.normalizer.round(3.14159), '3')
+        self.assertEqual(self.normalizer.round('3.14159', 3), '3.142')
+
+
 if __name__ == '__main__':
     from logilab.common.testlib import unittest_main
     unittest_main()