[normalize] Using a class was a bad idea, I removed it
authorSimon Chabot <simon.chabot@logilab.fr>
Thu, 18 Oct 2012 17:16:44 +0200
changeset 21 2f077077b266
parent 20 16f66a0aaa0e
child 22 0c3ef4909658
[normalize] Using a class was a bad idea, I removed it
distances.py
normalize.py
test/test_alignment.py
--- a/distances.py	Thu Oct 18 16:35:18 2012 +0200
+++ b/distances.py	Thu Oct 18 17:16:44 2012 +0200
@@ -49,6 +49,10 @@
         .:: wiki_ : https://en.wikipedia.org/wiki/Soundex
     """
 
+    if ' ' in word:
+        words = word.split(' ')
+        return ' '.join([soundexcode(w.strip(), language) for w in words])
+
     vowels = 'AEHIOUWY'
     if language.lower() == 'french' :
         consonnantscode = { 'B' : '1', 'P' : '1',
@@ -99,7 +103,8 @@
     """ Return the 1/0 distance between the soundex code of stra and strb.
         0 means they have the same code, 1 they don't
     """
-    return 0 if soundexcode(stra, language) == soundexcode(strb, language) else 1
+    return 0 if (soundexcode(stra, language) == soundexcode(strb, language)) \
+             else 1
 
 def jaccard(stra, strb):
     """ Return the jaccard distance between stra and strb, condering the letters
--- a/normalize.py	Thu Oct 18 16:35:18 2012 +0200
+++ b/normalize.py	Thu Oct 18 17:16:44 2012 +0200
@@ -17,79 +17,69 @@
 
 import re
 from logilab.common.textutils import unormalize
-from nltk.tokenize import WordPunctTokenizer 
-
-class Normalizer(object):
-    """ Use an object of this class to normalize your data """
-
-    def __init__(self, lemmasfilename = 'data/french_lemmas.txt'):
-        self.lemmas = None
-        self.lemmasfilename = lemmasfilename
+from nltk.tokenize import WordPunctTokenizer
 
-    def unormalize(self, sentence):
-        """ Normalize a sentence (ie remove accents, set to lower, etc) """
-        return unormalize(sentence).lower()
+def lunormalize(sentence):
+    """ Normalize a sentence (ie remove accents, set to lower, etc) """
+    return unormalize(sentence).lower()
 
-    def tokenize(self, sentence, tokenizer = None):
-        """ Tokenize a sentence.
-            Use ``tokenizer`` if given, else 
-            nltk.tokenize.regexp.WordPunctTokenizer
+def tokenize(sentence, tokenizer = None):
+    """ Tokenize a sentence.
+        Use ``tokenizer`` if given, else
+        nltk.tokenize.regexp.WordPunctTokenizer
 
-            Anyway, tokenizer must have a ``tokenize()`` method
-        """
-        tokenizer = tokenizer or WordPunctTokenizer
-        return [w for w in tokenizer().tokenize(sentence)]
+        Anyway, tokenizer must have a ``tokenize()`` method
+    """
+    tokenizer = tokenizer or WordPunctTokenizer
+    return [w for w in tokenizer().tokenize(sentence)]
 
-    def _deflemmas(self):
-        """ Return the default lemmas dictionnary
-        """
-        return dict([line.strip().split('\t') 
-                     for line in open(self.lemmasfilename)
-                         if len(line.strip().split('\t'))==2])
+def loadlemmas(filename):
+    """ Return the default lemmas dictionnary
+    """
+    return dict([line.strip().split('\t')
+                 for line in open(filename)
+                     if len(line.strip().split('\t'))==2])
 
-    def lemmatized(self, sentence, tokenizer = None, lemmas = None):
-        """ Return the lemmatized sentence
-        """
-        self.lemmas = lemmas or self.lemmas or self._deflemmas()
-        return [self.lemmatized_word(w, self.lemmas)
-                for w in self.tokenize(sentence, tokenizer)]
+def lemmatized(sentence, lemmas, tokenizer = None):
+    """ Return the lemmatized sentence
+    """
+    return [lemmatized_word(w, lemmas) for w in tokenize(sentence, tokenizer)]
 
-    def lemmatized_word(self, word, lemmas = None):
-        """ Return the lemmatized word
-        """
-        self.lemmas = lemmas or self.lemmas or self._deflemmas()
-        lemma = lemmas.get(word.lower(), word)
-        if '|' in lemma:
-            _words = lemma.split('|')
-            if word.lower() in _words:
-                lemma = word.lower()
-            else:
-                lemma = _words[0]
-        return lemma
+def lemmatized_word(word, lemmas):
+    """ Return the lemmatized word
+    """
+    lemma = lemmas.get(word.lower(), word)
+    if '|' in lemma:
+        _words = lemma.split('|')
+        if word.lower() in _words:
+            lemma = word.lower()
+        else:
+            lemma = _words[0]
+    return lemma
 
-    def round(self, number, ndigits = 0):
-        """Return an unicode string of ``number`` rounded to a given precision
-            in decimal digits (default 0 digits)
+def roundstr(number, ndigits = 0):
+    """Return an unicode string of ``number`` rounded to a given precision
+        in decimal digits (default 0 digits)
 
-            If ``number`` is not a float, this method casts it to a float. (An
-            exception can be raised if it's not possible)
-        """
+        If ``number`` is not a float, this method casts it to a float. (An
+        exception can be raised if it's not possible)
+    """
 
-        return format(round(float(number), ndigits), '0.%df' % ndigits)
+    return format(round(float(number), ndigits), '0.%df' % ndigits)
 
-    def format(self, string, regexp, output):
-        """ Apply the regexp to the ``string`` and return a formatted string
-        according to ``output``
+def rgxformat(string, regexp, output):
+    """ Apply the regexp to the ``string`` and return a formatted string
+    according to ``output``
 
-        eg :
-         normalizer.format(u'[Victor Hugo - 26 fev 1802 / 22 mai 1885]',
-                           r'\[(?P<firstname>\w+) (?p<lastname>\w+) - '
-                           r'(?P<birthdate>.*?) / (?<deathdate>.*?)\]',
-                           u'%(lastname)s, %(firstname)s (%(birthdate)s -'
-                           u'%(deathdate)s)')
+    eg :
+        format(u'[Victor Hugo - 26 fev 1802 / 22 mai 1885]',
+               r'\[(?P<firstname>\w+) (?p<lastname>\w+) - '
+               r'(?P<birthdate>.*?) / (?<deathdate>.*?)\]',
+               u'%(lastname)s, %(firstname)s (%(birthdate)s -'
+               u'%(deathdate)s)')
 
-         would return u'Hugo, Victor (26 fev 1802 - 22 mai 1885)'
-         """
+     would return u'Hugo, Victor (26 fev 1802 - 22 mai 1885)'
+     """
 
-        match = re.match(regexp, string)
-        return output % match.groupdict()
+    match = re.match(regexp, string)
+    return output % match.groupdict()
--- a/test/test_alignment.py	Thu Oct 18 16:35:18 2012 +0200
+++ b/test/test_alignment.py	Thu Oct 18 17:16:44 2012 +0200
@@ -42,7 +42,8 @@
 from cubicweb.devtools import testlib
 from cubes.alignment.distances import (levenshtein, soundex, soundexcode, \
                                        jaccard, temporal, euclidean)
-from cubes.alignment.normalize import Normalizer
+from cubes.alignment.normalize import (lunormalize, loadlemmas, lemmatized, \
+                                       roundstr, rgxformat, tokenize)
 
 class DistancesTest(testlib.CubicWebTC):
     def test_levenshtein(self):
@@ -114,37 +115,37 @@
 
 class NormalizerTestCase(testlib.CubicWebTC):
     def setUp(self):
-        self.normalizer = Normalizer('../data/french_lemmas.txt')
+        self.lemmas = loadlemmas('../data/french_lemmas.txt')
 
     def test_unormalize(self):
-        self.assertEqual(self.normalizer.unormalize(u'bépoèàÀêùï'),
-                                                    u'bepoeaaeui')
+        self.assertEqual(lunormalize(u'bépoèàÀêùï'),
+                                     u'bepoeaaeui')
 
     def test_tokenize(self):
-        self.assertEqual(self.normalizer.tokenize(u"J'aime les frites !"),
+        self.assertEqual(tokenize(u"J'aime les frites !"),
                          [u'J', u"'", u'aime', u'les', u'frites', u'!',])
 
     def test_lemmatizer(self):
-        self.assertEqual(self.normalizer.lemmatized(u"J'aime les frites !"),
+        self.assertEqual(lemmatized(u"J'aime les frites !", self.lemmas),
                          [u'J', u"'", u'aimer', u'le', u'frite', u'!'])
 
     def test_round(self):
-        self.assertEqual(self.normalizer.round(3.14159, 2), '3.14')
-        self.assertEqual(self.normalizer.round(3.14159), '3')
-        self.assertEqual(self.normalizer.round('3.14159', 3), '3.142')
+        self.assertEqual(roundstr(3.14159, 2), '3.14')
+        self.assertEqual(roundstr(3.14159), '3')
+        self.assertEqual(roundstr('3.14159', 3), '3.142')
 
     def test_format(self):
         string = u'[Victor Hugo - 26 fev 1802 / 22 mai 1885]'
         regex  = r'\[(?P<firstname>\w+) (?P<lastname>\w+) - ' \
                  r'(?P<birthdate>.*) \/ (?P<deathdate>.*?)\]'
         output = u'%(lastname)s, %(firstname)s (%(birthdate)s - %(deathdate)s)'
-        self.assertEqual(self.normalizer.format(string, regex, output),
+        self.assertEqual(rgxformat(string, regex, output),
                          u'Hugo, Victor (26 fev 1802 - 22 mai 1885)')
 
         string = u'http://perdu.com/42/supertop/cool'
         regex  = r'http://perdu.com/(?P<id>\d+).*'
         output = u'%(id)s'
-        self.assertEqual(self.normalizer.format(string, regex, output),
+        self.assertEqual(rgxformat(string, regex, output),
                          u'42')