[Normalizer] Lematizer returns a string (better for comparison)
authorSimon Chabot <simon.chabot@logilab.fr>
Thu, 18 Oct 2012 18:15:25 +0200
changeset 24 85a396a8f66d
parent 23 d3b07dc25785
child 25 1af8af49bf1f
[Normalizer] Lematizer returns a string (better for comparison)
normalize.py
test/test_alignment.py
--- a/normalize.py	Thu Oct 18 17:58:53 2012 +0200
+++ b/normalize.py	Thu Oct 18 18:15:25 2012 +0200
@@ -18,6 +18,7 @@
 import re
 from logilab.common.textutils import unormalize
 from nltk.tokenize import WordPunctTokenizer
+from string import punctuation
 
 def lunormalize(sentence):
     """ Normalize a sentence (ie remove accents, set to lower, etc) """
@@ -43,7 +44,16 @@
 def lemmatized(sentence, lemmas, tokenizer = None):
     """ Return the lemmatized sentence
     """
-    return [lemmatized_word(w, lemmas) for w in tokenize(sentence, tokenizer)]
+    tokenized_sent = tokenize(sentence, tokenizer)
+    tokenized_sentformated = []
+    for w in tokenized_sent:
+        if w in ".,'" and len(tokenized_sentformated) > 0:
+            tokenized_sentformated[-1] += w
+        elif w not in punctuation:
+            tokenized_sentformated.append(w)
+
+    return ' '.join([lemmatized_word(w, lemmas)
+                     for w in tokenized_sentformated])
 
 def lemmatized_word(word, lemmas):
     """ Return the lemmatized word
--- a/test/test_alignment.py	Thu Oct 18 17:58:53 2012 +0200
+++ b/test/test_alignment.py	Thu Oct 18 18:15:25 2012 +0200
@@ -128,7 +128,9 @@
 
     def test_lemmatizer(self):
         self.assertEqual(lemmatized(u"J'aime les frites !", self.lemmas),
-                         [u'J', u"'", u'aimer', u'le', u'frite', u'!'])
+                         u'je aimer le frite')
+        self.assertEqual(lemmatized(u", J'aime les frites", self.lemmas),
+                         u'je aimer le frite')
 
     def test_round(self):
         self.assertEqual(roundstr(3.14159, 2), '3.14')