[ner] Disambiguation word should be case-insensitive, closes #200147
authorVincent Michel <vincent.michel@logilab.fr>
Wed, 08 Jan 2014 16:54:19 +0000
changeset 382 9494bce3a7a9
parent 381 f6b7eff50f7f
child 383 33aaa7c273c1
[ner] Disambiguation word should be case-insensitive, closes #200147
ner/filters.py
test/test_filters.py
--- a/ner/filters.py	Wed Jan 08 16:17:46 2014 +0100
+++ b/ner/filters.py	Wed Jan 08 16:54:19 2014 +0000
@@ -82,9 +82,9 @@
         # Replace named entities
         filtered_named_entities = []
         for uri, peid, token in named_entities:
-            if token.word in parts:
+            if token.word.lower() in parts:
                 # Change URI
-                uri = parts[token.word]
+                uri = parts[token.word.lower()]
             filtered_named_entities.append((uri, peid, token))
         return filtered_named_entities
 
--- a/test/test_filters.py	Wed Jan 08 16:17:46 2014 +0100
+++ b/test/test_filters.py	Wed Jan 08 16:54:19 2014 +0000
@@ -81,6 +81,22 @@
                            Token(word='toto', start=21, end=25,
                                  sentence=Sentence(indice=1, start=16, end=26)))])
 
+    def test_disambiguation_word_case(self):
+        """ Test occurence filter """
+        text = 'Hello Toto Tutu. And Toto.'
+        source = NerSourceLexicon({'Toto Tutu': 'http://example.com/toto_tutu',
+                                   'Toto': 'http://example.com/toto'})
+        _filter = NerDisambiguationWordParts()
+        ner = NerProcess((source,), filters=(_filter,))
+        named_entities = ner.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/toto_tutu', None,
+                           Token(word='Toto Tutu', start=6, end=15,
+                                 sentence=Sentence(indice=0, start=0, end=16))),
+                          ('http://example.com/toto_tutu', None,
+                           Token(word='Toto', start=21, end=25,
+                                 sentence=Sentence(indice=1, start=16, end=26)))])
+
     def test_rules_filter(self):
         """ Test rules filter """
         text = 'Hello toto tutu. And toto.'