[core] Add a NerDisambiguation process
authorVincent Michel <vincent.michel@logilab.fr>
Thu, 27 Jun 2013 18:26:40 +0200
changeset 330 d43bdb4eb8f4
parent 329 2cc2ba9a5532
child 331 f91e69ca8fca
[core] Add a NerDisambiguation process
core.py
test/test_filter.py
--- a/core.py	Thu Jun 27 18:14:03 2013 +0200
+++ b/core.py	Thu Jun 27 18:26:40 2013 +0200
@@ -271,6 +271,31 @@
         return filtered_named_entities
 
 
+class NerdyDisambiguationWordParts(object):
+    """ Disambiguate named entities based on the words parts.
+    E.g.:
+
+    Found "Toto tata" and "toto" in the same text.
+    Replace "Toto tata" and "toto".
+
+    """
+    def __call__(self, named_entities):
+        # Create parts dictionnary
+        parts = {}
+        for uri, peid, token in named_entities:
+            if ' ' in token.word:
+                for part in token.word.split(' '):
+                    parts[part.lower()] = uri
+        # Replace named entities
+        filtered_named_entities = []
+        for uri, peid, token in named_entities:
+            if token.word in parts:
+                # Change URI
+                uri = parts[token.word]
+            filtered_named_entities.append((uri, peid, token))
+        return filtered_named_entities
+
+
 ###############################################################################
 ### NER PROCESS ###############################################################
 ###############################################################################
--- a/test/test_filter.py	Thu Jun 27 18:14:03 2013 +0200
+++ b/test/test_filter.py	Thu Jun 27 18:26:40 2013 +0200
@@ -61,6 +61,22 @@
                            Token(word='everyone', start=6, end=14,
                                            sentence=Sentence(indice=0, start=0, end=38))),])
 
+    def test_disambiguation_word_length(self):
+        """ Test occurence filter """
+        text = 'Hello toto tutu. And toto.'
+        source = core.NerdySourceLexical({'toto tutu': 'http://example.com/toto_tutu',
+                                          'toto': 'http://example.com/toto'})
+        _filter = core.NerdyDisambiguationWordParts()
+        nerdy = core.NerdyProcess((source,), filters=(_filter,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/toto_tutu', None,
+                           Token(word='toto tutu', start=6, end=15,
+                                 sentence=Sentence(indice=0, start=0, end=16))),
+                          ('http://example.com/toto_tutu', None,
+                           Token(word='toto', start=21, end=25,
+                                 sentence=Sentence(indice=1, start=16, end=26)))])
+
 
 
 if __name__ == '__main__':