[filters] Add a based-rules replacement filter
authorVincent Michel <vincent.michel@logilab.fr>
Wed, 12 Jun 2013 11:39:19 +0200
changeset 334 3a319fcad5cb
parent 333 47a62334bfaa
child 335 601731a76796
[filters] Add a based-rules replacement filter
core.py
test/test_filter.py
--- a/core.py	Tue Jun 11 09:38:02 2013 +0200
+++ b/core.py	Wed Jun 12 11:39:19 2013 +0200
@@ -274,10 +274,11 @@
 class NerdyDisambiguationWordParts(object):
     """ Disambiguate named entities based on the words parts.
     E.g.:
+          'toto tutu': 'http://example.com/toto_tutu',
+          'toto': 'http://example.com/toto'
 
-    Found "Toto tata" and "toto" in the same text.
-    Replace "Toto tata" and "toto".
-
+          Then if 'toto' is found in the text, replace the URI 'http://example.com/toto'
+          by 'http://example.com/toto_tutu'
     """
     def __call__(self, named_entities):
         # Create parts dictionnary
@@ -296,6 +297,20 @@
         return filtered_named_entities
 
 
+class NerdyReplacementRulesFilter(object):
+    """ Allow to define replacement rules for Named Entities
+    """
+    def __init__(self,rules):
+        self.rules = rules
+
+    def __call__(self, named_entities):
+        filtered_named_entities = []
+        for uri, peid, token in named_entities:
+            uri = self.rules.get(uri, uri)
+            filtered_named_entities.append((uri, peid, token))
+        return filtered_named_entities
+
+
 ###############################################################################
 ### NER PROCESS ###############################################################
 ###############################################################################
--- a/test/test_filter.py	Tue Jun 11 09:38:02 2013 +0200
+++ b/test/test_filter.py	Wed Jun 12 11:39:19 2013 +0200
@@ -77,7 +77,22 @@
                            Token(word='toto', start=21, end=25,
                                  sentence=Sentence(indice=1, start=16, end=26)))])
 
-
+    def test_rules_filter(self):
+        """ Test rules filter """
+        text = 'Hello toto tutu. And toto.'
+        source = core.NerdySourceLexical({'toto tutu': 'http://example.com/toto_tutu',
+                                          'toto': 'http://example.com/toto'})
+        rules = {'http://example.com/toto': 'http://example.com/tata'}
+        _filter = core.NerdyReplacementRulesFilter(rules)
+        nerdy = core.NerdyProcess((source,), filters=(_filter,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/toto_tutu', None,
+                           Token(word='toto tutu', start=6, end=15,
+                                 sentence=Sentence(indice=0, start=0, end=16))),
+                          ('http://example.com/tata', None,
+                           Token(word='toto', start=21, end=25,
+                                 sentence=Sentence(indice=1, start=16, end=26)))])
 
 if __name__ == '__main__':
     unittest2.main()