[named entities] Split core into preprocessors and filters modules, related to #187461
authorVincent Michel <vincent.michel@logilab.fr>
Thu, 19 Dec 2013 14:45:43 +0000
changeset 374 102c6331f3f6
parent 373 77a3a4107f5c
child 375 343a4304a259
[named entities] Split core into preprocessors and filters modules, related to #187461
named_entities/__init__.py
named_entities/filters.py
named_entities/named_entities.py
named_entities/preprocessors.py
named_entities/sources.py
test/test_filter.py
test/test_filters.py
test/test_named_entities.py
test/test_preprocessor.py
test/test_preprocessors.py
--- a/named_entities/__init__.py	Thu Dec 19 14:45:26 2013 +0000
+++ b/named_entities/__init__.py	Thu Dec 19 14:45:43 2013 +0000
@@ -0,0 +1,80 @@
+# -*- coding: utf-8 -*-
+""" Process/Core functions for Named Entities Recognition.
+"""
+from nazca.utils.tokenizer import RichStringTokenizer
+
+
+###############################################################################
+### NER PROCESS ###############################################################
+###############################################################################
+class NerProcess(object):
+    """ High-level process for Named Entities Recognition
+    """
+
+    def __init__(self, ner_sources, preprocessors=None, filters=None, unique=False):
+        """ Initialise the class.
+
+        :tokenizer: an instance of tokenizer
+        """
+        self.ner_sources = list(ner_sources)
+        self.preprocessors = preprocessors or []
+        self.filters = filters or []
+        self.unique = unique
+
+    def add_ner_source(self, process):
+        """ Add a ner process
+        """
+        self.ner_sources.append(process)
+
+    def add_preprocessors(self, preprocessor):
+        """ Add a preprocessor
+        """
+        self.preprocessors.append(preprocessor)
+
+    def add_filters(self, filter):
+        """ Add a filter
+        """
+        self.filters.append(filter)
+
+    def process_text(self, text):
+        """ High level function for analyzing a text
+        """
+        tokenizer = RichStringTokenizer(text)
+        return self.recognize_tokens(tokenizer)
+
+    def recognize_tokens(self, tokens):
+        """ Recognize Named Entities from a tokenizer or
+        an iterator yielding tokens.
+        """
+        last_stop = 0
+        named_entities = []
+        for token in tokens:
+            if token.start < last_stop:
+                continue # this token overlaps with a previous match
+            word = token.word
+            # Applies preprocessors
+            # XXX Preprocessors may be sources dependant
+            for preprocessor in self.preprocessors:
+                token = preprocessor(token)
+                if not token:
+                    break
+            if not token:
+                continue
+            recognized = False
+            for process in self.ner_sources:
+                for uri in process.recognize_token(token):
+                    named_entities.append((uri, process.name, token))
+                    recognized = True
+                    last_stop = token.end
+                    if self.unique:
+                        break
+                if recognized and self.unique:
+                    break
+        # XXX Postprocess/filters may be sources dependant
+        return self.postprocess(named_entities)
+
+    def postprocess(self, named_entities):
+        """ Postprocess the results by applying filters """
+        for filter in self.filters:
+            named_entities = filter(named_entities)
+        return named_entities
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/named_entities/filters.py	Thu Dec 19 14:45:43 2013 +0000
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+""" Filters for Named Entities Recognition.
+"""
+from nazca.utils.dataio import sparqlquery
+
+
+###############################################################################
+### NER FILTERS ###############################################################
+###############################################################################
+class AbstractNerFilter(object):
+    """ A filter used for cleaning named entities results
+    """
+
+    def __call__(self, named_entities):
+        raise NotImplementedError
+
+
+class NerOccurenceFilter(object):
+    """ A filter based on the number of occurence of
+    named entities in the results.
+    """
+    def __init__(self, min_occ=None, max_occ=None):
+        self.min_occ = min_occ
+        self.max_occ = max_occ
+
+    def __call__(self, named_entities):
+        uris = [u for u, p, t in named_entities]
+        counts = dict([(u, uris.count(u)) for u in set(uris)])
+        return [n for n in named_entities if not ((self.min_occ and counts[n[0]]<self.min_occ)
+                                              or (self.max_occ and counts[n[0]]>self.max_occ))]
+
+
+class NerRDFTypeFilter(object):
+    """ A filter based on the RDF type on entity
+    E.g.
+
+    filter = NerRDFTypeFilter('http://dbpedia.org/sparql',
+                                ('http://schema.org/Place',
+                                'http://dbpedia.org/ontology/Agent',
+                                'http://dbpedia.org/ontology/Place'))
+
+    """
+    def __init__(self, endpoint, accepted_types):
+        self.endpoint = endpoint
+        self.accepted_types = accepted_types
+        self.query = 'SELECT ?type WHERE{<%(uri)s> rdf:type ?type}'
+
+    def __call__(self, named_entities):
+        filtered_named_entities = []
+        seen_uris = {}
+        for uri, p, t in named_entities:
+            if uri in seen_uris:
+                if seen_uris[uri]:
+                    filtered_named_entities.append((uri, p, t))
+            else:
+                results = sparqlquery(self.endpoint, self.query % {'uri': uri})
+                types = set([r['type']['value'] for r in results])
+                if not len(types.intersection(self.accepted_types)):
+                    seen_uris[uri] = False
+                else:
+                    seen_uris[uri] = True
+                    filtered_named_entities.append((uri, p, t))
+        return filtered_named_entities
+
+
+class NerDisambiguationWordParts(object):
+    """ Disambiguate named entities based on the words parts.
+    E.g.:
+          'toto tutu': 'http://example.com/toto_tutu',
+          'toto': 'http://example.com/toto'
+
+          Then if 'toto' is found in the text, replace the URI 'http://example.com/toto'
+          by 'http://example.com/toto_tutu'
+    """
+    def __call__(self, named_entities):
+        # Create parts dictionnary
+        parts = {}
+        for uri, peid, token in named_entities:
+            if ' ' in token.word:
+                for part in token.word.split(' '):
+                    parts[part.lower()] = uri
+        # Replace named entities
+        filtered_named_entities = []
+        for uri, peid, token in named_entities:
+            if token.word in parts:
+                # Change URI
+                uri = parts[token.word]
+            filtered_named_entities.append((uri, peid, token))
+        return filtered_named_entities
+
+
+class NerReplacementRulesFilter(object):
+    """ Allow to define replacement rules for Named Entities
+    """
+    def __init__(self,rules):
+        self.rules = rules
+
+    def __call__(self, named_entities):
+        filtered_named_entities = []
+        for uri, peid, token in named_entities:
+            uri = self.rules.get(uri, uri)
+            filtered_named_entities.append((uri, peid, token))
+        return filtered_named_entities
--- a/named_entities/named_entities.py	Thu Dec 19 14:45:26 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,268 +0,0 @@
-# -*- coding: utf-8 -*-
-""" Core functions for Named Entities Recognition.
-"""
-from nazca.utils.tokenizer import RichStringTokenizer, Token
-from nazca.utils.dataio import sparqlquery
-from nazca.reference_data.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
-
-STOPWORDS = {'fr': FRENCH_STOPWORDS,
-             'en': ENGLISH_STOPWORDS}
-
-
-###############################################################################
-### NER PREPROCESSORS #########################################################
-###############################################################################
-class AbstractNerdyPreprocessor(object):
-    """ Preprocessor
-    """
-
-    def __call__(self, token):
-        raise NotImplementedError
-
-
-class NerdyWordSizeFilterPreprocessor(AbstractNerdyPreprocessor):
-    """ Remove token based on the size of the word
-    """
-    def __init__(self, min_size=None, max_size=None):
-        self.min_size = min_size
-        self.max_size = max_size
-
-    def __call__(self, token):
-        if ((self.min_size and len(token.word)<self.min_size)
-            or (self.max_size and len(token.word)>self.max_size)):
-            return None
-        return token
-
-
-class NerdyLowerCaseFilterPreprocessor(AbstractNerdyPreprocessor):
-    """ Remove token with word in lower case
-    """
-
-    def __call__(self, token):
-        return None if token.word.islower() else token
-
-
-class NerdyLowerFirstWordPreprocessor(AbstractNerdyPreprocessor):
-    """ Lower the first word of each sentence if it is a stopword.
-    """
-    def __init__(self, lang='en'):
-        self.lang = lang
-
-    def __call__(self, token):
-        if (token.start == token.sentence.start and
-            token.word.split()[0].lower() in STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)):
-            word = token.word[0].lower() + token.word[1:]
-            return Token(word, token.start, token.end, token.sentence)
-        return token
-
-
-class NerdyStopwordsFilterPreprocessor(AbstractNerdyPreprocessor):
-    """ Remove stopwords
-    """
-    def __init__(self, split_words=False, lang='en'):
-        self.split_words = split_words
-        self.lang = lang
-
-    def __call__(self, token):
-        stopwords = STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)
-        if self.split_words and not [w for w in token.word.split() if w.lower() not in stopwords]:
-            return None
-        if not self.split_words and token.word.lower() in stopwords:
-            return None
-        return token
-
-
-class NerdyHashTagPreprocessor(AbstractNerdyPreprocessor):
-    """ Cleanup hashtag
-    """
-    def __call__(self, token):
-        if token.word.startswith('@'):
-            # XXX Split capitalize letter ?
-            # @BarackObama -> Barack Obama
-            word = token.word[1:].replace('_', ' ')
-            return Token(word, token.start, token.end, token.sentence)
-        return token
-
-
-###############################################################################
-### NER FILTERS ###############################################################
-###############################################################################
-class AbstractNerdyFilter(object):
-    """ A filter used for cleaning named entities results
-    """
-
-    def __call__(self, named_entities):
-        raise NotImplementedError
-
-
-class NerdyOccurenceFilter(object):
-    """ A filter based on the number of occurence of
-    named entities in the results.
-    """
-    def __init__(self, min_occ=None, max_occ=None):
-        self.min_occ = min_occ
-        self.max_occ = max_occ
-
-    def __call__(self, named_entities):
-        uris = [u for u, p, t in named_entities]
-        counts = dict([(u, uris.count(u)) for u in set(uris)])
-        return [n for n in named_entities if not ((self.min_occ and counts[n[0]]<self.min_occ)
-                                              or (self.max_occ and counts[n[0]]>self.max_occ))]
-
-
-class NerdyRDFTypeFilter(object):
-    """ A filter based on the RDF type on entity
-    E.g.
-
-    filter = NerdyRDFTypeFilter('http://dbpedia.org/sparql',
-                                ('http://schema.org/Place',
-                                'http://dbpedia.org/ontology/Agent',
-                                'http://dbpedia.org/ontology/Place'))
-
-    """
-    def __init__(self, endpoint, accepted_types):
-        self.endpoint = endpoint
-        self.accepted_types = accepted_types
-        self.query = 'SELECT ?type WHERE{<%(uri)s> rdf:type ?type}'
-
-    def __call__(self, named_entities):
-        filtered_named_entities = []
-        seen_uris = {}
-        for uri, p, t in named_entities:
-            if uri in seen_uris:
-                if seen_uris[uri]:
-                    filtered_named_entities.append((uri, p, t))
-            else:
-                results = sparqlquery(self.endpoint, self.query % {'uri': uri})
-                types = set([r['type']['value'] for r in results])
-                if not len(types.intersection(self.accepted_types)):
-                    seen_uris[uri] = False
-                else:
-                    seen_uris[uri] = True
-                    filtered_named_entities.append((uri, p, t))
-        return filtered_named_entities
-
-
-class NerdyDisambiguationWordParts(object):
-    """ Disambiguate named entities based on the words parts.
-    E.g.:
-          'toto tutu': 'http://example.com/toto_tutu',
-          'toto': 'http://example.com/toto'
-
-          Then if 'toto' is found in the text, replace the URI 'http://example.com/toto'
-          by 'http://example.com/toto_tutu'
-    """
-    def __call__(self, named_entities):
-        # Create parts dictionnary
-        parts = {}
-        for uri, peid, token in named_entities:
-            if ' ' in token.word:
-                for part in token.word.split(' '):
-                    parts[part.lower()] = uri
-        # Replace named entities
-        filtered_named_entities = []
-        for uri, peid, token in named_entities:
-            if token.word in parts:
-                # Change URI
-                uri = parts[token.word]
-            filtered_named_entities.append((uri, peid, token))
-        return filtered_named_entities
-
-
-class NerdyReplacementRulesFilter(object):
-    """ Allow to define replacement rules for Named Entities
-    """
-    def __init__(self,rules):
-        self.rules = rules
-
-    def __call__(self, named_entities):
-        filtered_named_entities = []
-        for uri, peid, token in named_entities:
-            uri = self.rules.get(uri, uri)
-            filtered_named_entities.append((uri, peid, token))
-        return filtered_named_entities
-
-
-###############################################################################
-### NER PROCESS ###############################################################
-###############################################################################
-class NerdyProcess(object):
-    """ High-level process for Named Entities Recognition
-    """
-
-    def __init__(self, ner_sources, preprocessors=None, filters=None, unique=False):
-        """ Initialise the class.
-
-        :tokenizer: an instance of tokenizer
-        """
-        self.ner_sources = list(ner_sources)
-        self.preprocessors = preprocessors or []
-        self.filters = filters or []
-        self.unique = unique
-
-    def add_ner_source(self, process):
-        """ Add a ner process
-        """
-        self.ner_sources.append(process)
-
-    def add_preprocessors(self, preprocessor):
-        """ Add a preprocessor
-        """
-        self.preprocessors.append(preprocessor)
-
-    def add_filters(self, filter):
-        """ Add a filter
-        """
-        self.filters.append(filter)
-
-    def process_text(self, text):
-        """ High level function for analyzing a text
-        """
-        tokenizer = RichStringTokenizer(text)
-        return self.recognize_tokens(tokenizer)
-
-    def recognize_tokens(self, tokens):
-        """ Recognize Named Entities from a tokenizer or
-        an iterator yielding tokens.
-        """
-        last_stop = 0
-        named_entities = []
-        for token in tokens:
-            if token.start < last_stop:
-                continue # this token overlaps with a previous match
-            word = token.word
-            # Applies preprocessors
-            # XXX Preprocessors may be sources dependant
-            for preprocessor in self.preprocessors:
-                token = preprocessor(token)
-                if not token:
-                    break
-            if not token:
-                continue
-            recognized = False
-            for process in self.ner_sources:
-                for uri in process.recognize_token(token):
-                    named_entities.append((uri, process.name, token))
-                    recognized = True
-                    last_stop = token.end
-                    if self.unique:
-                        break
-                if recognized and self.unique:
-                    break
-        # XXX Postprocess/filters may be sources dependant
-        return self.postprocess(named_entities)
-
-    def postprocess(self, named_entities):
-        """ Postprocess the results by applying filters """
-        for filter in self.filters:
-            named_entities = filter(named_entities)
-        return named_entities
-
-
-###############################################################################
-### NER RELATIONS PROCESS #####################################################
-###############################################################################
-class NerdyRelationsProcess(object):
-    """ Process for building simple relation from named entities results
-    """
-    pass
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/named_entities/preprocessors.py	Thu Dec 19 14:45:43 2013 +0000
@@ -0,0 +1,83 @@
+# -*- coding: utf-8 -*-
+""" Preprocessors for Named Entities Recognition.
+"""
+from nazca.utils.tokenizer import Token
+from nazca.reference_data.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
+
+STOPWORDS = {'fr': FRENCH_STOPWORDS,
+             'en': ENGLISH_STOPWORDS}
+
+
+###############################################################################
+### NER PREPROCESSORS #########################################################
+###############################################################################
+class AbstractNerPreprocessor(object):
+    """ Preprocessor
+    """
+
+    def __call__(self, token):
+        raise NotImplementedError
+
+
+class NerWordSizeFilterPreprocessor(AbstractNerPreprocessor):
+    """ Remove token based on the size of the word
+    """
+    def __init__(self, min_size=None, max_size=None):
+        self.min_size = min_size
+        self.max_size = max_size
+
+    def __call__(self, token):
+        if ((self.min_size and len(token.word)<self.min_size)
+            or (self.max_size and len(token.word)>self.max_size)):
+            return None
+        return token
+
+
+class NerLowerCaseFilterPreprocessor(AbstractNerPreprocessor):
+    """ Remove token with word in lower case
+    """
+
+    def __call__(self, token):
+        return None if token.word.islower() else token
+
+
+class NerLowerFirstWordPreprocessor(AbstractNerPreprocessor):
+    """ Lower the first word of each sentence if it is a stopword.
+    """
+    def __init__(self, lang='en'):
+        self.lang = lang
+
+    def __call__(self, token):
+        if (token.start == token.sentence.start and
+            token.word.split()[0].lower() in STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)):
+            word = token.word[0].lower() + token.word[1:]
+            return Token(word, token.start, token.end, token.sentence)
+        return token
+
+
+class NerStopwordsFilterPreprocessor(AbstractNerPreprocessor):
+    """ Remove stopwords
+    """
+    def __init__(self, split_words=False, lang='en'):
+        self.split_words = split_words
+        self.lang = lang
+
+    def __call__(self, token):
+        stopwords = STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)
+        if self.split_words and not [w for w in token.word.split() if w.lower() not in stopwords]:
+            return None
+        if not self.split_words and token.word.lower() in stopwords:
+            return None
+        return token
+
+
+class NerHashTagPreprocessor(AbstractNerPreprocessor):
+    """ Cleanup hashtag
+    """
+    def __call__(self, token):
+        if token.word.startswith('@'):
+            # XXX Split capitalize letter ?
+            # @BarackObama -> Barack Obama
+            word = token.word[1:].replace('_', ' ')
+            return Token(word, token.start, token.end, token.sentence)
+        return token
--- a/named_entities/sources.py	Thu Dec 19 14:45:26 2013 +0000
+++ b/named_entities/sources.py	Thu Dec 19 14:45:43 2013 +0000
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-""" Core functions for Named Entities Recognition.
+""" Sources for Named Entities Recognition.
 """
 from nazca.utils.tokenizer import Token
 from nazca.utils.dataio import sparqlquery, rqlquery
@@ -103,7 +103,7 @@
     """ High-level source for Named Entities Recognition
     SPARQL version
 
-   >>> from nerdy.core import NerSourceSparql
+   >>> from ner.core import NerSourceSparql
    >>> ner_source = NerSourceSparql('''SELECT ?uri
                                          WHERE{
                                          ?uri rdfs:label "%(word)s"@en}''',
@@ -121,4 +121,4 @@
     def query_word(self, word):
         """ Query a word for a Named Entities Recognition process
         """
-        return [r['uri']['value'] for r in sparqlquery(self.endpoint, self.query % {'word': word})]
+        return [r[0] for r in sparqlquery(self.endpoint, self.query % {'word': word})]
--- a/test/test_filter.py	Thu Dec 19 14:45:26 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,99 +0,0 @@
-# -*- coding:utf-8 -*-
-#
-# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-import unittest2
-
-from nerdy import core
-from nerdy.tokenizer import Token, Sentence
-
-
-class FilterTest(unittest2.TestCase):
-    """ Test of filters """
-
-    def test_occurence_filter_min_occ(self):
-        """ Test occurence filter """
-        text = 'Hello everyone, this is   me speaking. And me.'
-        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'me': 'http://example.com/me'})
-        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
-        _filter = core.NerdyOccurenceFilter(min_occ=2)
-        nerdy = core.NerdyProcess((source1, source2), filters=(_filter,))
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
-
-    def test_occurence_filter_max_occ(self):
-        """ Test occurence filter """
-        text = 'Hello everyone, this is   me speaking. And me.'
-        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'me': 'http://example.com/me'})
-        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
-        _filter = core.NerdyOccurenceFilter(max_occ=1)
-        nerdy = core.NerdyProcess((source1, source2), filters=(_filter,))
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone', None,
-                           Token(word='everyone', start=6, end=14,
-                                           sentence=Sentence(indice=0, start=0, end=38))),])
-
-    def test_disambiguation_word_length(self):
-        """ Test occurence filter """
-        text = 'Hello toto tutu. And toto.'
-        source = core.NerdySourceLexical({'toto tutu': 'http://example.com/toto_tutu',
-                                          'toto': 'http://example.com/toto'})
-        _filter = core.NerdyDisambiguationWordParts()
-        nerdy = core.NerdyProcess((source,), filters=(_filter,))
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/toto_tutu', None,
-                           Token(word='toto tutu', start=6, end=15,
-                                 sentence=Sentence(indice=0, start=0, end=16))),
-                          ('http://example.com/toto_tutu', None,
-                           Token(word='toto', start=21, end=25,
-                                 sentence=Sentence(indice=1, start=16, end=26)))])
-
-    def test_rules_filter(self):
-        """ Test rules filter """
-        text = 'Hello toto tutu. And toto.'
-        source = core.NerdySourceLexical({'toto tutu': 'http://example.com/toto_tutu',
-                                          'toto': 'http://example.com/toto'})
-        rules = {'http://example.com/toto': 'http://example.com/tata'}
-        _filter = core.NerdyReplacementRulesFilter(rules)
-        nerdy = core.NerdyProcess((source,), filters=(_filter,))
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/toto_tutu', None,
-                           Token(word='toto tutu', start=6, end=15,
-                                 sentence=Sentence(indice=0, start=0, end=16))),
-                          ('http://example.com/tata', None,
-                           Token(word='toto', start=21, end=25,
-                                 sentence=Sentence(indice=1, start=16, end=26)))])
-
-if __name__ == '__main__':
-    unittest2.main()
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_filters.py	Thu Dec 19 14:45:43 2013 +0000
@@ -0,0 +1,100 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+import unittest2
+
+from nazca.named_entities import named_entities as core, filters
+from nazca.named_entities.sources import NerSourceLexicon
+from nazca.utils.tokenizer import Token, Sentence
+
+
+class FilterTest(unittest2.TestCase):
+    """ Test of filters """
+
+    def test_occurence_filter_min_occ(self):
+        """ Test occurence filter """
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
+                                    'me': 'http://example.com/me'})
+        source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
+        _filter = filters.NerOccurenceFilter(min_occ=2)
+        ner = core.NerProcess((source1, source2), filters=(_filter,))
+        named_entities = ner.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+
+    def test_occurence_filter_max_occ(self):
+        """ Test occurence filter """
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
+                                    'me': 'http://example.com/me'})
+        source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
+        _filter = filters.NerOccurenceFilter(max_occ=1)
+        ner = core.NerProcess((source1, source2), filters=(_filter,))
+        named_entities = ner.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),])
+
+    def test_disambiguation_word_length(self):
+        """ Test occurence filter """
+        text = 'Hello toto tutu. And toto.'
+        source = NerSourceLexicon({'toto tutu': 'http://example.com/toto_tutu',
+                                   'toto': 'http://example.com/toto'})
+        _filter = filters.NerDisambiguationWordParts()
+        ner = core.NerProcess((source,), filters=(_filter,))
+        named_entities = ner.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/toto_tutu', None,
+                           Token(word='toto tutu', start=6, end=15,
+                                 sentence=Sentence(indice=0, start=0, end=16))),
+                          ('http://example.com/toto_tutu', None,
+                           Token(word='toto', start=21, end=25,
+                                 sentence=Sentence(indice=1, start=16, end=26)))])
+
+    def test_rules_filter(self):
+        """ Test rules filter """
+        text = 'Hello toto tutu. And toto.'
+        source = NerSourceLexicon({'toto tutu': 'http://example.com/toto_tutu',
+                                   'toto': 'http://example.com/toto'})
+        rules = {'http://example.com/toto': 'http://example.com/tata'}
+        _filter = filters.NerReplacementRulesFilter(rules)
+        ner = core.NerProcess((source,), filters=(_filter,))
+        named_entities = ner.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/toto_tutu', None,
+                           Token(word='toto tutu', start=6, end=15,
+                                 sentence=Sentence(indice=0, start=0, end=16))),
+                          ('http://example.com/tata', None,
+                           Token(word='toto', start=21, end=25,
+                                 sentence=Sentence(indice=1, start=16, end=26)))])
+
+if __name__ == '__main__':
+    unittest2.main()
+
--- a/test/test_named_entities.py	Thu Dec 19 14:45:26 2013 +0000
+++ b/test/test_named_entities.py	Thu Dec 19 14:45:43 2013 +0000
@@ -20,12 +20,13 @@
 from nazca.named_entities.sources import (NerSourceLexicon,
                                           NerSourceSparql,
                                           NerSourceRql)
-from nazca.named_entities import named_entities as core
+from nazca.named_entities import NerProcess
 from nazca.utils.tokenizer import Token, Sentence
+from nazca.named_entities.preprocessors import NerStopwordsFilterPreprocessor
 
 
-class CoreTest(unittest2.TestCase):
-    """ Test of core """
+class NerTest(unittest2.TestCase):
+    """ Test of Ner """
 
     def test_lexicon_source(self):
         """ Test lexicon source """
@@ -51,21 +52,22 @@
     def test_sparql_source(self):
         """ Test sparql source """
         source = NerSourceSparql(u'http://dbpedia.org/sparql',
-                                 u'''SELECT ?uri
+                                 u'''SELECT DISTINCT ?uri
                                      WHERE{
-                                     ?uri rdfs:label "Python"@en .
+                                     ?uri rdfs:label "%(word)s"@en .
                                      ?uri rdf:type ?type}''')
-        self.assertEqual(source.query_word('cubicweb'),
-                         [u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
+        self.assertEqual(source.query_word('Python'),
+                         [u'http://dbpedia.org/resource/Python',
+                          u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
                           u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'])
 
-    def test_nerdy_process(self):
-        """ Test nerdy process """
+    def test_ner_process(self):
+        """ Test ner process """
         text = 'Hello everyone, this is   me speaking. And me.'
         source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
                                    'me': 'http://example.com/me'})
-        nerdy = core.NerdyProcess((source,))
-        named_entities = nerdy.process_text(text)
+        ner = NerProcess((source,))
+        named_entities = ner.process_text(text)
         self.assertEqual(named_entities,
                          [('http://example.com/everyone', None,
                            Token(word='everyone', start=6, end=14,
@@ -77,15 +79,15 @@
                            Token(word='me', start=43, end=45,
                                            sentence=Sentence(indice=1, start=38, end=46)))])
 
-    def test_nerdy_process_multisources(self):
-        """ Test nerdy process """
+    def test_ner_process_multisources(self):
+        """ Test ner process """
         text = 'Hello everyone, this is   me speaking. And me.'
         source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
                                     'me': 'http://example.com/me'})
         source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
         # Two sources, not unique
-        nerdy = core.NerdyProcess((source1, source2))
-        named_entities = nerdy.process_text(text)
+        ner = NerProcess((source1, source2))
+        named_entities = ner.process_text(text)
         self.assertEqual(named_entities,
                          [('http://example.com/everyone', None,
                            Token(word='everyone', start=6, end=14,
@@ -103,8 +105,8 @@
                            Token(word='me', start=43, end=45,
                                            sentence=Sentence(indice=1, start=38, end=46)))])
         # Two sources, unique
-        nerdy = core.NerdyProcess((source1, source2), unique=True)
-        named_entities = nerdy.process_text(text)
+        ner = NerProcess((source1, source2), unique=True)
+        named_entities = ner.process_text(text)
         self.assertEqual(named_entities,
                          [('http://example.com/everyone', None,
                            Token(word='everyone', start=6, end=14,
@@ -116,8 +118,8 @@
                            Token(word='me', start=43, end=45,
                                            sentence=Sentence(indice=1, start=38, end=46)))])
         # Two sources inversed, unique
-        nerdy = core.NerdyProcess((source2, source1), unique=True)
-        named_entities = nerdy.process_text(text)
+        ner = NerProcess((source2, source1), unique=True)
+        named_entities = ner.process_text(text)
         self.assertEqual(named_entities,
                          [('http://example.com/everyone', None,
                            Token(word='everyone', start=6, end=14,
@@ -129,14 +131,14 @@
                            Token(word='me', start=43, end=45,
                                            sentence=Sentence(indice=1, start=38, end=46)))])
 
-    def test_nerdy_process_add_sources(self):
-        """ Test nerdy process """
+    def test_ner_process_add_sources(self):
+        """ Test ner process """
         text = 'Hello everyone, this is   me speaking. And me.'
         source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
                                     'me': 'http://example.com/me'})
         source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
-        nerdy = core.NerdyProcess((source1,))
-        named_entities = nerdy.process_text(text)
+        ner = NerProcess((source1,))
+        named_entities = ner.process_text(text)
         self.assertEqual(named_entities,
                          [('http://example.com/everyone', None,
                            Token(word='everyone', start=6, end=14,
@@ -148,8 +150,8 @@
                            Token(word='me', start=43, end=45,
                                            sentence=Sentence(indice=1, start=38, end=46))),])
         # Two sources, not unique
-        nerdy.add_ner_source(source2)
-        named_entities = nerdy.process_text(text)
+        ner.add_ner_source(source2)
+        named_entities = ner.process_text(text)
         self.assertEqual(named_entities,
                          [('http://example.com/everyone', None,
                            Token(word='everyone', start=6, end=14,
@@ -167,27 +169,27 @@
                            Token(word='me', start=43, end=45,
                                            sentence=Sentence(indice=1, start=38, end=46)))])
 
-    def test_nerdy_process_preprocess(self):
-        """ Test nerdy process """
+    def test_ner_process_preprocess(self):
+        """ Test ner process """
         text = 'Hello Toto, this is   me speaking. And me.'
         source = NerSourceLexicon({'Toto': 'http://example.com/toto',
                                    'me': 'http://example.com/me'})
-        preprocessor = core.NerdyStopwordsFilterPreprocessor()
-        nerdy = core.NerdyProcess((source,),
+        preprocessor = NerStopwordsFilterPreprocessor()
+        ner = NerProcess((source,),
                                   preprocessors=(preprocessor,))
-        named_entities = nerdy.process_text(text)
+        named_entities = ner.process_text(text)
         self.assertEqual(named_entities, [('http://example.com/toto', None,
                                            Token(word='Toto', start=6, end=10,
                                                  sentence=Sentence(indice=0, start=0, end=34)))])
 
-    def test_nerdy_process_add_preprocess(self):
-        """ Test nerdy process """
+    def test_ner_process_add_preprocess(self):
+        """ Test ner process """
         text = 'Hello Toto, this is   me speaking. And me.'
         source = NerSourceLexicon({'Toto': 'http://example.com/toto',
                                    'me': 'http://example.com/me'})
-        preprocessor = core.NerdyStopwordsFilterPreprocessor()
-        nerdy = core.NerdyProcess((source,),)
-        named_entities = nerdy.process_text(text)
+        preprocessor = NerStopwordsFilterPreprocessor()
+        ner = NerProcess((source,),)
+        named_entities = ner.process_text(text)
         self.assertEqual(named_entities,
                          [('http://example.com/toto', None,
                            Token(word='Toto', start=6, end=10,
@@ -198,20 +200,20 @@
                           ('http://example.com/me', None,
                            Token(word='me', start=39, end=41,
                                  sentence=Sentence(indice=1, start=34, end=42)))])
-        nerdy.add_preprocessors(preprocessor)
-        named_entities = nerdy.process_text(text)
+        ner.add_preprocessors(preprocessor)
+        named_entities = ner.process_text(text)
         self.assertEqual(named_entities, [('http://example.com/toto', None,
                                            Token(word='Toto', start=6, end=10,
                                                  sentence=Sentence(indice=0, start=0, end=34)))])
 
-    def test_nerdy_process_chained_word(self):
-        """ Test nerdy process """
+    def test_ner_process_chained_word(self):
+        """ Test ner process """
         text = 'Hello everyone me, this is   me speaking. And me.'
         source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
                                    'everyone me': 'http://example.com/everyone_me',
                                    'me': 'http://example.com/me'})
-        nerdy = core.NerdyProcess((source,))
-        named_entities = nerdy.process_text(text)
+        ner = NerProcess((source,))
+        named_entities = ner.process_text(text)
         self.assertEqual(named_entities,
                          [('http://example.com/everyone_me', None,
                            Token(word='everyone me', start=6, end=17,
--- a/test/test_preprocessor.py	Thu Dec 19 14:45:26 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,97 +0,0 @@
-# -*- coding:utf-8 -*-
-#
-# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-import unittest2
-
-from nerdy import core, tokenizer
-
-
-class PreprocessorTest(unittest2.TestCase):
-    """ Test of preprocessors """
-
-    def test_lowercasefilter(self):
-        preprocessor = core.NerdyLowerCaseFilterPreprocessor()
-        token = tokenizer.Token('toto', 0, 4, None)
-        self.assertEqual(preprocessor(token), None)
-        token = tokenizer.Token('toto Tata', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-        token = tokenizer.Token('toto tata', 0, 4, None)
-        self.assertEqual(preprocessor(token), None)
-
-    def test_wordsizefilter(self):
-        preprocessor = core.NerdyWordSizeFilterPreprocessor()
-        token = tokenizer.Token('toto', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-        preprocessor = core.NerdyWordSizeFilterPreprocessor(min_size=3)
-        token = tokenizer.Token('toto', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-        token = tokenizer.Token('to', 0, 4, None)
-        self.assertEqual(preprocessor(token), None)
-        preprocessor = core.NerdyWordSizeFilterPreprocessor(max_size=3)
-        token = tokenizer.Token('toto', 0, 4, None)
-        self.assertEqual(preprocessor(token), None)
-        token = tokenizer.Token('to', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-
-    def test_lowerfirstword(self):
-        preprocessor = core.NerdyLowerFirstWordPreprocessor()
-        sentence = tokenizer.Sentence(0, 0, 20)
-        # Start of the sentence
-        token1 = tokenizer.Token('Toto tata', 0, 4, sentence)
-        token2 = tokenizer.Token('Toto tata', 0, 4, sentence)
-        self.assertEqual(preprocessor(token1), token2)
-        token1 = tokenizer.Token('Us tata', 0, 4, sentence)
-        token2 = tokenizer.Token('us tata', 0, 4, sentence)
-        self.assertEqual(preprocessor(token1), token2)
-        # Not start of the sentence
-        token1 = tokenizer.Token('Toto tata', 12, 16, sentence)
-        token2 = tokenizer.Token('Toto tata', 12, 16, sentence)
-        self.assertEqual(preprocessor(token1), token2)
-        token1 = tokenizer.Token('Us tata', 12, 16, sentence)
-        token2 = tokenizer.Token('Us tata', 12, 16, sentence)
-        self.assertEqual(preprocessor(token1), token2)
-
-    def test_stopwordsfilter(self):
-        preprocessor = core.NerdyStopwordsFilterPreprocessor()
-        token = tokenizer.Token('Toto', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-        token = tokenizer.Token('Us', 0, 4, None)
-        self.assertEqual(preprocessor(token), None)
-        token = tokenizer.Token('Us there', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-        # Split words
-        preprocessor = core.NerdyStopwordsFilterPreprocessor(split_words=True)
-        token = tokenizer.Token('Us there', 0, 4, None)
-        self.assertEqual(preprocessor(token), None)
-        token = tokenizer.Token('Us there toto', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-
-    def test_hashtag(self):
-        preprocessor = core.NerdyHashTagPreprocessor()
-        token = tokenizer.Token('Toto', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-        token1 = tokenizer.Token('@BarackObama', 0, 4, None)
-        token2 = tokenizer.Token('BarackObama', 0, 4, None)
-        self.assertEqual(preprocessor(token1), token2)
-        token1 = tokenizer.Token('@Barack_Obama', 0, 4, None)
-        token2 = tokenizer.Token('Barack Obama', 0, 4, None)
-        self.assertEqual(preprocessor(token1), token2)
-
-
-if __name__ == '__main__':
-    unittest2.main()
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_preprocessors.py	Thu Dec 19 14:45:43 2013 +0000
@@ -0,0 +1,98 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+import unittest2
+
+from nazca.utils import tokenizer
+from nazca.named_entities import preprocessors
+
+
+class PreprocessorTest(unittest2.TestCase):
+    """ Test of preprocessors """
+
+    def test_lowercasefilter(self):
+        preprocessor = preprocessors.NerLowerCaseFilterPreprocessor()
+        token = tokenizer.Token('toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+        token = tokenizer.Token('toto Tata', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        token = tokenizer.Token('toto tata', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+
+    def test_wordsizefilter(self):
+        preprocessor = preprocessors.NerWordSizeFilterPreprocessor()
+        token = tokenizer.Token('toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        preprocessor = preprocessors.NerWordSizeFilterPreprocessor(min_size=3)
+        token = tokenizer.Token('toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        token = tokenizer.Token('to', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+        preprocessor = preprocessors.NerWordSizeFilterPreprocessor(max_size=3)
+        token = tokenizer.Token('toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+        token = tokenizer.Token('to', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+
+    def test_lowerfirstword(self):
+        preprocessor = preprocessors.NerLowerFirstWordPreprocessor()
+        sentence = tokenizer.Sentence(0, 0, 20)
+        # Start of the sentence
+        token1 = tokenizer.Token('Toto tata', 0, 4, sentence)
+        token2 = tokenizer.Token('Toto tata', 0, 4, sentence)
+        self.assertEqual(preprocessor(token1), token2)
+        token1 = tokenizer.Token('Us tata', 0, 4, sentence)
+        token2 = tokenizer.Token('us tata', 0, 4, sentence)
+        self.assertEqual(preprocessor(token1), token2)
+        # Not start of the sentence
+        token1 = tokenizer.Token('Toto tata', 12, 16, sentence)
+        token2 = tokenizer.Token('Toto tata', 12, 16, sentence)
+        self.assertEqual(preprocessor(token1), token2)
+        token1 = tokenizer.Token('Us tata', 12, 16, sentence)
+        token2 = tokenizer.Token('Us tata', 12, 16, sentence)
+        self.assertEqual(preprocessor(token1), token2)
+
+    def test_stopwordsfilter(self):
+        preprocessor = preprocessors.NerStopwordsFilterPreprocessor()
+        token = tokenizer.Token('Toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        token = tokenizer.Token('Us', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+        token = tokenizer.Token('Us there', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        # Split words
+        preprocessor = preprocessors.NerStopwordsFilterPreprocessor(split_words=True)
+        token = tokenizer.Token('Us there', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+        token = tokenizer.Token('Us there toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+
+    def test_hashtag(self):
+        preprocessor = preprocessors.NerHashTagPreprocessor()
+        token = tokenizer.Token('Toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        token1 = tokenizer.Token('@BarackObama', 0, 4, None)
+        token2 = tokenizer.Token('BarackObama', 0, 4, None)
+        self.assertEqual(preprocessor(token1), token2)
+        token1 = tokenizer.Token('@Barack_Obama', 0, 4, None)
+        token2 = tokenizer.Token('Barack Obama', 0, 4, None)
+        self.assertEqual(preprocessor(token1), token2)
+
+
+if __name__ == '__main__':
+    unittest2.main()
+