Rename ner in named_entities, related to #187461
authorvincent.michel@logilab.fr
Thu, 19 Dec 2013 14:45:04 +0000
changeset 370 4a79af983c29
parent 369 7019bc0cab44
child 371 b104bae5b9c2
Rename ner in named_entities, related to #187461
named_entities/__init__.py
named_entities/core.py
named_entities/tokenizer.py
ner/__init__.py
ner/core.py
ner/tokenizer.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/named_entities/core.py	Thu Dec 19 14:45:04 2013 +0000
@@ -0,0 +1,396 @@
+# -*- coding: utf-8 -*-
+""" Core functions for Named Entities Recognition.
+"""
+from nerdy.tokenizer import RichStringTokenizer, Token
+from nerdy.dataio import sparql_query, rql_url_query, rql_appid_query
+from nerdy.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
+
+STOPWORDS = {'fr': FRENCH_STOPWORDS,
+             'en': ENGLISH_STOPWORDS}
+
+# XXX Add SQL source ?
+# XXX NER preprocessor
+
+###############################################################################
+### NER SOURCE ################################################################
+###############################################################################
+class AbstractNerdySource(object):
+    """ High-level source for Named Entities Recognition
+    """
+
+    def __init__(self, query, endpoint, name=None, use_cache=True, preprocessors=None):
+        """ Initialise the class.
+        """
+        self.query = query
+        self.endpoint = endpoint
+        self.name = name
+        self.preprocessors = preprocessors or []
+        self.use_cache = use_cache
+        self._recognized_cache = {}
+
+    def add_preprocessors(self, preprocessor):
+        """ Add a preprocessor
+        """
+        self.preprocessors.append(preprocessor)
+
+    def recognize_token(self, token):
+        """ Recognize a token
+        """
+        # Applies source specific preprocessors
+        for preprocessor in self.preprocessors:
+            token = preprocessor(token)
+            if not token:
+                return []
+        if self.use_cache and token.word in self._recognized_cache:
+            return self._recognized_cache[token.word]
+        uris = self.query_word(token.word) if token.word else []
+        if self.use_cache:
+            self._recognized_cache[token.word] = uris
+        return uris
+
+    def query_word(self, word):
+        """ Query a word for a Named Entities Recognition process
+        """
+        raise NotImplementedError
+
+
+class NerdySourceLexical(AbstractNerdySource):
+    """ Source based on a (pre-computed) dictionnary of words (token, uri)
+    """
+    def __init__(self, lexicon, name=None, use_cache=True, preprocessors=None):
+        self.lexicon = lexicon
+        self.name = name
+        self.preprocessors = preprocessors or []
+        self.use_cache = use_cache
+        self._recognized_cache = {}
+
+    def query_word(self, word):
+        uri = self.lexicon.get(word)
+        return [uri,] if uri else []
+
+
+class NerdySourceLocalRql(AbstractNerdySource):
+    """ High-level source for Named Entities Recognition
+    Local RQL version
+    """
+
+    def __init__(self, query, session, name=None, use_cache=True, preprocessors=None):
+        """ Initialise the class.
+        """
+        self.query = query
+        self.session = session
+        self.name = name
+        self.preprocessors = preprocessors or []
+        self.use_cache = use_cache
+        self._recognized_cache = {}
+
+    def query_word(self, word):
+        """ Query a word for a Named Entities Recognition process
+        """
+        return [r[0] for r in self.session.execute(self.query, dict(word=word))]
+
+
+class NerdySourceAppidRql(AbstractNerdySource):
+    """ High-level source for Named Entities Recognition
+    Appid RQL version
+    """
+
+    def query_word(self, word):
+        """ Query a word for a Named Entities Recognition process
+        """
+        return [r[0] for r in rql_appid_query(self.query, self.endpoint, word=word)]
+
+
+class NerdySourceUrlRql(AbstractNerdySource):
+    """ High-level source for Named Entities Recognition
+    Url RQL version
+    """
+
+    def query_word(self, word):
+        """ Query a word for a Named Entities Recognition process
+        """
+        return [r[0] for r in rql_url_query(self.query % {'word': word}, self.endpoint)]
+
+
+class NerdySourceSparql(AbstractNerdySource):
+    """ High-level source for Named Entities Recognition
+    SPARQL version
+
+   >>> from nerdy.core import NerdySourceSparql
+   >>> ner_source = NerdySourceSparql('''SELECT ?uri
+                                         WHERE{
+                                         ?uri rdfs:label "%(word)s"@en}''',
+			                 'http://dbpedia.org/sparql')
+   >>> print ner_source.recognize_token('Victor Hugo')
+		... ['http://dbpedia.org/resource/Category:Victor_Hugo',
+		     'http://dbpedia.org/resource/Victor_Hugo',
+		     'http://dbpedia.org/class/yago/VictorHugo',
+		     'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
+		     'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
+		     'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
+
+    """
+
+    def query_word(self, word):
+        """ Query a word for a Named Entities Recognition process
+        """
+        return [r['uri']['value'] for r in sparql_query(self.query % {'word': word}, self.endpoint)]
+
+
+###############################################################################
+### NER PREPROCESSORS #########################################################
+###############################################################################
+class AbstractNerdyPreprocessor(object):
+    """ Preprocessor
+    """
+
+    def __call__(self, token):
+        raise NotImplementedError
+
+
+class NerdyWordSizeFilterPreprocessor(AbstractNerdyPreprocessor):
+    """ Remove token based on the size of the word
+    """
+    def __init__(self, min_size=None, max_size=None):
+        self.min_size = min_size
+        self.max_size = max_size
+
+    def __call__(self, token):
+        if ((self.min_size and len(token.word)<self.min_size)
+            or (self.max_size and len(token.word)>self.max_size)):
+            return None
+        return token
+
+
+class NerdyLowerCaseFilterPreprocessor(AbstractNerdyPreprocessor):
+    """ Remove token with word in lower case
+    """
+
+    def __call__(self, token):
+        return None if token.word.islower() else token
+
+
+class NerdyLowerFirstWordPreprocessor(AbstractNerdyPreprocessor):
+    """ Lower the first word of each sentence if it is a stopword.
+    """
+    def __init__(self, lang='en'):
+        self.lang = lang
+
+    def __call__(self, token):
+        if (token.start == token.sentence.start and
+            token.word.split()[0].lower() in STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)):
+            word = token.word[0].lower() + token.word[1:]
+            return Token(word, token.start, token.end, token.sentence)
+        return token
+
+
+class NerdyStopwordsFilterPreprocessor(AbstractNerdyPreprocessor):
+    """ Remove stopwords
+    """
+    def __init__(self, split_words=False, lang='en'):
+        self.split_words = split_words
+        self.lang = lang
+
+    def __call__(self, token):
+        stopwords = STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)
+        if self.split_words and not [w for w in token.word.split() if w.lower() not in stopwords]:
+            return None
+        if not self.split_words and token.word.lower() in stopwords:
+            return None
+        return token
+
+
+class NerdyHashTagPreprocessor(AbstractNerdyPreprocessor):
+    """ Cleanup hashtag
+    """
+    def __call__(self, token):
+        if token.word.startswith('@'):
+            # XXX Split capitalize letter ?
+            # @BarackObama -> Barack Obama
+            word = token.word[1:].replace('_', ' ')
+            return Token(word, token.start, token.end, token.sentence)
+        return token
+
+
+###############################################################################
+### NER FILTERS ###############################################################
+###############################################################################
+class AbstractNerdyFilter(object):
+    """ A filter used for cleaning named entities results
+    """
+
+    def __call__(self, named_entities):
+        raise NotImplementedError
+
+
+class NerdyOccurenceFilter(object):
+    """ A filter based on the number of occurence of
+    named entities in the results.
+    """
+    def __init__(self, min_occ=None, max_occ=None):
+        self.min_occ = min_occ
+        self.max_occ = max_occ
+
+    def __call__(self, named_entities):
+        uris = [u for u, p, t in named_entities]
+        counts = dict([(u, uris.count(u)) for u in set(uris)])
+        return [n for n in named_entities if not ((self.min_occ and counts[n[0]]<self.min_occ)
+                                              or (self.max_occ and counts[n[0]]>self.max_occ))]
+
+
+class NerdyRDFTypeFilter(object):
+    """ A filter based on the RDF type on entity
+    E.g.
+
+    filter = NerdyRDFTypeFilter('http://dbpedia.org/sparql',
+                                ('http://schema.org/Place',
+                                'http://dbpedia.org/ontology/Agent',
+                                'http://dbpedia.org/ontology/Place'))
+
+    """
+    def __init__(self, endpoint, accepted_types):
+        self.endpoint = endpoint
+        self.accepted_types = accepted_types
+        self.query = 'SELECT ?type WHERE{<%(uri)s> rdf:type ?type}'
+
+    def __call__(self, named_entities):
+        filtered_named_entities = []
+        seen_uris = {}
+        for uri, p, t in named_entities:
+            if uri in seen_uris:
+                if seen_uris[uri]:
+                    filtered_named_entities.append((uri, p, t))
+            else:
+                results = sparql_query(self.query % {'uri': uri}, self.endpoint)
+                types = set([r['type']['value'] for r in results])
+                if not len(types.intersection(self.accepted_types)):
+                    seen_uris[uri] = False
+                else:
+                    seen_uris[uri] = True
+                    filtered_named_entities.append((uri, p, t))
+        return filtered_named_entities
+
+
+class NerdyDisambiguationWordParts(object):
+    """ Disambiguate named entities based on the words parts.
+    E.g.:
+          'toto tutu': 'http://example.com/toto_tutu',
+          'toto': 'http://example.com/toto'
+
+          Then if 'toto' is found in the text, replace the URI 'http://example.com/toto'
+          by 'http://example.com/toto_tutu'
+    """
+    def __call__(self, named_entities):
+        # Create parts dictionnary
+        parts = {}
+        for uri, peid, token in named_entities:
+            if ' ' in token.word:
+                for part in token.word.split(' '):
+                    parts[part.lower()] = uri
+        # Replace named entities
+        filtered_named_entities = []
+        for uri, peid, token in named_entities:
+            if token.word in parts:
+                # Change URI
+                uri = parts[token.word]
+            filtered_named_entities.append((uri, peid, token))
+        return filtered_named_entities
+
+
+class NerdyReplacementRulesFilter(object):
+    """ Allow to define replacement rules for Named Entities
+    """
+    def __init__(self,rules):
+        self.rules = rules
+
+    def __call__(self, named_entities):
+        filtered_named_entities = []
+        for uri, peid, token in named_entities:
+            uri = self.rules.get(uri, uri)
+            filtered_named_entities.append((uri, peid, token))
+        return filtered_named_entities
+
+
+###############################################################################
+### NER PROCESS ###############################################################
+###############################################################################
+class NerdyProcess(object):
+    """ High-level process for Named Entities Recognition
+    """
+
+    def __init__(self, ner_sources, preprocessors=None, filters=None, unique=False):
+        """ Initialise the class.
+
+        :tokenizer: an instance of tokenizer
+        """
+        self.ner_sources = list(ner_sources)
+        self.preprocessors = preprocessors or []
+        self.filters = filters or []
+        self.unique = unique
+
+    def add_ner_source(self, process):
+        """ Add a ner process
+        """
+        self.ner_sources.append(process)
+
+    def add_preprocessors(self, preprocessor):
+        """ Add a preprocessor
+        """
+        self.preprocessors.append(preprocessor)
+
+    def add_filters(self, filter):
+        """ Add a filter
+        """
+        self.filters.append(filter)
+
+    def process_text(self, text):
+        """ High level function for analyzing a text
+        """
+        tokenizer = RichStringTokenizer(text)
+        return self.recognize_tokens(tokenizer)
+
+    def recognize_tokens(self, tokens):
+        """ Recognize Named Entities from a tokenizer or
+        an iterator yielding tokens.
+        """
+        last_stop = 0
+        named_entities = []
+        for token in tokens:
+            if token.start < last_stop:
+                continue # this token overlaps with a previous match
+            word = token.word
+            # Applies preprocessors
+            # XXX Preprocessors may be sources dependant
+            for preprocessor in self.preprocessors:
+                token = preprocessor(token)
+                if not token:
+                    break
+            if not token:
+                continue
+            recognized = False
+            for process in self.ner_sources:
+                for uri in process.recognize_token(token):
+                    named_entities.append((uri, process.name, token))
+                    recognized = True
+                    last_stop = token.end
+                    if self.unique:
+                        break
+                if recognized and self.unique:
+                    break
+        # XXX Postprocess/filters may be sources dependant
+        return self.postprocess(named_entities)
+
+    def postprocess(self, named_entities):
+        """ Postprocess the results by applying filters """
+        for filter in self.filters:
+            named_entities = filter(named_entities)
+        return named_entities
+
+
+###############################################################################
+### NER RELATIONS PROCESS #####################################################
+###############################################################################
+class NerdyRelationsProcess(object):
+    """ Process for building simple relation from named entities results
+    """
+    pass
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/named_entities/tokenizer.py	Thu Dec 19 14:45:04 2013 +0000
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+""" Tokenizer for sentences/words segmentation.
+"""
+import itertools
+import collections
+import re
+
+
+Token = collections.namedtuple('Token', ['word', 'start', 'end', 'sentence'])
+Sentence = collections.namedtuple('Sentence', ['indice', 'start', 'end'])
+
+
+class RichStringTokenizer(object):
+    """Tokenizer for Yams' RichString content.
+
+    The tokenizer uses a variable-length sliding window, i.e. a sliding
+    window yielding tokens of N words.
+    """
+
+    def __init__(self, text, token_min_size=1, token_max_size=3):
+        """
+        :token_min_size: minimum number of words required to be a valid token
+        :token_max_size: minimum number of words required to be a valid token
+        """
+        self.text = text
+        self.token_min_size = token_min_size
+        self.token_max_size = token_max_size
+
+    def iter_tokens(self, text):
+        """ Iterate tokens over a text
+        """
+        # Compute sentences
+        sentences = self.find_sentences(text)
+        # Compute words
+        words = list([m for m in re.finditer(r'[\w@-]+', text, re.UNICODE)])
+        indice = 0
+        while indice < len(words):
+            # Choose the current sentence of the first word
+            current_sentence = [s for s in sentences if s.start<=words[indice].start()][-1]
+            # Sliding windows over the different words for each sentence
+            remaining = len(words) - indice
+            for length in range(min(self.token_max_size, remaining), self.token_min_size-1, -1):
+                _words = words[indice:indice+length]
+                if _words[-1].start() > current_sentence.end:
+                    # The last word in not in the same sentence anymore, split
+                    continue
+                normalized_word = ' '.join([w.group() for w in _words]).strip()
+                yield Token(normalized_word, _words[0].start(), _words[-1].end(), current_sentence)
+            indice += 1
+
+    def find_sentences(self, text):
+        """ Find the sentences
+        """
+        return [Sentence(ind, s.start(), s.end()) for ind, s in
+                enumerate(re.finditer(r'[^.!?]+(?:[.!?]|$)', text, re.UNICODE))]
+
+    def load_text(self, text):
+        """ Load the text to be tokenized
+        """
+        self.text = text
+
+    def __iter__(self):
+        """ Iterator over the text given in the object instantiation
+        """
+        for t in self.iter_tokens(self.text):
+            yield t
--- a/ner/core.py	Thu Dec 19 14:44:58 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,396 +0,0 @@
-# -*- coding: utf-8 -*-
-""" Core functions for Named Entities Recognition.
-"""
-from nerdy.tokenizer import RichStringTokenizer, Token
-from nerdy.dataio import sparql_query, rql_url_query, rql_appid_query
-from nerdy.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
-
-STOPWORDS = {'fr': FRENCH_STOPWORDS,
-             'en': ENGLISH_STOPWORDS}
-
-# XXX Add SQL source ?
-# XXX NER preprocessor
-
-###############################################################################
-### NER SOURCE ################################################################
-###############################################################################
-class AbstractNerdySource(object):
-    """ High-level source for Named Entities Recognition
-    """
-
-    def __init__(self, query, endpoint, name=None, use_cache=True, preprocessors=None):
-        """ Initialise the class.
-        """
-        self.query = query
-        self.endpoint = endpoint
-        self.name = name
-        self.preprocessors = preprocessors or []
-        self.use_cache = use_cache
-        self._recognized_cache = {}
-
-    def add_preprocessors(self, preprocessor):
-        """ Add a preprocessor
-        """
-        self.preprocessors.append(preprocessor)
-
-    def recognize_token(self, token):
-        """ Recognize a token
-        """
-        # Applies source specific preprocessors
-        for preprocessor in self.preprocessors:
-            token = preprocessor(token)
-            if not token:
-                return []
-        if self.use_cache and token.word in self._recognized_cache:
-            return self._recognized_cache[token.word]
-        uris = self.query_word(token.word) if token.word else []
-        if self.use_cache:
-            self._recognized_cache[token.word] = uris
-        return uris
-
-    def query_word(self, word):
-        """ Query a word for a Named Entities Recognition process
-        """
-        raise NotImplementedError
-
-
-class NerdySourceLexical(AbstractNerdySource):
-    """ Source based on a (pre-computed) dictionnary of words (token, uri)
-    """
-    def __init__(self, lexicon, name=None, use_cache=True, preprocessors=None):
-        self.lexicon = lexicon
-        self.name = name
-        self.preprocessors = preprocessors or []
-        self.use_cache = use_cache
-        self._recognized_cache = {}
-
-    def query_word(self, word):
-        uri = self.lexicon.get(word)
-        return [uri,] if uri else []
-
-
-class NerdySourceLocalRql(AbstractNerdySource):
-    """ High-level source for Named Entities Recognition
-    Local RQL version
-    """
-
-    def __init__(self, query, session, name=None, use_cache=True, preprocessors=None):
-        """ Initialise the class.
-        """
-        self.query = query
-        self.session = session
-        self.name = name
-        self.preprocessors = preprocessors or []
-        self.use_cache = use_cache
-        self._recognized_cache = {}
-
-    def query_word(self, word):
-        """ Query a word for a Named Entities Recognition process
-        """
-        return [r[0] for r in self.session.execute(self.query, dict(word=word))]
-
-
-class NerdySourceAppidRql(AbstractNerdySource):
-    """ High-level source for Named Entities Recognition
-    Appid RQL version
-    """
-
-    def query_word(self, word):
-        """ Query a word for a Named Entities Recognition process
-        """
-        return [r[0] for r in rql_appid_query(self.query, self.endpoint, word=word)]
-
-
-class NerdySourceUrlRql(AbstractNerdySource):
-    """ High-level source for Named Entities Recognition
-    Url RQL version
-    """
-
-    def query_word(self, word):
-        """ Query a word for a Named Entities Recognition process
-        """
-        return [r[0] for r in rql_url_query(self.query % {'word': word}, self.endpoint)]
-
-
-class NerdySourceSparql(AbstractNerdySource):
-    """ High-level source for Named Entities Recognition
-    SPARQL version
-
-   >>> from nerdy.core import NerdySourceSparql
-   >>> ner_source = NerdySourceSparql('''SELECT ?uri
-                                         WHERE{
-                                         ?uri rdfs:label "%(word)s"@en}''',
-			                 'http://dbpedia.org/sparql')
-   >>> print ner_source.recognize_token('Victor Hugo')
-		... ['http://dbpedia.org/resource/Category:Victor_Hugo',
-		     'http://dbpedia.org/resource/Victor_Hugo',
-		     'http://dbpedia.org/class/yago/VictorHugo',
-		     'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
-		     'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
-		     'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
-
-    """
-
-    def query_word(self, word):
-        """ Query a word for a Named Entities Recognition process
-        """
-        return [r['uri']['value'] for r in sparql_query(self.query % {'word': word}, self.endpoint)]
-
-
-###############################################################################
-### NER PREPROCESSORS #########################################################
-###############################################################################
-class AbstractNerdyPreprocessor(object):
-    """ Preprocessor
-    """
-
-    def __call__(self, token):
-        raise NotImplementedError
-
-
-class NerdyWordSizeFilterPreprocessor(AbstractNerdyPreprocessor):
-    """ Remove token based on the size of the word
-    """
-    def __init__(self, min_size=None, max_size=None):
-        self.min_size = min_size
-        self.max_size = max_size
-
-    def __call__(self, token):
-        if ((self.min_size and len(token.word)<self.min_size)
-            or (self.max_size and len(token.word)>self.max_size)):
-            return None
-        return token
-
-
-class NerdyLowerCaseFilterPreprocessor(AbstractNerdyPreprocessor):
-    """ Remove token with word in lower case
-    """
-
-    def __call__(self, token):
-        return None if token.word.islower() else token
-
-
-class NerdyLowerFirstWordPreprocessor(AbstractNerdyPreprocessor):
-    """ Lower the first word of each sentence if it is a stopword.
-    """
-    def __init__(self, lang='en'):
-        self.lang = lang
-
-    def __call__(self, token):
-        if (token.start == token.sentence.start and
-            token.word.split()[0].lower() in STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)):
-            word = token.word[0].lower() + token.word[1:]
-            return Token(word, token.start, token.end, token.sentence)
-        return token
-
-
-class NerdyStopwordsFilterPreprocessor(AbstractNerdyPreprocessor):
-    """ Remove stopwords
-    """
-    def __init__(self, split_words=False, lang='en'):
-        self.split_words = split_words
-        self.lang = lang
-
-    def __call__(self, token):
-        stopwords = STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)
-        if self.split_words and not [w for w in token.word.split() if w.lower() not in stopwords]:
-            return None
-        if not self.split_words and token.word.lower() in stopwords:
-            return None
-        return token
-
-
-class NerdyHashTagPreprocessor(AbstractNerdyPreprocessor):
-    """ Cleanup hashtag
-    """
-    def __call__(self, token):
-        if token.word.startswith('@'):
-            # XXX Split capitalize letter ?
-            # @BarackObama -> Barack Obama
-            word = token.word[1:].replace('_', ' ')
-            return Token(word, token.start, token.end, token.sentence)
-        return token
-
-
-###############################################################################
-### NER FILTERS ###############################################################
-###############################################################################
-class AbstractNerdyFilter(object):
-    """ A filter used for cleaning named entities results
-    """
-
-    def __call__(self, named_entities):
-        raise NotImplementedError
-
-
-class NerdyOccurenceFilter(object):
-    """ A filter based on the number of occurence of
-    named entities in the results.
-    """
-    def __init__(self, min_occ=None, max_occ=None):
-        self.min_occ = min_occ
-        self.max_occ = max_occ
-
-    def __call__(self, named_entities):
-        uris = [u for u, p, t in named_entities]
-        counts = dict([(u, uris.count(u)) for u in set(uris)])
-        return [n for n in named_entities if not ((self.min_occ and counts[n[0]]<self.min_occ)
-                                              or (self.max_occ and counts[n[0]]>self.max_occ))]
-
-
-class NerdyRDFTypeFilter(object):
-    """ A filter based on the RDF type on entity
-    E.g.
-
-    filter = NerdyRDFTypeFilter('http://dbpedia.org/sparql',
-                                ('http://schema.org/Place',
-                                'http://dbpedia.org/ontology/Agent',
-                                'http://dbpedia.org/ontology/Place'))
-
-    """
-    def __init__(self, endpoint, accepted_types):
-        self.endpoint = endpoint
-        self.accepted_types = accepted_types
-        self.query = 'SELECT ?type WHERE{<%(uri)s> rdf:type ?type}'
-
-    def __call__(self, named_entities):
-        filtered_named_entities = []
-        seen_uris = {}
-        for uri, p, t in named_entities:
-            if uri in seen_uris:
-                if seen_uris[uri]:
-                    filtered_named_entities.append((uri, p, t))
-            else:
-                results = sparql_query(self.query % {'uri': uri}, self.endpoint)
-                types = set([r['type']['value'] for r in results])
-                if not len(types.intersection(self.accepted_types)):
-                    seen_uris[uri] = False
-                else:
-                    seen_uris[uri] = True
-                    filtered_named_entities.append((uri, p, t))
-        return filtered_named_entities
-
-
-class NerdyDisambiguationWordParts(object):
-    """ Disambiguate named entities based on the words parts.
-    E.g.:
-          'toto tutu': 'http://example.com/toto_tutu',
-          'toto': 'http://example.com/toto'
-
-          Then if 'toto' is found in the text, replace the URI 'http://example.com/toto'
-          by 'http://example.com/toto_tutu'
-    """
-    def __call__(self, named_entities):
-        # Create parts dictionnary
-        parts = {}
-        for uri, peid, token in named_entities:
-            if ' ' in token.word:
-                for part in token.word.split(' '):
-                    parts[part.lower()] = uri
-        # Replace named entities
-        filtered_named_entities = []
-        for uri, peid, token in named_entities:
-            if token.word in parts:
-                # Change URI
-                uri = parts[token.word]
-            filtered_named_entities.append((uri, peid, token))
-        return filtered_named_entities
-
-
-class NerdyReplacementRulesFilter(object):
-    """ Allow to define replacement rules for Named Entities
-    """
-    def __init__(self,rules):
-        self.rules = rules
-
-    def __call__(self, named_entities):
-        filtered_named_entities = []
-        for uri, peid, token in named_entities:
-            uri = self.rules.get(uri, uri)
-            filtered_named_entities.append((uri, peid, token))
-        return filtered_named_entities
-
-
-###############################################################################
-### NER PROCESS ###############################################################
-###############################################################################
-class NerdyProcess(object):
-    """ High-level process for Named Entities Recognition
-    """
-
-    def __init__(self, ner_sources, preprocessors=None, filters=None, unique=False):
-        """ Initialise the class.
-
-        :tokenizer: an instance of tokenizer
-        """
-        self.ner_sources = list(ner_sources)
-        self.preprocessors = preprocessors or []
-        self.filters = filters or []
-        self.unique = unique
-
-    def add_ner_source(self, process):
-        """ Add a ner process
-        """
-        self.ner_sources.append(process)
-
-    def add_preprocessors(self, preprocessor):
-        """ Add a preprocessor
-        """
-        self.preprocessors.append(preprocessor)
-
-    def add_filters(self, filter):
-        """ Add a filter
-        """
-        self.filters.append(filter)
-
-    def process_text(self, text):
-        """ High level function for analyzing a text
-        """
-        tokenizer = RichStringTokenizer(text)
-        return self.recognize_tokens(tokenizer)
-
-    def recognize_tokens(self, tokens):
-        """ Recognize Named Entities from a tokenizer or
-        an iterator yielding tokens.
-        """
-        last_stop = 0
-        named_entities = []
-        for token in tokens:
-            if token.start < last_stop:
-                continue # this token overlaps with a previous match
-            word = token.word
-            # Applies preprocessors
-            # XXX Preprocessors may be sources dependant
-            for preprocessor in self.preprocessors:
-                token = preprocessor(token)
-                if not token:
-                    break
-            if not token:
-                continue
-            recognized = False
-            for process in self.ner_sources:
-                for uri in process.recognize_token(token):
-                    named_entities.append((uri, process.name, token))
-                    recognized = True
-                    last_stop = token.end
-                    if self.unique:
-                        break
-                if recognized and self.unique:
-                    break
-        # XXX Postprocess/filters may be sources dependant
-        return self.postprocess(named_entities)
-
-    def postprocess(self, named_entities):
-        """ Postprocess the results by applying filters """
-        for filter in self.filters:
-            named_entities = filter(named_entities)
-        return named_entities
-
-
-###############################################################################
-### NER RELATIONS PROCESS #####################################################
-###############################################################################
-class NerdyRelationsProcess(object):
-    """ Process for building simple relation from named entities results
-    """
-    pass
--- a/ner/tokenizer.py	Thu Dec 19 14:44:58 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,66 +0,0 @@
-# -*- coding: utf-8 -*-
-""" Tokenizer for sentences/words segmentation.
-"""
-import itertools
-import collections
-import re
-
-
-Token = collections.namedtuple('Token', ['word', 'start', 'end', 'sentence'])
-Sentence = collections.namedtuple('Sentence', ['indice', 'start', 'end'])
-
-
-class RichStringTokenizer(object):
-    """Tokenizer for Yams' RichString content.
-
-    The tokenizer uses a variable-length sliding window, i.e. a sliding
-    window yielding tokens of N words.
-    """
-
-    def __init__(self, text, token_min_size=1, token_max_size=3):
-        """
-        :token_min_size: minimum number of words required to be a valid token
-        :token_max_size: minimum number of words required to be a valid token
-        """
-        self.text = text
-        self.token_min_size = token_min_size
-        self.token_max_size = token_max_size
-
-    def iter_tokens(self, text):
-        """ Iterate tokens over a text
-        """
-        # Compute sentences
-        sentences = self.find_sentences(text)
-        # Compute words
-        words = list([m for m in re.finditer(r'[\w@-]+', text, re.UNICODE)])
-        indice = 0
-        while indice < len(words):
-            # Choose the current sentence of the first word
-            current_sentence = [s for s in sentences if s.start<=words[indice].start()][-1]
-            # Sliding windows over the different words for each sentence
-            remaining = len(words) - indice
-            for length in range(min(self.token_max_size, remaining), self.token_min_size-1, -1):
-                _words = words[indice:indice+length]
-                if _words[-1].start() > current_sentence.end:
-                    # The last word in not in the same sentence anymore, split
-                    continue
-                normalized_word = ' '.join([w.group() for w in _words]).strip()
-                yield Token(normalized_word, _words[0].start(), _words[-1].end(), current_sentence)
-            indice += 1
-
-    def find_sentences(self, text):
-        """ Find the sentences
-        """
-        return [Sentence(ind, s.start(), s.end()) for ind, s in
-                enumerate(re.finditer(r'[^.!?]+(?:[.!?]|$)', text, re.UNICODE))]
-
-    def load_text(self, text):
-        """ Load the text to be tokenized
-        """
-        self.text = text
-
-    def __iter__(self):
-        """ Iterator over the text given in the object instantiation
-        """
-        for t in self.iter_tokens(self.text):
-            yield t