[nerdy] Move nerdy into ner directory.
authorvincent.michel@logilab.fr
Tue, 22 Oct 2013 15:53:25 +0200
changeset 343 ccd7e8632d19
parent 342 feaa3ebfc9fc
child 344 2bc47c2b1c0f
[nerdy] Move nerdy into ner directory.
__init__.py
__pkginfo__.py
core.py
dataio.py
debian/changelog
debian/compat
debian/control
debian/copyright
debian/rules
doc.rst
ner/__init__.py
ner/__pkginfo__.py
ner/core.py
ner/dataio.py
ner/debian/changelog
ner/debian/compat
ner/debian/control
ner/debian/copyright
ner/debian/rules
ner/doc.rst
ner/python-nerdy.spec
ner/setup.py
ner/stopwords.py
ner/test/test_core.py
ner/test/test_dataio.py
ner/test/test_filter.py
ner/test/test_preprocessor.py
ner/test/test_tokenizer.py
ner/tokenizer.py
python-nerdy.spec
setup.py
stopwords.py
test/test_core.py
test/test_dataio.py
test/test_filter.py
test/test_preprocessor.py
test/test_tokenizer.py
tokenizer.py
--- a/__pkginfo__.py	Sun Jul 14 23:18:38 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,39 +0,0 @@
-# -*- coding:utf-8 -*-
-# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-"""Nerdy packaging information."""
-__docformat__ = "restructuredtext en"
-import sys
-
-distname = 'nerdy'
-modname = 'nerdy'
-
-numversion = (0, 1, 0)
-version = '.'.join([str(num) for num in numversion])
-
-license = 'LGPL' # 2.1 or later
-description = "Python library for data alignment"
-web = "https://www.logilab.org/project/nerdy"
-author = "Logilab"
-author_email = "contact@logilab.fr"
-
-
-from os.path import join
-scripts = []
-include_dirs = []
-
-if sys.version_info < (2, 7):
-    install_requires = ['unittest2 >= 0.5.1']
--- a/core.py	Sun Jul 14 23:18:38 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,396 +0,0 @@
-# -*- coding: utf-8 -*-
-""" Core functions for Named Entities Recognition.
-"""
-from nerdy.tokenizer import RichStringTokenizer, Token
-from nerdy.dataio import sparql_query, rql_url_query, rql_appid_query
-from nerdy.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
-
-STOPWORDS = {'fr': FRENCH_STOPWORDS,
-             'en': ENGLISH_STOPWORDS}
-
-# XXX Add SQL source ?
-# XXX NER preprocessor
-
-###############################################################################
-### NER SOURCE ################################################################
-###############################################################################
-class AbstractNerdySource(object):
-    """ High-level source for Named Entities Recognition
-    """
-
-    def __init__(self, query, endpoint, name=None, use_cache=True, preprocessors=None):
-        """ Initialise the class.
-        """
-        self.query = query
-        self.endpoint = endpoint
-        self.name = name
-        self.preprocessors = preprocessors or []
-        self.use_cache = use_cache
-        self._recognized_cache = {}
-
-    def add_preprocessors(self, preprocessor):
-        """ Add a preprocessor
-        """
-        self.preprocessors.append(preprocessor)
-
-    def recognize_token(self, token):
-        """ Recognize a token
-        """
-        # Applies source specific preprocessors
-        for preprocessor in self.preprocessors:
-            token = preprocessor(token)
-            if not token:
-                return []
-        if self.use_cache and token.word in self._recognized_cache:
-            return self._recognized_cache[token.word]
-        uris = self.query_word(token.word) if token.word else []
-        if self.use_cache:
-            self._recognized_cache[token.word] = uris
-        return uris
-
-    def query_word(self, word):
-        """ Query a word for a Named Entities Recognition process
-        """
-        raise NotImplementedError
-
-
-class NerdySourceLexical(AbstractNerdySource):
-    """ Source based on a (pre-computed) dictionnary of words (token, uri)
-    """
-    def __init__(self, lexicon, name=None, use_cache=True, preprocessors=None):
-        self.lexicon = lexicon
-        self.name = name
-        self.preprocessors = preprocessors or []
-        self.use_cache = use_cache
-        self._recognized_cache = {}
-
-    def query_word(self, word):
-        uri = self.lexicon.get(word)
-        return [uri,] if uri else []
-
-
-class NerdySourceLocalRql(AbstractNerdySource):
-    """ High-level source for Named Entities Recognition
-    Local RQL version
-    """
-
-    def __init__(self, query, session, name=None, use_cache=True, preprocessors=None):
-        """ Initialise the class.
-        """
-        self.query = query
-        self.session = session
-        self.name = name
-        self.preprocessors = preprocessors or []
-        self.use_cache = use_cache
-        self._recognized_cache = {}
-
-    def query_word(self, word):
-        """ Query a word for a Named Entities Recognition process
-        """
-        return [r[0] for r in self.session.execute(self.query, dict(word=word))]
-
-
-class NerdySourceAppidRql(AbstractNerdySource):
-    """ High-level source for Named Entities Recognition
-    Appid RQL version
-    """
-
-    def query_word(self, word):
-        """ Query a word for a Named Entities Recognition process
-        """
-        return [r[0] for r in rql_appid_query(self.query, self.endpoint, word=word)]
-
-
-class NerdySourceUrlRql(AbstractNerdySource):
-    """ High-level source for Named Entities Recognition
-    Url RQL version
-    """
-
-    def query_word(self, word):
-        """ Query a word for a Named Entities Recognition process
-        """
-        return [r[0] for r in rql_url_query(self.query % {'word': word}, self.endpoint)]
-
-
-class NerdySourceSparql(AbstractNerdySource):
-    """ High-level source for Named Entities Recognition
-    SPARQL version
-
-   >>> from nerdy.core import NerdySourceSparql
-   >>> ner_source = NerdySourceSparql('''SELECT ?uri
-                                         WHERE{
-                                         ?uri rdfs:label "%(word)s"@en}''',
-			                 'http://dbpedia.org/sparql')
-   >>> print ner_source.recognize_token('Victor Hugo')
-		... ['http://dbpedia.org/resource/Category:Victor_Hugo',
-		     'http://dbpedia.org/resource/Victor_Hugo',
-		     'http://dbpedia.org/class/yago/VictorHugo',
-		     'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
-		     'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
-		     'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
-
-    """
-
-    def query_word(self, word):
-        """ Query a word for a Named Entities Recognition process
-        """
-        return [r['uri']['value'] for r in sparql_query(self.query % {'word': word}, self.endpoint)]
-
-
-###############################################################################
-### NER PREPROCESSORS #########################################################
-###############################################################################
-class AbstractNerdyPreprocessor(object):
-    """ Preprocessor
-    """
-
-    def __call__(self, token):
-        raise NotImplementedError
-
-
-class NerdyWordSizeFilterPreprocessor(AbstractNerdyPreprocessor):
-    """ Remove token based on the size of the word
-    """
-    def __init__(self, min_size=None, max_size=None):
-        self.min_size = min_size
-        self.max_size = max_size
-
-    def __call__(self, token):
-        if ((self.min_size and len(token.word)<self.min_size)
-            or (self.max_size and len(token.word)>self.max_size)):
-            return None
-        return token
-
-
-class NerdyLowerCaseFilterPreprocessor(AbstractNerdyPreprocessor):
-    """ Remove token with word in lower case
-    """
-
-    def __call__(self, token):
-        return None if token.word.islower() else token
-
-
-class NerdyLowerFirstWordPreprocessor(AbstractNerdyPreprocessor):
-    """ Lower the first word of each sentence if it is a stopword.
-    """
-    def __init__(self, lang='en'):
-        self.lang = lang
-
-    def __call__(self, token):
-        if (token.start == token.sentence.start and
-            token.word.split()[0].lower() in STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)):
-            word = token.word[0].lower() + token.word[1:]
-            return Token(word, token.start, token.end, token.sentence)
-        return token
-
-
-class NerdyStopwordsFilterPreprocessor(AbstractNerdyPreprocessor):
-    """ Remove stopwords
-    """
-    def __init__(self, split_words=False, lang='en'):
-        self.split_words = split_words
-        self.lang = lang
-
-    def __call__(self, token):
-        stopwords = STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)
-        if self.split_words and not [w for w in token.word.split() if w.lower() not in stopwords]:
-            return None
-        if not self.split_words and token.word.lower() in stopwords:
-            return None
-        return token
-
-
-class NerdyHashTagPreprocessor(AbstractNerdyPreprocessor):
-    """ Cleanup hashtag
-    """
-    def __call__(self, token):
-        if token.word.startswith('@'):
-            # XXX Split capitalize letter ?
-            # @BarackObama -> Barack Obama
-            word = token.word[1:].replace('_', ' ')
-            return Token(word, token.start, token.end, token.sentence)
-        return token
-
-
-###############################################################################
-### NER FILTERS ###############################################################
-###############################################################################
-class AbstractNerdyFilter(object):
-    """ A filter used for cleaning named entities results
-    """
-
-    def __call__(self, named_entities):
-        raise NotImplementedError
-
-
-class NerdyOccurenceFilter(object):
-    """ A filter based on the number of occurence of
-    named entities in the results.
-    """
-    def __init__(self, min_occ=None, max_occ=None):
-        self.min_occ = min_occ
-        self.max_occ = max_occ
-
-    def __call__(self, named_entities):
-        uris = [u for u, p, t in named_entities]
-        counts = dict([(u, uris.count(u)) for u in set(uris)])
-        return [n for n in named_entities if not ((self.min_occ and counts[n[0]]<self.min_occ)
-                                              or (self.max_occ and counts[n[0]]>self.max_occ))]
-
-
-class NerdyRDFTypeFilter(object):
-    """ A filter based on the RDF type on entity
-    E.g.
-
-    filter = NerdyRDFTypeFilter('http://dbpedia.org/sparql',
-                                ('http://schema.org/Place',
-                                'http://dbpedia.org/ontology/Agent',
-                                'http://dbpedia.org/ontology/Place'))
-
-    """
-    def __init__(self, endpoint, accepted_types):
-        self.endpoint = endpoint
-        self.accepted_types = accepted_types
-        self.query = 'SELECT ?type WHERE{<%(uri)s> rdf:type ?type}'
-
-    def __call__(self, named_entities):
-        filtered_named_entities = []
-        seen_uris = {}
-        for uri, p, t in named_entities:
-            if uri in seen_uris:
-                if seen_uris[uri]:
-                    filtered_named_entities.append((uri, p, t))
-            else:
-                results = sparql_query(self.query % {'uri': uri}, self.endpoint)
-                types = set([r['type']['value'] for r in results])
-                if not len(types.intersection(self.accepted_types)):
-                    seen_uris[uri] = False
-                else:
-                    seen_uris[uri] = True
-                    filtered_named_entities.append((uri, p, t))
-        return filtered_named_entities
-
-
-class NerdyDisambiguationWordParts(object):
-    """ Disambiguate named entities based on the words parts.
-    E.g.:
-          'toto tutu': 'http://example.com/toto_tutu',
-          'toto': 'http://example.com/toto'
-
-          Then if 'toto' is found in the text, replace the URI 'http://example.com/toto'
-          by 'http://example.com/toto_tutu'
-    """
-    def __call__(self, named_entities):
-        # Create parts dictionnary
-        parts = {}
-        for uri, peid, token in named_entities:
-            if ' ' in token.word:
-                for part in token.word.split(' '):
-                    parts[part.lower()] = uri
-        # Replace named entities
-        filtered_named_entities = []
-        for uri, peid, token in named_entities:
-            if token.word in parts:
-                # Change URI
-                uri = parts[token.word]
-            filtered_named_entities.append((uri, peid, token))
-        return filtered_named_entities
-
-
-class NerdyReplacementRulesFilter(object):
-    """ Allow to define replacement rules for Named Entities
-    """
-    def __init__(self,rules):
-        self.rules = rules
-
-    def __call__(self, named_entities):
-        filtered_named_entities = []
-        for uri, peid, token in named_entities:
-            uri = self.rules.get(uri, uri)
-            filtered_named_entities.append((uri, peid, token))
-        return filtered_named_entities
-
-
-###############################################################################
-### NER PROCESS ###############################################################
-###############################################################################
-class NerdyProcess(object):
-    """ High-level process for Named Entities Recognition
-    """
-
-    def __init__(self, ner_sources, preprocessors=None, filters=None, unique=False):
-        """ Initialise the class.
-
-        :tokenizer: an instance of tokenizer
-        """
-        self.ner_sources = list(ner_sources)
-        self.preprocessors = preprocessors or []
-        self.filters = filters or []
-        self.unique = unique
-
-    def add_ner_source(self, process):
-        """ Add a ner process
-        """
-        self.ner_sources.append(process)
-
-    def add_preprocessors(self, preprocessor):
-        """ Add a preprocessor
-        """
-        self.preprocessors.append(preprocessor)
-
-    def add_filters(self, filter):
-        """ Add a filter
-        """
-        self.filters.append(filter)
-
-    def process_text(self, text):
-        """ High level function for analyzing a text
-        """
-        tokenizer = RichStringTokenizer(text)
-        return self.recognize_tokens(tokenizer)
-
-    def recognize_tokens(self, tokens):
-        """ Recognize Named Entities from a tokenizer or
-        an iterator yielding tokens.
-        """
-        last_stop = 0
-        named_entities = []
-        for token in tokens:
-            if token.start < last_stop:
-                continue # this token overlaps with a previous match
-            word = token.word
-            # Applies preprocessors
-            # XXX Preprocessors may be sources dependant
-            for preprocessor in self.preprocessors:
-                token = preprocessor(token)
-                if not token:
-                    break
-            if not token:
-                continue
-            recognized = False
-            for process in self.ner_sources:
-                for uri in process.recognize_token(token):
-                    named_entities.append((uri, process.name, token))
-                    recognized = True
-                    last_stop = token.end
-                    if self.unique:
-                        break
-                if recognized and self.unique:
-                    break
-        # XXX Postprocess/filters may be sources dependant
-        return self.postprocess(named_entities)
-
-    def postprocess(self, named_entities):
-        """ Postprocess the results by applying filters """
-        for filter in self.filters:
-            named_entities = filter(named_entities)
-        return named_entities
-
-
-###############################################################################
-### NER RELATIONS PROCESS #####################################################
-###############################################################################
-class NerdyRelationsProcess(object):
-    """ Process for building simple relation from named entities results
-    """
-    pass
--- a/dataio.py	Sun Jul 14 23:18:38 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,140 +0,0 @@
-# -*- coding: utf-8 -*-
-""" IO for Named Entities Recognition.
-"""
-import json
-import urllib
-import lxml.etree as ET
-
-
-###############################################################################
-### SPARQL UTILITIES ##########################################################
-###############################################################################
-def sparql_query(query, endpoint):
-    """ Execute a query on an endpoint:
-
-    sparql_query(query=u'''SELECT ?uri ?type
-                           WHERE{
-                           ?uri rdfs:label "Python"@en .
-                           ?uri rdf:type ?type}''',
-                           endpoint=u'http://dbpedia.org/sparql')
-    """
-    from SPARQLWrapper import SPARQLWrapper, JSON
-    sparql = SPARQLWrapper(endpoint)
-    sparql.setQuery(query)
-    sparql.setReturnFormat(JSON)
-    try:
-        rawresults = sparql.query().convert()
-        labels = rawresults['head']['vars']
-        return rawresults["results"]["bindings"]
-    except:
-        print 'Error in sparql query'
-        return []
-
-
-###############################################################################
-### RQL UTILITIES #############################################################
-###############################################################################
-def get_cw_cnx(endpoint):
-    """ Get a cnx on a CubicWeb database
-    """
-    from cubicweb import dbapi
-    from cubicweb.cwconfig import CubicWebConfiguration
-    from cubicweb.entities import AnyEntity
-    CubicWebConfiguration.load_cwctl_plugins()
-    config = CubicWebConfiguration.config_for(endpoint)
-    sourceinfo = config.sources()['admin']
-    login = sourceinfo['login']
-    password = sourceinfo['password']
-    _, cnx = dbapi.in_memory_repo_cnx(config, login, password=password)
-    req = cnx.request()
-    return req
-
-def rql_appid_query(query, endpoint, _cache_cnx={}, **kwargs):
-    """ Execute a query on an appid endpoint:
-
-    rql_query('Any X WHERE X label "Python"', 'localhost')
-
-    Additional arguments can be passed to be properly substitued
-    in the execute() function.
-    """
-    if endpoint in _cache_cnx:
-        cnx = _cache_cnx[endpoint]
-    else:
-        cnx = get_cw_cnx(endpoint)
-        _cache_cnx[endpoint] = cnx
-    return cnx.execute(query, kwargs)
-
-def rql_url_query(query, endpoint):
-    """ Execute a query on an url endpoint:
-
-    rql_query('Any X WHERE X label "Python"', 'localhost')
-    """
-    url = urllib.basejoin(endpoint, '?rql=%s&vid=jsonexport' % query)
-    return json.loads(urllib.urlopen(url).read())
-
-
-###############################################################################
-### OUTPUT UTILITIES ##########################################################
-###############################################################################
-class AbstractNerdyPrettyPrint(object):
-    """ Pretty print the output of a Nerdy process
-    """
-
-    def pprint_text(self, text, named_entities, **kwargs):
-        newtext = u''
-        indice = 0
-        tindices = dict([(t.start, (uri, t)) for uri, p, t in named_entities])
-        while indice < len(text):
-            if indice in tindices:
-                uri, t = tindices[indice]
-                words = text[t.start:t.end]
-                fragment = self.pprint_entity(uri, words, **kwargs)
-                if not self.is_valid(newtext+fragment+text[t.end:]):
-                    fragment = words
-                newtext += fragment
-                indice = t.end
-            else:
-                newtext += text[indice]
-                indice += 1
-        return newtext
-
-    def pprint_entity(self, uri, word, **kwargs):
-        """ Pretty print an entity """
-        raise NotImplementedError
-
-    def is_valid(self, newtext):
-        """Override to check the validity of the prettified content at each
-        enrichement step"""
-        return True
-
-
-class NerdyHTMLPrettyPrint(AbstractNerdyPrettyPrint):
-    """ Pretty print the output of a Nerdy process
-    """
-
-    def pprint_entity(self, uri, word, **kwargs):
-        """ Pretty print an entity """
-        klass = ' class="%s"' % kwargs['html_class'] if 'html_class' in kwargs else ''
-        return u'<a href="%s"%s>%s</a>' % (uri, klass, word)
-
-
-class NerdyValidXHTMLPrettyPrint(NerdyHTMLPrettyPrint):
-
-    XHTML_DOC_TEMPLATE = '''\
-<?xml version="1.0" encoding="UTF-8" ?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml">
-<head>
-<meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
-<title>nerdy</title>
-</head>
-<body><div>%s</div></body>
-</html>'''
-
-    def is_valid(self, html):
-        try:
-            ET.fromstring(self.XHTML_DOC_TEMPLATE % html.encode('utf-8'),
-                          parser=ET.XMLParser(dtd_validation=True))
-        except ET.XMLSyntaxError:
-            return False
-        return True
--- a/debian/changelog	Sun Jul 14 23:18:38 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-nerdy (0.1.0-1) unstable; urgency=low
-
-  * Initial release of the Nerdy package for Named Entities Recognition in Python.
-
- -- Vincent michel <Vincent.Michel@logilab.fr>  Tue, 11 Jun 2013 13:59:22 +0200
-
--- a/debian/compat	Sun Jul 14 23:18:38 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1 +0,0 @@
-7
--- a/debian/control	Sun Jul 14 23:18:38 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,12 +0,0 @@
-Source: nerdy
-Section: python
-Priority: optional
-Maintainer: LOGILAB S.A. (Paris, FRANCE) <contact@logilab.fr>
-Build-Depends: debhelper (>= 7), python (>=2.5), python-support
-Standards-Version: 3.9.3
-XS-Python-Version: >= 2.5
-
-Package: python-nerdy
-Architecture: all
-Depends: ${python:Depends}
-Description: Python library for Named Entities Recognition.
--- a/debian/copyright	Sun Jul 14 23:18:38 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,8 +0,0 @@
-Upstream Author:
-
-  LOGILAB S.A. (Paris, FRANCE) <contact@logilab.fr>
-
-Copyright:
-
-Copyright (c) 2013 LOGILAB S.A. (Paris, FRANCE).
-http://www.logilab.fr -- mailto:contact@logilab.fr
--- a/debian/rules	Sun Jul 14 23:18:38 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,55 +0,0 @@
-#!/usr/bin/make -f
-# Sample debian/rules that uses debhelper.
-# GNU copyright 1997 to 1999 by Joey Hess.
-
-# Uncomment this to turn on verbose mode.
-#export DH_VERBOSE=1
-build: build-arch build-indep
-build-arch:
-	# Nothing to do
-build-indep: build-stamp
-build-stamp:
-	dh_testdir
-	NO_SETUPTOOLS=1 python setup.py -q build
-	touch build-stamp
-
-clean:
-	dh_testdir
-	dh_testroot
-	rm -f build-stamp configure-stamp
-	rm -rf build
-	find . -name "*.pyc" | xargs rm -f
-	dh_clean
-
-install: build
-	dh_testdir
-	dh_testroot
-	dh_clean -k
-	dh_installdirs -i
-	NO_SETUPTOOLS=1 python setup.py -q install --no-compile --prefix=debian/python-nerdy/usr/
-
-
-# Build architecture-independent files here.
-binary-indep: build install
-	dh_testdir
-	dh_testroot
-	dh_install -i
-	dh_installchangelogs -i
-	dh_installexamples -i
-	dh_installdocs -i
-	dh_installman -i
-	dh_pysupport -i
-	dh_link -i
-	dh_compress -i -X.py -X.ini -X.xml -Xtest
-	dh_fixperms -i
-	dh_installdeb -i
-	dh_gencontrol -i
-	dh_md5sums -i
-	dh_builddeb -i
-
-
-# Build architecture-dependent files here.
-binary-arch:
-
-binary: binary-indep
-.PHONY: build clean binary-arch binary-indep binary
--- a/doc.rst	Sun Jul 14 23:18:38 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,123 +0,0 @@
-=====================================================
- NERDY - A Named Entities Recognition python Library
-=====================================================
-
-Examples of NerdySource
-=======================
-
-
-NerdySourceSparql
------------------
-
-Simple NerdySourceSparql on Dbpedia sparql endpoint::
-
-   .. sourcecode:: python
-
-   >>> from nerdy.core import NerdySourceSparql
-   >>> ner_source = NerdySourceSparql('''SELECT distinct ?uri
-                                         WHERE{
-                                         ?uri rdfs:label "%(word)s"@en}''',
-			                 'http://dbpedia.org/sparql')
-   >>> print ner_source.query_word('Victor Hugo')
-   ...     ['http://dbpedia.org/resource/Category:Victor_Hugo',
-	    'http://dbpedia.org/resource/Victor_Hugo',
-	    'http://dbpedia.org/class/yago/VictorHugo',
-	    'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
-	    'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
-	    'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
-
-
-With restriction in the SPARQL query::
-
-   .. sourcecode:: python
-
-   >>> from nerdy.core import NerdySourceSparql
-   >>> ner_source = NerdySourceSparql('''SELECT distinct ?uri
-                                         WHERE{
-                                         ?uri rdfs:label "%(word)s"@en .
-                                         ?p foaf:primaryTopic ?uri}''',
-			                 'http://dbpedia.org/sparql')
-   >>> print ner_source.query_word('Victor Hugo')
-   ...    ['http://dbpedia.org/resource/Victor_Hugo']
-
-
-
-NerdySourceUrlRql
------------------
-
-Simple NerdySourceUrlRql on a Rql endpoint::
-
-   .. sourcecode:: python
-
-   >>> from nerdy.core import NerdySourceUrlRql
-   >>> ner_source = NerdySourceUrlRql('Any U WHERE X cwuri U, X name "%(word)s"',
-		                        'http://www.cubicweb.org')
-   >>> print ner_source.query_word('apycot')
-   ...     [u'http://www.cubicweb.org/1310453', u'http://www.cubicweb.org/749162']
-
-
-
-Examples of full Nerdy process
-==============================
-
-
-1 - Define some text
---------------------
-
-For example, this text comes from Dbpedia (http://dbpedia.org/page/Victor_Hugo)::
-
-    .. sourcecode:: python
-
-   >>> from nerdy import core, dataio
-
-   >>> text = u"""Victor Hugo, né le 26 février 1802 à Besançon et mort le 22 mai 1885 à Paris, est un poète, dramaturge et prosateur romantique considéré comme l'un des plus importants écrivains de langue française. Il est aussi une personnalité politique et un intellectuel engagé qui a compté dans l'Histoire du XIX siècle. Victor Hugo occupe une place marquante dans l'histoire des lettres françaises au XIX siècle, dans des genres et des domaines d'une remarquable variété. Il est poète lyrique avec des recueils comme Odes et Ballades (1826), Les Feuilles d'automne (1831) ou Les Contemplations (1856), mais il est aussi poète engagé contre Napoléon III dans Les Châtiments (1853) ou encore poète épique avec La Légende des siècles (1859 et 1877). Il est également un romancier du peuple qui rencontre un grand succès populaire avec par exemple Notre-Dame de Paris (1831), et plus encore avec Les Misérables (1862). Au théâtre, il expose sa théorie du drame romantique dans sa préface de Cromwell en 1827 et l'illustre principalement avec Hernani en 1830 et Ruy Blas en 1838. Son œuvre multiple comprend aussi des discours politiques à la Chambre des pairs, à l'Assemblée constituante et à l'Assemblée législative, notamment sur la peine de mort, l'école ou l'Europe, des récits de voyages (Le Rhin, 1842, ou Choses vues, posthumes, 1887 et 1890), et une correspondance abondante. Victor Hugo a fortement contribué au renouvellement de la poésie et du théâtre ; il a été admiré par ses contemporains et l'est encore, mais il a été aussi contesté par certains auteurs modernes. Il a aussi permis à de nombreuses générations de développer une réflexion sur l'engagement de l'écrivain dans la vie politique et sociale grâce à ses multiples prises de position qui le condamneront à l'exil pendant les vingt ans du Second Empire. Ses choix, à la fois moraux et politiques, durant la deuxième partie de sa vie, et son œuvre hors du commun ont fait de lui un personnage emblématique que la Troisième République a honoré à sa mort le 22 mai 1885 par des funérailles nationales qui ont accompagné le transfert de sa dépouille au Panthéon de Paris, le 31 mai 1885."""
-
-
-2 - Define a source
--------------------
-
-Now, define a source for the Named Entities::
-
-    .. sourcecode:: python
-
-    >>> dbpedia_sparql_source = core.NerdySourceSparql('''SELECT distinct ?uri
-             		       				 WHERE{
- 							 ?uri rdfs:label "%(word)s"@en .
- 							 ?p foaf:primaryTopic ?uri}''',
- 							 'http://dbpedia.org/sparql',
- 							 use_cache=True)
-    >>> nerdy_sources = [dbpedia_sparql_source,]
-
-
-3 - Define some preprocessors
------------------------------
-
-Define some preprocessors that will cleanup the words before matching::
-
-    .. sourcecode:: python
-
-    >>> preprocessors = [core.NerdyLowerCaseFilterPreprocessor(),
-        	         core.NerdyStopwordsFilterPreprocessor()]
-
-
-4 - Define the Nerdy process
-----------------------------
-
-Define the process and process the text::
-
-    .. sourcecode:: python
-
-    >>> nerdy = core.NerdyProcess(nerdy_sources, preprocessors=preprocessors)
-    >>> named_entities = nerdy.process_text(text)
-    >>> print named_entities
-
-
-5 - Pretty priint the output
-----------------------------
-
-And finally, we can print the output as HTML with links::
-
-    .. sourcecode:: python
-
-    >>> html = dataio.NerdyHTMLPrettyPrint().pprint_text(text, named_entities)
-    >>> print html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ner/__pkginfo__.py	Tue Oct 22 15:53:25 2013 +0200
@@ -0,0 +1,39 @@
+# -*- coding:utf-8 -*-
+# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+"""Nerdy packaging information."""
+__docformat__ = "restructuredtext en"
+import sys
+
+distname = 'nerdy'
+modname = 'nerdy'
+
+numversion = (0, 1, 0)
+version = '.'.join([str(num) for num in numversion])
+
+license = 'LGPL' # 2.1 or later
+description = "Python library for data alignment"
+web = "https://www.logilab.org/project/nerdy"
+author = "Logilab"
+author_email = "contact@logilab.fr"
+
+
+from os.path import join
+scripts = []
+include_dirs = []
+
+if sys.version_info < (2, 7):
+    install_requires = ['unittest2 >= 0.5.1']
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ner/core.py	Tue Oct 22 15:53:25 2013 +0200
@@ -0,0 +1,396 @@
+# -*- coding: utf-8 -*-
+""" Core functions for Named Entities Recognition.
+"""
+from nerdy.tokenizer import RichStringTokenizer, Token
+from nerdy.dataio import sparql_query, rql_url_query, rql_appid_query
+from nerdy.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
+
+STOPWORDS = {'fr': FRENCH_STOPWORDS,
+             'en': ENGLISH_STOPWORDS}
+
+# XXX Add SQL source ?
+# XXX NER preprocessor
+
+###############################################################################
+### NER SOURCE ################################################################
+###############################################################################
+class AbstractNerdySource(object):
+    """ High-level source for Named Entities Recognition
+    """
+
+    def __init__(self, query, endpoint, name=None, use_cache=True, preprocessors=None):
+        """ Initialise the class.
+        """
+        self.query = query
+        self.endpoint = endpoint
+        self.name = name
+        self.preprocessors = preprocessors or []
+        self.use_cache = use_cache
+        self._recognized_cache = {}
+
+    def add_preprocessors(self, preprocessor):
+        """ Add a preprocessor
+        """
+        self.preprocessors.append(preprocessor)
+
+    def recognize_token(self, token):
+        """ Recognize a token
+        """
+        # Applies source specific preprocessors
+        for preprocessor in self.preprocessors:
+            token = preprocessor(token)
+            if not token:
+                return []
+        if self.use_cache and token.word in self._recognized_cache:
+            return self._recognized_cache[token.word]
+        uris = self.query_word(token.word) if token.word else []
+        if self.use_cache:
+            self._recognized_cache[token.word] = uris
+        return uris
+
+    def query_word(self, word):
+        """ Query a word for a Named Entities Recognition process
+        """
+        raise NotImplementedError
+
+
+class NerdySourceLexical(AbstractNerdySource):
+    """ Source based on a (pre-computed) dictionnary of words (token, uri)
+    """
+    def __init__(self, lexicon, name=None, use_cache=True, preprocessors=None):
+        self.lexicon = lexicon
+        self.name = name
+        self.preprocessors = preprocessors or []
+        self.use_cache = use_cache
+        self._recognized_cache = {}
+
+    def query_word(self, word):
+        uri = self.lexicon.get(word)
+        return [uri,] if uri else []
+
+
+class NerdySourceLocalRql(AbstractNerdySource):
+    """ High-level source for Named Entities Recognition
+    Local RQL version
+    """
+
+    def __init__(self, query, session, name=None, use_cache=True, preprocessors=None):
+        """ Initialise the class.
+        """
+        self.query = query
+        self.session = session
+        self.name = name
+        self.preprocessors = preprocessors or []
+        self.use_cache = use_cache
+        self._recognized_cache = {}
+
+    def query_word(self, word):
+        """ Query a word for a Named Entities Recognition process
+        """
+        return [r[0] for r in self.session.execute(self.query, dict(word=word))]
+
+
+class NerdySourceAppidRql(AbstractNerdySource):
+    """ High-level source for Named Entities Recognition
+    Appid RQL version
+    """
+
+    def query_word(self, word):
+        """ Query a word for a Named Entities Recognition process
+        """
+        return [r[0] for r in rql_appid_query(self.query, self.endpoint, word=word)]
+
+
+class NerdySourceUrlRql(AbstractNerdySource):
+    """ High-level source for Named Entities Recognition
+    Url RQL version
+    """
+
+    def query_word(self, word):
+        """ Query a word for a Named Entities Recognition process
+        """
+        return [r[0] for r in rql_url_query(self.query % {'word': word}, self.endpoint)]
+
+
+class NerdySourceSparql(AbstractNerdySource):
+    """ High-level source for Named Entities Recognition
+    SPARQL version
+
+   >>> from nerdy.core import NerdySourceSparql
+   >>> ner_source = NerdySourceSparql('''SELECT ?uri
+                                         WHERE{
+                                         ?uri rdfs:label "%(word)s"@en}''',
+			                 'http://dbpedia.org/sparql')
+   >>> print ner_source.recognize_token('Victor Hugo')
+		... ['http://dbpedia.org/resource/Category:Victor_Hugo',
+		     'http://dbpedia.org/resource/Victor_Hugo',
+		     'http://dbpedia.org/class/yago/VictorHugo',
+		     'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
+		     'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
+		     'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
+
+    """
+
+    def query_word(self, word):
+        """ Query a word for a Named Entities Recognition process
+        """
+        return [r['uri']['value'] for r in sparql_query(self.query % {'word': word}, self.endpoint)]
+
+
+###############################################################################
+### NER PREPROCESSORS #########################################################
+###############################################################################
+class AbstractNerdyPreprocessor(object):
+    """ Preprocessor
+    """
+
+    def __call__(self, token):
+        raise NotImplementedError
+
+
+class NerdyWordSizeFilterPreprocessor(AbstractNerdyPreprocessor):
+    """ Remove token based on the size of the word
+    """
+    def __init__(self, min_size=None, max_size=None):
+        self.min_size = min_size
+        self.max_size = max_size
+
+    def __call__(self, token):
+        if ((self.min_size and len(token.word)<self.min_size)
+            or (self.max_size and len(token.word)>self.max_size)):
+            return None
+        return token
+
+
+class NerdyLowerCaseFilterPreprocessor(AbstractNerdyPreprocessor):
+    """ Remove token with word in lower case
+    """
+
+    def __call__(self, token):
+        return None if token.word.islower() else token
+
+
+class NerdyLowerFirstWordPreprocessor(AbstractNerdyPreprocessor):
+    """ Lower the first word of each sentence if it is a stopword.
+    """
+    def __init__(self, lang='en'):
+        self.lang = lang
+
+    def __call__(self, token):
+        if (token.start == token.sentence.start and
+            token.word.split()[0].lower() in STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)):
+            word = token.word[0].lower() + token.word[1:]
+            return Token(word, token.start, token.end, token.sentence)
+        return token
+
+
+class NerdyStopwordsFilterPreprocessor(AbstractNerdyPreprocessor):
+    """ Remove stopwords
+    """
+    def __init__(self, split_words=False, lang='en'):
+        self.split_words = split_words
+        self.lang = lang
+
+    def __call__(self, token):
+        stopwords = STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)
+        if self.split_words and not [w for w in token.word.split() if w.lower() not in stopwords]:
+            return None
+        if not self.split_words and token.word.lower() in stopwords:
+            return None
+        return token
+
+
+class NerdyHashTagPreprocessor(AbstractNerdyPreprocessor):
+    """ Cleanup hashtag
+    """
+    def __call__(self, token):
+        if token.word.startswith('@'):
+            # XXX Split capitalize letter ?
+            # @BarackObama -> Barack Obama
+            word = token.word[1:].replace('_', ' ')
+            return Token(word, token.start, token.end, token.sentence)
+        return token
+
+
+###############################################################################
+### NER FILTERS ###############################################################
+###############################################################################
+class AbstractNerdyFilter(object):
+    """ A filter used for cleaning named entities results
+    """
+
+    def __call__(self, named_entities):
+        raise NotImplementedError
+
+
+class NerdyOccurenceFilter(object):
+    """ A filter based on the number of occurence of
+    named entities in the results.
+    """
+    def __init__(self, min_occ=None, max_occ=None):
+        self.min_occ = min_occ
+        self.max_occ = max_occ
+
+    def __call__(self, named_entities):
+        uris = [u for u, p, t in named_entities]
+        counts = dict([(u, uris.count(u)) for u in set(uris)])
+        return [n for n in named_entities if not ((self.min_occ and counts[n[0]]<self.min_occ)
+                                              or (self.max_occ and counts[n[0]]>self.max_occ))]
+
+
+class NerdyRDFTypeFilter(object):
+    """ A filter based on the RDF type on entity
+    E.g.
+
+    filter = NerdyRDFTypeFilter('http://dbpedia.org/sparql',
+                                ('http://schema.org/Place',
+                                'http://dbpedia.org/ontology/Agent',
+                                'http://dbpedia.org/ontology/Place'))
+
+    """
+    def __init__(self, endpoint, accepted_types):
+        self.endpoint = endpoint
+        self.accepted_types = accepted_types
+        self.query = 'SELECT ?type WHERE{<%(uri)s> rdf:type ?type}'
+
+    def __call__(self, named_entities):
+        filtered_named_entities = []
+        seen_uris = {}
+        for uri, p, t in named_entities:
+            if uri in seen_uris:
+                if seen_uris[uri]:
+                    filtered_named_entities.append((uri, p, t))
+            else:
+                results = sparql_query(self.query % {'uri': uri}, self.endpoint)
+                types = set([r['type']['value'] for r in results])
+                if not len(types.intersection(self.accepted_types)):
+                    seen_uris[uri] = False
+                else:
+                    seen_uris[uri] = True
+                    filtered_named_entities.append((uri, p, t))
+        return filtered_named_entities
+
+
+class NerdyDisambiguationWordParts(object):
+    """ Disambiguate named entities based on the words parts.
+    E.g.:
+          'toto tutu': 'http://example.com/toto_tutu',
+          'toto': 'http://example.com/toto'
+
+          Then if 'toto' is found in the text, replace the URI 'http://example.com/toto'
+          by 'http://example.com/toto_tutu'
+    """
+    def __call__(self, named_entities):
+        # Create parts dictionnary
+        parts = {}
+        for uri, peid, token in named_entities:
+            if ' ' in token.word:
+                for part in token.word.split(' '):
+                    parts[part.lower()] = uri
+        # Replace named entities
+        filtered_named_entities = []
+        for uri, peid, token in named_entities:
+            if token.word in parts:
+                # Change URI
+                uri = parts[token.word]
+            filtered_named_entities.append((uri, peid, token))
+        return filtered_named_entities
+
+
+class NerdyReplacementRulesFilter(object):
+    """ Allow to define replacement rules for Named Entities
+    """
+    def __init__(self,rules):
+        self.rules = rules
+
+    def __call__(self, named_entities):
+        filtered_named_entities = []
+        for uri, peid, token in named_entities:
+            uri = self.rules.get(uri, uri)
+            filtered_named_entities.append((uri, peid, token))
+        return filtered_named_entities
+
+
+###############################################################################
+### NER PROCESS ###############################################################
+###############################################################################
+class NerdyProcess(object):
+    """ High-level process for Named Entities Recognition
+    """
+
+    def __init__(self, ner_sources, preprocessors=None, filters=None, unique=False):
+        """ Initialise the class.
+
+        :tokenizer: an instance of tokenizer
+        """
+        self.ner_sources = list(ner_sources)
+        self.preprocessors = preprocessors or []
+        self.filters = filters or []
+        self.unique = unique
+
+    def add_ner_source(self, process):
+        """ Add a ner process
+        """
+        self.ner_sources.append(process)
+
+    def add_preprocessors(self, preprocessor):
+        """ Add a preprocessor
+        """
+        self.preprocessors.append(preprocessor)
+
+    def add_filters(self, filter):
+        """ Add a filter
+        """
+        self.filters.append(filter)
+
+    def process_text(self, text):
+        """ High level function for analyzing a text
+        """
+        tokenizer = RichStringTokenizer(text)
+        return self.recognize_tokens(tokenizer)
+
+    def recognize_tokens(self, tokens):
+        """ Recognize Named Entities from a tokenizer or
+        an iterator yielding tokens.
+        """
+        last_stop = 0
+        named_entities = []
+        for token in tokens:
+            if token.start < last_stop:
+                continue # this token overlaps with a previous match
+            word = token.word
+            # Applies preprocessors
+            # XXX Preprocessors may be sources dependant
+            for preprocessor in self.preprocessors:
+                token = preprocessor(token)
+                if not token:
+                    break
+            if not token:
+                continue
+            recognized = False
+            for process in self.ner_sources:
+                for uri in process.recognize_token(token):
+                    named_entities.append((uri, process.name, token))
+                    recognized = True
+                    last_stop = token.end
+                    if self.unique:
+                        break
+                if recognized and self.unique:
+                    break
+        # XXX Postprocess/filters may be sources dependant
+        return self.postprocess(named_entities)
+
+    def postprocess(self, named_entities):
+        """ Postprocess the results by applying filters """
+        for filter in self.filters:
+            named_entities = filter(named_entities)
+        return named_entities
+
+
+###############################################################################
+### NER RELATIONS PROCESS #####################################################
+###############################################################################
+class NerdyRelationsProcess(object):
+    """ Process for building simple relation from named entities results
+    """
+    pass
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ner/dataio.py	Tue Oct 22 15:53:25 2013 +0200
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+""" IO for Named Entities Recognition.
+"""
+import json
+import urllib
+import lxml.etree as ET
+
+
+###############################################################################
+### SPARQL UTILITIES ##########################################################
+###############################################################################
+def sparql_query(query, endpoint):
+    """ Execute a query on an endpoint:
+
+    sparql_query(query=u'''SELECT ?uri ?type
+                           WHERE{
+                           ?uri rdfs:label "Python"@en .
+                           ?uri rdf:type ?type}''',
+                           endpoint=u'http://dbpedia.org/sparql')
+    """
+    from SPARQLWrapper import SPARQLWrapper, JSON
+    sparql = SPARQLWrapper(endpoint)
+    sparql.setQuery(query)
+    sparql.setReturnFormat(JSON)
+    try:
+        rawresults = sparql.query().convert()
+        labels = rawresults['head']['vars']
+        return rawresults["results"]["bindings"]
+    except:
+        print 'Error in sparql query'
+        return []
+
+
+###############################################################################
+### RQL UTILITIES #############################################################
+###############################################################################
+def get_cw_cnx(endpoint):
+    """ Get a cnx on a CubicWeb database
+    """
+    from cubicweb import dbapi
+    from cubicweb.cwconfig import CubicWebConfiguration
+    from cubicweb.entities import AnyEntity
+    CubicWebConfiguration.load_cwctl_plugins()
+    config = CubicWebConfiguration.config_for(endpoint)
+    sourceinfo = config.sources()['admin']
+    login = sourceinfo['login']
+    password = sourceinfo['password']
+    _, cnx = dbapi.in_memory_repo_cnx(config, login, password=password)
+    req = cnx.request()
+    return req
+
+def rql_appid_query(query, endpoint, _cache_cnx={}, **kwargs):
+    """ Execute a query on an appid endpoint:
+
+    rql_query('Any X WHERE X label "Python"', 'localhost')
+
+    Additional arguments can be passed to be properly substitued
+    in the execute() function.
+    """
+    if endpoint in _cache_cnx:
+        cnx = _cache_cnx[endpoint]
+    else:
+        cnx = get_cw_cnx(endpoint)
+        _cache_cnx[endpoint] = cnx
+    return cnx.execute(query, kwargs)
+
+def rql_url_query(query, endpoint):
+    """ Execute a query on an url endpoint:
+
+    rql_query('Any X WHERE X label "Python"', 'localhost')
+    """
+    url = urllib.basejoin(endpoint, '?rql=%s&vid=jsonexport' % query)
+    return json.loads(urllib.urlopen(url).read())
+
+
+###############################################################################
+### OUTPUT UTILITIES ##########################################################
+###############################################################################
+class AbstractNerdyPrettyPrint(object):
+    """ Pretty print the output of a Nerdy process
+    """
+
+    def pprint_text(self, text, named_entities, **kwargs):
+        newtext = u''
+        indice = 0
+        tindices = dict([(t.start, (uri, t)) for uri, p, t in named_entities])
+        while indice < len(text):
+            if indice in tindices:
+                uri, t = tindices[indice]
+                words = text[t.start:t.end]
+                fragment = self.pprint_entity(uri, words, **kwargs)
+                if not self.is_valid(newtext+fragment+text[t.end:]):
+                    fragment = words
+                newtext += fragment
+                indice = t.end
+            else:
+                newtext += text[indice]
+                indice += 1
+        return newtext
+
+    def pprint_entity(self, uri, word, **kwargs):
+        """ Pretty print an entity """
+        raise NotImplementedError
+
+    def is_valid(self, newtext):
+        """Override to check the validity of the prettified content at each
+        enrichement step"""
+        return True
+
+
+class NerdyHTMLPrettyPrint(AbstractNerdyPrettyPrint):
+    """ Pretty print the output of a Nerdy process
+    """
+
+    def pprint_entity(self, uri, word, **kwargs):
+        """ Pretty print an entity """
+        klass = ' class="%s"' % kwargs['html_class'] if 'html_class' in kwargs else ''
+        return u'<a href="%s"%s>%s</a>' % (uri, klass, word)
+
+
+class NerdyValidXHTMLPrettyPrint(NerdyHTMLPrettyPrint):
+
+    XHTML_DOC_TEMPLATE = '''\
+<?xml version="1.0" encoding="UTF-8" ?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
+<title>nerdy</title>
+</head>
+<body><div>%s</div></body>
+</html>'''
+
+    def is_valid(self, html):
+        try:
+            ET.fromstring(self.XHTML_DOC_TEMPLATE % html.encode('utf-8'),
+                          parser=ET.XMLParser(dtd_validation=True))
+        except ET.XMLSyntaxError:
+            return False
+        return True
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ner/debian/changelog	Tue Oct 22 15:53:25 2013 +0200
@@ -0,0 +1,6 @@
+nerdy (0.1.0-1) unstable; urgency=low
+
+  * Initial release of the Nerdy package for Named Entities Recognition in Python.
+
+ -- Vincent michel <Vincent.Michel@logilab.fr>  Tue, 11 Jun 2013 13:59:22 +0200
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ner/debian/compat	Tue Oct 22 15:53:25 2013 +0200
@@ -0,0 +1,1 @@
+7
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ner/debian/control	Tue Oct 22 15:53:25 2013 +0200
@@ -0,0 +1,12 @@
+Source: nerdy
+Section: python
+Priority: optional
+Maintainer: LOGILAB S.A. (Paris, FRANCE) <contact@logilab.fr>
+Build-Depends: debhelper (>= 7), python (>=2.5), python-support
+Standards-Version: 3.9.3
+XS-Python-Version: >= 2.5
+
+Package: python-nerdy
+Architecture: all
+Depends: ${python:Depends}
+Description: Python library for Named Entities Recognition.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ner/debian/copyright	Tue Oct 22 15:53:25 2013 +0200
@@ -0,0 +1,8 @@
+Upstream Author:
+
+  LOGILAB S.A. (Paris, FRANCE) <contact@logilab.fr>
+
+Copyright:
+
+Copyright (c) 2013 LOGILAB S.A. (Paris, FRANCE).
+http://www.logilab.fr -- mailto:contact@logilab.fr
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ner/debian/rules	Tue Oct 22 15:53:25 2013 +0200
@@ -0,0 +1,55 @@
+#!/usr/bin/make -f
+# Sample debian/rules that uses debhelper.
+# GNU copyright 1997 to 1999 by Joey Hess.
+
+# Uncomment this to turn on verbose mode.
+#export DH_VERBOSE=1
+build: build-arch build-indep
+build-arch:
+	# Nothing to do
+build-indep: build-stamp
+build-stamp:
+	dh_testdir
+	NO_SETUPTOOLS=1 python setup.py -q build
+	touch build-stamp
+
+clean:
+	dh_testdir
+	dh_testroot
+	rm -f build-stamp configure-stamp
+	rm -rf build
+	find . -name "*.pyc" | xargs rm -f
+	dh_clean
+
+install: build
+	dh_testdir
+	dh_testroot
+	dh_clean -k
+	dh_installdirs -i
+	NO_SETUPTOOLS=1 python setup.py -q install --no-compile --prefix=debian/python-nerdy/usr/
+
+
+# Build architecture-independent files here.
+binary-indep: build install
+	dh_testdir
+	dh_testroot
+	dh_install -i
+	dh_installchangelogs -i
+	dh_installexamples -i
+	dh_installdocs -i
+	dh_installman -i
+	dh_pysupport -i
+	dh_link -i
+	dh_compress -i -X.py -X.ini -X.xml -Xtest
+	dh_fixperms -i
+	dh_installdeb -i
+	dh_gencontrol -i
+	dh_md5sums -i
+	dh_builddeb -i
+
+
+# Build architecture-dependent files here.
+binary-arch:
+
+binary: binary-indep
+.PHONY: build clean binary-arch binary-indep binary
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ner/doc.rst	Tue Oct 22 15:53:25 2013 +0200
@@ -0,0 +1,123 @@
+=====================================================
+ NERDY - A Named Entities Recognition python Library
+=====================================================
+
+Examples of NerdySource
+=======================
+
+
+NerdySourceSparql
+-----------------
+
+Simple NerdySourceSparql on Dbpedia sparql endpoint::
+
+   .. sourcecode:: python
+
+   >>> from nerdy.core import NerdySourceSparql
+   >>> ner_source = NerdySourceSparql('''SELECT distinct ?uri
+                                         WHERE{
+                                         ?uri rdfs:label "%(word)s"@en}''',
+			                 'http://dbpedia.org/sparql')
+   >>> print ner_source.query_word('Victor Hugo')
+   ...     ['http://dbpedia.org/resource/Category:Victor_Hugo',
+	    'http://dbpedia.org/resource/Victor_Hugo',
+	    'http://dbpedia.org/class/yago/VictorHugo',
+	    'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
+	    'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
+	    'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
+
+
+With restriction in the SPARQL query::
+
+   .. sourcecode:: python
+
+   >>> from nerdy.core import NerdySourceSparql
+   >>> ner_source = NerdySourceSparql('''SELECT distinct ?uri
+                                         WHERE{
+                                         ?uri rdfs:label "%(word)s"@en .
+                                         ?p foaf:primaryTopic ?uri}''',
+			                 'http://dbpedia.org/sparql')
+   >>> print ner_source.query_word('Victor Hugo')
+   ...    ['http://dbpedia.org/resource/Victor_Hugo']
+
+
+
+NerdySourceUrlRql
+-----------------
+
+Simple NerdySourceUrlRql on a Rql endpoint::
+
+   .. sourcecode:: python
+
+   >>> from nerdy.core import NerdySourceUrlRql
+   >>> ner_source = NerdySourceUrlRql('Any U WHERE X cwuri U, X name "%(word)s"',
+		                        'http://www.cubicweb.org')
+   >>> print ner_source.query_word('apycot')
+   ...     [u'http://www.cubicweb.org/1310453', u'http://www.cubicweb.org/749162']
+
+
+
+Examples of full Nerdy process
+==============================
+
+
+1 - Define some text
+--------------------
+
+For example, this text comes from Dbpedia (http://dbpedia.org/page/Victor_Hugo)::
+
+    .. sourcecode:: python
+
+   >>> from nerdy import core, dataio
+
+   >>> text = u"""Victor Hugo, né le 26 février 1802 à Besançon et mort le 22 mai 1885 à Paris, est un poète, dramaturge et prosateur romantique considéré comme l'un des plus importants écrivains de langue française. Il est aussi une personnalité politique et un intellectuel engagé qui a compté dans l'Histoire du XIX siècle. Victor Hugo occupe une place marquante dans l'histoire des lettres françaises au XIX siècle, dans des genres et des domaines d'une remarquable variété. Il est poète lyrique avec des recueils comme Odes et Ballades (1826), Les Feuilles d'automne (1831) ou Les Contemplations (1856), mais il est aussi poète engagé contre Napoléon III dans Les Châtiments (1853) ou encore poète épique avec La Légende des siècles (1859 et 1877). Il est également un romancier du peuple qui rencontre un grand succès populaire avec par exemple Notre-Dame de Paris (1831), et plus encore avec Les Misérables (1862). Au théâtre, il expose sa théorie du drame romantique dans sa préface de Cromwell en 1827 et l'illustre principalement avec Hernani en 1830 et Ruy Blas en 1838. Son œuvre multiple comprend aussi des discours politiques à la Chambre des pairs, à l'Assemblée constituante et à l'Assemblée législative, notamment sur la peine de mort, l'école ou l'Europe, des récits de voyages (Le Rhin, 1842, ou Choses vues, posthumes, 1887 et 1890), et une correspondance abondante. Victor Hugo a fortement contribué au renouvellement de la poésie et du théâtre ; il a été admiré par ses contemporains et l'est encore, mais il a été aussi contesté par certains auteurs modernes. Il a aussi permis à de nombreuses générations de développer une réflexion sur l'engagement de l'écrivain dans la vie politique et sociale grâce à ses multiples prises de position qui le condamneront à l'exil pendant les vingt ans du Second Empire. Ses choix, à la fois moraux et politiques, durant la deuxième partie de sa vie, et son œuvre hors du commun ont fait de lui un personnage emblématique que la Troisième République a honoré à sa mort le 22 mai 1885 par des funérailles nationales qui ont accompagné le transfert de sa dépouille au Panthéon de Paris, le 31 mai 1885."""
+
+
+2 - Define a source
+-------------------
+
+Now, define a source for the Named Entities::
+
+    .. sourcecode:: python
+
+    >>> dbpedia_sparql_source = core.NerdySourceSparql('''SELECT distinct ?uri
+             		       				 WHERE{
+ 							 ?uri rdfs:label "%(word)s"@en .
+ 							 ?p foaf:primaryTopic ?uri}''',
+ 							 'http://dbpedia.org/sparql',
+ 							 use_cache=True)
+    >>> nerdy_sources = [dbpedia_sparql_source,]
+
+
+3 - Define some preprocessors
+-----------------------------
+
+Define some preprocessors that will cleanup the words before matching::
+
+    .. sourcecode:: python
+
+    >>> preprocessors = [core.NerdyLowerCaseFilterPreprocessor(),
+        	         core.NerdyStopwordsFilterPreprocessor()]
+
+
+4 - Define the Nerdy process
+----------------------------
+
+Define the process and process the text::
+
+    .. sourcecode:: python
+
+    >>> nerdy = core.NerdyProcess(nerdy_sources, preprocessors=preprocessors)
+    >>> named_entities = nerdy.process_text(text)
+    >>> print named_entities
+
+
+5 - Pretty priint the output
+----------------------------
+
+And finally, we can print the output as HTML with links::
+
+    .. sourcecode:: python
+
+    >>> html = dataio.NerdyHTMLPrettyPrint().pprint_text(text, named_entities)
+    >>> print html
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ner/python-nerdy.spec	Tue Oct 22 15:53:25 2013 +0200
@@ -0,0 +1,48 @@
+%if 0%{?el5}
+%define python python26
+%define __python /usr/bin/python2.6
+%{!?python_scriptarch: %define python_scriptarch %(%{__python} -c "from distutils.sysconfig import get_python_lib; from os.path import join; print join(get_python_lib(1, 1), 'scripts')")}
+%else
+%define python python
+%define __python /usr/bin/python
+%endif
+
+Name:           %{python}-nerdy
+Version:        0.1.0
+Release:        logilab.1%{?dist}
+Summary:        Python library for data alignment
+Group:          Development/Languages/Python
+License:        LGPL
+Source0:        nerdy-%{version}.tar.gz
+
+BuildArch:      noarch
+BuildRoot:      %{_tmppath}/%{name}-%{version}-%{release}-buildroot
+
+BuildRequires:  %{python}
+Requires:       %{python}, %{python}-lxml
+
+
+%description
+entity / relation schema
+
+%prep
+%setup -q -n nerdy-%{version}
+
+%build
+%{__python} setup.py build
+%if 0%{?el5}
+# change the python version in shebangs
+find . -name '*.py' -type f -print0 |  xargs -0 sed -i '1,3s;^#!.*python.*$;#! /usr/bin/python2.6;'
+%endif
+
+%install
+rm -rf $RPM_BUILD_ROOT
+NO_SETUPTOOLS=1 %{__python} setup.py install -O1 --skip-build --root $RPM_BUILD_ROOT %{?python_scriptarch: --install-scripts=%{python_scriptarch}}
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%files 
+%defattr(-, root, root)
+/*
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ner/setup.py	Tue Oct 22 15:53:25 2013 +0200
@@ -0,0 +1,27 @@
+# -*- coding:utf-8 -*-
+# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+from distutils.core import setup
+
+setup(name='nerdy',
+      version='0.1.0',
+      description='Python library for data alignment',
+      author='LOGILAB S.A. (Paris, FRANCE)',
+      author_email=' <contact@logilab.fr>',
+      url='https://www.logilab.org/project/nerdy',
+      package_dir={'nerdy': '.'},
+      packages=['nerdy'],
+     )
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ner/stopwords.py	Tue Oct 22 15:53:25 2013 +0200
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+"""
+Stopwords in different languages.
+"""
+
+FRENCH_STOPWORDS = set(['alors', 'au', 'aucuns', 'aussi', 'autre', 'aux', 'avant', 'avec', 'avoir', 'bon', 'car', 'ce', 'cela', 'ces', 'ceux', 'chaque', 'ci', 'comme', 'comment', 'dans', 'de', 'dedans', 'dehors', 'depuis', 'des', 'deux', 'devrait', 'doit', 'donc', 'dos', 'droite', 'du', 'début', 'elle', 'elles', 'en', 'encore', 'essai', 'est', 'et', 'eu', 'eux', 'fait', 'faites', 'fois', 'font', 'force', 'haut', 'hors', 'ici', 'il', 'ils', 'je', 'juste', 'la', 'le', 'les', 'leur', 'lui', 'là', 'ma', 'maintenant', 'mais', 'me', 'meme', 'mes', 'mine', 'moi', 'moins', 'mon', 'mot', 'ne', 'ni', 'nommés', 'nos', 'notre', 'nous', 'nouveaux', 'on', 'ou', 'où', 'par', 'parce', 'parole', 'pas', 'personnes', 'peu', 'peut', 'pièce', 'plupart', 'pour', 'pourquoi', 'qu', 'quand', 'que', 'quel', 'quelle', 'quelles', 'quels', 'qui', 'sa', 'sans', 'se', 'ses', 'seulement', 'si', 'sien', 'son', 'sont', 'sous', 'soyez', 'sujet', 'sur', 'ta', 'tandis', 'te', 'tellement', 'tels', 'tes', 'toi', 'ton', 'tous', 'tout', 'trop', 'très', 'tu', 'un', 'une', 'valeur', 'voie', 'voient', 'vont', 'vos', 'votre', 'vous', 'vu', 'ça', 'étaient', 'état', 'étions', 'été', 'être'])
+
+
+ENGLISH_STOPWORDS = set(['a', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', "ain't", 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', "aren't", 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'bill', 'both', 'bottom', 'brief', 'but', 'by', 'call', 'came', 'can', 'cannot', 'cant', "can't", 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'computer', 'con', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'couldnt', "couldn't", 'course', 'cry', 'currently', "c'mon", "c's", 'de', 'definitely', 'describe', 'described', 'despite', 'detail', 'did', "didn't", 'different', 'do', 'does', "doesn't", 'doing', 'done', "don't", 'down', 'downwards', 'due', 'during', 'each', 'edu', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'far', 'few', 'fifteen', 'fifth', 'fify', 'fill', 'find', 'fire', 'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'furthermore', 'get', 'gets', 'getting', 'give', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'had', "hadn't", 'happens', 'hardly', 'has', 'hasnt', "hasn't", 'have', "haven't", 'having', 'he', 'hello', 'help', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', "here's", 'hers', 'herself', "he's", 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'hundred', 'i', "i'd", "i'll", "i'm", "i've", 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'insofar', 'instead', 'interest', 'into', 'inward', 'is', "isn't", 'it', 'its', 'itself', "it'd", "it'll", "it's'", "i'd", "i'll", "i'm", "i've", 'just', 'keep', 'keeps', 'kept', 'know', 'known', 'knows', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', "let's", 'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks', 'ltd', 'made', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'part', 'particular', 'particularly', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'put', 'que', 'quite', 'qv', 'rather', 'rd', 're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', "shouldn't", 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 'system', 'take', 'taken', 'tell', 'ten', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', "that's", 'thats', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'thick', 'thin', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took', 'top', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twelve', 'twenty', 'twice', 'two', "t's", 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', 'via', 'viz', 'vs', 'want', 'wants', 'was', "wasn't", 'way', 'we', 'welcome', 'well', 'went', 'were', "weren't", "we'd", "we'll", "we're", "we've", 'what', 'whatever', "what's", 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', "where's", 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', "who's", 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'wonder', "won't", 'would', "wouldn't", 'yes', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves', "you'd", "you'll", "you're", "you've", 'zero'])
+
+
+ENGLISH_REGULAR_VERBS = set(['accept', 'add', 'admire', 'admit', 'advise', 'afford', 'agree', 'alert', 'allow', 'amuse', 'analyse', 'announce', 'annoy', 'answer', 'apologise', 'appear', 'applaud', 'appreciate', 'approve', 'argue', 'arrange', 'arrest', 'arrive', 'ask', 'attach', 'attack', 'attempt', 'attend', 'attract', 'avoid', 'back', 'bake', 'balance', 'ban', 'bang', 'bare', 'bat', 'bathe', 'battle', 'beam', 'beg', 'behave', 'belong', 'bleach', 'bless', 'blind', 'blink', 'blot', 'blush', 'boast', 'boil', 'bolt', 'bomb', 'book', 'bore', 'borrow', 'bounce', 'bow', 'box', 'brake', 'branch', 'breathe', 'bruise', 'brush', 'bubble', 'bump', 'burn', 'bury', 'buzz', 'calculate', 'call', 'camp', 'care', 'carry', 'carve', 'cause', 'challenge', 'change', 'charge', 'chase', 'cheat', 'check', 'cheer', 'chew', 'choke', 'chop', 'claim', 'clap', 'clean', 'clear', 'clip', 'close', 'coach', 'coil', 'collect', 'colour', 'comb', 'command', 'communicate', 'compare', 'compete', 'complain', 'complete', 'concentrate', 'concern', 'confess', 'confuse', 'connect', 'consider', 'consist', 'contain', 'continue', 'copy', 'correct', 'cough', 'count', 'cover', 'crack', 'crash', 'crawl', 'cross', 'crush', 'cry', 'cure', 'curl', 'curve', 'cycle', 'dam', 'damage', 'dance', 'dare', 'decay', 'deceive', 'decide', 'decorate', 'delay', 'delight', 'deliver', 'depend', 'describe', 'desert', 'deserve', 'destroy', 'detect', 'develop', 'disagree', 'disappear', 'disapprove', 'disarm', 'discover', 'dislike', 'divide', 'double', 'doubt', 'drag', 'drain', 'dream', 'dress', 'drip', 'drop', 'drown', 'drum', 'dry', 'dust', 'earn', 'educate', 'embarrass', 'employ', 'empty', 'encourage', 'end', 'enjoy', 'enter', 'entertain', 'escape', 'examine', 'excite', 'excuse', 'exercise', 'exist', 'expand', 'expect', 'explain', 'explode', 'extend', 'face', 'fade', 'fail', 'fancy', 'fasten', 'fax', 'fear', 'fence', 'fetch', 'file', 'fill', 'film', 'fire', 'fit', 'fix', 'flap', 'flash', 'float', 'flood', 'flow', 'flower', 'fold', 'follow', 'fool', 'force', 'form', 'found', 'frame', 'frighten', 'fry', 'gather', 'gaze', 'glow', 'glue', 'grab', 'grate', 'grease', 'greet', 'grin', 'grip', 'groan', 'guarantee', 'guard', 'guess', 'guide', 'hammer', 'hand', 'handle', 'hang', 'happen', 'harass', 'harm', 'hate', 'haunt', 'head', 'heal', 'heap', 'heat', 'help', 'hook', 'hop', 'hope', 'hover', 'hug', 'hum', 'hunt', 'hurry', 'identify', 'ignore', 'imagine', 'impress', 'improve', 'include', 'increase', 'influence', 'inform', 'inject', 'injure', 'instruct', 'intend', 'interest', 'interfere', 'interrupt', 'introduce', 'invent', 'invite', 'irritate', 'itch', 'jail', 'jam', 'jog', 'join', 'joke', 'judge', 'juggle', 'jump', 'kick', 'kill', 'kiss', 'kneel', 'knit', 'knock', 'knot', 'label', 'land', 'last', 'laugh', 'launch', 'learn', 'level', 'license', 'lick', 'lie', 'lighten', 'like', 'list', 'listen', 'live', 'load', 'lock', 'long', 'look', 'love', 'man', 'manage', 'march', 'mark', 'marry', 'match', 'mate', 'matter', 'measure', 'meddle', 'melt', 'memorise', 'mend', 'mess up', 'milk', 'mine', 'miss', 'mix', 'moan', 'moor', 'mourn', 'move', 'muddle', 'mug', 'multiply', 'murder', 'nail', 'name', 'need', 'nest', 'nod', 'note', 'notice', 'number', 'obey', 'object', 'observe', 'obtain', 'occur', 'offend', 'offer', 'open', 'order', 'overflow', 'owe', 'own', 'pack', 'paddle', 'paint', 'park', 'part', 'pass', 'paste', 'pat', 'pause', 'peck', 'pedal', 'peel', 'peep', 'perform', 'permit', 'phone', 'pick', 'pinch', 'pine', 'place', 'plan', 'plant', 'play', 'please', 'plug', 'point', 'poke', 'polish', 'pop', 'possess', 'post', 'pour', 'practise', 'pray', 'preach', 'precede', 'prefer', 'prepare', 'present', 'preserve', 'press', 'pretend', 'prevent', 'prick', 'print', 'produce', 'program', 'promise', 'protect', 'provide', 'pull', 'pump', 'punch', 'puncture', 'punish', 'push', 'question', 'queue', 'race', 'radiate', 'rain', 'raise', 'reach', 'realise', 'receive', 'recognise', 'record', 'reduce', 'reflect', 'refuse', 'regret', 'reign', 'reject', 'rejoice', 'relax', 'release', 'rely', 'remain', 'remember', 'remind', 'remove', 'repair', 'repeat', 'replace', 'reply', 'report', 'reproduce', 'request', 'rescue', 'retire', 'return', 'rhyme', 'rinse', 'risk', 'rob', 'rock', 'roll', 'rot', 'rub', 'ruin', 'rule', 'rush', 'sack', 'sail', 'satisfy', 'save', 'saw', 'scare', 'scatter', 'scold', 'scorch', 'scrape', 'scratch', 'scream', 'screw', 'scribble', 'scrub', 'seal', 'search', 'separate', 'serve', 'settle', 'shade', 'share', 'shave', 'shelter', 'shiver', 'shock', 'shop', 'shrug', 'sigh', 'sign', 'signal', 'sin', 'sip', 'ski', 'skip', 'slap', 'slip', 'slow', 'smash', 'smell', 'smile', 'smoke', 'snatch', 'sneeze', 'sniff', 'snore', 'snow', 'soak', 'soothe', 'sound', 'spare', 'spark', 'sparkle', 'spell', 'spill', 'spoil', 'spot', 'spray', 'sprout', 'squash', 'squeak', 'squeal', 'squeeze', 'stain', 'stamp', 'stare', 'start', 'stay', 'steer', 'step', 'stir', 'stitch', 'stop', 'store', 'strap', 'strengthen', 'stretch', 'strip', 'stroke', 'stuff', 'subtract', 'succeed', 'suck', 'suffer', 'suggest', 'suit', 'supply', 'support', 'suppose', 'surprise', 'surround', 'suspect', 'suspend', 'switch', 'talk', 'tame', 'tap', 'taste', 'tease', 'telephone', 'tempt', 'terrify', 'test', 'thank', 'thaw', 'tick', 'tickle', 'tie', 'time', 'tip', 'tire', 'touch', 'tour', 'tow', 'trace', 'trade', 'train', 'transport', 'trap', 'travel', 'treat', 'tremble', 'trick', 'trip', 'trot', 'trouble', 'trust', 'try', 'tug', 'tumble', 'turn', 'twist', 'type', 'undress', 'unfasten', 'unite', 'unlock', 'unpack', 'untidy', 'use', 'vanish', 'visit', 'wail', 'wait', 'walk', 'wander', 'want', 'warm', 'warn', 'wash', 'waste', 'watch', 'water', 'wave', 'weigh', 'welcome', 'whine', 'whip', 'whirl', 'whisper', 'whistle', 'wink', 'wipe', 'wish', 'wobble', 'wonder', 'work', 'worry', 'wrap', 'wreck', 'wrestle', 'wriggle', 'x-ray', 'yawn', 'yell', 'zip', 'zoom'])
+
+
+ENGLISH_IRREGULAR_VERBS = set(['arise ', 'arisen', 'arose ', 'ate', 'awake', 'awakened', 'awoke', 'awoken', 'backslid', 'backslidden', 'backslide', 'bade', 'be', 'bear', 'beat', 'beaten', 'became', 'become', 'been', 'began', 'begin', 'begun', 'bend', 'bent', 'bet', 'betted', 'bid', 'bidden', 'bind', 'bit', 'bite', 'bitten', 'bled', 'bleed', 'blew', 'blow', 'blown', 'bore', 'born', 'borne', 'bought', 'bound', 'break', 'bred', 'breed', 'bring', 'broadcast', 'broadcasted', 'broke', 'broken', 'brought', 'build', 'built', 'burn', 'burned', 'burnt', 'burst', 'bust', 'busted', 'buy', 'came', 'cast', 'catch', 'caught', 'choose', 'chose', 'chosen', 'clad', 'cling', 'clothe', 'clothed', 'clung', 'come', 'cost', 'creep', 'crept', 'cut', 'daydream', 'daydreamed', 'daydreamt', 'deal', 'dealt', 'did', 'dig', 'disprove', 'disproved', 'disproven', 'dive', 'dived', 'do', 'done', 'dove', 'drank', 'draw', 'drawn', 'dream', 'dreamed', 'dreamt', 'drew', 'drink', 'drive', 'driven', 'drove', 'drunk', 'dug', 'dwell', 'dwelled', 'dwelt', 'eat', 'eaten', 'fall', 'fallen', 'fed', 'feed', 'feel', 'fell', 'felt', 'fight', 'find', 'fit', 'fitted', 'fled', 'flee', 'flew', 'fling', 'flown', 'flung', 'fly', 'forbade', 'forbid', 'forbidden', 'forecast', 'forego', 'foregone', 'foresaw', 'foresee', 'foreseen', 'foretell', 'foretold', 'forewent', 'forgave', 'forget', 'forgive', 'forgiven', 'forgot', 'forgotten', 'forsake', 'forsaken', 'forsook', 'fought', 'found', 'freeze', 'froze', 'frozen', 'gave', 'get', 'give', 'given', 'go', 'gone', 'got', 'gotten', 'grew', 'grind', 'ground', 'grow', 'grown', 'had', 'hang', 'have', 'hear', 'heard', 'held', 'hew', 'hewed', 'hewn', 'hid', 'hidden', 'hide', 'hit', 'hold', 'hung', 'hurt', 'keep', 'kept', 'kneel', 'kneeled', 'knelt', 'knew', 'knit', 'knitted', 'know', 'known', 'laid', 'lain', 'lay', 'lead', 'lean', 'leaned', 'leant', 'leap', 'leaped', 'leapt', 'learn', 'learned', 'learnt', 'leave', 'led', 'left', 'lend', 'lent', 'let', 'lie', 'lied', 'light', 'lighted', 'lit', 'lose', 'lost', 'made', 'make', 'mean', 'meant', 'meet', 'met', 'misunderstand', 'misunderstood', 'mow', 'mowed', 'mown', 'paid', 'partake', 'partaken', 'partook', 'pay', 'plead', 'pleaded', 'pled', 'proofread', 'prove', 'proved', 'proven', 'put', 'quick-freeze', 'quick-froze', 'quick-frozen', 'quit', 'quitted', 'ran', 'rang', 'read', 'rid', 'ridden', 'ride', 'ring', 'rise', 'risen', 'rode', 'rose', 'run', 'rung', 'said', 'sang', 'sank', 'sat', 'saw', 'sawed', 'sawn', 'say', 'see', 'seek', 'seen', 'sell', 'send', 'sent', 'set', 'sew', 'sewed', 'sewn', 'shake', 'shaken', 'shave', 'shaved', 'shaven', 'shear', 'sheared', 'shed', 'shine', 'shined', 'shone', 'shook', 'shoot', 'shorn', 'shot', 'show', 'showed', 'shown', 'shrank', 'shrink', 'shrunk', 'shut', 'sing', 'sink', 'sit', 'slain', 'slay', 'slayed', 'sleep', 'slept', 'slew', 'slid', 'slide', 'sling', 'slink', 'slinked', 'slit', 'slung', 'slunk', 'smell', 'smelled', 'smelt', 'sneak', 'sneaked', 'snuck', 'sold', 'sought', 'sow', 'sowed', 'sown', 'spat', 'speak', 'sped', 'speed', 'speeded', 'spell', 'spelled', 'spelt', 'spend', 'spent', 'spill', 'spilled', 'spilt', 'spin', 'spit', 'split', 'spoil', 'spoiled', 'spoilt', 'spoke', 'spoken', 'sprang', 'spread', 'spring', 'sprung', 'spun', 'stand ', 'stank', 'steal', 'stick', 'sting', 'stink', 'stole', 'stolen', 'stood', 'strew', 'strewed', 'strewn', 'stricken', 'stridden', 'stride', 'strike', 'string', 'strive', 'strived', 'striven', 'strode', 'strove', 'struck', 'strung', 'stuck', 'stung', 'stunk', 'sublet', 'sunburn', 'sunburned', 'sunburnt', 'sung', 'sunk', 'swam', 'swear', 'sweat', 'sweated', 'sweep', 'swell', 'swelled', 'swept', 'swim', 'swing', 'swollen', 'swore', 'sworn', 'swum', 'swung', 'take', 'taken', 'taught', 'teach', 'tear', 'telecast', 'tell', 'test-drive', 'test-driven', 'test-drove', 'test-flew', 'test-flown', 'test-fly', 'think', 'thought', 'threw', 'throw', 'thrown', 'thrust', 'told', 'took', 'tore', 'torn', 'tread', 'trod', 'trodden', 'understand', 'understood', 'undertake', 'undertaken', 'undertook', 'undid', 'undo', 'undone', 'wake', 'waked', 'was, were', 'waylaid', 'waylay', 'wear', 'weave', 'weaved', 'wed', 'wedded', 'weep', 'went', 'wept', 'wet', 'wetted', 'whet', 'whetted', 'win', 'wind', 'withdraw', 'withdrawn', 'withdrew', 'withheld', 'withhold', 'withstand', 'withstood', 'woke', 'woken', 'won', 'wore', 'worn', 'wound', 'wove', 'woven', 'wring', 'write', 'written', 'wrote', 'wrung'])
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ner/test/test_core.py	Tue Oct 22 15:53:25 2013 +0200
@@ -0,0 +1,225 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+import unittest2
+
+from nerdy import core
+from nerdy.tokenizer import Token, Sentence
+
+
+class CoreTest(unittest2.TestCase):
+    """ Test of core """
+
+    def test_lexical_source(self):
+        """ Test lexical source """
+        lexicon = {'everyone': 'http://example.com/everyone',
+                   'me': 'http://example.com/me'}
+        source = core.NerdySourceLexical(lexicon)
+        self.assertEqual(source.query_word('me'), ['http://example.com/me',])
+        self.assertEqual(source.query_word('everyone'), ['http://example.com/everyone',])
+        self.assertEqual(source.query_word('me everyone'), [])
+        self.assertEqual(source.query_word('toto'), [])
+        # Token
+        token = Token('me', 0, 2, None)
+        self.assertEqual(source.recognize_token(token), ['http://example.com/me',])
+        token = Token('ma', 0, 2, None)
+        self.assertEqual(source.recognize_token(token), [])
+
+    def test_rql_source(self):
+        """ Test rql source """
+        source = core.NerdySourceUrlRql('Any U LIMIT 1 WHERE X cwuri U, X name "%(word)s"',
+                                       'http://www.cubicweb.org')
+        self.assertEqual(source.query_word('apycot'), [u'http://www.cubicweb.org/1310453',])
+
+    def test_sparql_source(self):
+        """ Test sparql source """
+        source = core.NerdySourceSparql(u'''SELECT ?uri
+                                            WHERE{
+                                            ?uri rdfs:label "Python"@en .
+                                            ?uri rdf:type ?type}''',
+                                        u'http://dbpedia.org/sparql')
+        self.assertEqual(source.query_word('cubicweb'),
+                         [u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
+                          u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'])
+
+    def test_nerdy_process(self):
+        """ Test nerdy process """
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
+                                          'me': 'http://example.com/me'})
+        nerdy = core.NerdyProcess((source,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+
+    def test_nerdy_process_multisources(self):
+        """ Test nerdy process """
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
+                                          'me': 'http://example.com/me'})
+        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
+        # Two sources, not unique
+        nerdy = core.NerdyProcess((source1, source2))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+        # Two sources, unique
+        nerdy = core.NerdyProcess((source1, source2), unique=True)
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+        # Two sources inversed, unique
+        nerdy = core.NerdyProcess((source2, source1), unique=True)
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+
+    def test_nerdy_process_add_sources(self):
+        """ Test nerdy process """
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
+                                          'me': 'http://example.com/me'})
+        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
+        nerdy = core.NerdyProcess((source1,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46))),])
+        # Two sources, not unique
+        nerdy.add_ner_source(source2)
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+
+    def test_nerdy_process_preprocess(self):
+        """ Test nerdy process """
+        text = 'Hello Toto, this is   me speaking. And me.'
+        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
+                                          'me': 'http://example.com/me'})
+        preprocessor = core.NerdyStopwordsFilterPreprocessor()
+        nerdy = core.NerdyProcess((source,),
+                                  preprocessors=(preprocessor,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities, [('http://example.com/toto', None,
+                                           Token(word='Toto', start=6, end=10,
+                                                 sentence=Sentence(indice=0, start=0, end=34)))])
+
+    def test_nerdy_process_add_preprocess(self):
+        """ Test nerdy process """
+        text = 'Hello Toto, this is   me speaking. And me.'
+        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
+                                          'me': 'http://example.com/me'})
+        preprocessor = core.NerdyStopwordsFilterPreprocessor()
+        nerdy = core.NerdyProcess((source,),)
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/toto', None,
+                           Token(word='Toto', start=6, end=10,
+                                 sentence=Sentence(indice=0, start=0, end=34))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=22, end=24,
+                                 sentence=Sentence(indice=0, start=0, end=34))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=39, end=41,
+                                 sentence=Sentence(indice=1, start=34, end=42)))])
+        nerdy.add_preprocessors(preprocessor)
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities, [('http://example.com/toto', None,
+                                           Token(word='Toto', start=6, end=10,
+                                                 sentence=Sentence(indice=0, start=0, end=34)))])
+
+    def test_nerdy_process_chained_word(self):
+        """ Test nerdy process """
+        text = 'Hello everyone me, this is   me speaking. And me.'
+        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
+                                          'everyone me': 'http://example.com/everyone_me',
+                                          'me': 'http://example.com/me'})
+        nerdy = core.NerdyProcess((source,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone_me', None,
+                           Token(word='everyone me', start=6, end=17,
+                                 sentence=Sentence(indice=0, start=0, end=41))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=29, end=31,
+                                 sentence=Sentence(indice=0, start=0, end=41))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=46, end=48, sentence=Sentence(indice=1, start=41, end=49)))])
+
+
+if __name__ == '__main__':
+    unittest2.main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ner/test/test_dataio.py	Tue Oct 22 15:53:25 2013 +0200
@@ -0,0 +1,85 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+import unittest2
+
+from nerdy import dataio, core
+
+
+class DataioTest(unittest2.TestCase):
+    """ Test of dataio """
+
+    def test_sparql_query(self):
+        results = dataio.sparql_query(query=u'''SELECT ?uri
+                                                WHERE{
+                                                ?uri rdfs:label "Python"@en .
+                                                ?uri rdf:type ?type}''',
+                                      endpoint=u'http://dbpedia.org/sparql')
+        truth = [{u'uri':
+                  {u'type': u'uri',
+                   u'value': u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage'}},
+                 {u'uri':
+                  {u'type': u'uri',
+                   u'value': u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'}}]
+        self.assertEqual(results, truth)
+
+    def test_rql_url_query(self):
+        results = dataio.rql_url_query('Any U LIMIT 1 WHERE X cwuri U, X name "apycot"',
+                                       'http://www.cubicweb.org')
+        self.assertEqual(results, [[u'http://www.cubicweb.org/1310453']])
+
+    def test_prettyprint(self):
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
+                                          'me': 'http://example.com/me'})
+        nerdy = core.NerdyProcess((source,))
+        named_entities = nerdy.process_text(text)
+        html = dataio.NerdyHTMLPrettyPrint().pprint_text(text, named_entities)
+        self.assertEqual(html, (u'Hello <a href="http://example.com/everyone">everyone</a>, '
+                                u'this is   <a href="http://example.com/me">me</a> speaking. '
+                                u'And <a href="http://example.com/me">me</a>.'))
+
+    def test_prettyprint_class(self):
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
+                                          'me': 'http://example.com/me'})
+        nerdy = core.NerdyProcess((source,))
+        named_entities = nerdy.process_text(text)
+        html = dataio.NerdyHTMLPrettyPrint().pprint_text(text, named_entities, html_class='ner')
+        self.assertEqual(html, (u'Hello <a href="http://example.com/everyone" class="ner">everyone</a>, '
+                                u'this is   <a href="http://example.com/me" class="ner">me</a> speaking. '
+                                u'And <a href="http://example.com/me" class="ner">me</a>.'))
+
+
+class NerdyValidXHTMLPrettyPrintTest(unittest2.TestCase):
+
+    def test_valid(self):
+        self.assertTrue(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
+            '<p>coucou</p>'))
+
+    def test_valid_unicode(self):
+        self.assertTrue(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
+            u'<p>hé</p>'))
+
+    def test_invalid(self):
+        self.assertFalse(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
+            '<p><div>coucou</div></p>'))
+
+
+if __name__ == '__main__':
+    unittest2.main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ner/test/test_filter.py	Tue Oct 22 15:53:25 2013 +0200
@@ -0,0 +1,99 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+import unittest2
+
+from nerdy import core
+from nerdy.tokenizer import Token, Sentence
+
+
+class FilterTest(unittest2.TestCase):
+    """ Test of filters """
+
+    def test_occurence_filter_min_occ(self):
+        """ Test occurence filter """
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
+                                          'me': 'http://example.com/me'})
+        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
+        _filter = core.NerdyOccurenceFilter(min_occ=2)
+        nerdy = core.NerdyProcess((source1, source2), filters=(_filter,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+
+    def test_occurence_filter_max_occ(self):
+        """ Test occurence filter """
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
+                                          'me': 'http://example.com/me'})
+        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
+        _filter = core.NerdyOccurenceFilter(max_occ=1)
+        nerdy = core.NerdyProcess((source1, source2), filters=(_filter,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),])
+
+    def test_disambiguation_word_length(self):
+        """ Test occurence filter """
+        text = 'Hello toto tutu. And toto.'
+        source = core.NerdySourceLexical({'toto tutu': 'http://example.com/toto_tutu',
+                                          'toto': 'http://example.com/toto'})
+        _filter = core.NerdyDisambiguationWordParts()
+        nerdy = core.NerdyProcess((source,), filters=(_filter,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/toto_tutu', None,
+                           Token(word='toto tutu', start=6, end=15,
+                                 sentence=Sentence(indice=0, start=0, end=16))),
+                          ('http://example.com/toto_tutu', None,
+                           Token(word='toto', start=21, end=25,
+                                 sentence=Sentence(indice=1, start=16, end=26)))])
+
+    def test_rules_filter(self):
+        """ Test rules filter """
+        text = 'Hello toto tutu. And toto.'
+        source = core.NerdySourceLexical({'toto tutu': 'http://example.com/toto_tutu',
+                                          'toto': 'http://example.com/toto'})
+        rules = {'http://example.com/toto': 'http://example.com/tata'}
+        _filter = core.NerdyReplacementRulesFilter(rules)
+        nerdy = core.NerdyProcess((source,), filters=(_filter,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/toto_tutu', None,
+                           Token(word='toto tutu', start=6, end=15,
+                                 sentence=Sentence(indice=0, start=0, end=16))),
+                          ('http://example.com/tata', None,
+                           Token(word='toto', start=21, end=25,
+                                 sentence=Sentence(indice=1, start=16, end=26)))])
+
+if __name__ == '__main__':
+    unittest2.main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ner/test/test_preprocessor.py	Tue Oct 22 15:53:25 2013 +0200
@@ -0,0 +1,97 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+import unittest2
+
+from nerdy import core, tokenizer
+
+
+class PreprocessorTest(unittest2.TestCase):
+    """ Test of preprocessors """
+
+    def test_lowercasefilter(self):
+        preprocessor = core.NerdyLowerCaseFilterPreprocessor()
+        token = tokenizer.Token('toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+        token = tokenizer.Token('toto Tata', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        token = tokenizer.Token('toto tata', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+
+    def test_wordsizefilter(self):
+        preprocessor = core.NerdyWordSizeFilterPreprocessor()
+        token = tokenizer.Token('toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        preprocessor = core.NerdyWordSizeFilterPreprocessor(min_size=3)
+        token = tokenizer.Token('toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        token = tokenizer.Token('to', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+        preprocessor = core.NerdyWordSizeFilterPreprocessor(max_size=3)
+        token = tokenizer.Token('toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+        token = tokenizer.Token('to', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+
+    def test_lowerfirstword(self):
+        preprocessor = core.NerdyLowerFirstWordPreprocessor()
+        sentence = tokenizer.Sentence(0, 0, 20)
+        # Start of the sentence
+        token1 = tokenizer.Token('Toto tata', 0, 4, sentence)
+        token2 = tokenizer.Token('Toto tata', 0, 4, sentence)
+        self.assertEqual(preprocessor(token1), token2)
+        token1 = tokenizer.Token('Us tata', 0, 4, sentence)
+        token2 = tokenizer.Token('us tata', 0, 4, sentence)
+        self.assertEqual(preprocessor(token1), token2)
+        # Not start of the sentence
+        token1 = tokenizer.Token('Toto tata', 12, 16, sentence)
+        token2 = tokenizer.Token('Toto tata', 12, 16, sentence)
+        self.assertEqual(preprocessor(token1), token2)
+        token1 = tokenizer.Token('Us tata', 12, 16, sentence)
+        token2 = tokenizer.Token('Us tata', 12, 16, sentence)
+        self.assertEqual(preprocessor(token1), token2)
+
+    def test_stopwordsfilter(self):
+        preprocessor = core.NerdyStopwordsFilterPreprocessor()
+        token = tokenizer.Token('Toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        token = tokenizer.Token('Us', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+        token = tokenizer.Token('Us there', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        # Split words
+        preprocessor = core.NerdyStopwordsFilterPreprocessor(split_words=True)
+        token = tokenizer.Token('Us there', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+        token = tokenizer.Token('Us there toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+
+    def test_hashtag(self):
+        preprocessor = core.NerdyHashTagPreprocessor()
+        token = tokenizer.Token('Toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        token1 = tokenizer.Token('@BarackObama', 0, 4, None)
+        token2 = tokenizer.Token('BarackObama', 0, 4, None)
+        self.assertEqual(preprocessor(token1), token2)
+        token1 = tokenizer.Token('@Barack_Obama', 0, 4, None)
+        token2 = tokenizer.Token('Barack Obama', 0, 4, None)
+        self.assertEqual(preprocessor(token1), token2)
+
+
+if __name__ == '__main__':
+    unittest2.main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ner/test/test_tokenizer.py	Tue Oct 22 15:53:25 2013 +0200
@@ -0,0 +1,88 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+import unittest2
+
+from nerdy.tokenizer import RichStringTokenizer, Token, Sentence
+
+
+class TokenizerTest(unittest2.TestCase):
+    """ Test of tokenizer """
+
+    def test_richstringtokenizer(self):
+        text = 'Hello everyone, this is   me speaking. And me.'
+        tokenizer = RichStringTokenizer(text,
+                                        token_min_size=1,
+                                        token_max_size=3)
+        tokens = list(tokenizer)
+        self.assertEqual(len(tokens), 18)
+        t1 = Token(word='Hello everyone this', start=0, end=20, sentence=Sentence(indice=0, start=0, end=38))
+        self.assertEqual(tokens[0], t1)
+        t2 = Token(word='And', start=39, end=42, sentence=Sentence(indice=1, start=38, end=46))
+        self.assertEqual(tokens[16], t2)
+
+    def test_richstringtokenizer_loadtext(self):
+        text = 'Hello everyone, this is   me speaking. And me.'
+        tokenizer = RichStringTokenizer(text,
+                                        token_min_size=1,
+                                        token_max_size=3)
+        tokens = list(tokenizer)
+        self.assertEqual(len(tokens), 18)
+        tokenizer.load_text('Hello everyone')
+        tokens = list(tokenizer)
+        self.assertEqual(len(tokens), 3)
+
+    def test_richstringtokenizer_minsize(self):
+        text = 'Hello everyone, this is   me speaking. And me.'
+        tokenizer = RichStringTokenizer(text,
+                                        token_min_size=2,
+                                        token_max_size=3)
+        tokens = list(tokenizer)
+        self.assertEqual(len(tokens), 10)
+        t1 =  Token(word='me speaking', start=26, end=37, sentence=Sentence(indice=0, start=0, end=38))
+        self.assertEqual(tokens[8], t1)
+
+    def test_richstringtokenizer_maxsize(self):
+        text = 'Hello everyone, this is   me speaking. And me.'
+        tokenizer = RichStringTokenizer(text,
+                                        token_min_size=1,
+                                        token_max_size=4)
+        tokens = list(tokenizer)
+        self.assertEqual(len(tokens), 21)
+        t1 = Token(word='And me', start=39, end=45, sentence=Sentence(indice=1, start=38, end=46))
+        self.assertEqual(tokens[18], t1)
+
+    def test_richstringtokenizer_sentences(self):
+        text = 'Hello everyone, this is   me speaking. And me !Why not me ? Blup'
+        tokenizer = RichStringTokenizer(text,
+                                        token_min_size=1,
+                                        token_max_size=4)
+        sentences = tokenizer.find_sentences(text)
+        self.assertEqual(len(sentences), 4)
+        self.assertEqual(text[sentences[0].start:sentences[0].end],
+                         'Hello everyone, this is   me speaking.')
+        self.assertEqual(text[sentences[1].start:sentences[1].end],
+                         ' And me !')
+        self.assertEqual(text[sentences[2].start:sentences[2].end],
+                         'Why not me ?')
+        self.assertEqual(text[sentences[3].start:sentences[3].end],
+                         ' Blup')
+
+
+if __name__ == '__main__':
+    unittest2.main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ner/tokenizer.py	Tue Oct 22 15:53:25 2013 +0200
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+""" Tokenizer for sentences/words segmentation.
+"""
+import itertools
+import collections
+import re
+
+
+Token = collections.namedtuple('Token', ['word', 'start', 'end', 'sentence'])
+Sentence = collections.namedtuple('Sentence', ['indice', 'start', 'end'])
+
+
+class RichStringTokenizer(object):
+    """Tokenizer for Yams' RichString content.
+
+    The tokenizer uses a variable-length sliding window, i.e. a sliding
+    window yielding tokens of N words.
+    """
+
+    def __init__(self, text, token_min_size=1, token_max_size=3):
+        """
+        :token_min_size: minimum number of words required to be a valid token
+        :token_max_size: minimum number of words required to be a valid token
+        """
+        self.text = text
+        self.token_min_size = token_min_size
+        self.token_max_size = token_max_size
+
+    def iter_tokens(self, text):
+        """ Iterate tokens over a text
+        """
+        # Compute sentences
+        sentences = self.find_sentences(text)
+        # Compute words
+        words = list([m for m in re.finditer(r'[\w@-]+', text, re.UNICODE)])
+        indice = 0
+        while indice < len(words):
+            # Choose the current sentence of the first word
+            current_sentence = [s for s in sentences if s.start<=words[indice].start()][-1]
+            # Sliding windows over the different words for each sentence
+            remaining = len(words) - indice
+            for length in range(min(self.token_max_size, remaining), self.token_min_size-1, -1):
+                _words = words[indice:indice+length]
+                if _words[-1].start() > current_sentence.end:
+                    # The last word in not in the same sentence anymore, split
+                    continue
+                normalized_word = ' '.join([w.group() for w in _words]).strip()
+                yield Token(normalized_word, _words[0].start(), _words[-1].end(), current_sentence)
+            indice += 1
+
+    def find_sentences(self, text):
+        """ Find the sentences
+        """
+        return [Sentence(ind, s.start(), s.end()) for ind, s in
+                enumerate(re.finditer(r'[^.!?]+(?:[.!?]|$)', text, re.UNICODE))]
+
+    def load_text(self, text):
+        """ Load the text to be tokenized
+        """
+        self.text = text
+
+    def __iter__(self):
+        """ Iterator over the text given in the object instantiation
+        """
+        for t in self.iter_tokens(self.text):
+            yield t
--- a/python-nerdy.spec	Sun Jul 14 23:18:38 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,48 +0,0 @@
-%if 0%{?el5}
-%define python python26
-%define __python /usr/bin/python2.6
-%{!?python_scriptarch: %define python_scriptarch %(%{__python} -c "from distutils.sysconfig import get_python_lib; from os.path import join; print join(get_python_lib(1, 1), 'scripts')")}
-%else
-%define python python
-%define __python /usr/bin/python
-%endif
-
-Name:           %{python}-nerdy
-Version:        0.1.0
-Release:        logilab.1%{?dist}
-Summary:        Python library for data alignment
-Group:          Development/Languages/Python
-License:        LGPL
-Source0:        nerdy-%{version}.tar.gz
-
-BuildArch:      noarch
-BuildRoot:      %{_tmppath}/%{name}-%{version}-%{release}-buildroot
-
-BuildRequires:  %{python}
-Requires:       %{python}, %{python}-lxml
-
-
-%description
-entity / relation schema
-
-%prep
-%setup -q -n nerdy-%{version}
-
-%build
-%{__python} setup.py build
-%if 0%{?el5}
-# change the python version in shebangs
-find . -name '*.py' -type f -print0 |  xargs -0 sed -i '1,3s;^#!.*python.*$;#! /usr/bin/python2.6;'
-%endif
-
-%install
-rm -rf $RPM_BUILD_ROOT
-NO_SETUPTOOLS=1 %{__python} setup.py install -O1 --skip-build --root $RPM_BUILD_ROOT %{?python_scriptarch: --install-scripts=%{python_scriptarch}}
-
-%clean
-rm -rf $RPM_BUILD_ROOT
-
-%files 
-%defattr(-, root, root)
-/*
-
--- a/setup.py	Sun Jul 14 23:18:38 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,27 +0,0 @@
-# -*- coding:utf-8 -*-
-# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-from distutils.core import setup
-
-setup(name='nerdy',
-      version='0.1.0',
-      description='Python library for data alignment',
-      author='LOGILAB S.A. (Paris, FRANCE)',
-      author_email=' <contact@logilab.fr>',
-      url='https://www.logilab.org/project/nerdy',
-      package_dir={'nerdy': '.'},
-      packages=['nerdy'],
-     )
--- a/stopwords.py	Sun Jul 14 23:18:38 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,15 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Stopwords in different languages.
-"""
-
-FRENCH_STOPWORDS = set(['alors', 'au', 'aucuns', 'aussi', 'autre', 'aux', 'avant', 'avec', 'avoir', 'bon', 'car', 'ce', 'cela', 'ces', 'ceux', 'chaque', 'ci', 'comme', 'comment', 'dans', 'de', 'dedans', 'dehors', 'depuis', 'des', 'deux', 'devrait', 'doit', 'donc', 'dos', 'droite', 'du', 'début', 'elle', 'elles', 'en', 'encore', 'essai', 'est', 'et', 'eu', 'eux', 'fait', 'faites', 'fois', 'font', 'force', 'haut', 'hors', 'ici', 'il', 'ils', 'je', 'juste', 'la', 'le', 'les', 'leur', 'lui', 'là', 'ma', 'maintenant', 'mais', 'me', 'meme', 'mes', 'mine', 'moi', 'moins', 'mon', 'mot', 'ne', 'ni', 'nommés', 'nos', 'notre', 'nous', 'nouveaux', 'on', 'ou', 'où', 'par', 'parce', 'parole', 'pas', 'personnes', 'peu', 'peut', 'pièce', 'plupart', 'pour', 'pourquoi', 'qu', 'quand', 'que', 'quel', 'quelle', 'quelles', 'quels', 'qui', 'sa', 'sans', 'se', 'ses', 'seulement', 'si', 'sien', 'son', 'sont', 'sous', 'soyez', 'sujet', 'sur', 'ta', 'tandis', 'te', 'tellement', 'tels', 'tes', 'toi', 'ton', 'tous', 'tout', 'trop', 'très', 'tu', 'un', 'une', 'valeur', 'voie', 'voient', 'vont', 'vos', 'votre', 'vous', 'vu', 'ça', 'étaient', 'état', 'étions', 'été', 'être'])
-
-
-ENGLISH_STOPWORDS = set(['a', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', "ain't", 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', "aren't", 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'bill', 'both', 'bottom', 'brief', 'but', 'by', 'call', 'came', 'can', 'cannot', 'cant', "can't", 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'computer', 'con', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'couldnt', "couldn't", 'course', 'cry', 'currently', "c'mon", "c's", 'de', 'definitely', 'describe', 'described', 'despite', 'detail', 'did', "didn't", 'different', 'do', 'does', "doesn't", 'doing', 'done', "don't", 'down', 'downwards', 'due', 'during', 'each', 'edu', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'far', 'few', 'fifteen', 'fifth', 'fify', 'fill', 'find', 'fire', 'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'furthermore', 'get', 'gets', 'getting', 'give', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'had', "hadn't", 'happens', 'hardly', 'has', 'hasnt', "hasn't", 'have', "haven't", 'having', 'he', 'hello', 'help', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', "here's", 'hers', 'herself', "he's", 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'hundred', 'i', "i'd", "i'll", "i'm", "i've", 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'insofar', 'instead', 'interest', 'into', 'inward', 'is', "isn't", 'it', 'its', 'itself', "it'd", "it'll", "it's'", "i'd", "i'll", "i'm", "i've", 'just', 'keep', 'keeps', 'kept', 'know', 'known', 'knows', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', "let's", 'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks', 'ltd', 'made', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'part', 'particular', 'particularly', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'put', 'que', 'quite', 'qv', 'rather', 'rd', 're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', "shouldn't", 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 'system', 'take', 'taken', 'tell', 'ten', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', "that's", 'thats', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'thick', 'thin', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took', 'top', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twelve', 'twenty', 'twice', 'two', "t's", 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', 'via', 'viz', 'vs', 'want', 'wants', 'was', "wasn't", 'way', 'we', 'welcome', 'well', 'went', 'were', "weren't", "we'd", "we'll", "we're", "we've", 'what', 'whatever', "what's", 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', "where's", 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', "who's", 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'wonder', "won't", 'would', "wouldn't", 'yes', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves', "you'd", "you'll", "you're", "you've", 'zero'])
-
-
-ENGLISH_REGULAR_VERBS = set(['accept', 'add', 'admire', 'admit', 'advise', 'afford', 'agree', 'alert', 'allow', 'amuse', 'analyse', 'announce', 'annoy', 'answer', 'apologise', 'appear', 'applaud', 'appreciate', 'approve', 'argue', 'arrange', 'arrest', 'arrive', 'ask', 'attach', 'attack', 'attempt', 'attend', 'attract', 'avoid', 'back', 'bake', 'balance', 'ban', 'bang', 'bare', 'bat', 'bathe', 'battle', 'beam', 'beg', 'behave', 'belong', 'bleach', 'bless', 'blind', 'blink', 'blot', 'blush', 'boast', 'boil', 'bolt', 'bomb', 'book', 'bore', 'borrow', 'bounce', 'bow', 'box', 'brake', 'branch', 'breathe', 'bruise', 'brush', 'bubble', 'bump', 'burn', 'bury', 'buzz', 'calculate', 'call', 'camp', 'care', 'carry', 'carve', 'cause', 'challenge', 'change', 'charge', 'chase', 'cheat', 'check', 'cheer', 'chew', 'choke', 'chop', 'claim', 'clap', 'clean', 'clear', 'clip', 'close', 'coach', 'coil', 'collect', 'colour', 'comb', 'command', 'communicate', 'compare', 'compete', 'complain', 'complete', 'concentrate', 'concern', 'confess', 'confuse', 'connect', 'consider', 'consist', 'contain', 'continue', 'copy', 'correct', 'cough', 'count', 'cover', 'crack', 'crash', 'crawl', 'cross', 'crush', 'cry', 'cure', 'curl', 'curve', 'cycle', 'dam', 'damage', 'dance', 'dare', 'decay', 'deceive', 'decide', 'decorate', 'delay', 'delight', 'deliver', 'depend', 'describe', 'desert', 'deserve', 'destroy', 'detect', 'develop', 'disagree', 'disappear', 'disapprove', 'disarm', 'discover', 'dislike', 'divide', 'double', 'doubt', 'drag', 'drain', 'dream', 'dress', 'drip', 'drop', 'drown', 'drum', 'dry', 'dust', 'earn', 'educate', 'embarrass', 'employ', 'empty', 'encourage', 'end', 'enjoy', 'enter', 'entertain', 'escape', 'examine', 'excite', 'excuse', 'exercise', 'exist', 'expand', 'expect', 'explain', 'explode', 'extend', 'face', 'fade', 'fail', 'fancy', 'fasten', 'fax', 'fear', 'fence', 'fetch', 'file', 'fill', 'film', 'fire', 'fit', 'fix', 'flap', 'flash', 'float', 'flood', 'flow', 'flower', 'fold', 'follow', 'fool', 'force', 'form', 'found', 'frame', 'frighten', 'fry', 'gather', 'gaze', 'glow', 'glue', 'grab', 'grate', 'grease', 'greet', 'grin', 'grip', 'groan', 'guarantee', 'guard', 'guess', 'guide', 'hammer', 'hand', 'handle', 'hang', 'happen', 'harass', 'harm', 'hate', 'haunt', 'head', 'heal', 'heap', 'heat', 'help', 'hook', 'hop', 'hope', 'hover', 'hug', 'hum', 'hunt', 'hurry', 'identify', 'ignore', 'imagine', 'impress', 'improve', 'include', 'increase', 'influence', 'inform', 'inject', 'injure', 'instruct', 'intend', 'interest', 'interfere', 'interrupt', 'introduce', 'invent', 'invite', 'irritate', 'itch', 'jail', 'jam', 'jog', 'join', 'joke', 'judge', 'juggle', 'jump', 'kick', 'kill', 'kiss', 'kneel', 'knit', 'knock', 'knot', 'label', 'land', 'last', 'laugh', 'launch', 'learn', 'level', 'license', 'lick', 'lie', 'lighten', 'like', 'list', 'listen', 'live', 'load', 'lock', 'long', 'look', 'love', 'man', 'manage', 'march', 'mark', 'marry', 'match', 'mate', 'matter', 'measure', 'meddle', 'melt', 'memorise', 'mend', 'mess up', 'milk', 'mine', 'miss', 'mix', 'moan', 'moor', 'mourn', 'move', 'muddle', 'mug', 'multiply', 'murder', 'nail', 'name', 'need', 'nest', 'nod', 'note', 'notice', 'number', 'obey', 'object', 'observe', 'obtain', 'occur', 'offend', 'offer', 'open', 'order', 'overflow', 'owe', 'own', 'pack', 'paddle', 'paint', 'park', 'part', 'pass', 'paste', 'pat', 'pause', 'peck', 'pedal', 'peel', 'peep', 'perform', 'permit', 'phone', 'pick', 'pinch', 'pine', 'place', 'plan', 'plant', 'play', 'please', 'plug', 'point', 'poke', 'polish', 'pop', 'possess', 'post', 'pour', 'practise', 'pray', 'preach', 'precede', 'prefer', 'prepare', 'present', 'preserve', 'press', 'pretend', 'prevent', 'prick', 'print', 'produce', 'program', 'promise', 'protect', 'provide', 'pull', 'pump', 'punch', 'puncture', 'punish', 'push', 'question', 'queue', 'race', 'radiate', 'rain', 'raise', 'reach', 'realise', 'receive', 'recognise', 'record', 'reduce', 'reflect', 'refuse', 'regret', 'reign', 'reject', 'rejoice', 'relax', 'release', 'rely', 'remain', 'remember', 'remind', 'remove', 'repair', 'repeat', 'replace', 'reply', 'report', 'reproduce', 'request', 'rescue', 'retire', 'return', 'rhyme', 'rinse', 'risk', 'rob', 'rock', 'roll', 'rot', 'rub', 'ruin', 'rule', 'rush', 'sack', 'sail', 'satisfy', 'save', 'saw', 'scare', 'scatter', 'scold', 'scorch', 'scrape', 'scratch', 'scream', 'screw', 'scribble', 'scrub', 'seal', 'search', 'separate', 'serve', 'settle', 'shade', 'share', 'shave', 'shelter', 'shiver', 'shock', 'shop', 'shrug', 'sigh', 'sign', 'signal', 'sin', 'sip', 'ski', 'skip', 'slap', 'slip', 'slow', 'smash', 'smell', 'smile', 'smoke', 'snatch', 'sneeze', 'sniff', 'snore', 'snow', 'soak', 'soothe', 'sound', 'spare', 'spark', 'sparkle', 'spell', 'spill', 'spoil', 'spot', 'spray', 'sprout', 'squash', 'squeak', 'squeal', 'squeeze', 'stain', 'stamp', 'stare', 'start', 'stay', 'steer', 'step', 'stir', 'stitch', 'stop', 'store', 'strap', 'strengthen', 'stretch', 'strip', 'stroke', 'stuff', 'subtract', 'succeed', 'suck', 'suffer', 'suggest', 'suit', 'supply', 'support', 'suppose', 'surprise', 'surround', 'suspect', 'suspend', 'switch', 'talk', 'tame', 'tap', 'taste', 'tease', 'telephone', 'tempt', 'terrify', 'test', 'thank', 'thaw', 'tick', 'tickle', 'tie', 'time', 'tip', 'tire', 'touch', 'tour', 'tow', 'trace', 'trade', 'train', 'transport', 'trap', 'travel', 'treat', 'tremble', 'trick', 'trip', 'trot', 'trouble', 'trust', 'try', 'tug', 'tumble', 'turn', 'twist', 'type', 'undress', 'unfasten', 'unite', 'unlock', 'unpack', 'untidy', 'use', 'vanish', 'visit', 'wail', 'wait', 'walk', 'wander', 'want', 'warm', 'warn', 'wash', 'waste', 'watch', 'water', 'wave', 'weigh', 'welcome', 'whine', 'whip', 'whirl', 'whisper', 'whistle', 'wink', 'wipe', 'wish', 'wobble', 'wonder', 'work', 'worry', 'wrap', 'wreck', 'wrestle', 'wriggle', 'x-ray', 'yawn', 'yell', 'zip', 'zoom'])
-
-
-ENGLISH_IRREGULAR_VERBS = set(['arise ', 'arisen', 'arose ', 'ate', 'awake', 'awakened', 'awoke', 'awoken', 'backslid', 'backslidden', 'backslide', 'bade', 'be', 'bear', 'beat', 'beaten', 'became', 'become', 'been', 'began', 'begin', 'begun', 'bend', 'bent', 'bet', 'betted', 'bid', 'bidden', 'bind', 'bit', 'bite', 'bitten', 'bled', 'bleed', 'blew', 'blow', 'blown', 'bore', 'born', 'borne', 'bought', 'bound', 'break', 'bred', 'breed', 'bring', 'broadcast', 'broadcasted', 'broke', 'broken', 'brought', 'build', 'built', 'burn', 'burned', 'burnt', 'burst', 'bust', 'busted', 'buy', 'came', 'cast', 'catch', 'caught', 'choose', 'chose', 'chosen', 'clad', 'cling', 'clothe', 'clothed', 'clung', 'come', 'cost', 'creep', 'crept', 'cut', 'daydream', 'daydreamed', 'daydreamt', 'deal', 'dealt', 'did', 'dig', 'disprove', 'disproved', 'disproven', 'dive', 'dived', 'do', 'done', 'dove', 'drank', 'draw', 'drawn', 'dream', 'dreamed', 'dreamt', 'drew', 'drink', 'drive', 'driven', 'drove', 'drunk', 'dug', 'dwell', 'dwelled', 'dwelt', 'eat', 'eaten', 'fall', 'fallen', 'fed', 'feed', 'feel', 'fell', 'felt', 'fight', 'find', 'fit', 'fitted', 'fled', 'flee', 'flew', 'fling', 'flown', 'flung', 'fly', 'forbade', 'forbid', 'forbidden', 'forecast', 'forego', 'foregone', 'foresaw', 'foresee', 'foreseen', 'foretell', 'foretold', 'forewent', 'forgave', 'forget', 'forgive', 'forgiven', 'forgot', 'forgotten', 'forsake', 'forsaken', 'forsook', 'fought', 'found', 'freeze', 'froze', 'frozen', 'gave', 'get', 'give', 'given', 'go', 'gone', 'got', 'gotten', 'grew', 'grind', 'ground', 'grow', 'grown', 'had', 'hang', 'have', 'hear', 'heard', 'held', 'hew', 'hewed', 'hewn', 'hid', 'hidden', 'hide', 'hit', 'hold', 'hung', 'hurt', 'keep', 'kept', 'kneel', 'kneeled', 'knelt', 'knew', 'knit', 'knitted', 'know', 'known', 'laid', 'lain', 'lay', 'lead', 'lean', 'leaned', 'leant', 'leap', 'leaped', 'leapt', 'learn', 'learned', 'learnt', 'leave', 'led', 'left', 'lend', 'lent', 'let', 'lie', 'lied', 'light', 'lighted', 'lit', 'lose', 'lost', 'made', 'make', 'mean', 'meant', 'meet', 'met', 'misunderstand', 'misunderstood', 'mow', 'mowed', 'mown', 'paid', 'partake', 'partaken', 'partook', 'pay', 'plead', 'pleaded', 'pled', 'proofread', 'prove', 'proved', 'proven', 'put', 'quick-freeze', 'quick-froze', 'quick-frozen', 'quit', 'quitted', 'ran', 'rang', 'read', 'rid', 'ridden', 'ride', 'ring', 'rise', 'risen', 'rode', 'rose', 'run', 'rung', 'said', 'sang', 'sank', 'sat', 'saw', 'sawed', 'sawn', 'say', 'see', 'seek', 'seen', 'sell', 'send', 'sent', 'set', 'sew', 'sewed', 'sewn', 'shake', 'shaken', 'shave', 'shaved', 'shaven', 'shear', 'sheared', 'shed', 'shine', 'shined', 'shone', 'shook', 'shoot', 'shorn', 'shot', 'show', 'showed', 'shown', 'shrank', 'shrink', 'shrunk', 'shut', 'sing', 'sink', 'sit', 'slain', 'slay', 'slayed', 'sleep', 'slept', 'slew', 'slid', 'slide', 'sling', 'slink', 'slinked', 'slit', 'slung', 'slunk', 'smell', 'smelled', 'smelt', 'sneak', 'sneaked', 'snuck', 'sold', 'sought', 'sow', 'sowed', 'sown', 'spat', 'speak', 'sped', 'speed', 'speeded', 'spell', 'spelled', 'spelt', 'spend', 'spent', 'spill', 'spilled', 'spilt', 'spin', 'spit', 'split', 'spoil', 'spoiled', 'spoilt', 'spoke', 'spoken', 'sprang', 'spread', 'spring', 'sprung', 'spun', 'stand ', 'stank', 'steal', 'stick', 'sting', 'stink', 'stole', 'stolen', 'stood', 'strew', 'strewed', 'strewn', 'stricken', 'stridden', 'stride', 'strike', 'string', 'strive', 'strived', 'striven', 'strode', 'strove', 'struck', 'strung', 'stuck', 'stung', 'stunk', 'sublet', 'sunburn', 'sunburned', 'sunburnt', 'sung', 'sunk', 'swam', 'swear', 'sweat', 'sweated', 'sweep', 'swell', 'swelled', 'swept', 'swim', 'swing', 'swollen', 'swore', 'sworn', 'swum', 'swung', 'take', 'taken', 'taught', 'teach', 'tear', 'telecast', 'tell', 'test-drive', 'test-driven', 'test-drove', 'test-flew', 'test-flown', 'test-fly', 'think', 'thought', 'threw', 'throw', 'thrown', 'thrust', 'told', 'took', 'tore', 'torn', 'tread', 'trod', 'trodden', 'understand', 'understood', 'undertake', 'undertaken', 'undertook', 'undid', 'undo', 'undone', 'wake', 'waked', 'was, were', 'waylaid', 'waylay', 'wear', 'weave', 'weaved', 'wed', 'wedded', 'weep', 'went', 'wept', 'wet', 'wetted', 'whet', 'whetted', 'win', 'wind', 'withdraw', 'withdrawn', 'withdrew', 'withheld', 'withhold', 'withstand', 'withstood', 'woke', 'woken', 'won', 'wore', 'worn', 'wound', 'wove', 'woven', 'wring', 'write', 'written', 'wrote', 'wrung'])
--- a/test/test_core.py	Sun Jul 14 23:18:38 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,225 +0,0 @@
-# -*- coding:utf-8 -*-
-#
-# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-import unittest2
-
-from nerdy import core
-from nerdy.tokenizer import Token, Sentence
-
-
-class CoreTest(unittest2.TestCase):
-    """ Test of core """
-
-    def test_lexical_source(self):
-        """ Test lexical source """
-        lexicon = {'everyone': 'http://example.com/everyone',
-                   'me': 'http://example.com/me'}
-        source = core.NerdySourceLexical(lexicon)
-        self.assertEqual(source.query_word('me'), ['http://example.com/me',])
-        self.assertEqual(source.query_word('everyone'), ['http://example.com/everyone',])
-        self.assertEqual(source.query_word('me everyone'), [])
-        self.assertEqual(source.query_word('toto'), [])
-        # Token
-        token = Token('me', 0, 2, None)
-        self.assertEqual(source.recognize_token(token), ['http://example.com/me',])
-        token = Token('ma', 0, 2, None)
-        self.assertEqual(source.recognize_token(token), [])
-
-    def test_rql_source(self):
-        """ Test rql source """
-        source = core.NerdySourceUrlRql('Any U LIMIT 1 WHERE X cwuri U, X name "%(word)s"',
-                                       'http://www.cubicweb.org')
-        self.assertEqual(source.query_word('apycot'), [u'http://www.cubicweb.org/1310453',])
-
-    def test_sparql_source(self):
-        """ Test sparql source """
-        source = core.NerdySourceSparql(u'''SELECT ?uri
-                                            WHERE{
-                                            ?uri rdfs:label "Python"@en .
-                                            ?uri rdf:type ?type}''',
-                                        u'http://dbpedia.org/sparql')
-        self.assertEqual(source.query_word('cubicweb'),
-                         [u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
-                          u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'])
-
-    def test_nerdy_process(self):
-        """ Test nerdy process """
-        text = 'Hello everyone, this is   me speaking. And me.'
-        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'me': 'http://example.com/me'})
-        nerdy = core.NerdyProcess((source,))
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone', None,
-                           Token(word='everyone', start=6, end=14,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
-
-    def test_nerdy_process_multisources(self):
-        """ Test nerdy process """
-        text = 'Hello everyone, this is   me speaking. And me.'
-        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'me': 'http://example.com/me'})
-        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
-        # Two sources, not unique
-        nerdy = core.NerdyProcess((source1, source2))
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone', None,
-                           Token(word='everyone', start=6, end=14,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
-        # Two sources, unique
-        nerdy = core.NerdyProcess((source1, source2), unique=True)
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone', None,
-                           Token(word='everyone', start=6, end=14,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
-        # Two sources inversed, unique
-        nerdy = core.NerdyProcess((source2, source1), unique=True)
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone', None,
-                           Token(word='everyone', start=6, end=14,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
-
-    def test_nerdy_process_add_sources(self):
-        """ Test nerdy process """
-        text = 'Hello everyone, this is   me speaking. And me.'
-        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'me': 'http://example.com/me'})
-        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
-        nerdy = core.NerdyProcess((source1,))
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone', None,
-                           Token(word='everyone', start=6, end=14,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46))),])
-        # Two sources, not unique
-        nerdy.add_ner_source(source2)
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone', None,
-                           Token(word='everyone', start=6, end=14,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
-
-    def test_nerdy_process_preprocess(self):
-        """ Test nerdy process """
-        text = 'Hello Toto, this is   me speaking. And me.'
-        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
-                                          'me': 'http://example.com/me'})
-        preprocessor = core.NerdyStopwordsFilterPreprocessor()
-        nerdy = core.NerdyProcess((source,),
-                                  preprocessors=(preprocessor,))
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities, [('http://example.com/toto', None,
-                                           Token(word='Toto', start=6, end=10,
-                                                 sentence=Sentence(indice=0, start=0, end=34)))])
-
-    def test_nerdy_process_add_preprocess(self):
-        """ Test nerdy process """
-        text = 'Hello Toto, this is   me speaking. And me.'
-        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
-                                          'me': 'http://example.com/me'})
-        preprocessor = core.NerdyStopwordsFilterPreprocessor()
-        nerdy = core.NerdyProcess((source,),)
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/toto', None,
-                           Token(word='Toto', start=6, end=10,
-                                 sentence=Sentence(indice=0, start=0, end=34))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=22, end=24,
-                                 sentence=Sentence(indice=0, start=0, end=34))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=39, end=41,
-                                 sentence=Sentence(indice=1, start=34, end=42)))])
-        nerdy.add_preprocessors(preprocessor)
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities, [('http://example.com/toto', None,
-                                           Token(word='Toto', start=6, end=10,
-                                                 sentence=Sentence(indice=0, start=0, end=34)))])
-
-    def test_nerdy_process_chained_word(self):
-        """ Test nerdy process """
-        text = 'Hello everyone me, this is   me speaking. And me.'
-        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'everyone me': 'http://example.com/everyone_me',
-                                          'me': 'http://example.com/me'})
-        nerdy = core.NerdyProcess((source,))
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone_me', None,
-                           Token(word='everyone me', start=6, end=17,
-                                 sentence=Sentence(indice=0, start=0, end=41))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=29, end=31,
-                                 sentence=Sentence(indice=0, start=0, end=41))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=46, end=48, sentence=Sentence(indice=1, start=41, end=49)))])
-
-
-if __name__ == '__main__':
-    unittest2.main()
-
--- a/test/test_dataio.py	Sun Jul 14 23:18:38 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,85 +0,0 @@
-# -*- coding:utf-8 -*-
-#
-# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-import unittest2
-
-from nerdy import dataio, core
-
-
-class DataioTest(unittest2.TestCase):
-    """ Test of dataio """
-
-    def test_sparql_query(self):
-        results = dataio.sparql_query(query=u'''SELECT ?uri
-                                                WHERE{
-                                                ?uri rdfs:label "Python"@en .
-                                                ?uri rdf:type ?type}''',
-                                      endpoint=u'http://dbpedia.org/sparql')
-        truth = [{u'uri':
-                  {u'type': u'uri',
-                   u'value': u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage'}},
-                 {u'uri':
-                  {u'type': u'uri',
-                   u'value': u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'}}]
-        self.assertEqual(results, truth)
-
-    def test_rql_url_query(self):
-        results = dataio.rql_url_query('Any U LIMIT 1 WHERE X cwuri U, X name "apycot"',
-                                       'http://www.cubicweb.org')
-        self.assertEqual(results, [[u'http://www.cubicweb.org/1310453']])
-
-    def test_prettyprint(self):
-        text = 'Hello everyone, this is   me speaking. And me.'
-        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'me': 'http://example.com/me'})
-        nerdy = core.NerdyProcess((source,))
-        named_entities = nerdy.process_text(text)
-        html = dataio.NerdyHTMLPrettyPrint().pprint_text(text, named_entities)
-        self.assertEqual(html, (u'Hello <a href="http://example.com/everyone">everyone</a>, '
-                                u'this is   <a href="http://example.com/me">me</a> speaking. '
-                                u'And <a href="http://example.com/me">me</a>.'))
-
-    def test_prettyprint_class(self):
-        text = 'Hello everyone, this is   me speaking. And me.'
-        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'me': 'http://example.com/me'})
-        nerdy = core.NerdyProcess((source,))
-        named_entities = nerdy.process_text(text)
-        html = dataio.NerdyHTMLPrettyPrint().pprint_text(text, named_entities, html_class='ner')
-        self.assertEqual(html, (u'Hello <a href="http://example.com/everyone" class="ner">everyone</a>, '
-                                u'this is   <a href="http://example.com/me" class="ner">me</a> speaking. '
-                                u'And <a href="http://example.com/me" class="ner">me</a>.'))
-
-
-class NerdyValidXHTMLPrettyPrintTest(unittest2.TestCase):
-
-    def test_valid(self):
-        self.assertTrue(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
-            '<p>coucou</p>'))
-
-    def test_valid_unicode(self):
-        self.assertTrue(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
-            u'<p>hé</p>'))
-
-    def test_invalid(self):
-        self.assertFalse(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
-            '<p><div>coucou</div></p>'))
-
-
-if __name__ == '__main__':
-    unittest2.main()
-
--- a/test/test_filter.py	Sun Jul 14 23:18:38 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,99 +0,0 @@
-# -*- coding:utf-8 -*-
-#
-# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-import unittest2
-
-from nerdy import core
-from nerdy.tokenizer import Token, Sentence
-
-
-class FilterTest(unittest2.TestCase):
-    """ Test of filters """
-
-    def test_occurence_filter_min_occ(self):
-        """ Test occurence filter """
-        text = 'Hello everyone, this is   me speaking. And me.'
-        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'me': 'http://example.com/me'})
-        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
-        _filter = core.NerdyOccurenceFilter(min_occ=2)
-        nerdy = core.NerdyProcess((source1, source2), filters=(_filter,))
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
-
-    def test_occurence_filter_max_occ(self):
-        """ Test occurence filter """
-        text = 'Hello everyone, this is   me speaking. And me.'
-        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'me': 'http://example.com/me'})
-        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
-        _filter = core.NerdyOccurenceFilter(max_occ=1)
-        nerdy = core.NerdyProcess((source1, source2), filters=(_filter,))
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone', None,
-                           Token(word='everyone', start=6, end=14,
-                                           sentence=Sentence(indice=0, start=0, end=38))),])
-
-    def test_disambiguation_word_length(self):
-        """ Test occurence filter """
-        text = 'Hello toto tutu. And toto.'
-        source = core.NerdySourceLexical({'toto tutu': 'http://example.com/toto_tutu',
-                                          'toto': 'http://example.com/toto'})
-        _filter = core.NerdyDisambiguationWordParts()
-        nerdy = core.NerdyProcess((source,), filters=(_filter,))
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/toto_tutu', None,
-                           Token(word='toto tutu', start=6, end=15,
-                                 sentence=Sentence(indice=0, start=0, end=16))),
-                          ('http://example.com/toto_tutu', None,
-                           Token(word='toto', start=21, end=25,
-                                 sentence=Sentence(indice=1, start=16, end=26)))])
-
-    def test_rules_filter(self):
-        """ Test rules filter """
-        text = 'Hello toto tutu. And toto.'
-        source = core.NerdySourceLexical({'toto tutu': 'http://example.com/toto_tutu',
-                                          'toto': 'http://example.com/toto'})
-        rules = {'http://example.com/toto': 'http://example.com/tata'}
-        _filter = core.NerdyReplacementRulesFilter(rules)
-        nerdy = core.NerdyProcess((source,), filters=(_filter,))
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/toto_tutu', None,
-                           Token(word='toto tutu', start=6, end=15,
-                                 sentence=Sentence(indice=0, start=0, end=16))),
-                          ('http://example.com/tata', None,
-                           Token(word='toto', start=21, end=25,
-                                 sentence=Sentence(indice=1, start=16, end=26)))])
-
-if __name__ == '__main__':
-    unittest2.main()
-
--- a/test/test_preprocessor.py	Sun Jul 14 23:18:38 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,97 +0,0 @@
-# -*- coding:utf-8 -*-
-#
-# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-import unittest2
-
-from nerdy import core, tokenizer
-
-
-class PreprocessorTest(unittest2.TestCase):
-    """ Test of preprocessors """
-
-    def test_lowercasefilter(self):
-        preprocessor = core.NerdyLowerCaseFilterPreprocessor()
-        token = tokenizer.Token('toto', 0, 4, None)
-        self.assertEqual(preprocessor(token), None)
-        token = tokenizer.Token('toto Tata', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-        token = tokenizer.Token('toto tata', 0, 4, None)
-        self.assertEqual(preprocessor(token), None)
-
-    def test_wordsizefilter(self):
-        preprocessor = core.NerdyWordSizeFilterPreprocessor()
-        token = tokenizer.Token('toto', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-        preprocessor = core.NerdyWordSizeFilterPreprocessor(min_size=3)
-        token = tokenizer.Token('toto', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-        token = tokenizer.Token('to', 0, 4, None)
-        self.assertEqual(preprocessor(token), None)
-        preprocessor = core.NerdyWordSizeFilterPreprocessor(max_size=3)
-        token = tokenizer.Token('toto', 0, 4, None)
-        self.assertEqual(preprocessor(token), None)
-        token = tokenizer.Token('to', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-
-    def test_lowerfirstword(self):
-        preprocessor = core.NerdyLowerFirstWordPreprocessor()
-        sentence = tokenizer.Sentence(0, 0, 20)
-        # Start of the sentence
-        token1 = tokenizer.Token('Toto tata', 0, 4, sentence)
-        token2 = tokenizer.Token('Toto tata', 0, 4, sentence)
-        self.assertEqual(preprocessor(token1), token2)
-        token1 = tokenizer.Token('Us tata', 0, 4, sentence)
-        token2 = tokenizer.Token('us tata', 0, 4, sentence)
-        self.assertEqual(preprocessor(token1), token2)
-        # Not start of the sentence
-        token1 = tokenizer.Token('Toto tata', 12, 16, sentence)
-        token2 = tokenizer.Token('Toto tata', 12, 16, sentence)
-        self.assertEqual(preprocessor(token1), token2)
-        token1 = tokenizer.Token('Us tata', 12, 16, sentence)
-        token2 = tokenizer.Token('Us tata', 12, 16, sentence)
-        self.assertEqual(preprocessor(token1), token2)
-
-    def test_stopwordsfilter(self):
-        preprocessor = core.NerdyStopwordsFilterPreprocessor()
-        token = tokenizer.Token('Toto', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-        token = tokenizer.Token('Us', 0, 4, None)
-        self.assertEqual(preprocessor(token), None)
-        token = tokenizer.Token('Us there', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-        # Split words
-        preprocessor = core.NerdyStopwordsFilterPreprocessor(split_words=True)
-        token = tokenizer.Token('Us there', 0, 4, None)
-        self.assertEqual(preprocessor(token), None)
-        token = tokenizer.Token('Us there toto', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-
-    def test_hashtag(self):
-        preprocessor = core.NerdyHashTagPreprocessor()
-        token = tokenizer.Token('Toto', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-        token1 = tokenizer.Token('@BarackObama', 0, 4, None)
-        token2 = tokenizer.Token('BarackObama', 0, 4, None)
-        self.assertEqual(preprocessor(token1), token2)
-        token1 = tokenizer.Token('@Barack_Obama', 0, 4, None)
-        token2 = tokenizer.Token('Barack Obama', 0, 4, None)
-        self.assertEqual(preprocessor(token1), token2)
-
-
-if __name__ == '__main__':
-    unittest2.main()
-
--- a/test/test_tokenizer.py	Sun Jul 14 23:18:38 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,88 +0,0 @@
-# -*- coding:utf-8 -*-
-#
-# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-import unittest2
-
-from nerdy.tokenizer import RichStringTokenizer, Token, Sentence
-
-
-class TokenizerTest(unittest2.TestCase):
-    """ Test of tokenizer """
-
-    def test_richstringtokenizer(self):
-        text = 'Hello everyone, this is   me speaking. And me.'
-        tokenizer = RichStringTokenizer(text,
-                                        token_min_size=1,
-                                        token_max_size=3)
-        tokens = list(tokenizer)
-        self.assertEqual(len(tokens), 18)
-        t1 = Token(word='Hello everyone this', start=0, end=20, sentence=Sentence(indice=0, start=0, end=38))
-        self.assertEqual(tokens[0], t1)
-        t2 = Token(word='And', start=39, end=42, sentence=Sentence(indice=1, start=38, end=46))
-        self.assertEqual(tokens[16], t2)
-
-    def test_richstringtokenizer_loadtext(self):
-        text = 'Hello everyone, this is   me speaking. And me.'
-        tokenizer = RichStringTokenizer(text,
-                                        token_min_size=1,
-                                        token_max_size=3)
-        tokens = list(tokenizer)
-        self.assertEqual(len(tokens), 18)
-        tokenizer.load_text('Hello everyone')
-        tokens = list(tokenizer)
-        self.assertEqual(len(tokens), 3)
-
-    def test_richstringtokenizer_minsize(self):
-        text = 'Hello everyone, this is   me speaking. And me.'
-        tokenizer = RichStringTokenizer(text,
-                                        token_min_size=2,
-                                        token_max_size=3)
-        tokens = list(tokenizer)
-        self.assertEqual(len(tokens), 10)
-        t1 =  Token(word='me speaking', start=26, end=37, sentence=Sentence(indice=0, start=0, end=38))
-        self.assertEqual(tokens[8], t1)
-
-    def test_richstringtokenizer_maxsize(self):
-        text = 'Hello everyone, this is   me speaking. And me.'
-        tokenizer = RichStringTokenizer(text,
-                                        token_min_size=1,
-                                        token_max_size=4)
-        tokens = list(tokenizer)
-        self.assertEqual(len(tokens), 21)
-        t1 = Token(word='And me', start=39, end=45, sentence=Sentence(indice=1, start=38, end=46))
-        self.assertEqual(tokens[18], t1)
-
-    def test_richstringtokenizer_sentences(self):
-        text = 'Hello everyone, this is   me speaking. And me !Why not me ? Blup'
-        tokenizer = RichStringTokenizer(text,
-                                        token_min_size=1,
-                                        token_max_size=4)
-        sentences = tokenizer.find_sentences(text)
-        self.assertEqual(len(sentences), 4)
-        self.assertEqual(text[sentences[0].start:sentences[0].end],
-                         'Hello everyone, this is   me speaking.')
-        self.assertEqual(text[sentences[1].start:sentences[1].end],
-                         ' And me !')
-        self.assertEqual(text[sentences[2].start:sentences[2].end],
-                         'Why not me ?')
-        self.assertEqual(text[sentences[3].start:sentences[3].end],
-                         ' Blup')
-
-
-if __name__ == '__main__':
-    unittest2.main()
-
--- a/tokenizer.py	Sun Jul 14 23:18:38 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,66 +0,0 @@
-# -*- coding: utf-8 -*-
-""" Tokenizer for sentences/words segmentation.
-"""
-import itertools
-import collections
-import re
-
-
-Token = collections.namedtuple('Token', ['word', 'start', 'end', 'sentence'])
-Sentence = collections.namedtuple('Sentence', ['indice', 'start', 'end'])
-
-
-class RichStringTokenizer(object):
-    """Tokenizer for Yams' RichString content.
-
-    The tokenizer uses a variable-length sliding window, i.e. a sliding
-    window yielding tokens of N words.
-    """
-
-    def __init__(self, text, token_min_size=1, token_max_size=3):
-        """
-        :token_min_size: minimum number of words required to be a valid token
-        :token_max_size: minimum number of words required to be a valid token
-        """
-        self.text = text
-        self.token_min_size = token_min_size
-        self.token_max_size = token_max_size
-
-    def iter_tokens(self, text):
-        """ Iterate tokens over a text
-        """
-        # Compute sentences
-        sentences = self.find_sentences(text)
-        # Compute words
-        words = list([m for m in re.finditer(r'[\w@-]+', text, re.UNICODE)])
-        indice = 0
-        while indice < len(words):
-            # Choose the current sentence of the first word
-            current_sentence = [s for s in sentences if s.start<=words[indice].start()][-1]
-            # Sliding windows over the different words for each sentence
-            remaining = len(words) - indice
-            for length in range(min(self.token_max_size, remaining), self.token_min_size-1, -1):
-                _words = words[indice:indice+length]
-                if _words[-1].start() > current_sentence.end:
-                    # The last word in not in the same sentence anymore, split
-                    continue
-                normalized_word = ' '.join([w.group() for w in _words]).strip()
-                yield Token(normalized_word, _words[0].start(), _words[-1].end(), current_sentence)
-            indice += 1
-
-    def find_sentences(self, text):
-        """ Find the sentences
-        """
-        return [Sentence(ind, s.start(), s.end()) for ind, s in
-                enumerate(re.finditer(r'[^.!?]+(?:[.!?]|$)', text, re.UNICODE))]
-
-    def load_text(self, text):
-        """ Load the text to be tokenized
-        """
-        self.text = text
-
-    def __iter__(self):
-        """ Iterator over the text given in the object instantiation
-        """
-        for t in self.iter_tokens(self.text):
-            yield t