[ner] Remove unused files and move tests, related to #187461
authorVincent Michel <vincent.michel@logilab.fr>
Thu, 19 Dec 2013 14:44:44 +0000
changeset 368 61a56bf04d36
parent 367 e9b7a47e8d3e
child 369 7019bc0cab44
[ner] Remove unused files and move tests, related to #187461
ner/__pkginfo__.py
ner/debian/changelog
ner/debian/compat
ner/debian/control
ner/debian/copyright
ner/debian/rules
ner/python-nerdy.spec
ner/setup.py
ner/stopwords.py
ner/test/test_core.py
ner/test/test_dataio.py
ner/test/test_filter.py
ner/test/test_preprocessor.py
ner/test/test_tokenizer.py
reference_data/stopwords.py
reference_data/us_states.txt
test/test_core.py
test/test_filter.py
test/test_ner_dataio.py
test/test_preprocessor.py
test/test_tokenizer.py
--- a/ner/__pkginfo__.py	Thu Dec 19 14:41:47 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,39 +0,0 @@
-# -*- coding:utf-8 -*-
-# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-"""Nerdy packaging information."""
-__docformat__ = "restructuredtext en"
-import sys
-
-distname = 'nerdy'
-modname = 'nerdy'
-
-numversion = (0, 1, 0)
-version = '.'.join([str(num) for num in numversion])
-
-license = 'LGPL' # 2.1 or later
-description = "Python library for data alignment"
-web = "https://www.logilab.org/project/nerdy"
-author = "Logilab"
-author_email = "contact@logilab.fr"
-
-
-from os.path import join
-scripts = []
-include_dirs = []
-
-if sys.version_info < (2, 7):
-    install_requires = ['unittest2 >= 0.5.1']
--- a/ner/debian/changelog	Thu Dec 19 14:41:47 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-nerdy (0.1.0-1) unstable; urgency=low
-
-  * Initial release of the Nerdy package for Named Entities Recognition in Python.
-
- -- Vincent michel <Vincent.Michel@logilab.fr>  Tue, 11 Jun 2013 13:59:22 +0200
-
--- a/ner/debian/compat	Thu Dec 19 14:41:47 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1 +0,0 @@
-7
--- a/ner/debian/control	Thu Dec 19 14:41:47 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,12 +0,0 @@
-Source: nerdy
-Section: python
-Priority: optional
-Maintainer: LOGILAB S.A. (Paris, FRANCE) <contact@logilab.fr>
-Build-Depends: debhelper (>= 7), python (>=2.5), python-support
-Standards-Version: 3.9.3
-XS-Python-Version: >= 2.5
-
-Package: python-nerdy
-Architecture: all
-Depends: ${python:Depends}
-Description: Python library for Named Entities Recognition.
--- a/ner/debian/copyright	Thu Dec 19 14:41:47 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,8 +0,0 @@
-Upstream Author:
-
-  LOGILAB S.A. (Paris, FRANCE) <contact@logilab.fr>
-
-Copyright:
-
-Copyright (c) 2013 LOGILAB S.A. (Paris, FRANCE).
-http://www.logilab.fr -- mailto:contact@logilab.fr
--- a/ner/debian/rules	Thu Dec 19 14:41:47 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,55 +0,0 @@
-#!/usr/bin/make -f
-# Sample debian/rules that uses debhelper.
-# GNU copyright 1997 to 1999 by Joey Hess.
-
-# Uncomment this to turn on verbose mode.
-#export DH_VERBOSE=1
-build: build-arch build-indep
-build-arch:
-	# Nothing to do
-build-indep: build-stamp
-build-stamp:
-	dh_testdir
-	NO_SETUPTOOLS=1 python setup.py -q build
-	touch build-stamp
-
-clean:
-	dh_testdir
-	dh_testroot
-	rm -f build-stamp configure-stamp
-	rm -rf build
-	find . -name "*.pyc" | xargs rm -f
-	dh_clean
-
-install: build
-	dh_testdir
-	dh_testroot
-	dh_clean -k
-	dh_installdirs -i
-	NO_SETUPTOOLS=1 python setup.py -q install --no-compile --prefix=debian/python-nerdy/usr/
-
-
-# Build architecture-independent files here.
-binary-indep: build install
-	dh_testdir
-	dh_testroot
-	dh_install -i
-	dh_installchangelogs -i
-	dh_installexamples -i
-	dh_installdocs -i
-	dh_installman -i
-	dh_pysupport -i
-	dh_link -i
-	dh_compress -i -X.py -X.ini -X.xml -Xtest
-	dh_fixperms -i
-	dh_installdeb -i
-	dh_gencontrol -i
-	dh_md5sums -i
-	dh_builddeb -i
-
-
-# Build architecture-dependent files here.
-binary-arch:
-
-binary: binary-indep
-.PHONY: build clean binary-arch binary-indep binary
--- a/ner/python-nerdy.spec	Thu Dec 19 14:41:47 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,48 +0,0 @@
-%if 0%{?el5}
-%define python python26
-%define __python /usr/bin/python2.6
-%{!?python_scriptarch: %define python_scriptarch %(%{__python} -c "from distutils.sysconfig import get_python_lib; from os.path import join; print join(get_python_lib(1, 1), 'scripts')")}
-%else
-%define python python
-%define __python /usr/bin/python
-%endif
-
-Name:           %{python}-nerdy
-Version:        0.1.0
-Release:        logilab.1%{?dist}
-Summary:        Python library for data alignment
-Group:          Development/Languages/Python
-License:        LGPL
-Source0:        nerdy-%{version}.tar.gz
-
-BuildArch:      noarch
-BuildRoot:      %{_tmppath}/%{name}-%{version}-%{release}-buildroot
-
-BuildRequires:  %{python}
-Requires:       %{python}, %{python}-lxml
-
-
-%description
-entity / relation schema
-
-%prep
-%setup -q -n nerdy-%{version}
-
-%build
-%{__python} setup.py build
-%if 0%{?el5}
-# change the python version in shebangs
-find . -name '*.py' -type f -print0 |  xargs -0 sed -i '1,3s;^#!.*python.*$;#! /usr/bin/python2.6;'
-%endif
-
-%install
-rm -rf $RPM_BUILD_ROOT
-NO_SETUPTOOLS=1 %{__python} setup.py install -O1 --skip-build --root $RPM_BUILD_ROOT %{?python_scriptarch: --install-scripts=%{python_scriptarch}}
-
-%clean
-rm -rf $RPM_BUILD_ROOT
-
-%files 
-%defattr(-, root, root)
-/*
-
--- a/ner/setup.py	Thu Dec 19 14:41:47 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,27 +0,0 @@
-# -*- coding:utf-8 -*-
-# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-from distutils.core import setup
-
-setup(name='nerdy',
-      version='0.1.0',
-      description='Python library for data alignment',
-      author='LOGILAB S.A. (Paris, FRANCE)',
-      author_email=' <contact@logilab.fr>',
-      url='https://www.logilab.org/project/nerdy',
-      package_dir={'nerdy': '.'},
-      packages=['nerdy'],
-     )
--- a/ner/stopwords.py	Thu Dec 19 14:41:47 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,15 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Stopwords in different languages.
-"""
-
-FRENCH_STOPWORDS = set(['alors', 'au', 'aucuns', 'aussi', 'autre', 'aux', 'avant', 'avec', 'avoir', 'bon', 'car', 'ce', 'cela', 'ces', 'ceux', 'chaque', 'ci', 'comme', 'comment', 'dans', 'de', 'dedans', 'dehors', 'depuis', 'des', 'deux', 'devrait', 'doit', 'donc', 'dos', 'droite', 'du', 'début', 'elle', 'elles', 'en', 'encore', 'essai', 'est', 'et', 'eu', 'eux', 'fait', 'faites', 'fois', 'font', 'force', 'haut', 'hors', 'ici', 'il', 'ils', 'je', 'juste', 'la', 'le', 'les', 'leur', 'lui', 'là', 'ma', 'maintenant', 'mais', 'me', 'meme', 'mes', 'mine', 'moi', 'moins', 'mon', 'mot', 'ne', 'ni', 'nommés', 'nos', 'notre', 'nous', 'nouveaux', 'on', 'ou', 'où', 'par', 'parce', 'parole', 'pas', 'personnes', 'peu', 'peut', 'pièce', 'plupart', 'pour', 'pourquoi', 'qu', 'quand', 'que', 'quel', 'quelle', 'quelles', 'quels', 'qui', 'sa', 'sans', 'se', 'ses', 'seulement', 'si', 'sien', 'son', 'sont', 'sous', 'soyez', 'sujet', 'sur', 'ta', 'tandis', 'te', 'tellement', 'tels', 'tes', 'toi', 'ton', 'tous', 'tout', 'trop', 'très', 'tu', 'un', 'une', 'valeur', 'voie', 'voient', 'vont', 'vos', 'votre', 'vous', 'vu', 'ça', 'étaient', 'état', 'étions', 'été', 'être'])
-
-
-ENGLISH_STOPWORDS = set(['a', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', "ain't", 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', "aren't", 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'bill', 'both', 'bottom', 'brief', 'but', 'by', 'call', 'came', 'can', 'cannot', 'cant', "can't", 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'computer', 'con', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'couldnt', "couldn't", 'course', 'cry', 'currently', "c'mon", "c's", 'de', 'definitely', 'describe', 'described', 'despite', 'detail', 'did', "didn't", 'different', 'do', 'does', "doesn't", 'doing', 'done', "don't", 'down', 'downwards', 'due', 'during', 'each', 'edu', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'far', 'few', 'fifteen', 'fifth', 'fify', 'fill', 'find', 'fire', 'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'furthermore', 'get', 'gets', 'getting', 'give', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'had', "hadn't", 'happens', 'hardly', 'has', 'hasnt', "hasn't", 'have', "haven't", 'having', 'he', 'hello', 'help', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', "here's", 'hers', 'herself', "he's", 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'hundred', 'i', "i'd", "i'll", "i'm", "i've", 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'insofar', 'instead', 'interest', 'into', 'inward', 'is', "isn't", 'it', 'its', 'itself', "it'd", "it'll", "it's'", "i'd", "i'll", "i'm", "i've", 'just', 'keep', 'keeps', 'kept', 'know', 'known', 'knows', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', "let's", 'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks', 'ltd', 'made', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'part', 'particular', 'particularly', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'put', 'que', 'quite', 'qv', 'rather', 'rd', 're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', "shouldn't", 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 'system', 'take', 'taken', 'tell', 'ten', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', "that's", 'thats', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'thick', 'thin', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took', 'top', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twelve', 'twenty', 'twice', 'two', "t's", 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', 'via', 'viz', 'vs', 'want', 'wants', 'was', "wasn't", 'way', 'we', 'welcome', 'well', 'went', 'were', "weren't", "we'd", "we'll", "we're", "we've", 'what', 'whatever', "what's", 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', "where's", 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', "who's", 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'wonder', "won't", 'would', "wouldn't", 'yes', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves', "you'd", "you'll", "you're", "you've", 'zero'])
-
-
-ENGLISH_REGULAR_VERBS = set(['accept', 'add', 'admire', 'admit', 'advise', 'afford', 'agree', 'alert', 'allow', 'amuse', 'analyse', 'announce', 'annoy', 'answer', 'apologise', 'appear', 'applaud', 'appreciate', 'approve', 'argue', 'arrange', 'arrest', 'arrive', 'ask', 'attach', 'attack', 'attempt', 'attend', 'attract', 'avoid', 'back', 'bake', 'balance', 'ban', 'bang', 'bare', 'bat', 'bathe', 'battle', 'beam', 'beg', 'behave', 'belong', 'bleach', 'bless', 'blind', 'blink', 'blot', 'blush', 'boast', 'boil', 'bolt', 'bomb', 'book', 'bore', 'borrow', 'bounce', 'bow', 'box', 'brake', 'branch', 'breathe', 'bruise', 'brush', 'bubble', 'bump', 'burn', 'bury', 'buzz', 'calculate', 'call', 'camp', 'care', 'carry', 'carve', 'cause', 'challenge', 'change', 'charge', 'chase', 'cheat', 'check', 'cheer', 'chew', 'choke', 'chop', 'claim', 'clap', 'clean', 'clear', 'clip', 'close', 'coach', 'coil', 'collect', 'colour', 'comb', 'command', 'communicate', 'compare', 'compete', 'complain', 'complete', 'concentrate', 'concern', 'confess', 'confuse', 'connect', 'consider', 'consist', 'contain', 'continue', 'copy', 'correct', 'cough', 'count', 'cover', 'crack', 'crash', 'crawl', 'cross', 'crush', 'cry', 'cure', 'curl', 'curve', 'cycle', 'dam', 'damage', 'dance', 'dare', 'decay', 'deceive', 'decide', 'decorate', 'delay', 'delight', 'deliver', 'depend', 'describe', 'desert', 'deserve', 'destroy', 'detect', 'develop', 'disagree', 'disappear', 'disapprove', 'disarm', 'discover', 'dislike', 'divide', 'double', 'doubt', 'drag', 'drain', 'dream', 'dress', 'drip', 'drop', 'drown', 'drum', 'dry', 'dust', 'earn', 'educate', 'embarrass', 'employ', 'empty', 'encourage', 'end', 'enjoy', 'enter', 'entertain', 'escape', 'examine', 'excite', 'excuse', 'exercise', 'exist', 'expand', 'expect', 'explain', 'explode', 'extend', 'face', 'fade', 'fail', 'fancy', 'fasten', 'fax', 'fear', 'fence', 'fetch', 'file', 'fill', 'film', 'fire', 'fit', 'fix', 'flap', 'flash', 'float', 'flood', 'flow', 'flower', 'fold', 'follow', 'fool', 'force', 'form', 'found', 'frame', 'frighten', 'fry', 'gather', 'gaze', 'glow', 'glue', 'grab', 'grate', 'grease', 'greet', 'grin', 'grip', 'groan', 'guarantee', 'guard', 'guess', 'guide', 'hammer', 'hand', 'handle', 'hang', 'happen', 'harass', 'harm', 'hate', 'haunt', 'head', 'heal', 'heap', 'heat', 'help', 'hook', 'hop', 'hope', 'hover', 'hug', 'hum', 'hunt', 'hurry', 'identify', 'ignore', 'imagine', 'impress', 'improve', 'include', 'increase', 'influence', 'inform', 'inject', 'injure', 'instruct', 'intend', 'interest', 'interfere', 'interrupt', 'introduce', 'invent', 'invite', 'irritate', 'itch', 'jail', 'jam', 'jog', 'join', 'joke', 'judge', 'juggle', 'jump', 'kick', 'kill', 'kiss', 'kneel', 'knit', 'knock', 'knot', 'label', 'land', 'last', 'laugh', 'launch', 'learn', 'level', 'license', 'lick', 'lie', 'lighten', 'like', 'list', 'listen', 'live', 'load', 'lock', 'long', 'look', 'love', 'man', 'manage', 'march', 'mark', 'marry', 'match', 'mate', 'matter', 'measure', 'meddle', 'melt', 'memorise', 'mend', 'mess up', 'milk', 'mine', 'miss', 'mix', 'moan', 'moor', 'mourn', 'move', 'muddle', 'mug', 'multiply', 'murder', 'nail', 'name', 'need', 'nest', 'nod', 'note', 'notice', 'number', 'obey', 'object', 'observe', 'obtain', 'occur', 'offend', 'offer', 'open', 'order', 'overflow', 'owe', 'own', 'pack', 'paddle', 'paint', 'park', 'part', 'pass', 'paste', 'pat', 'pause', 'peck', 'pedal', 'peel', 'peep', 'perform', 'permit', 'phone', 'pick', 'pinch', 'pine', 'place', 'plan', 'plant', 'play', 'please', 'plug', 'point', 'poke', 'polish', 'pop', 'possess', 'post', 'pour', 'practise', 'pray', 'preach', 'precede', 'prefer', 'prepare', 'present', 'preserve', 'press', 'pretend', 'prevent', 'prick', 'print', 'produce', 'program', 'promise', 'protect', 'provide', 'pull', 'pump', 'punch', 'puncture', 'punish', 'push', 'question', 'queue', 'race', 'radiate', 'rain', 'raise', 'reach', 'realise', 'receive', 'recognise', 'record', 'reduce', 'reflect', 'refuse', 'regret', 'reign', 'reject', 'rejoice', 'relax', 'release', 'rely', 'remain', 'remember', 'remind', 'remove', 'repair', 'repeat', 'replace', 'reply', 'report', 'reproduce', 'request', 'rescue', 'retire', 'return', 'rhyme', 'rinse', 'risk', 'rob', 'rock', 'roll', 'rot', 'rub', 'ruin', 'rule', 'rush', 'sack', 'sail', 'satisfy', 'save', 'saw', 'scare', 'scatter', 'scold', 'scorch', 'scrape', 'scratch', 'scream', 'screw', 'scribble', 'scrub', 'seal', 'search', 'separate', 'serve', 'settle', 'shade', 'share', 'shave', 'shelter', 'shiver', 'shock', 'shop', 'shrug', 'sigh', 'sign', 'signal', 'sin', 'sip', 'ski', 'skip', 'slap', 'slip', 'slow', 'smash', 'smell', 'smile', 'smoke', 'snatch', 'sneeze', 'sniff', 'snore', 'snow', 'soak', 'soothe', 'sound', 'spare', 'spark', 'sparkle', 'spell', 'spill', 'spoil', 'spot', 'spray', 'sprout', 'squash', 'squeak', 'squeal', 'squeeze', 'stain', 'stamp', 'stare', 'start', 'stay', 'steer', 'step', 'stir', 'stitch', 'stop', 'store', 'strap', 'strengthen', 'stretch', 'strip', 'stroke', 'stuff', 'subtract', 'succeed', 'suck', 'suffer', 'suggest', 'suit', 'supply', 'support', 'suppose', 'surprise', 'surround', 'suspect', 'suspend', 'switch', 'talk', 'tame', 'tap', 'taste', 'tease', 'telephone', 'tempt', 'terrify', 'test', 'thank', 'thaw', 'tick', 'tickle', 'tie', 'time', 'tip', 'tire', 'touch', 'tour', 'tow', 'trace', 'trade', 'train', 'transport', 'trap', 'travel', 'treat', 'tremble', 'trick', 'trip', 'trot', 'trouble', 'trust', 'try', 'tug', 'tumble', 'turn', 'twist', 'type', 'undress', 'unfasten', 'unite', 'unlock', 'unpack', 'untidy', 'use', 'vanish', 'visit', 'wail', 'wait', 'walk', 'wander', 'want', 'warm', 'warn', 'wash', 'waste', 'watch', 'water', 'wave', 'weigh', 'welcome', 'whine', 'whip', 'whirl', 'whisper', 'whistle', 'wink', 'wipe', 'wish', 'wobble', 'wonder', 'work', 'worry', 'wrap', 'wreck', 'wrestle', 'wriggle', 'x-ray', 'yawn', 'yell', 'zip', 'zoom'])
-
-
-ENGLISH_IRREGULAR_VERBS = set(['arise ', 'arisen', 'arose ', 'ate', 'awake', 'awakened', 'awoke', 'awoken', 'backslid', 'backslidden', 'backslide', 'bade', 'be', 'bear', 'beat', 'beaten', 'became', 'become', 'been', 'began', 'begin', 'begun', 'bend', 'bent', 'bet', 'betted', 'bid', 'bidden', 'bind', 'bit', 'bite', 'bitten', 'bled', 'bleed', 'blew', 'blow', 'blown', 'bore', 'born', 'borne', 'bought', 'bound', 'break', 'bred', 'breed', 'bring', 'broadcast', 'broadcasted', 'broke', 'broken', 'brought', 'build', 'built', 'burn', 'burned', 'burnt', 'burst', 'bust', 'busted', 'buy', 'came', 'cast', 'catch', 'caught', 'choose', 'chose', 'chosen', 'clad', 'cling', 'clothe', 'clothed', 'clung', 'come', 'cost', 'creep', 'crept', 'cut', 'daydream', 'daydreamed', 'daydreamt', 'deal', 'dealt', 'did', 'dig', 'disprove', 'disproved', 'disproven', 'dive', 'dived', 'do', 'done', 'dove', 'drank', 'draw', 'drawn', 'dream', 'dreamed', 'dreamt', 'drew', 'drink', 'drive', 'driven', 'drove', 'drunk', 'dug', 'dwell', 'dwelled', 'dwelt', 'eat', 'eaten', 'fall', 'fallen', 'fed', 'feed', 'feel', 'fell', 'felt', 'fight', 'find', 'fit', 'fitted', 'fled', 'flee', 'flew', 'fling', 'flown', 'flung', 'fly', 'forbade', 'forbid', 'forbidden', 'forecast', 'forego', 'foregone', 'foresaw', 'foresee', 'foreseen', 'foretell', 'foretold', 'forewent', 'forgave', 'forget', 'forgive', 'forgiven', 'forgot', 'forgotten', 'forsake', 'forsaken', 'forsook', 'fought', 'found', 'freeze', 'froze', 'frozen', 'gave', 'get', 'give', 'given', 'go', 'gone', 'got', 'gotten', 'grew', 'grind', 'ground', 'grow', 'grown', 'had', 'hang', 'have', 'hear', 'heard', 'held', 'hew', 'hewed', 'hewn', 'hid', 'hidden', 'hide', 'hit', 'hold', 'hung', 'hurt', 'keep', 'kept', 'kneel', 'kneeled', 'knelt', 'knew', 'knit', 'knitted', 'know', 'known', 'laid', 'lain', 'lay', 'lead', 'lean', 'leaned', 'leant', 'leap', 'leaped', 'leapt', 'learn', 'learned', 'learnt', 'leave', 'led', 'left', 'lend', 'lent', 'let', 'lie', 'lied', 'light', 'lighted', 'lit', 'lose', 'lost', 'made', 'make', 'mean', 'meant', 'meet', 'met', 'misunderstand', 'misunderstood', 'mow', 'mowed', 'mown', 'paid', 'partake', 'partaken', 'partook', 'pay', 'plead', 'pleaded', 'pled', 'proofread', 'prove', 'proved', 'proven', 'put', 'quick-freeze', 'quick-froze', 'quick-frozen', 'quit', 'quitted', 'ran', 'rang', 'read', 'rid', 'ridden', 'ride', 'ring', 'rise', 'risen', 'rode', 'rose', 'run', 'rung', 'said', 'sang', 'sank', 'sat', 'saw', 'sawed', 'sawn', 'say', 'see', 'seek', 'seen', 'sell', 'send', 'sent', 'set', 'sew', 'sewed', 'sewn', 'shake', 'shaken', 'shave', 'shaved', 'shaven', 'shear', 'sheared', 'shed', 'shine', 'shined', 'shone', 'shook', 'shoot', 'shorn', 'shot', 'show', 'showed', 'shown', 'shrank', 'shrink', 'shrunk', 'shut', 'sing', 'sink', 'sit', 'slain', 'slay', 'slayed', 'sleep', 'slept', 'slew', 'slid', 'slide', 'sling', 'slink', 'slinked', 'slit', 'slung', 'slunk', 'smell', 'smelled', 'smelt', 'sneak', 'sneaked', 'snuck', 'sold', 'sought', 'sow', 'sowed', 'sown', 'spat', 'speak', 'sped', 'speed', 'speeded', 'spell', 'spelled', 'spelt', 'spend', 'spent', 'spill', 'spilled', 'spilt', 'spin', 'spit', 'split', 'spoil', 'spoiled', 'spoilt', 'spoke', 'spoken', 'sprang', 'spread', 'spring', 'sprung', 'spun', 'stand ', 'stank', 'steal', 'stick', 'sting', 'stink', 'stole', 'stolen', 'stood', 'strew', 'strewed', 'strewn', 'stricken', 'stridden', 'stride', 'strike', 'string', 'strive', 'strived', 'striven', 'strode', 'strove', 'struck', 'strung', 'stuck', 'stung', 'stunk', 'sublet', 'sunburn', 'sunburned', 'sunburnt', 'sung', 'sunk', 'swam', 'swear', 'sweat', 'sweated', 'sweep', 'swell', 'swelled', 'swept', 'swim', 'swing', 'swollen', 'swore', 'sworn', 'swum', 'swung', 'take', 'taken', 'taught', 'teach', 'tear', 'telecast', 'tell', 'test-drive', 'test-driven', 'test-drove', 'test-flew', 'test-flown', 'test-fly', 'think', 'thought', 'threw', 'throw', 'thrown', 'thrust', 'told', 'took', 'tore', 'torn', 'tread', 'trod', 'trodden', 'understand', 'understood', 'undertake', 'undertaken', 'undertook', 'undid', 'undo', 'undone', 'wake', 'waked', 'was, were', 'waylaid', 'waylay', 'wear', 'weave', 'weaved', 'wed', 'wedded', 'weep', 'went', 'wept', 'wet', 'wetted', 'whet', 'whetted', 'win', 'wind', 'withdraw', 'withdrawn', 'withdrew', 'withheld', 'withhold', 'withstand', 'withstood', 'woke', 'woken', 'won', 'wore', 'worn', 'wound', 'wove', 'woven', 'wring', 'write', 'written', 'wrote', 'wrung'])
--- a/ner/test/test_core.py	Thu Dec 19 14:41:47 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,225 +0,0 @@
-# -*- coding:utf-8 -*-
-#
-# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-import unittest2
-
-from nerdy import core
-from nerdy.tokenizer import Token, Sentence
-
-
-class CoreTest(unittest2.TestCase):
-    """ Test of core """
-
-    def test_lexical_source(self):
-        """ Test lexical source """
-        lexicon = {'everyone': 'http://example.com/everyone',
-                   'me': 'http://example.com/me'}
-        source = core.NerdySourceLexical(lexicon)
-        self.assertEqual(source.query_word('me'), ['http://example.com/me',])
-        self.assertEqual(source.query_word('everyone'), ['http://example.com/everyone',])
-        self.assertEqual(source.query_word('me everyone'), [])
-        self.assertEqual(source.query_word('toto'), [])
-        # Token
-        token = Token('me', 0, 2, None)
-        self.assertEqual(source.recognize_token(token), ['http://example.com/me',])
-        token = Token('ma', 0, 2, None)
-        self.assertEqual(source.recognize_token(token), [])
-
-    def test_rql_source(self):
-        """ Test rql source """
-        source = core.NerdySourceUrlRql('Any U LIMIT 1 WHERE X cwuri U, X name "%(word)s"',
-                                       'http://www.cubicweb.org')
-        self.assertEqual(source.query_word('apycot'), [u'http://www.cubicweb.org/1310453',])
-
-    def test_sparql_source(self):
-        """ Test sparql source """
-        source = core.NerdySourceSparql(u'''SELECT ?uri
-                                            WHERE{
-                                            ?uri rdfs:label "Python"@en .
-                                            ?uri rdf:type ?type}''',
-                                        u'http://dbpedia.org/sparql')
-        self.assertEqual(source.query_word('cubicweb'),
-                         [u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
-                          u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'])
-
-    def test_nerdy_process(self):
-        """ Test nerdy process """
-        text = 'Hello everyone, this is   me speaking. And me.'
-        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'me': 'http://example.com/me'})
-        nerdy = core.NerdyProcess((source,))
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone', None,
-                           Token(word='everyone', start=6, end=14,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
-
-    def test_nerdy_process_multisources(self):
-        """ Test nerdy process """
-        text = 'Hello everyone, this is   me speaking. And me.'
-        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'me': 'http://example.com/me'})
-        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
-        # Two sources, not unique
-        nerdy = core.NerdyProcess((source1, source2))
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone', None,
-                           Token(word='everyone', start=6, end=14,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
-        # Two sources, unique
-        nerdy = core.NerdyProcess((source1, source2), unique=True)
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone', None,
-                           Token(word='everyone', start=6, end=14,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
-        # Two sources inversed, unique
-        nerdy = core.NerdyProcess((source2, source1), unique=True)
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone', None,
-                           Token(word='everyone', start=6, end=14,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
-
-    def test_nerdy_process_add_sources(self):
-        """ Test nerdy process """
-        text = 'Hello everyone, this is   me speaking. And me.'
-        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'me': 'http://example.com/me'})
-        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
-        nerdy = core.NerdyProcess((source1,))
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone', None,
-                           Token(word='everyone', start=6, end=14,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46))),])
-        # Two sources, not unique
-        nerdy.add_ner_source(source2)
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone', None,
-                           Token(word='everyone', start=6, end=14,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
-
-    def test_nerdy_process_preprocess(self):
-        """ Test nerdy process """
-        text = 'Hello Toto, this is   me speaking. And me.'
-        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
-                                          'me': 'http://example.com/me'})
-        preprocessor = core.NerdyStopwordsFilterPreprocessor()
-        nerdy = core.NerdyProcess((source,),
-                                  preprocessors=(preprocessor,))
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities, [('http://example.com/toto', None,
-                                           Token(word='Toto', start=6, end=10,
-                                                 sentence=Sentence(indice=0, start=0, end=34)))])
-
-    def test_nerdy_process_add_preprocess(self):
-        """ Test nerdy process """
-        text = 'Hello Toto, this is   me speaking. And me.'
-        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
-                                          'me': 'http://example.com/me'})
-        preprocessor = core.NerdyStopwordsFilterPreprocessor()
-        nerdy = core.NerdyProcess((source,),)
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/toto', None,
-                           Token(word='Toto', start=6, end=10,
-                                 sentence=Sentence(indice=0, start=0, end=34))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=22, end=24,
-                                 sentence=Sentence(indice=0, start=0, end=34))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=39, end=41,
-                                 sentence=Sentence(indice=1, start=34, end=42)))])
-        nerdy.add_preprocessors(preprocessor)
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities, [('http://example.com/toto', None,
-                                           Token(word='Toto', start=6, end=10,
-                                                 sentence=Sentence(indice=0, start=0, end=34)))])
-
-    def test_nerdy_process_chained_word(self):
-        """ Test nerdy process """
-        text = 'Hello everyone me, this is   me speaking. And me.'
-        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'everyone me': 'http://example.com/everyone_me',
-                                          'me': 'http://example.com/me'})
-        nerdy = core.NerdyProcess((source,))
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone_me', None,
-                           Token(word='everyone me', start=6, end=17,
-                                 sentence=Sentence(indice=0, start=0, end=41))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=29, end=31,
-                                 sentence=Sentence(indice=0, start=0, end=41))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=46, end=48, sentence=Sentence(indice=1, start=41, end=49)))])
-
-
-if __name__ == '__main__':
-    unittest2.main()
-
--- a/ner/test/test_dataio.py	Thu Dec 19 14:41:47 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,85 +0,0 @@
-# -*- coding:utf-8 -*-
-#
-# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-import unittest2
-
-from nerdy import dataio, core
-
-
-class DataioTest(unittest2.TestCase):
-    """ Test of dataio """
-
-    def test_sparql_query(self):
-        results = dataio.sparql_query(query=u'''SELECT ?uri
-                                                WHERE{
-                                                ?uri rdfs:label "Python"@en .
-                                                ?uri rdf:type ?type}''',
-                                      endpoint=u'http://dbpedia.org/sparql')
-        truth = [{u'uri':
-                  {u'type': u'uri',
-                   u'value': u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage'}},
-                 {u'uri':
-                  {u'type': u'uri',
-                   u'value': u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'}}]
-        self.assertEqual(results, truth)
-
-    def test_rql_url_query(self):
-        results = dataio.rql_url_query('Any U LIMIT 1 WHERE X cwuri U, X name "apycot"',
-                                       'http://www.cubicweb.org')
-        self.assertEqual(results, [[u'http://www.cubicweb.org/1310453']])
-
-    def test_prettyprint(self):
-        text = 'Hello everyone, this is   me speaking. And me.'
-        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'me': 'http://example.com/me'})
-        nerdy = core.NerdyProcess((source,))
-        named_entities = nerdy.process_text(text)
-        html = dataio.NerdyHTMLPrettyPrint().pprint_text(text, named_entities)
-        self.assertEqual(html, (u'Hello <a href="http://example.com/everyone">everyone</a>, '
-                                u'this is   <a href="http://example.com/me">me</a> speaking. '
-                                u'And <a href="http://example.com/me">me</a>.'))
-
-    def test_prettyprint_class(self):
-        text = 'Hello everyone, this is   me speaking. And me.'
-        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'me': 'http://example.com/me'})
-        nerdy = core.NerdyProcess((source,))
-        named_entities = nerdy.process_text(text)
-        html = dataio.NerdyHTMLPrettyPrint().pprint_text(text, named_entities, html_class='ner')
-        self.assertEqual(html, (u'Hello <a href="http://example.com/everyone" class="ner">everyone</a>, '
-                                u'this is   <a href="http://example.com/me" class="ner">me</a> speaking. '
-                                u'And <a href="http://example.com/me" class="ner">me</a>.'))
-
-
-class NerdyValidXHTMLPrettyPrintTest(unittest2.TestCase):
-
-    def test_valid(self):
-        self.assertTrue(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
-            '<p>coucou</p>'))
-
-    def test_valid_unicode(self):
-        self.assertTrue(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
-            u'<p>hé</p>'))
-
-    def test_invalid(self):
-        self.assertFalse(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
-            '<p><div>coucou</div></p>'))
-
-
-if __name__ == '__main__':
-    unittest2.main()
-
--- a/ner/test/test_filter.py	Thu Dec 19 14:41:47 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,99 +0,0 @@
-# -*- coding:utf-8 -*-
-#
-# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-import unittest2
-
-from nerdy import core
-from nerdy.tokenizer import Token, Sentence
-
-
-class FilterTest(unittest2.TestCase):
-    """ Test of filters """
-
-    def test_occurence_filter_min_occ(self):
-        """ Test occurence filter """
-        text = 'Hello everyone, this is   me speaking. And me.'
-        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'me': 'http://example.com/me'})
-        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
-        _filter = core.NerdyOccurenceFilter(min_occ=2)
-        nerdy = core.NerdyProcess((source1, source2), filters=(_filter,))
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
-
-    def test_occurence_filter_max_occ(self):
-        """ Test occurence filter """
-        text = 'Hello everyone, this is   me speaking. And me.'
-        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'me': 'http://example.com/me'})
-        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
-        _filter = core.NerdyOccurenceFilter(max_occ=1)
-        nerdy = core.NerdyProcess((source1, source2), filters=(_filter,))
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone', None,
-                           Token(word='everyone', start=6, end=14,
-                                           sentence=Sentence(indice=0, start=0, end=38))),])
-
-    def test_disambiguation_word_length(self):
-        """ Test occurence filter """
-        text = 'Hello toto tutu. And toto.'
-        source = core.NerdySourceLexical({'toto tutu': 'http://example.com/toto_tutu',
-                                          'toto': 'http://example.com/toto'})
-        _filter = core.NerdyDisambiguationWordParts()
-        nerdy = core.NerdyProcess((source,), filters=(_filter,))
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/toto_tutu', None,
-                           Token(word='toto tutu', start=6, end=15,
-                                 sentence=Sentence(indice=0, start=0, end=16))),
-                          ('http://example.com/toto_tutu', None,
-                           Token(word='toto', start=21, end=25,
-                                 sentence=Sentence(indice=1, start=16, end=26)))])
-
-    def test_rules_filter(self):
-        """ Test rules filter """
-        text = 'Hello toto tutu. And toto.'
-        source = core.NerdySourceLexical({'toto tutu': 'http://example.com/toto_tutu',
-                                          'toto': 'http://example.com/toto'})
-        rules = {'http://example.com/toto': 'http://example.com/tata'}
-        _filter = core.NerdyReplacementRulesFilter(rules)
-        nerdy = core.NerdyProcess((source,), filters=(_filter,))
-        named_entities = nerdy.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/toto_tutu', None,
-                           Token(word='toto tutu', start=6, end=15,
-                                 sentence=Sentence(indice=0, start=0, end=16))),
-                          ('http://example.com/tata', None,
-                           Token(word='toto', start=21, end=25,
-                                 sentence=Sentence(indice=1, start=16, end=26)))])
-
-if __name__ == '__main__':
-    unittest2.main()
-
--- a/ner/test/test_preprocessor.py	Thu Dec 19 14:41:47 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,97 +0,0 @@
-# -*- coding:utf-8 -*-
-#
-# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-import unittest2
-
-from nerdy import core, tokenizer
-
-
-class PreprocessorTest(unittest2.TestCase):
-    """ Test of preprocessors """
-
-    def test_lowercasefilter(self):
-        preprocessor = core.NerdyLowerCaseFilterPreprocessor()
-        token = tokenizer.Token('toto', 0, 4, None)
-        self.assertEqual(preprocessor(token), None)
-        token = tokenizer.Token('toto Tata', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-        token = tokenizer.Token('toto tata', 0, 4, None)
-        self.assertEqual(preprocessor(token), None)
-
-    def test_wordsizefilter(self):
-        preprocessor = core.NerdyWordSizeFilterPreprocessor()
-        token = tokenizer.Token('toto', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-        preprocessor = core.NerdyWordSizeFilterPreprocessor(min_size=3)
-        token = tokenizer.Token('toto', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-        token = tokenizer.Token('to', 0, 4, None)
-        self.assertEqual(preprocessor(token), None)
-        preprocessor = core.NerdyWordSizeFilterPreprocessor(max_size=3)
-        token = tokenizer.Token('toto', 0, 4, None)
-        self.assertEqual(preprocessor(token), None)
-        token = tokenizer.Token('to', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-
-    def test_lowerfirstword(self):
-        preprocessor = core.NerdyLowerFirstWordPreprocessor()
-        sentence = tokenizer.Sentence(0, 0, 20)
-        # Start of the sentence
-        token1 = tokenizer.Token('Toto tata', 0, 4, sentence)
-        token2 = tokenizer.Token('Toto tata', 0, 4, sentence)
-        self.assertEqual(preprocessor(token1), token2)
-        token1 = tokenizer.Token('Us tata', 0, 4, sentence)
-        token2 = tokenizer.Token('us tata', 0, 4, sentence)
-        self.assertEqual(preprocessor(token1), token2)
-        # Not start of the sentence
-        token1 = tokenizer.Token('Toto tata', 12, 16, sentence)
-        token2 = tokenizer.Token('Toto tata', 12, 16, sentence)
-        self.assertEqual(preprocessor(token1), token2)
-        token1 = tokenizer.Token('Us tata', 12, 16, sentence)
-        token2 = tokenizer.Token('Us tata', 12, 16, sentence)
-        self.assertEqual(preprocessor(token1), token2)
-
-    def test_stopwordsfilter(self):
-        preprocessor = core.NerdyStopwordsFilterPreprocessor()
-        token = tokenizer.Token('Toto', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-        token = tokenizer.Token('Us', 0, 4, None)
-        self.assertEqual(preprocessor(token), None)
-        token = tokenizer.Token('Us there', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-        # Split words
-        preprocessor = core.NerdyStopwordsFilterPreprocessor(split_words=True)
-        token = tokenizer.Token('Us there', 0, 4, None)
-        self.assertEqual(preprocessor(token), None)
-        token = tokenizer.Token('Us there toto', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-
-    def test_hashtag(self):
-        preprocessor = core.NerdyHashTagPreprocessor()
-        token = tokenizer.Token('Toto', 0, 4, None)
-        self.assertEqual(preprocessor(token), token)
-        token1 = tokenizer.Token('@BarackObama', 0, 4, None)
-        token2 = tokenizer.Token('BarackObama', 0, 4, None)
-        self.assertEqual(preprocessor(token1), token2)
-        token1 = tokenizer.Token('@Barack_Obama', 0, 4, None)
-        token2 = tokenizer.Token('Barack Obama', 0, 4, None)
-        self.assertEqual(preprocessor(token1), token2)
-
-
-if __name__ == '__main__':
-    unittest2.main()
-
--- a/ner/test/test_tokenizer.py	Thu Dec 19 14:41:47 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,88 +0,0 @@
-# -*- coding:utf-8 -*-
-#
-# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-import unittest2
-
-from nerdy.tokenizer import RichStringTokenizer, Token, Sentence
-
-
-class TokenizerTest(unittest2.TestCase):
-    """ Test of tokenizer """
-
-    def test_richstringtokenizer(self):
-        text = 'Hello everyone, this is   me speaking. And me.'
-        tokenizer = RichStringTokenizer(text,
-                                        token_min_size=1,
-                                        token_max_size=3)
-        tokens = list(tokenizer)
-        self.assertEqual(len(tokens), 18)
-        t1 = Token(word='Hello everyone this', start=0, end=20, sentence=Sentence(indice=0, start=0, end=38))
-        self.assertEqual(tokens[0], t1)
-        t2 = Token(word='And', start=39, end=42, sentence=Sentence(indice=1, start=38, end=46))
-        self.assertEqual(tokens[16], t2)
-
-    def test_richstringtokenizer_loadtext(self):
-        text = 'Hello everyone, this is   me speaking. And me.'
-        tokenizer = RichStringTokenizer(text,
-                                        token_min_size=1,
-                                        token_max_size=3)
-        tokens = list(tokenizer)
-        self.assertEqual(len(tokens), 18)
-        tokenizer.load_text('Hello everyone')
-        tokens = list(tokenizer)
-        self.assertEqual(len(tokens), 3)
-
-    def test_richstringtokenizer_minsize(self):
-        text = 'Hello everyone, this is   me speaking. And me.'
-        tokenizer = RichStringTokenizer(text,
-                                        token_min_size=2,
-                                        token_max_size=3)
-        tokens = list(tokenizer)
-        self.assertEqual(len(tokens), 10)
-        t1 =  Token(word='me speaking', start=26, end=37, sentence=Sentence(indice=0, start=0, end=38))
-        self.assertEqual(tokens[8], t1)
-
-    def test_richstringtokenizer_maxsize(self):
-        text = 'Hello everyone, this is   me speaking. And me.'
-        tokenizer = RichStringTokenizer(text,
-                                        token_min_size=1,
-                                        token_max_size=4)
-        tokens = list(tokenizer)
-        self.assertEqual(len(tokens), 21)
-        t1 = Token(word='And me', start=39, end=45, sentence=Sentence(indice=1, start=38, end=46))
-        self.assertEqual(tokens[18], t1)
-
-    def test_richstringtokenizer_sentences(self):
-        text = 'Hello everyone, this is   me speaking. And me !Why not me ? Blup'
-        tokenizer = RichStringTokenizer(text,
-                                        token_min_size=1,
-                                        token_max_size=4)
-        sentences = tokenizer.find_sentences(text)
-        self.assertEqual(len(sentences), 4)
-        self.assertEqual(text[sentences[0].start:sentences[0].end],
-                         'Hello everyone, this is   me speaking.')
-        self.assertEqual(text[sentences[1].start:sentences[1].end],
-                         ' And me !')
-        self.assertEqual(text[sentences[2].start:sentences[2].end],
-                         'Why not me ?')
-        self.assertEqual(text[sentences[3].start:sentences[3].end],
-                         ' Blup')
-
-
-if __name__ == '__main__':
-    unittest2.main()
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/reference_data/stopwords.py	Thu Dec 19 14:44:44 2013 +0000
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+"""
+Stopwords in different languages.
+"""
+
+FRENCH_STOPWORDS = set(['alors', 'au', 'aucuns', 'aussi', 'autre', 'aux', 'avant', 'avec', 'avoir', 'bon', 'car', 'ce', 'cela', 'ces', 'ceux', 'chaque', 'ci', 'comme', 'comment', 'dans', 'de', 'dedans', 'dehors', 'depuis', 'des', 'deux', 'devrait', 'doit', 'donc', 'dos', 'droite', 'du', 'début', 'elle', 'elles', 'en', 'encore', 'essai', 'est', 'et', 'eu', 'eux', 'fait', 'faites', 'fois', 'font', 'force', 'haut', 'hors', 'ici', 'il', 'ils', 'je', 'juste', 'la', 'le', 'les', 'leur', 'lui', 'là', 'ma', 'maintenant', 'mais', 'me', 'meme', 'mes', 'mine', 'moi', 'moins', 'mon', 'mot', 'ne', 'ni', 'nommés', 'nos', 'notre', 'nous', 'nouveaux', 'on', 'ou', 'où', 'par', 'parce', 'parole', 'pas', 'personnes', 'peu', 'peut', 'pièce', 'plupart', 'pour', 'pourquoi', 'qu', 'quand', 'que', 'quel', 'quelle', 'quelles', 'quels', 'qui', 'sa', 'sans', 'se', 'ses', 'seulement', 'si', 'sien', 'son', 'sont', 'sous', 'soyez', 'sujet', 'sur', 'ta', 'tandis', 'te', 'tellement', 'tels', 'tes', 'toi', 'ton', 'tous', 'tout', 'trop', 'très', 'tu', 'un', 'une', 'valeur', 'voie', 'voient', 'vont', 'vos', 'votre', 'vous', 'vu', 'ça', 'étaient', 'état', 'étions', 'été', 'être'])
+
+
+ENGLISH_STOPWORDS = set(['a', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', "ain't", 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', "aren't", 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'bill', 'both', 'bottom', 'brief', 'but', 'by', 'call', 'came', 'can', 'cannot', 'cant', "can't", 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'computer', 'con', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'couldnt', "couldn't", 'course', 'cry', 'currently', "c'mon", "c's", 'de', 'definitely', 'describe', 'described', 'despite', 'detail', 'did', "didn't", 'different', 'do', 'does', "doesn't", 'doing', 'done', "don't", 'down', 'downwards', 'due', 'during', 'each', 'edu', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'far', 'few', 'fifteen', 'fifth', 'fify', 'fill', 'find', 'fire', 'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'furthermore', 'get', 'gets', 'getting', 'give', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'had', "hadn't", 'happens', 'hardly', 'has', 'hasnt', "hasn't", 'have', "haven't", 'having', 'he', 'hello', 'help', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', "here's", 'hers', 'herself', "he's", 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'hundred', 'i', "i'd", "i'll", "i'm", "i've", 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'insofar', 'instead', 'interest', 'into', 'inward', 'is', "isn't", 'it', 'its', 'itself', "it'd", "it'll", "it's'", "i'd", "i'll", "i'm", "i've", 'just', 'keep', 'keeps', 'kept', 'know', 'known', 'knows', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', "let's", 'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks', 'ltd', 'made', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'part', 'particular', 'particularly', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'put', 'que', 'quite', 'qv', 'rather', 'rd', 're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', "shouldn't", 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 'system', 'take', 'taken', 'tell', 'ten', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', "that's", 'thats', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'thick', 'thin', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took', 'top', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twelve', 'twenty', 'twice', 'two', "t's", 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', 'via', 'viz', 'vs', 'want', 'wants', 'was', "wasn't", 'way', 'we', 'welcome', 'well', 'went', 'were', "weren't", "we'd", "we'll", "we're", "we've", 'what', 'whatever', "what's", 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', "where's", 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', "who's", 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'wonder', "won't", 'would', "wouldn't", 'yes', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves', "you'd", "you'll", "you're", "you've", 'zero'])
+
+
+ENGLISH_REGULAR_VERBS = set(['accept', 'add', 'admire', 'admit', 'advise', 'afford', 'agree', 'alert', 'allow', 'amuse', 'analyse', 'announce', 'annoy', 'answer', 'apologise', 'appear', 'applaud', 'appreciate', 'approve', 'argue', 'arrange', 'arrest', 'arrive', 'ask', 'attach', 'attack', 'attempt', 'attend', 'attract', 'avoid', 'back', 'bake', 'balance', 'ban', 'bang', 'bare', 'bat', 'bathe', 'battle', 'beam', 'beg', 'behave', 'belong', 'bleach', 'bless', 'blind', 'blink', 'blot', 'blush', 'boast', 'boil', 'bolt', 'bomb', 'book', 'bore', 'borrow', 'bounce', 'bow', 'box', 'brake', 'branch', 'breathe', 'bruise', 'brush', 'bubble', 'bump', 'burn', 'bury', 'buzz', 'calculate', 'call', 'camp', 'care', 'carry', 'carve', 'cause', 'challenge', 'change', 'charge', 'chase', 'cheat', 'check', 'cheer', 'chew', 'choke', 'chop', 'claim', 'clap', 'clean', 'clear', 'clip', 'close', 'coach', 'coil', 'collect', 'colour', 'comb', 'command', 'communicate', 'compare', 'compete', 'complain', 'complete', 'concentrate', 'concern', 'confess', 'confuse', 'connect', 'consider', 'consist', 'contain', 'continue', 'copy', 'correct', 'cough', 'count', 'cover', 'crack', 'crash', 'crawl', 'cross', 'crush', 'cry', 'cure', 'curl', 'curve', 'cycle', 'dam', 'damage', 'dance', 'dare', 'decay', 'deceive', 'decide', 'decorate', 'delay', 'delight', 'deliver', 'depend', 'describe', 'desert', 'deserve', 'destroy', 'detect', 'develop', 'disagree', 'disappear', 'disapprove', 'disarm', 'discover', 'dislike', 'divide', 'double', 'doubt', 'drag', 'drain', 'dream', 'dress', 'drip', 'drop', 'drown', 'drum', 'dry', 'dust', 'earn', 'educate', 'embarrass', 'employ', 'empty', 'encourage', 'end', 'enjoy', 'enter', 'entertain', 'escape', 'examine', 'excite', 'excuse', 'exercise', 'exist', 'expand', 'expect', 'explain', 'explode', 'extend', 'face', 'fade', 'fail', 'fancy', 'fasten', 'fax', 'fear', 'fence', 'fetch', 'file', 'fill', 'film', 'fire', 'fit', 'fix', 'flap', 'flash', 'float', 'flood', 'flow', 'flower', 'fold', 'follow', 'fool', 'force', 'form', 'found', 'frame', 'frighten', 'fry', 'gather', 'gaze', 'glow', 'glue', 'grab', 'grate', 'grease', 'greet', 'grin', 'grip', 'groan', 'guarantee', 'guard', 'guess', 'guide', 'hammer', 'hand', 'handle', 'hang', 'happen', 'harass', 'harm', 'hate', 'haunt', 'head', 'heal', 'heap', 'heat', 'help', 'hook', 'hop', 'hope', 'hover', 'hug', 'hum', 'hunt', 'hurry', 'identify', 'ignore', 'imagine', 'impress', 'improve', 'include', 'increase', 'influence', 'inform', 'inject', 'injure', 'instruct', 'intend', 'interest', 'interfere', 'interrupt', 'introduce', 'invent', 'invite', 'irritate', 'itch', 'jail', 'jam', 'jog', 'join', 'joke', 'judge', 'juggle', 'jump', 'kick', 'kill', 'kiss', 'kneel', 'knit', 'knock', 'knot', 'label', 'land', 'last', 'laugh', 'launch', 'learn', 'level', 'license', 'lick', 'lie', 'lighten', 'like', 'list', 'listen', 'live', 'load', 'lock', 'long', 'look', 'love', 'man', 'manage', 'march', 'mark', 'marry', 'match', 'mate', 'matter', 'measure', 'meddle', 'melt', 'memorise', 'mend', 'mess up', 'milk', 'mine', 'miss', 'mix', 'moan', 'moor', 'mourn', 'move', 'muddle', 'mug', 'multiply', 'murder', 'nail', 'name', 'need', 'nest', 'nod', 'note', 'notice', 'number', 'obey', 'object', 'observe', 'obtain', 'occur', 'offend', 'offer', 'open', 'order', 'overflow', 'owe', 'own', 'pack', 'paddle', 'paint', 'park', 'part', 'pass', 'paste', 'pat', 'pause', 'peck', 'pedal', 'peel', 'peep', 'perform', 'permit', 'phone', 'pick', 'pinch', 'pine', 'place', 'plan', 'plant', 'play', 'please', 'plug', 'point', 'poke', 'polish', 'pop', 'possess', 'post', 'pour', 'practise', 'pray', 'preach', 'precede', 'prefer', 'prepare', 'present', 'preserve', 'press', 'pretend', 'prevent', 'prick', 'print', 'produce', 'program', 'promise', 'protect', 'provide', 'pull', 'pump', 'punch', 'puncture', 'punish', 'push', 'question', 'queue', 'race', 'radiate', 'rain', 'raise', 'reach', 'realise', 'receive', 'recognise', 'record', 'reduce', 'reflect', 'refuse', 'regret', 'reign', 'reject', 'rejoice', 'relax', 'release', 'rely', 'remain', 'remember', 'remind', 'remove', 'repair', 'repeat', 'replace', 'reply', 'report', 'reproduce', 'request', 'rescue', 'retire', 'return', 'rhyme', 'rinse', 'risk', 'rob', 'rock', 'roll', 'rot', 'rub', 'ruin', 'rule', 'rush', 'sack', 'sail', 'satisfy', 'save', 'saw', 'scare', 'scatter', 'scold', 'scorch', 'scrape', 'scratch', 'scream', 'screw', 'scribble', 'scrub', 'seal', 'search', 'separate', 'serve', 'settle', 'shade', 'share', 'shave', 'shelter', 'shiver', 'shock', 'shop', 'shrug', 'sigh', 'sign', 'signal', 'sin', 'sip', 'ski', 'skip', 'slap', 'slip', 'slow', 'smash', 'smell', 'smile', 'smoke', 'snatch', 'sneeze', 'sniff', 'snore', 'snow', 'soak', 'soothe', 'sound', 'spare', 'spark', 'sparkle', 'spell', 'spill', 'spoil', 'spot', 'spray', 'sprout', 'squash', 'squeak', 'squeal', 'squeeze', 'stain', 'stamp', 'stare', 'start', 'stay', 'steer', 'step', 'stir', 'stitch', 'stop', 'store', 'strap', 'strengthen', 'stretch', 'strip', 'stroke', 'stuff', 'subtract', 'succeed', 'suck', 'suffer', 'suggest', 'suit', 'supply', 'support', 'suppose', 'surprise', 'surround', 'suspect', 'suspend', 'switch', 'talk', 'tame', 'tap', 'taste', 'tease', 'telephone', 'tempt', 'terrify', 'test', 'thank', 'thaw', 'tick', 'tickle', 'tie', 'time', 'tip', 'tire', 'touch', 'tour', 'tow', 'trace', 'trade', 'train', 'transport', 'trap', 'travel', 'treat', 'tremble', 'trick', 'trip', 'trot', 'trouble', 'trust', 'try', 'tug', 'tumble', 'turn', 'twist', 'type', 'undress', 'unfasten', 'unite', 'unlock', 'unpack', 'untidy', 'use', 'vanish', 'visit', 'wail', 'wait', 'walk', 'wander', 'want', 'warm', 'warn', 'wash', 'waste', 'watch', 'water', 'wave', 'weigh', 'welcome', 'whine', 'whip', 'whirl', 'whisper', 'whistle', 'wink', 'wipe', 'wish', 'wobble', 'wonder', 'work', 'worry', 'wrap', 'wreck', 'wrestle', 'wriggle', 'x-ray', 'yawn', 'yell', 'zip', 'zoom'])
+
+
+ENGLISH_IRREGULAR_VERBS = set(['arise ', 'arisen', 'arose ', 'ate', 'awake', 'awakened', 'awoke', 'awoken', 'backslid', 'backslidden', 'backslide', 'bade', 'be', 'bear', 'beat', 'beaten', 'became', 'become', 'been', 'began', 'begin', 'begun', 'bend', 'bent', 'bet', 'betted', 'bid', 'bidden', 'bind', 'bit', 'bite', 'bitten', 'bled', 'bleed', 'blew', 'blow', 'blown', 'bore', 'born', 'borne', 'bought', 'bound', 'break', 'bred', 'breed', 'bring', 'broadcast', 'broadcasted', 'broke', 'broken', 'brought', 'build', 'built', 'burn', 'burned', 'burnt', 'burst', 'bust', 'busted', 'buy', 'came', 'cast', 'catch', 'caught', 'choose', 'chose', 'chosen', 'clad', 'cling', 'clothe', 'clothed', 'clung', 'come', 'cost', 'creep', 'crept', 'cut', 'daydream', 'daydreamed', 'daydreamt', 'deal', 'dealt', 'did', 'dig', 'disprove', 'disproved', 'disproven', 'dive', 'dived', 'do', 'done', 'dove', 'drank', 'draw', 'drawn', 'dream', 'dreamed', 'dreamt', 'drew', 'drink', 'drive', 'driven', 'drove', 'drunk', 'dug', 'dwell', 'dwelled', 'dwelt', 'eat', 'eaten', 'fall', 'fallen', 'fed', 'feed', 'feel', 'fell', 'felt', 'fight', 'find', 'fit', 'fitted', 'fled', 'flee', 'flew', 'fling', 'flown', 'flung', 'fly', 'forbade', 'forbid', 'forbidden', 'forecast', 'forego', 'foregone', 'foresaw', 'foresee', 'foreseen', 'foretell', 'foretold', 'forewent', 'forgave', 'forget', 'forgive', 'forgiven', 'forgot', 'forgotten', 'forsake', 'forsaken', 'forsook', 'fought', 'found', 'freeze', 'froze', 'frozen', 'gave', 'get', 'give', 'given', 'go', 'gone', 'got', 'gotten', 'grew', 'grind', 'ground', 'grow', 'grown', 'had', 'hang', 'have', 'hear', 'heard', 'held', 'hew', 'hewed', 'hewn', 'hid', 'hidden', 'hide', 'hit', 'hold', 'hung', 'hurt', 'keep', 'kept', 'kneel', 'kneeled', 'knelt', 'knew', 'knit', 'knitted', 'know', 'known', 'laid', 'lain', 'lay', 'lead', 'lean', 'leaned', 'leant', 'leap', 'leaped', 'leapt', 'learn', 'learned', 'learnt', 'leave', 'led', 'left', 'lend', 'lent', 'let', 'lie', 'lied', 'light', 'lighted', 'lit', 'lose', 'lost', 'made', 'make', 'mean', 'meant', 'meet', 'met', 'misunderstand', 'misunderstood', 'mow', 'mowed', 'mown', 'paid', 'partake', 'partaken', 'partook', 'pay', 'plead', 'pleaded', 'pled', 'proofread', 'prove', 'proved', 'proven', 'put', 'quick-freeze', 'quick-froze', 'quick-frozen', 'quit', 'quitted', 'ran', 'rang', 'read', 'rid', 'ridden', 'ride', 'ring', 'rise', 'risen', 'rode', 'rose', 'run', 'rung', 'said', 'sang', 'sank', 'sat', 'saw', 'sawed', 'sawn', 'say', 'see', 'seek', 'seen', 'sell', 'send', 'sent', 'set', 'sew', 'sewed', 'sewn', 'shake', 'shaken', 'shave', 'shaved', 'shaven', 'shear', 'sheared', 'shed', 'shine', 'shined', 'shone', 'shook', 'shoot', 'shorn', 'shot', 'show', 'showed', 'shown', 'shrank', 'shrink', 'shrunk', 'shut', 'sing', 'sink', 'sit', 'slain', 'slay', 'slayed', 'sleep', 'slept', 'slew', 'slid', 'slide', 'sling', 'slink', 'slinked', 'slit', 'slung', 'slunk', 'smell', 'smelled', 'smelt', 'sneak', 'sneaked', 'snuck', 'sold', 'sought', 'sow', 'sowed', 'sown', 'spat', 'speak', 'sped', 'speed', 'speeded', 'spell', 'spelled', 'spelt', 'spend', 'spent', 'spill', 'spilled', 'spilt', 'spin', 'spit', 'split', 'spoil', 'spoiled', 'spoilt', 'spoke', 'spoken', 'sprang', 'spread', 'spring', 'sprung', 'spun', 'stand ', 'stank', 'steal', 'stick', 'sting', 'stink', 'stole', 'stolen', 'stood', 'strew', 'strewed', 'strewn', 'stricken', 'stridden', 'stride', 'strike', 'string', 'strive', 'strived', 'striven', 'strode', 'strove', 'struck', 'strung', 'stuck', 'stung', 'stunk', 'sublet', 'sunburn', 'sunburned', 'sunburnt', 'sung', 'sunk', 'swam', 'swear', 'sweat', 'sweated', 'sweep', 'swell', 'swelled', 'swept', 'swim', 'swing', 'swollen', 'swore', 'sworn', 'swum', 'swung', 'take', 'taken', 'taught', 'teach', 'tear', 'telecast', 'tell', 'test-drive', 'test-driven', 'test-drove', 'test-flew', 'test-flown', 'test-fly', 'think', 'thought', 'threw', 'throw', 'thrown', 'thrust', 'told', 'took', 'tore', 'torn', 'tread', 'trod', 'trodden', 'understand', 'understood', 'undertake', 'undertaken', 'undertook', 'undid', 'undo', 'undone', 'wake', 'waked', 'was, were', 'waylaid', 'waylay', 'wear', 'weave', 'weaved', 'wed', 'wedded', 'weep', 'went', 'wept', 'wet', 'wetted', 'whet', 'whetted', 'win', 'wind', 'withdraw', 'withdrawn', 'withdrew', 'withheld', 'withhold', 'withstand', 'withstood', 'woke', 'woken', 'won', 'wore', 'worn', 'wound', 'wove', 'woven', 'wring', 'write', 'written', 'wrote', 'wrung'])
--- a/reference_data/us_states.txt	Thu Dec 19 14:41:47 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,210 +0,0 @@
-
-# See http://en.wikipedia.org/wiki/List_of_U.S._state_abbreviations
-# WARNING: The name of each state should be in French
-# (e.g. "Floride", not "Florida")
-US_STATES = {'AK': 'Alaska',
-             'AL': 'Alabama',
-             'AR': 'Arkansas',
-             'AZ': 'Arizona',
-             'Ala.': 'Alabama',
-             'Alas.': 'Alaska',
-             'Alaska': 'Alaska',
-             'Ariz.': 'Arizona',
-             'Ark.': 'Arkansas',
-             'Az.': 'Arizona',
-             'CA': 'Californie',
-             'CF': 'Californie',
-             'CL': 'Colorado',
-             'CO': 'Colorado',
-             'CT': 'Connecticut',
-             'Ca.': 'Californie',
-             'Cal.': 'Californie',
-             'Cali.': 'Californie',
-             'Calif.': 'Californie',
-             'Col.': 'Colorado',
-             'Colo.': 'Colorado',
-             'Conn.': 'Connecticut',
-             'Ct.': 'Connecticut',
-             'D.C.': 'District of ColuFederal district',
-             'DC': 'District of ColuFederal district',
-             'DE': 'Delaware',
-             'DL': 'Delaware',
-             'De.': 'Delaware',
-             'Del.': 'Delaware',
-             'FL': 'Floride',
-             'Fl.': 'Floride',
-             'Fla.': 'Floride',
-             'Flor.': 'Floride',
-             'GA': u'Géorgie',
-             'Ga.': u'Géorgie',
-             'H.I.': 'Hawaii',
-             'HA': 'Hawaii',
-             'HI': 'Hawaii',
-             'Hawaii': 'Hawaii',
-             'IA': 'Iowa',
-             'ID': 'Idaho',
-             'IL': 'Illinois',
-             'IN': 'Indiana',
-             'Ia.': 'Iowa',
-             'Id.': 'Idaho',
-             'Ida.': 'Idaho',
-             'Idaho': 'Idaho',
-             'Il.': 'Illinois',
-             "Ill's": 'Illinois',
-             'Ill.': 'Illinois',
-             'Ills.': 'Illinois',
-             'In.': 'Indiana',
-             'Ind.': 'Indiana',
-             'Ioa.': 'Iowa',
-             'Iowa': 'Iowa',
-             'KA': 'Kansas',
-             'KS': 'Kansas',
-             'KY': 'Kentucky',
-             'Ka.': 'Kansas',
-             'Kan.': 'Kansas',
-             'Kans.': 'Kansas',
-             'Ks.': 'Kansas',
-             'Ky.': 'Kentucky',
-             'LA': 'Louisiane',
-             'La.': 'Louisiane',
-             'MA': 'Massachusetts',
-             'MC': 'Michigan',
-             'MD': 'Maryland',
-             'ME': 'Maine',
-             'MI': 'Mississippi',
-             'MN': 'Minnesota',
-             'MO': 'Missouri',
-             'MS': 'Mississippi',
-             'MT': 'Montana',
-             'Maine': 'Maine',
-             'Mass.': 'Massachusetts',
-             'Md.': 'Maryland',
-             'Me.': 'Maine',
-             'Mich.': 'Michigan',
-             'Minn.': 'Minnesota',
-             'Miss.': 'Mississippi',
-             'Mn.': 'Minnesota',
-             'Mo.': 'Missouri',
-             'Mont.': 'Montana',
-             'N. Car.': 'Caroline du Nord',
-             'N. Dak.': 'Dakota du Nord',
-             'N. Mex.': 'Nouveau-Mexique',
-             'N. York': 'New York',
-             'N.C.': 'Caroline du Nord',
-             'N.D.': 'Dakota du Nord',
-             'N.H.': 'New Hampshire',
-             'N.J.': 'New Jersey',
-             'N.M.': 'Nouveau-Mexique',
-             'N.Y.': 'New York',
-             'NB': 'Nebraska',
-             'NC': 'Caroline du Nord',
-             'ND': 'Dakota du Nord',
-             'NE': 'Nebraska',
-             'NH': 'New Hampshire',
-             'NJ': 'New Jersey',
-             'NM': 'Nouveau-Mexique',
-             'NV': 'Nevada',
-             'NY': 'New York',
-             'Neb.': 'Nebraska',
-             'Nebr.': 'Nebraska',
-             'Nev.': 'Nevada',
-             'New M.': 'Nouveau-Mexique',
-             'NoDak': 'Dakota du Nord',
-             'Nv.': 'Nevada',
-             'O.': 'Ohio',
-             'OH': 'Ohio',
-             'OK': 'Oklahoma',
-             'OR': 'Oregon',
-             'Oh.': 'Ohio',
-             'Ohio': 'Ohio',
-             'Ok.': 'Oklahoma',
-             'Okla.': 'Oklahoma',
-             'Or.': 'Oregon',
-             'Ore.': 'Oregon',
-             'Oreg.': 'Oregon',
-             'PA': 'Pennsylvanie',
-             'Pa.': 'Pennsylvanie',
-             'R.I.': 'Rhode Island',
-             'R.I. & P.P.': 'Rhode Island',
-             'RI': 'Rhode Island',
-             'S. Car.': 'Caroline du Sud',
-             'S. Dak.': 'Dakota du Sud',
-             'S.C.': 'Caroline du Sud',
-             'S.D.': 'Dakota du Sud',
-             'SC': 'Caroline du Sud',
-             'SD': 'Dakota du Sud',
-             'SoDak': 'Dakota du Sud',
-             'State': 'Utah',
-             'TN': 'Tennessee',
-             'TX': 'Texas',
-             'Tenn.': 'Tennessee',
-             'Tex.': 'Texas',
-             'Texas': 'Texas',
-             'Tn.': 'Tennessee',
-             'Tx.': 'Texas',
-             'US-AL': 'Alabama',
-             'US-AR': 'Arkansas',
-             'US-AZ': 'Arizona',
-             'US-CA': 'Californie',
-             'US-CO': 'Colorado',
-             'US-CT': 'Connecticut',
-             'US-DC': 'District of ColuFederal district',
-             'US-DE': 'Delaware',
-             'US-FL': 'Floride',
-             'US-GA': u'Géorgie',
-             'US-IL': 'Illinois',
-             'US-IN': 'Indiana',
-             'US-KY': 'Kentucky',
-             'US-LA': 'Louisiane',
-             'US-MA': 'Massachusetts',
-             'US-MD': 'Maryland',
-             'US-MI': 'Michigan',
-             'US-MN': 'Minnesota',
-             'US-MO': 'Missouri',
-             'US-MS': 'Mississippi',
-             'US-MT': 'Montana',
-             'US-NC': 'Caroline du Nord',
-             'US-ND': 'Dakota du Nord',
-             'US-NE': 'Nebraska',
-             'US-NH': 'New Hampshire',
-             'US-NJ': 'New Jersey',
-             'US-NM': 'Nouveau-Mexique',
-             'US-NY': 'New York',
-             'US-OK': 'Oklahoma',
-             'US-PA': 'Pennsylvanie',
-             'US-RI': 'Rhode Island',
-             'US-SC': 'Caroline du Sud',
-             'US-SD': 'Dakota du Sud',
-             'US-TN': 'Tennessee',
-             'US-VA': 'Virginia',
-             'US-VT': 'Vermont',
-             'US-WA': 'Washington',
-             'US-WI': 'Wisconsin',
-             'US-WV': 'Virginie occidentale',
-             'US-WY': 'Wyoming',
-             'UT': 'Utah',
-             'Ut.': 'Utah',
-             'Utah': 'Utah',
-             'VA': 'Virginia',
-             'VT': 'Vermont',
-             'Va.': 'Virginia',
-             'Vt.': 'Vermont',
-             'W. Va.': 'Virginie occidentale',
-             'W. Virg.': 'Virginie occidentale',
-             'W.V.': 'Virginie occidentale',
-             'W.Va.': 'Virginie occidentale',
-             'WA': 'Washington',
-             'WI': 'Wisconsin',
-             'WN': 'Washington',
-             'WS': 'Wisconsin',
-             'WV': 'Virginie occidentale',
-             'WY': 'Wyoming',
-             'Wa.': 'Washington',
-             'Wash.': 'Washington',
-             'Wash. D.C.': 'District of ColuFederal district',
-             'Wi.': 'Wisconsin',
-             'Wis.': 'Wisconsin',
-             'Wisc.': 'Wisconsin',
-             'Wn.': 'Washington',
-             'Wy.': 'Wyoming',
-             'Wyo.': 'Wyoming'}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_core.py	Thu Dec 19 14:44:44 2013 +0000
@@ -0,0 +1,225 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+import unittest2
+
+from nerdy import core
+from nerdy.tokenizer import Token, Sentence
+
+
+class CoreTest(unittest2.TestCase):
+    """ Test of core """
+
+    def test_lexical_source(self):
+        """ Test lexical source """
+        lexicon = {'everyone': 'http://example.com/everyone',
+                   'me': 'http://example.com/me'}
+        source = core.NerdySourceLexical(lexicon)
+        self.assertEqual(source.query_word('me'), ['http://example.com/me',])
+        self.assertEqual(source.query_word('everyone'), ['http://example.com/everyone',])
+        self.assertEqual(source.query_word('me everyone'), [])
+        self.assertEqual(source.query_word('toto'), [])
+        # Token
+        token = Token('me', 0, 2, None)
+        self.assertEqual(source.recognize_token(token), ['http://example.com/me',])
+        token = Token('ma', 0, 2, None)
+        self.assertEqual(source.recognize_token(token), [])
+
+    def test_rql_source(self):
+        """ Test rql source """
+        source = core.NerdySourceUrlRql('Any U LIMIT 1 WHERE X cwuri U, X name "%(word)s"',
+                                       'http://www.cubicweb.org')
+        self.assertEqual(source.query_word('apycot'), [u'http://www.cubicweb.org/1310453',])
+
+    def test_sparql_source(self):
+        """ Test sparql source """
+        source = core.NerdySourceSparql(u'''SELECT ?uri
+                                            WHERE{
+                                            ?uri rdfs:label "Python"@en .
+                                            ?uri rdf:type ?type}''',
+                                        u'http://dbpedia.org/sparql')
+        self.assertEqual(source.query_word('cubicweb'),
+                         [u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
+                          u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'])
+
+    def test_nerdy_process(self):
+        """ Test nerdy process """
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
+                                          'me': 'http://example.com/me'})
+        nerdy = core.NerdyProcess((source,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+
+    def test_nerdy_process_multisources(self):
+        """ Test nerdy process """
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
+                                          'me': 'http://example.com/me'})
+        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
+        # Two sources, not unique
+        nerdy = core.NerdyProcess((source1, source2))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+        # Two sources, unique
+        nerdy = core.NerdyProcess((source1, source2), unique=True)
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+        # Two sources inversed, unique
+        nerdy = core.NerdyProcess((source2, source1), unique=True)
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+
+    def test_nerdy_process_add_sources(self):
+        """ Test nerdy process """
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
+                                          'me': 'http://example.com/me'})
+        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
+        nerdy = core.NerdyProcess((source1,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46))),])
+        # Two sources, not unique
+        nerdy.add_ner_source(source2)
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+
+    def test_nerdy_process_preprocess(self):
+        """ Test nerdy process """
+        text = 'Hello Toto, this is   me speaking. And me.'
+        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
+                                          'me': 'http://example.com/me'})
+        preprocessor = core.NerdyStopwordsFilterPreprocessor()
+        nerdy = core.NerdyProcess((source,),
+                                  preprocessors=(preprocessor,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities, [('http://example.com/toto', None,
+                                           Token(word='Toto', start=6, end=10,
+                                                 sentence=Sentence(indice=0, start=0, end=34)))])
+
+    def test_nerdy_process_add_preprocess(self):
+        """ Test nerdy process """
+        text = 'Hello Toto, this is   me speaking. And me.'
+        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
+                                          'me': 'http://example.com/me'})
+        preprocessor = core.NerdyStopwordsFilterPreprocessor()
+        nerdy = core.NerdyProcess((source,),)
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/toto', None,
+                           Token(word='Toto', start=6, end=10,
+                                 sentence=Sentence(indice=0, start=0, end=34))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=22, end=24,
+                                 sentence=Sentence(indice=0, start=0, end=34))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=39, end=41,
+                                 sentence=Sentence(indice=1, start=34, end=42)))])
+        nerdy.add_preprocessors(preprocessor)
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities, [('http://example.com/toto', None,
+                                           Token(word='Toto', start=6, end=10,
+                                                 sentence=Sentence(indice=0, start=0, end=34)))])
+
+    def test_nerdy_process_chained_word(self):
+        """ Test nerdy process """
+        text = 'Hello everyone me, this is   me speaking. And me.'
+        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
+                                          'everyone me': 'http://example.com/everyone_me',
+                                          'me': 'http://example.com/me'})
+        nerdy = core.NerdyProcess((source,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone_me', None,
+                           Token(word='everyone me', start=6, end=17,
+                                 sentence=Sentence(indice=0, start=0, end=41))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=29, end=31,
+                                 sentence=Sentence(indice=0, start=0, end=41))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=46, end=48, sentence=Sentence(indice=1, start=41, end=49)))])
+
+
+if __name__ == '__main__':
+    unittest2.main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_filter.py	Thu Dec 19 14:44:44 2013 +0000
@@ -0,0 +1,99 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+import unittest2
+
+from nerdy import core
+from nerdy.tokenizer import Token, Sentence
+
+
+class FilterTest(unittest2.TestCase):
+    """ Test of filters """
+
+    def test_occurence_filter_min_occ(self):
+        """ Test occurence filter """
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
+                                          'me': 'http://example.com/me'})
+        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
+        _filter = core.NerdyOccurenceFilter(min_occ=2)
+        nerdy = core.NerdyProcess((source1, source2), filters=(_filter,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+
+    def test_occurence_filter_max_occ(self):
+        """ Test occurence filter """
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
+                                          'me': 'http://example.com/me'})
+        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
+        _filter = core.NerdyOccurenceFilter(max_occ=1)
+        nerdy = core.NerdyProcess((source1, source2), filters=(_filter,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),])
+
+    def test_disambiguation_word_length(self):
+        """ Test occurence filter """
+        text = 'Hello toto tutu. And toto.'
+        source = core.NerdySourceLexical({'toto tutu': 'http://example.com/toto_tutu',
+                                          'toto': 'http://example.com/toto'})
+        _filter = core.NerdyDisambiguationWordParts()
+        nerdy = core.NerdyProcess((source,), filters=(_filter,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/toto_tutu', None,
+                           Token(word='toto tutu', start=6, end=15,
+                                 sentence=Sentence(indice=0, start=0, end=16))),
+                          ('http://example.com/toto_tutu', None,
+                           Token(word='toto', start=21, end=25,
+                                 sentence=Sentence(indice=1, start=16, end=26)))])
+
+    def test_rules_filter(self):
+        """ Test rules filter """
+        text = 'Hello toto tutu. And toto.'
+        source = core.NerdySourceLexical({'toto tutu': 'http://example.com/toto_tutu',
+                                          'toto': 'http://example.com/toto'})
+        rules = {'http://example.com/toto': 'http://example.com/tata'}
+        _filter = core.NerdyReplacementRulesFilter(rules)
+        nerdy = core.NerdyProcess((source,), filters=(_filter,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/toto_tutu', None,
+                           Token(word='toto tutu', start=6, end=15,
+                                 sentence=Sentence(indice=0, start=0, end=16))),
+                          ('http://example.com/tata', None,
+                           Token(word='toto', start=21, end=25,
+                                 sentence=Sentence(indice=1, start=16, end=26)))])
+
+if __name__ == '__main__':
+    unittest2.main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_ner_dataio.py	Thu Dec 19 14:44:44 2013 +0000
@@ -0,0 +1,85 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+import unittest2
+
+from nerdy import dataio, core
+
+
+class DataioTest(unittest2.TestCase):
+    """ Test of dataio """
+
+    def test_sparql_query(self):
+        results = dataio.sparql_query(query=u'''SELECT ?uri
+                                                WHERE{
+                                                ?uri rdfs:label "Python"@en .
+                                                ?uri rdf:type ?type}''',
+                                      endpoint=u'http://dbpedia.org/sparql')
+        truth = [{u'uri':
+                  {u'type': u'uri',
+                   u'value': u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage'}},
+                 {u'uri':
+                  {u'type': u'uri',
+                   u'value': u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'}}]
+        self.assertEqual(results, truth)
+
+    def test_rql_url_query(self):
+        results = dataio.rql_url_query('Any U LIMIT 1 WHERE X cwuri U, X name "apycot"',
+                                       'http://www.cubicweb.org')
+        self.assertEqual(results, [[u'http://www.cubicweb.org/1310453']])
+
+    def test_prettyprint(self):
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
+                                          'me': 'http://example.com/me'})
+        nerdy = core.NerdyProcess((source,))
+        named_entities = nerdy.process_text(text)
+        html = dataio.NerdyHTMLPrettyPrint().pprint_text(text, named_entities)
+        self.assertEqual(html, (u'Hello <a href="http://example.com/everyone">everyone</a>, '
+                                u'this is   <a href="http://example.com/me">me</a> speaking. '
+                                u'And <a href="http://example.com/me">me</a>.'))
+
+    def test_prettyprint_class(self):
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
+                                          'me': 'http://example.com/me'})
+        nerdy = core.NerdyProcess((source,))
+        named_entities = nerdy.process_text(text)
+        html = dataio.NerdyHTMLPrettyPrint().pprint_text(text, named_entities, html_class='ner')
+        self.assertEqual(html, (u'Hello <a href="http://example.com/everyone" class="ner">everyone</a>, '
+                                u'this is   <a href="http://example.com/me" class="ner">me</a> speaking. '
+                                u'And <a href="http://example.com/me" class="ner">me</a>.'))
+
+
+class NerdyValidXHTMLPrettyPrintTest(unittest2.TestCase):
+
+    def test_valid(self):
+        self.assertTrue(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
+            '<p>coucou</p>'))
+
+    def test_valid_unicode(self):
+        self.assertTrue(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
+            u'<p>hé</p>'))
+
+    def test_invalid(self):
+        self.assertFalse(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
+            '<p><div>coucou</div></p>'))
+
+
+if __name__ == '__main__':
+    unittest2.main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_preprocessor.py	Thu Dec 19 14:44:44 2013 +0000
@@ -0,0 +1,97 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+import unittest2
+
+from nerdy import core, tokenizer
+
+
+class PreprocessorTest(unittest2.TestCase):
+    """ Test of preprocessors """
+
+    def test_lowercasefilter(self):
+        preprocessor = core.NerdyLowerCaseFilterPreprocessor()
+        token = tokenizer.Token('toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+        token = tokenizer.Token('toto Tata', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        token = tokenizer.Token('toto tata', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+
+    def test_wordsizefilter(self):
+        preprocessor = core.NerdyWordSizeFilterPreprocessor()
+        token = tokenizer.Token('toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        preprocessor = core.NerdyWordSizeFilterPreprocessor(min_size=3)
+        token = tokenizer.Token('toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        token = tokenizer.Token('to', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+        preprocessor = core.NerdyWordSizeFilterPreprocessor(max_size=3)
+        token = tokenizer.Token('toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+        token = tokenizer.Token('to', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+
+    def test_lowerfirstword(self):
+        preprocessor = core.NerdyLowerFirstWordPreprocessor()
+        sentence = tokenizer.Sentence(0, 0, 20)
+        # Start of the sentence
+        token1 = tokenizer.Token('Toto tata', 0, 4, sentence)
+        token2 = tokenizer.Token('Toto tata', 0, 4, sentence)
+        self.assertEqual(preprocessor(token1), token2)
+        token1 = tokenizer.Token('Us tata', 0, 4, sentence)
+        token2 = tokenizer.Token('us tata', 0, 4, sentence)
+        self.assertEqual(preprocessor(token1), token2)
+        # Not start of the sentence
+        token1 = tokenizer.Token('Toto tata', 12, 16, sentence)
+        token2 = tokenizer.Token('Toto tata', 12, 16, sentence)
+        self.assertEqual(preprocessor(token1), token2)
+        token1 = tokenizer.Token('Us tata', 12, 16, sentence)
+        token2 = tokenizer.Token('Us tata', 12, 16, sentence)
+        self.assertEqual(preprocessor(token1), token2)
+
+    def test_stopwordsfilter(self):
+        preprocessor = core.NerdyStopwordsFilterPreprocessor()
+        token = tokenizer.Token('Toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        token = tokenizer.Token('Us', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+        token = tokenizer.Token('Us there', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        # Split words
+        preprocessor = core.NerdyStopwordsFilterPreprocessor(split_words=True)
+        token = tokenizer.Token('Us there', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+        token = tokenizer.Token('Us there toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+
+    def test_hashtag(self):
+        preprocessor = core.NerdyHashTagPreprocessor()
+        token = tokenizer.Token('Toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        token1 = tokenizer.Token('@BarackObama', 0, 4, None)
+        token2 = tokenizer.Token('BarackObama', 0, 4, None)
+        self.assertEqual(preprocessor(token1), token2)
+        token1 = tokenizer.Token('@Barack_Obama', 0, 4, None)
+        token2 = tokenizer.Token('Barack Obama', 0, 4, None)
+        self.assertEqual(preprocessor(token1), token2)
+
+
+if __name__ == '__main__':
+    unittest2.main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_tokenizer.py	Thu Dec 19 14:44:44 2013 +0000
@@ -0,0 +1,88 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+import unittest2
+
+from nerdy.tokenizer import RichStringTokenizer, Token, Sentence
+
+
+class TokenizerTest(unittest2.TestCase):
+    """ Test of tokenizer """
+
+    def test_richstringtokenizer(self):
+        text = 'Hello everyone, this is   me speaking. And me.'
+        tokenizer = RichStringTokenizer(text,
+                                        token_min_size=1,
+                                        token_max_size=3)
+        tokens = list(tokenizer)
+        self.assertEqual(len(tokens), 18)
+        t1 = Token(word='Hello everyone this', start=0, end=20, sentence=Sentence(indice=0, start=0, end=38))
+        self.assertEqual(tokens[0], t1)
+        t2 = Token(word='And', start=39, end=42, sentence=Sentence(indice=1, start=38, end=46))
+        self.assertEqual(tokens[16], t2)
+
+    def test_richstringtokenizer_loadtext(self):
+        text = 'Hello everyone, this is   me speaking. And me.'
+        tokenizer = RichStringTokenizer(text,
+                                        token_min_size=1,
+                                        token_max_size=3)
+        tokens = list(tokenizer)
+        self.assertEqual(len(tokens), 18)
+        tokenizer.load_text('Hello everyone')
+        tokens = list(tokenizer)
+        self.assertEqual(len(tokens), 3)
+
+    def test_richstringtokenizer_minsize(self):
+        text = 'Hello everyone, this is   me speaking. And me.'
+        tokenizer = RichStringTokenizer(text,
+                                        token_min_size=2,
+                                        token_max_size=3)
+        tokens = list(tokenizer)
+        self.assertEqual(len(tokens), 10)
+        t1 =  Token(word='me speaking', start=26, end=37, sentence=Sentence(indice=0, start=0, end=38))
+        self.assertEqual(tokens[8], t1)
+
+    def test_richstringtokenizer_maxsize(self):
+        text = 'Hello everyone, this is   me speaking. And me.'
+        tokenizer = RichStringTokenizer(text,
+                                        token_min_size=1,
+                                        token_max_size=4)
+        tokens = list(tokenizer)
+        self.assertEqual(len(tokens), 21)
+        t1 = Token(word='And me', start=39, end=45, sentence=Sentence(indice=1, start=38, end=46))
+        self.assertEqual(tokens[18], t1)
+
+    def test_richstringtokenizer_sentences(self):
+        text = 'Hello everyone, this is   me speaking. And me !Why not me ? Blup'
+        tokenizer = RichStringTokenizer(text,
+                                        token_min_size=1,
+                                        token_max_size=4)
+        sentences = tokenizer.find_sentences(text)
+        self.assertEqual(len(sentences), 4)
+        self.assertEqual(text[sentences[0].start:sentences[0].end],
+                         'Hello everyone, this is   me speaking.')
+        self.assertEqual(text[sentences[1].start:sentences[1].end],
+                         ' And me !')
+        self.assertEqual(text[sentences[2].start:sentences[2].end],
+                         'Why not me ?')
+        self.assertEqual(text[sentences[3].start:sentences[3].end],
+                         ' Blup')
+
+
+if __name__ == '__main__':
+    unittest2.main()
+