[utils] Create an utils folder, related to #187461
authorvincent.michel@logilab.fr
Thu, 19 Dec 2013 14:44:58 +0000
changeset 369 7019bc0cab44
parent 368 61a56bf04d36
child 370 4a79af983c29
[utils] Create an utils folder, related to #187461
dataio.py
distances.py
minhashing.py
ner/dataio.py
normalize.py
record_linkage/aligner.py
record_linkage/blocking.py
test/test_alignment.py
test/test_blocking.py
test/test_dataio.py
test/test_distances.py
test/test_minhashing.py
test/test_normalize.py
utils/__init__.py
utils/dataio.py
utils/distances.py
utils/minhashing.py
utils/ner_dataio.py
utils/normalize.py
--- a/dataio.py	Thu Dec 19 14:44:44 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,224 +0,0 @@
-# -*- coding:utf-8 -*-
-# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-
-from os.path import exists as fileexists
-from os import path as osp
-
-import csv
-import urllib
-
-try:
-    from SPARQLWrapper import SPARQLWrapper, JSON
-    SPARQL_ENABLED = True
-except ImportError:
-    SPARQL_ENABLED = False
-
-
-###############################################################################
-### UTILITY FUNCTIONS #########################################################
-###############################################################################
-def autocast(data, encoding=None):
-    """ Try to convert data into a specific type
-    in (int, float, str)
-    """
-    try:
-        return int(data)
-    except ValueError:
-        try:
-            return float(data.replace(',', '.'))
-        except ValueError:
-            data = data.strip()
-            if encoding:
-                return data.decode(encoding)
-            return data
-
-
-###############################################################################
-### RQL FUNCTIONS #############################################################
-###############################################################################
-def rqlquery(host, rql, indexes=None, formatopt=None):
-    """ Run the rql query on the given cubicweb host
-    """
-
-    if host.endswith('/'):
-        host = host[:-1]
-
-    indexes = indexes or []
-    filehandle = urllib.urlopen('%(host)s/view?'
-                                'rql=%(rql)s&vid=csvexport'
-                                % {'rql': rql, 'host': host})
-    filehandle.readline()#Skip the first line
-    return parsefile(filehandle, delimiter=';', indexes=indexes,
-                     formatopt=formatopt);
-
-
-###############################################################################
-### SPARQL FUNCTIONS ##########################################################
-###############################################################################
-def sparqlquery(endpoint, query, indexes=None, autocaste_data=True):
-    """ Run the sparql query on the given endpoint, and wrap the items in the
-    indexes form. If indexes is empty, keep raw output"""
-
-    if not SPARQL_ENABLED:
-        raise ImportError("You have to install SPARQLWrapper and JSON modules to"
-                          "used this function")
-
-    sparql = SPARQLWrapper(endpoint)
-    sparql.setQuery(query)
-    sparql.setReturnFormat(JSON)
-    rawresults = sparql.query().convert()
-    labels = rawresults['head']['vars']
-    results = []
-    indexes = indexes or []
-    if autocaste_data:
-        transform = autocast
-    else:
-        def transform(*args): return args
-    for raw in rawresults["results"]["bindings"]:
-        data = []
-        if not indexes:
-            data = [transform(raw[label]['value']) for label in labels]
-        else:
-            for il, ind in enumerate(indexes):
-                if isinstance(ind, tuple):
-                    data.append(tuple([transform(raw[labels[i]]['value']) for i in ind]))
-                else:
-                    data.append(transform(raw[labels[il]]['value']))
-        results.append(data)
-    return results
-
-
-###############################################################################
-### FILE FUNCTIONS ############################################################
-###############################################################################
-def parsefile(filename, indexes=None, nbmax=None, delimiter='\t',
-              encoding='utf-8', field_size_limit=None, formatopt=None):
-    """ Parse the file (read ``nbmax`` line at maximum if given). Each
-        line is splitted according ``delimiter`` and only ``indexes`` are kept
-
-        eg : The file is :
-                1, house, 12, 19, apple
-                2, horse, 21.9, 19, stramberry
-                3, flower, 23, 2.17, cherry
-
-            >>> data = parsefile('myfile', [0, (2, 3), 4, 1], delimiter=',')
-            data = [[1, (12, 19), u'apple', u'house'],
-                    [2, (21.9, 19), u'stramberry', u'horse'],
-                    [3, (23, 2.17), u'cherry', u'flower']]
-
-            By default, all cells are "autocast" (thanks to the
-            ``autocast()`` function), but you can overpass it thanks to the
-            ``formatopt`` dictionnary. Each key is the index to work on, and the
-            value is the function to call. See the following example:
-
-            >>> data = parsefile('myfile', [0, (2, 3), 4, 1], delimiter=',',
-            >>>                  formatopt={2:lambda x:x.decode('utf-8')})
-            data = [[1, (u'12', 19), u'apple', u'house'],
-                    [2, (u'21.9', 19), u'stramberry', u'horse'],
-                    [3, (u'23', 2.17), u'cherry', u'flower']]
-
-    """
-    def formatedoutput(filename):
-        if field_size_limit:
-            csv.field_size_limit(field_size_limit)
-
-        if isinstance(filename, basestring):
-            csvfile = open(filename, 'r')
-        else:
-            csvfile = filename
-        reader = csv.reader(csvfile, delimiter=delimiter)
-        for row in reader:
-            yield [cell.strip() for cell in row]
-        csvfile.close()
-
-
-
-    result = []
-    indexes = indexes or []
-    formatopt = formatopt or {}
-    for ind, row in enumerate(formatedoutput(filename)):
-        row = [formatopt.get(i, lambda x: autocast(x, encoding))(cell)
-               for i, cell in enumerate(row)]
-        data = []
-        if nbmax and ind > nbmax:
-            break
-        if not indexes:
-            data = row
-        else:
-            for ind in indexes:
-                if isinstance(ind, tuple):
-                    data.append(tuple([row[i] for i in ind]))
-                    if '' in data[-1]:
-                        data[-1] = None
-                elif row[ind]:
-                    data.append(row[ind])
-                else:
-                    data.append(None)
-
-        result.append(data)
-    return result
-
-def write_results(matched, alignset, targetset, resultfile):
-    """ Given a matched dictionnay, an alignset and a targetset to the
-        resultfile
-    """
-    openmode = 'a' if fileexists(resultfile) else 'w'
-    with open(resultfile, openmode) as fobj:
-        if openmode == 'w':
-            fobj.write('aligned;targetted;distance\n')
-        for aligned in matched:
-            for target, dist in matched[aligned]:
-                alignid = alignset[aligned][0]
-                targetid = targetset[target][0]
-                fobj.write('%s;%s;%s\n' %
-                    (alignid.encode('utf-8') if isinstance(alignid, basestring)
-                                             else alignid,
-                     targetid.encode('utf-8') if isinstance(targetid, basestring)
-                                              else targetid,
-                     dist
-                     ))
-
-def split_file(filename, outputdir, nblines=60000):
-    """ Split `filename` into smaller files of ``nblines`` lines. Files are
-        written into `outputdir`.
-
-        Return the list of files
-    """
-    NEW = object()
-
-    def readlines(fobj, nblines):
-        """ yield all lines of the file, and
-        at split-file boundaries, yield a NEW marker
-        """
-        for index, line in enumerate(fobj):
-            if index and index % nblines == 0:
-                yield NEW
-            yield line
-
-    count = 0
-    with open(filename, 'rb') as fobj:
-        outfile = open(osp.join(outputdir, '%s' % count), 'wb')
-        for line in readlines(fobj, nblines):
-            if line is NEW:
-                outfile.close()
-                count += 1
-                outfile = open(osp.join(outputdir, '%s' % count), 'wb')
-                continue
-            outfile.write(line)
-        outfile.close()
-        count += 1
-    return map(str, xrange(count))
--- a/distances.py	Thu Dec 19 14:44:44 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,456 +0,0 @@
-# -*- coding:utf-8 -*-
-# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-
-from functools import partial
-from math import cos, sqrt, pi #Needed for geographical distance
-try:
-    from dateutil import parser as dateparser
-    DATEUTIL_ENABLED = True
-except ImportError:
-    DATEUTIL_ENABLED = False
-from scipy import matrix, empty
-
-from nazca.normalize import tokenize
-
-
-###############################################################################
-### UTILITY FUNCTIONS #########################################################
-###############################################################################
-def cdist(distance_callback, refset, targetset, matrix_normalized=False,
-          ref_indexes=None, target_indexes=None):
-    """ Compute the metric matrix, given two datasets and a metric
-
-    Parameters
-    ----------
-    refset: a dataset (list of records)
-
-    targetset: a dataset (list of records)
-
-    Returns
-    -------
-
-    A distance matrix, of shape (len(refset), len(targetset))
-    with the distance of each element in it.
-    """
-    ref_indexes = ref_indexes or xrange(len(refset))
-    target_indexes = target_indexes or xrange(len(targetset))
-    distmatrix = empty((len(ref_indexes), len(target_indexes)), dtype='float32')
-    size = distmatrix.shape
-    for i, iref in enumerate(ref_indexes):
-        for j, jref in enumerate(target_indexes):
-            d = 1
-            if refset[iref] and targetset[jref]:
-                d = distance_callback(refset[iref], targetset[jref])
-                if matrix_normalized:
-                    d = 1 - (1.0/(1.0 + d))
-            distmatrix[i, j] = d
-    return distmatrix
-
-def _handlespaces(stra, strb, distance, tokenizer=None, **kwargs):
-    """ Compute the matrix of distances between all tokens of stra and strb
-        (with function ``distance``). Extra args are given to the distance
-        function
-
-        The distance returned is defined as the max of the min of each rows of
-        each distance matrix, see the example above :
-
-                 |  Victor |  Hugo                  Victor | Jean | Hugo
-         Victor  |     0   |    5           Victor |  0    |  6   |  5
-          Jean   |     6   |    4           Hugo   |  5    |  4   |  0
-          Hugo   |     5   |    0
-
-                 --> 4                                --> 0
-
-        Return 4
-    """
-
-    if ' ' not in stra:
-        stra += ' '
-    if ' ' not in strb:
-        strb += ' '
-
-    toka = tokenize(stra, tokenizer)
-    tokb = tokenize(strb, tokenizer)
-    # If not same number of tokens, complete the smallest list with empty strings
-    if len(toka) != len(tokb):
-        mint = toka if len(toka)<len(tokb) else tokb
-        maxt = toka if len(toka)>len(tokb) else tokb
-        mint.extend(['' for i in range(len(maxt)-len(mint))])
-
-    listmatrix = []
-    for i in xrange(len(toka)):
-        listmatrix.append([distance(toka[i], tokb[j], **kwargs) for j in xrange(len(tokb))])
-    m = matrix(listmatrix)
-    minlist = [m[i,:].min() for i in xrange(m.shape[0])]
-    minlist.extend([m[:,i].min() for i in xrange(m.shape[1])])
-    return max(minlist)
-
-
-###############################################################################
-### NUMERICAL DISTANCES #######################################################
-###############################################################################
-def euclidean(a, b):
-    """ Simple euclidian distance
-    """
-    try:
-        return abs(a - b)
-    except TypeError:
-        #a and b may be strings
-        return abs(float(a) - float(b))
-
-
-###############################################################################
-### STRING DISTANCES ##########################################################
-###############################################################################
-def levenshtein(stra, strb, tokenizer=None):
-    """ Compute the Levenshtein distance between stra and strb.
-
-    The Levenshtein distance is defined as the minimal cost to transform stra
-    into strb, where 3 operators are allowed :
-        - Replace one character of stra into a character of strb
-        - Add one character of strb into stra
-        - Remove one character of strb
-
-        If spaces are found in stra or strb, this method returns
-            _handlespaces(stra, strb, levenshtein)
-    """
-    if ' ' in stra or ' ' in strb:
-        return _handlespaces(stra, strb, levenshtein, tokenizer)
-
-    lenb = len(strb)
-    onerowago = None
-    thisrow = range(1, lenb + 1) + [0]
-    for x in xrange(len(stra)):
-        onerowago, thisrow = thisrow, [0]*lenb + [x+1]
-        for y in xrange(lenb):
-            delcost = onerowago[y] + 1
-            addcost = thisrow[y - 1] + 1
-            subcost = onerowago[y - 1] + (stra[x] != strb[y])
-            thisrow[y] = min(delcost, addcost, subcost)
-    return thisrow[lenb - 1]
-
-def soundexcode(word, language='french'):
-    """ Return the Soundex code of the word ``word``
-        For more information about soundex code see wiki_
-
-        ``language`` can be 'french' or 'english'
-
-        .:: wiki_ : https://en.wikipedia.org/wiki/Soundex
-
-        If spaces are found in stra or strb, this method returns
-            _handlespaces(stra, strb), soundex, language=language)
-    """
-
-    vowels = 'AEHIOUWY'
-    if language.lower() == 'french' :
-        consonnantscode = {'B': '1', 'P': '1',
-                           'C': '2', 'K': '2', 'Q': '2',
-                           'D': '3', 'T': '3',
-                           'L': '4',
-                           'M': '5', 'N': '5',
-                           'R': '6',
-                           'G': '7', 'J': '7',
-                           'X': '8', 'Z': '8', 'S': '8',
-                           'F': '9', 'V': '9'
-                          }
-    elif language.lower() == 'english':
-        consonnantscode = {'B': '1', 'F': '1', 'P': '1', 'V': '1',
-                           'C': '2', 'G': '2', 'J': '2', 'K': '2',
-                           'Q': '2', 'S': '2', 'X': '2', 'Z': '2',
-                           'D': '3', 'T': '3',
-                           'L': '4',
-                           'M': '5', 'N': '5',
-                           'R': '6'
-                          }
-    else:
-        raise NotImplementedError('Soundex code is not supported (yet ?) for'
-                                  'this language (%s). '
-                                  'Supported languages are french and english' % language)
-    word = word.strip().upper()
-    code = word[0]
-    #After this ``for`` code is
-    # the first letter of ``word`` followed by all the consonnants of word,
-    # where from consecutive consonnants, only the first is kept,
-    # and from two identical consonnants separated by a W or a H, only the first
-    # is kept too.
-    for i in xrange(1, len(word)):
-        if word[i] in vowels:
-            continue
-        if word[i - 1] not in vowels and \
-           consonnantscode[word[i]] == consonnantscode.get(code[-1], ''):
-            continue
-        if i + 2 < len(word) and word[i + 1] in 'WH' and \
-           consonnantscode[word[i]] == consonnantscode.get(word[i + 2], ''):
-            continue
-        code += word[i]
-        if len(code) > 4:
-            break
-
-    #Replace according to the codes
-    code = code[0] + ''.join([consonnantscode[c] for c in code[1:]])
-    ###First four letters, completed by zeros
-    return code[:4] + '0'*(4 - len(code))
-
-def soundex(stra, strb, language='french', tokenizer=None):
-    """ Return the 1/0 distance between the soundex code of stra and strb.
-        0 means they have the same code, 1 they don't
-    """
-    if ' ' in stra or ' ' in strb:
-        return _handlespaces(stra, strb, soundex, tokenizer=tokenizer, language=language)
-
-    return 0 if (soundexcode(stra, language) == soundexcode(strb, language)) \
-             else 1
-
-def jaccard(stra, strb, tokenizer=None):
-    """ Return the jaccard distance between stra and strb, condering the tokens
-        set of stra and strb. If no tokenizer is given, it use if
-        alignement.normalize.tokenize's default one.
-
-        J(A, B) = (A \cap B)/(A \cup B)
-        d(A, B) = 1 - J(A, B)
-    """
-    seta = set(tokenize(stra, tokenizer))
-    setb = set(tokenize(strb, tokenizer))
-    return generic_jaccard(seta, setb)
-
-def generic_jaccard(seta, setb):
-    """ Return the jaccard distance between two sets A and B.
-
-        J(A, B) = (A \cap B)/(A \cup B)
-        d(A, B) = 1 - J(A, B)
-    """
-    return 1.0 - 1.0*len(seta.intersection(setb))/len(seta.union(setb))
-
-
-###############################################################################
-### TEMPORAL DISTANCES ########################################################
-###############################################################################
-if DATEUTIL_ENABLED:
-    class FrenchParserInfo(dateparser.parserinfo):
-        """ Inherit of the dateutil.parser.parserinfo and translate the english
-            dependant variables into french.
-        """
-
-        HMS = [(u'h', u'heure', u'heures'),
-               (u'm', u'minute', u'minutes'),
-                    (u's', u'seconde', u'seconde'),]
-        JUMP = [u' ', u'.', u',', u';', u'-', u'/', u"'",
-               u'a', u'le', u'et', u'er']
-        MONTHS = [(u'Jan', u'Janvier'), (u'Fev', u'Fevrier'),
-                  (u'Mar', u'Mars'), (u'Avr', u'Avril'), (u'Mai', u'Mai'),
-                  (u'Jun', u'Juin'), (u'Jui', u'Juillet'),
-                  (u'Aou', u'Aout'), (u'Sep', u'Septembre'),
-                  (u'Oct', u'Octobre'), (u'Nov', u'Novembre'),
-                  (u'Dec', u'Decembre')]
-        PERTAIN = [u'de']
-        WEEKDAYS = [(u'Lun', u'Lundi'),
-                    (u'Mar', u'Mardi'),
-                    (u'Mer', u'Mercredi'),
-                    (u'Jeu', u'Jeudi'),
-                    (u'Ven', u'Vendredi'),
-                    (u'Sam', u'Samedi'),
-                    (u'Dim', u'Dimanche')]
-
-    def temporal(stra, strb, granularity=u'days', parserinfo=FrenchParserInfo,
-                 dayfirst=True, yearfirst=False):
-        """ Return the distance between two strings (read as dates).
-
-            ``granularity`` can be either ``days`` or ``months`` or ``years``
-            (be careful to the plural form !)
-            ``language`` can be either french or english
-
-            ``dayfirst`` and ``yearfirst`` are used in case of ambiguity, for
-            instance 09/09/09, by default it assumes it's day/month/year
-
-            Neither stra nor strb can have accent. Clean it before.
-        """
-
-        datea = dateparser.parse(stra, parserinfo=parserinfo(dayfirst,
-                                 yearfirst), fuzzy=True)
-        dateb = dateparser.parse(strb, parserinfo=parserinfo(dayfirst,
-                                 yearfirst), fuzzy=True)
-        diff = datea - dateb
-        if granularity.lower() == 'years':
-            return abs(diff.days/365.25)
-        if granularity.lower() == 'months':
-            return abs(diff.days/30.5)
-        return abs(diff.days)
-
-
-###############################################################################
-### GEOGRAPHICAL DISTANCES ####################################################
-###############################################################################
-def geographical(pointa, pointb, in_radians=False, planet_radius=6371009,
-                 units='m'):
-    """ Return the geographical distance between two points.
-
-        Both points must be tuples (latitude, longitude)
-
-        - in_radians is True, if latitude and longitude are in radians, false
-          otherwise
-        - planetRadius is the planet's radius in meters. By default, it's the
-          Earth'one.
-
-        - `units` can be 'm' (meters) or 'km' (kilometers)
-    """
-    pointa = (float(pointa[0]), float(pointa[1]))
-    pointb = (float(pointb[0]), float(pointb[1]))
-
-    difflat = pointa[0] - pointb[0]
-    difflong = pointa[1] - pointb[1]
-    meanlat = (pointa[0] + pointb[0])/2.0
-
-    if not in_radians:
-        difflat *= pi/180.0
-        difflong *= pi/180.0
-        meanlat *= pi/180.0
-
-    coef = 1. if units == 'm' else 0.001
-    return coef*planet_radius*sqrt(difflat**2 + (cos(meanlat)*difflong)**2)
-
-
-###############################################################################
-### BASE PROCESSING ############################################################
-###############################################################################
-class BaseProcessing(object):
-    """ A processing object used to provide an abstraction over the different
-    distance functions, and help building Nazca process. """
-
-    def __init__(self, ref_attr_index=None, target_attr_index=None,
-                 distance_callback=euclidean, weight=1, matrix_normalized=False):
-        """ Initiate the BaseProcessing
-
-        Parameters
-        ----------
-
-        ref_attr_index: index of the attribute of interest in a record
-                        for the reference dataset
-                        (i.e. attribute to be used for key computation)
-
-        target_attr_index: index of the attribute of interest in a record
-                           for the target dataset
-                           (i.e. attribute to be used for key computation)
-
-        distance_callback: distance callback. Default is euclidean distance.
-
-        weight: weight of the processing in a global distance matrix
-
-        matrix_normalized: Boolean. If matrix_normalized is True,
-                           the distance between two points is changed to
-                           a value between 0 (equal) and 1 (totaly different).
-                           To avoid useless computation and scale
-                           problems the following “normalization” is done:
-                                d = 1 - 1/(1 + d(x, y))
-
-        """
-        self.ref_attr_index = ref_attr_index
-        self.target_attr_index = target_attr_index
-        self.distance_callback = distance_callback
-        self.weight = weight
-        self.matrix_normalized = matrix_normalized
-
-    def distance(self, reference_record, target_record):
-        """ Compute the distance between two records
-
-        Parameters
-        ----------
-        reference_record: a record (tuple/list of values) of the reference dataset.
-
-        target_record: a record (tuple/list of values) of the target dataset.
-
-        """
-        refrecord = (reference_record[self.ref_attr_index] if self.ref_attr_index
-                     else reference_record)
-        targetrecord = (target_record[self.target_attr_index] if self.target_attr_index
-                        else target_record)
-        return self.distance_callback(refrecord, targetrecord)
-
-    def cdist(self, refset, targetset, ref_indexes=None, target_indexes=None):
-        """ Compute the metric matrix, given two datasets and a metric
-
-        Parameters
-        ----------
-        refset: a dataset (list of records)
-
-        targetset: a dataset (list of records)
-
-        Returns
-        -------
-
-        A distance matrix, of shape (len(refset), len(targetset))
-        with the distance of each element in it.
-        """
-        return cdist(self.distance, refset, targetset,
-                     matrix_normalized=self.matrix_normalized,
-                     ref_indexes=ref_indexes, target_indexes=target_indexes)
-
-    def pdist(self, dataset):
-        """ Compute the upper triangular matrix in a way similar
-        to scipy.spatial.metric
-
-        Parameters
-        ----------
-        dataset: a dataset (list of records)
-
-        Returns
-        -------
-
-        The values of the upper triangular distance matrix
-        (of shape (len(dataset), len(dataset)) with the distance of each element in it.
-        The values are sorted as row 1, row2, ...
-        """
-        values = []
-        for i in xrange(len(dataset)):
-            for j in xrange(i+1, len(dataset)):
-                d = 1
-                if dataset[i] and dataset[j]:
-                    d = self.distance(dataset[i], dataset[j])
-                    if self.matrix_normalized:
-                        d = 1 - (1.0/(1.0 + d))
-                values.append(d)
-        return values
-
-
-###############################################################################
-### CONCRETE PROCESSINGS #######################################################
-###############################################################################
-class LevenshteinProcessing(BaseProcessing):
-    """ A processing based on the levenshtein distance.
-    """
-
-    def __init__(self, ref_attr_index=None, target_attr_index=None,
-                 tokenizer=None, weight=1, matrix_normalized=False):
-        distance_callback = partial(levenshtein,
-                                    tokenizer=tokenizer)
-        super(LevenshteinProcessing, self).__init__(ref_attr_index,
-                                                   target_attr_index,
-                                                   distance_callback,
-                                                   weight,matrix_normalized)
-
-
-class GeographicalProcessing(BaseProcessing):
-    """ A processing based on the geographical distance.
-    """
-
-    def __init__(self, ref_attr_index=None, target_attr_index=None,
-                 in_radians=False, planet_radius=6371009, units='m', weight=1, matrix_normalized=False):
-        distance_callback = partial(geographical, in_radians=in_radians,
-                                    planet_radius=planet_radius, units=units)
-        super(GeographicalProcessing, self).__init__(ref_attr_index,
-                                                    target_attr_index,
-                                                    distance_callback,
-                                                    weight,matrix_normalized)
--- a/minhashing.py	Thu Dec 19 14:44:44 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,184 +0,0 @@
-# -*- coding:utf-8 -*-
-# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-
-import cPickle
-
-from random import randint
-from collections import defaultdict
-
-import numpy as np
-from scipy.optimize import bisect
-
-from nazca.normalize import iter_wordgrams
-
-
-def randomhashfunction(zr):
-    """ Return a random hash function, mapping x in Z to ZR
-        h:x -> ax + b mod R
-
-    """
-    bound = max(zr - 1, 1)
-    a = randint(1, bound)
-    b = randint(1, bound)
-
-    def hashfunc(x):
-        return ((a*x + b)%zr)
-
-    return hashfunc
-
-
-class Minlsh(object):
-    """ Operate minhashing + locally-sensitive-hashing to find similar sentences
-    """
-
-    def __init__(self, verbose=False):
-        self._trained = False
-        self.sigmatrix = None
-        self._verbose = verbose
-
-    def train(self, sentences, k=2, siglen=200):
-        """ Train the minlsh on the given sentences.
-
-            - `k` is the length of the k-wordgrams used
-              (the lower k is, the faster is the training)
-            - `siglen` the length of the sentences signature
-
-        """
-
-        rows, shape = self._buildmatrixdocument(sentences, k)
-
-        if self._verbose: print "Training is done. Wait while signaturing"
-
-        self._computesignaturematrix(rows, shape, siglen)
-        self._trained = True
-
-
-    def _buildmatrixdocument(self, sentences, k):
-        """ Return a sparse matrix where :
-
-            - Each sentence is a column
-            - Each row is a element of the universal set
-
-            Each value (r, c) is set to 1 if the element at row r is in the
-            sentence c, 0 otherwise
-
-        """
-
-        rows, universe, sizeofuniverse = [], {}, 0
-        for nb, sent in enumerate(sentences):
-            row = []
-            for w in iter_wordgrams(sent, k):
-                row.append(universe.setdefault(w, sizeofuniverse))
-                if row[-1] == sizeofuniverse:
-                    sizeofuniverse += 1
-            rows.append(row)
-            if self._verbose and nb % 50000 == 0:
-                print nb
-
-        return rows, (len(rows), sizeofuniverse)
-
-    def _computesignaturematrix(self, rows, shape, siglen):
-        """ Return a matrix where each column is the signature the document
-            The signature is composed of `siglen` numbers
-
-            The more the documents have rows in commun, the closer they are.
-        """
-
-        nrows, ncols = shape
-        sig = np.empty((siglen, nrows))
-        #Generate the random hash functions
-        hashfunc = [randomhashfunction(ncols) for _ in xrange(siglen)]
-        #Compute hashing values just for once.
-        #Avoid multiple recomputations for the same column.
-        hashvalues = np.array([[hashfunc[i](r) for r in xrange(ncols)]
-                                for i in  xrange(siglen)])
-
-        docind = 0
-        while rows:
-            doc = rows.pop(0)
-            #Concatenate the needed rows.
-            tmp = np.dstack([hashvalues[:, r] for r in doc])
-            #Take the mininum of hashes
-            sig[:, docind] = np.min(tmp[0], 1)
-            docind += 1
-            if self._verbose and docind % 50000 == 0:
-                print (docind * 100) / nrows
-        self.sigmatrix = sig
-
-    def save(self, savefile):
-        """ Save the training into `savefile` for a future use """
-
-        if not self._trained:
-            print "Not trained, nothing to save"
-            return
-
-        with open(savefile, 'wb') as fobj:
-            pickler = cPickle.Pickler(fobj)
-            pickler.dump(self.sigmatrix)
-
-    def load(self, savefile):
-        """ Load a trained minhashing """
-
-        with open(savefile, 'rb') as fobj:
-            pickler = cPickle.Unpickler(fobj)
-            self.sigmatrix = pickler.load()
-
-        if self.sigmatrix is not None:
-            self._trained = True
-        else:
-            self._trained = False
-
-    def computebandsize(self, threshold, nbrows):
-        """ Compute the bandsize according to the threshold given """
-
-        ### t ~ (1/b)^(1/r), where t is the threshold, b the number of
-        ### bands, and r the number of rows per band. And nbrows (the length
-        ### of the matrix is nbrows = b*r, so t ~ (r/L)^(1/r). So, let's
-        ### find the root of f(x) = (x/L)^(1/r) - t.
-        def f(x):
-            y = pow(x/nbrows, 1. /x) - threshold
-            return y
-
-        ## Solve f(x) = 0, with x having values in [1, nbrows]
-        return int(bisect(f, 1, nbrows))
-
-    def predict(self, threshold):
-        """ Return a set of tuples of *possible* similar sentences
-        """
-        if not self._trained:
-            print "Train it before"
-            return
-
-        if not (0 < threshold <= 1):
-            print "Threshold must be in ]0 ; 1]"
-            return
-
-        sig = self.sigmatrix
-        # Treshold is a percent of similarity
-        # It should be inverted here (0 is closed, 1 is far)
-        threshold = 1 - threshold
-        bandsize = self.computebandsize(threshold, self.sigmatrix.shape[0])
-
-        buckets = defaultdict(set)
-        similars = set()
-        for r in xrange(0, sig.shape[0], bandsize):
-            buckets.clear()
-            for i in xrange(sig.shape[1]):
-                buckets[tuple(sig[r:r+bandsize, i])].add(i)
-            similars.update(set(tuple(v) for v in buckets.itervalues()
-                                         if len(v) > 1))
-        return similars
--- a/ner/dataio.py	Thu Dec 19 14:44:44 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,140 +0,0 @@
-# -*- coding: utf-8 -*-
-""" IO for Named Entities Recognition.
-"""
-import json
-import urllib
-import lxml.etree as ET
-
-
-###############################################################################
-### SPARQL UTILITIES ##########################################################
-###############################################################################
-def sparql_query(query, endpoint):
-    """ Execute a query on an endpoint:
-
-    sparql_query(query=u'''SELECT ?uri ?type
-                           WHERE{
-                           ?uri rdfs:label "Python"@en .
-                           ?uri rdf:type ?type}''',
-                           endpoint=u'http://dbpedia.org/sparql')
-    """
-    from SPARQLWrapper import SPARQLWrapper, JSON
-    sparql = SPARQLWrapper(endpoint)
-    sparql.setQuery(query)
-    sparql.setReturnFormat(JSON)
-    try:
-        rawresults = sparql.query().convert()
-        labels = rawresults['head']['vars']
-        return rawresults["results"]["bindings"]
-    except:
-        print 'Error in sparql query'
-        return []
-
-
-###############################################################################
-### RQL UTILITIES #############################################################
-###############################################################################
-def get_cw_cnx(endpoint):
-    """ Get a cnx on a CubicWeb database
-    """
-    from cubicweb import dbapi
-    from cubicweb.cwconfig import CubicWebConfiguration
-    from cubicweb.entities import AnyEntity
-    CubicWebConfiguration.load_cwctl_plugins()
-    config = CubicWebConfiguration.config_for(endpoint)
-    sourceinfo = config.sources()['admin']
-    login = sourceinfo['login']
-    password = sourceinfo['password']
-    _, cnx = dbapi.in_memory_repo_cnx(config, login, password=password)
-    req = cnx.request()
-    return req
-
-def rql_appid_query(query, endpoint, _cache_cnx={}, **kwargs):
-    """ Execute a query on an appid endpoint:
-
-    rql_query('Any X WHERE X label "Python"', 'localhost')
-
-    Additional arguments can be passed to be properly substitued
-    in the execute() function.
-    """
-    if endpoint in _cache_cnx:
-        cnx = _cache_cnx[endpoint]
-    else:
-        cnx = get_cw_cnx(endpoint)
-        _cache_cnx[endpoint] = cnx
-    return cnx.execute(query, kwargs)
-
-def rql_url_query(query, endpoint):
-    """ Execute a query on an url endpoint:
-
-    rql_query('Any X WHERE X label "Python"', 'localhost')
-    """
-    url = urllib.basejoin(endpoint, '?rql=%s&vid=jsonexport' % query)
-    return json.loads(urllib.urlopen(url).read())
-
-
-###############################################################################
-### OUTPUT UTILITIES ##########################################################
-###############################################################################
-class AbstractNerdyPrettyPrint(object):
-    """ Pretty print the output of a Nerdy process
-    """
-
-    def pprint_text(self, text, named_entities, **kwargs):
-        newtext = u''
-        indice = 0
-        tindices = dict([(t.start, (uri, t)) for uri, p, t in named_entities])
-        while indice < len(text):
-            if indice in tindices:
-                uri, t = tindices[indice]
-                words = text[t.start:t.end]
-                fragment = self.pprint_entity(uri, words, **kwargs)
-                if not self.is_valid(newtext+fragment+text[t.end:]):
-                    fragment = words
-                newtext += fragment
-                indice = t.end
-            else:
-                newtext += text[indice]
-                indice += 1
-        return newtext
-
-    def pprint_entity(self, uri, word, **kwargs):
-        """ Pretty print an entity """
-        raise NotImplementedError
-
-    def is_valid(self, newtext):
-        """Override to check the validity of the prettified content at each
-        enrichement step"""
-        return True
-
-
-class NerdyHTMLPrettyPrint(AbstractNerdyPrettyPrint):
-    """ Pretty print the output of a Nerdy process
-    """
-
-    def pprint_entity(self, uri, word, **kwargs):
-        """ Pretty print an entity """
-        klass = ' class="%s"' % kwargs['html_class'] if 'html_class' in kwargs else ''
-        return u'<a href="%s"%s>%s</a>' % (uri, klass, word)
-
-
-class NerdyValidXHTMLPrettyPrint(NerdyHTMLPrettyPrint):
-
-    XHTML_DOC_TEMPLATE = '''\
-<?xml version="1.0" encoding="UTF-8" ?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml">
-<head>
-<meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
-<title>nerdy</title>
-</head>
-<body><div>%s</div></body>
-</html>'''
-
-    def is_valid(self, html):
-        try:
-            ET.fromstring(self.XHTML_DOC_TEMPLATE % html.encode('utf-8'),
-                          parser=ET.XMLParser(dtd_validation=True))
-        except ET.XMLSyntaxError:
-            return False
-        return True
--- a/normalize.py	Thu Dec 19 14:44:44 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,415 +0,0 @@
-# -*- coding:utf-8 -*-
-# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-
-import re
-from string import punctuation
-from warnings import warn
-from unicodedata import normalize as _uninormalize
-from functools import partial
-
-
-FRENCH_STOPWORDS = set([u'alors', u'au', u'aux', u'aucuns', u'aussi', u'autre', u'avant',
-u'avec', u'avoir', u'bon', u'car', u'ce', u'cela', u'ces', u'ceux', u'chaque',
-u'ci', u'comme', u'comment', u'dans', u'de', u'des', u'du', u'dedans', u'dehors',
-u'depuis', u'deux', u'devrait', u'doit', u'donc', u'dos', u'droite', u'début',
-u'elle', u'elles', u'en', u'encore', u'essai', u'est', u'et', u'eu', u'eux', u'fait',
-u'faites', u'fois', u'font', u'force', u'haut', u'hors', u'ici', u'il', u'ils',
-u'je', u'juste', u'la', u'le', u'les', u'leur', u'lui', u'là', u'ma', u'maintenant',
-u'mais', u'me', u'mes', u'moi', u'moins', u'mon', u'mot', u'même', u'ne',
-u'ni', u'nommés', u'nos',
-u'notre', u'nous', u'nouveaux', u'on', u'ou', u'où', u'par', u'parce', u'parole',
-u'pas', u'personnes', u'peut', u'peu', u'pièce', u'plupart', u'pour',
-u'pourquoi', u'quand', u'que', u'quel', u'quelle', u'quelles', u'quels', u'qui',
-u'sa', u'sans', u'se', u'ses', u'seulement', u'si', u'sien', u'son', u'sont', u'sous',
-u'soyez', u'sujet', u'sur', u'ta', u'tandis', u'tellement', u'te', u'tels', u'tes', u'toi',
-u'ton', u'tous', u'tout', u'trop', u'très', u'tu', u'un', u'une', u'valeur', u'voie',
-u'voient', u'vont', u'vos', u'votre', u'vous', u'vu', u'ça', u'étaient', u'état',
-u'étions', u'été', u'être'])
-
-MANUAL_UNICODE_MAP = {
-    u'\xa1': u'!',    # INVERTED EXCLAMATION MARK
-    u'\u0142': u'l',  # LATIN SMALL LETTER L WITH STROKE
-    u'\u2044': u'/',  # FRACTION SLASH
-    u'\xc6': u'AE',   # LATIN CAPITAL LETTER AE
-    u'\xa9': u'(c)',  # COPYRIGHT SIGN
-    u'\xab': u'"',    # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
-    u'\xe6': u'ae',   # LATIN SMALL LETTER AE
-    u'\xae': u'(r)',  # REGISTERED SIGN
-    u'\u0153': u'oe', # LATIN SMALL LIGATURE OE
-    u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE
-    u'\xd8': u'O',    # LATIN CAPITAL LETTER O WITH STROKE
-    u'\xf8': u'o',    # LATIN SMALL LETTER O WITH STROKE
-    u'\xbb': u'"',    # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
-    u'\xdf': u'ss',   # LATIN SMALL LETTER SHARP S
-    }
-
-
-###############################################################################
-### NORMALIZE FUNCTIONS #######################################################
-###############################################################################
-def unormalize(ustring, substitute=None):
-    """replace diacritical characters with their corresponding ascii characters
-
-    Convert the unicode string to its long normalized form (unicode character
-    will be transform into several characters) and keep the first one only.
-    The normal form KD (NFKD) will apply the compatibility decomposition, i.e.
-    replace all compatibility characters with their equivalents.
-
-    :type substitute: str
-    :param substitute: replacement character to use if decomposition fails
-
-    :see: Another project about ASCII transliterations of Unicode text
-          http://pypi.python.org/pypi/Unidecode
-    """
-    res = []
-    for letter in ustring[:]:
-        try:
-            replacement = MANUAL_UNICODE_MAP[letter]
-        except KeyError:
-            if isinstance(letter, unicode):
-                replacement = _uninormalize('NFKD', letter)[0]
-            else:
-                replacement = letter
-            if ord(replacement) >= 2 ** 7:
-                if substitute is None:
-                    raise ValueError("can't deal with non-ascii based characters")
-                replacement = substitute
-        res.append(replacement)
-    return u''.join(res)
-
-def lunormalize(sentence, substitute=None):
-    """ Normalize a sentence (ie remove accents, set to lower, etc) """
-    return unormalize(sentence,substitute).lower()
-
-def simplify(sentence, lemmas=None, remove_stopwords=True, stopwords=FRENCH_STOPWORDS):
-    """ Simply the given sentence
-        0) If remove_stopwords, then remove the stop words
-        1) If lemmas are given, the sentence is lemmatized
-        2) Set the sentence to lower case
-        3) Remove punctuation
-    """
-    if not isinstance(sentence, basestring):
-        return sentence
-
-    if lemmas:
-        sentence = lemmatized(sentence, lemmas)
-    sentence = sentence.lower()
-    cleansent = ''.join([s if s not in punctuation
-                           else ' ' for s in sentence]).strip()
-    #comma followed by a space is replaced by two spaces, keep only one
-    cleansent = cleansent.replace('  ', ' ')
-
-    if not remove_stopwords:
-        return cleansent
-    else:
-        return ' '.join([w for w in cleansent.split(' ') if w not in stopwords])
-
-def tokenize(sentence, tokenizer=None, regexp=re.compile(r"[^\s]+")):
-    """ Tokenize a sentence.
-        Use ``tokenizer`` if given, else try to use the nltk WordPunctTokenizer,
-        in case of failure, it just split on spaces.
-
-        Anyway, tokenizer must have a ``tokenize()`` method
-    """
-    if tokenizer:
-        return tokenizer().tokenize(sentence)
-    # XXX Unicode, could not use WorkTokenizer.
-    # Instead split on whitespaces
-    chunks = []
-    for chunk in [t for t in regexp.findall(sentence) if t]:
-        # Deals with '
-        if "'" in chunk:
-            schunks = chunk.split("'")
-            chunks.extend([c+"'" for c in schunks[:-1]])
-            chunks.append(schunks[-1])
-        else:
-            chunks.append(chunk)
-    return chunks
-
-def iter_wordgrams(sentence, k):
-    """ Generator of k-wordgrams on the given sentence
-    """
-    words = sentence.split(' ')
-    #XXX Call tokenizer
-    for r in xrange(len(words)):
-        yield ' '.join(words[r:r + k])
-
-def loadlemmas(filename, encoding='utf-8'):
-    """ Return the default lemmas dictionnary
-    """
-    lemmas = {}
-    with open(filename) as fobj:
-        for line in fobj:
-            line = line.decode(encoding).strip().split('\t')
-            if len(line) == 2:
-                lemmas[line[0]] = line[1]
-    return lemmas
-
-def lemmatized(sentence, lemmas, tokenizer=None):
-    """ Return the lemmatized sentence
-    """
-    tokenized_sent = tokenize(sentence, tokenizer)
-    tokenized_sentformated = []
-    for w in tokenized_sent:
-        if w in ".,'" and len(tokenized_sentformated) > 0:
-            tokenized_sentformated[-1] += w
-        elif w not in punctuation:
-            tokenized_sentformated.append(w)
-    return u' '.join([lemmatized_word(w, lemmas) for w in tokenized_sentformated])
-
-def lemmatized_word(word, lemmas):
-    """ Return the lemmatized word
-    """
-    lemma = lemmas.get(word.lower(), word)
-    if '|' in lemma:
-        _words = lemma.split('|')
-        if word.lower() in _words:
-            lemma = word.lower()
-        else:
-            lemma = _words[0]
-    return lemma
-
-def roundstr(number, ndigits=0):
-    """Return an unicode string of ``number`` rounded to a given precision
-        in decimal digits (default 0 digits)
-
-        If ``number`` is not a float, this method casts it to a float. (An
-        exception may be raised if it's not possible)
-    """
-    return format(round(float(number), ndigits), '0.%df' % ndigits)
-
-def rgxformat(string, regexp, output):
-    """ Apply the regexp to the ``string`` and return a formatted string
-    according to ``output``
-
-    eg :
-        format(u'[Victor Hugo - 26 fev 1802 / 22 mai 1885]',
-               r'\[(?P<firstname>\w+) (?p<lastname>\w+) - '
-               r'(?P<birthdate>.*?) / (?<deathdate>.*?)\]',
-               u'%(lastname)s, %(firstname)s (%(birthdate)s -'
-               u'%(deathdate)s)')
-
-     would return u'Hugo, Victor (26 fev 1802 - 22 mai 1885)'
-     """
-
-    match = re.match(regexp, string)
-    return output % match.groupdict()
-
-
-###############################################################################
-### NORMALIZER OBJECTS ########################################################
-###############################################################################
-class BaseNormalizer(object):
-    """ A normalizer object used to provide an abstraction over the different
-    normalization functions, and help building Nazca process. """
-
-    def __init__(self, callback, attr_index=None):
-        """ Initiate the BaseNormalizer
-
-        Parameters
-        ----------
-        callback: normalization callback
-
-        attr_index: index of the attribute of interest in a record
-                    (i.e. attribute to be normalized).
-                    By default, 'attr_index' is None and the whole
-                    record is passed to the callback.
-                    If given, only the attr_index value of the record
-                    is passed to the the callback.
-                    Could be a list or an int
-        """
-        self.callback = callback
-        if attr_index:
-            self.attr_index = attr_index if isinstance(attr_index, (tuple, list)) else (attr_index,)
-        else:
-            self.attr_index = attr_index
-
-    def normalize(self, record):
-        """ Normalize a record
-
-        Parameters
-        ----------
-        record: a record (tuple/list of values).
-
-        Returns
-        -------
-
-        record: the normalized record.
-        """
-        if not self.attr_index:
-            return self.callback(record)
-        else:
-            for attr_ind in self.attr_index:
-                record = list(r if ind != attr_ind else self.callback(r)
-                               for ind, r in enumerate(record))
-            return record
-
-    def normalize_dataset(self, dataset, inplace=False):
-        """ Normalize a dataset
-
-        Parameters
-        ----------
-        dataset: a list of record (tuple/list of values).
-
-        inplace: Boolean. If True, normalize the dataset in place.
-
-        Returns
-        -------
-
-        record: the normalized dataset.
-        """
-        if not inplace:
-            dataset = [self.normalize(record) for record in dataset]
-        else:
-            # Change dataset in place
-            for ind, record in enumerate(dataset):
-                dataset[ind] = self.normalize(record)
-        return dataset
-
-
-class UnicodeNormalizer(BaseNormalizer):
-    """ Normalizer that unormalize the unicode
-    (i.e. replace accentuating characters by ASCII ones)
-    """
-    def __init__(self, attr_index=None, substitute=None):
-        callback = partial(lunormalize, substitute=substitute)
-        super(UnicodeNormalizer, self).__init__(callback, attr_index=attr_index)
-
-
-class SimplifyNormalizer(BaseNormalizer):
-    """ Normalizer that simplify a string
-        0) If remove_stopwords, then remove the stop words
-        1) If lemmas are given, the sentence is lemmatized
-        2) Set the sentence to lower case
-        3) Remove punctuation
-    """
-    def __init__(self, attr_index=None, lemmas=None, remove_stopwords=True):
-        callback = partial(simplify, lemmas=lemmas, remove_stopwords=remove_stopwords)
-        super(SimplifyNormalizer, self).__init__(callback, attr_index=attr_index)
-
-
-class TokenizerNormalizer(BaseNormalizer):
-    """ Normalizer that tokenize a string
-        Use ``tokenizer`` if given, else try to use the nltk WordPunctTokenizer,
-        in case of failure, it just split on spaces.
-        Anyway, tokenizer must have a ``tokenize()`` method
-    """
-    def __init__(self, attr_index=None, tokenizer=None, regexp=re.compile(r"[^\s]+")):
-        callback = partial(tokenize, tokenizer=tokenizer, regexp=regexp)
-        super(TokenizerNormalizer, self).__init__(callback, attr_index=attr_index)
-
-
-class LemmatizerNormalizer(BaseNormalizer):
-    """ Normalizer that lemmatize a string
-    """
-    def __init__(self, lemmas, attr_index=None, tokenizer=None):
-        callback = partial(lemmatized, lemmas=lemmas, tokenizer=tokenizer)
-        super(LemmatizerNormalizer, self).__init__(callback, attr_index=attr_index)
-
-
-class RoundNormalizer(BaseNormalizer):
-    """Normalizer that round a string
-    Return an unicode string of ``number`` rounded to a given precision
-    in decimal digits (default 0 digits)
-
-    If ``number`` is not a float, this method casts it to a float. (An
-    exception may be raised if it's not possible)
-    """
-    def __init__(self, attr_index=None, ndigits=0):
-        callback = partial(roundstr, ndigits=ndigits)
-        super(RoundNormalizer, self).__init__(callback, attr_index=attr_index)
-
-
-class RegexpNormalizer(BaseNormalizer):
-    """Normalizer that normalize a string based on a regexp
-
-     Apply the regexp to the ``string`` and return a formatted string
-    according to ``output``
-
-    eg :
-        format(u'[Victor Hugo - 26 fev 1802 / 22 mai 1885]',
-               r'\[(?P<firstname>\w+) (?p<lastname>\w+) - '
-               r'(?P<birthdate>.*?) / (?<deathdate>.*?)\]',
-               u'%(lastname)s, %(firstname)s (%(birthdate)s -'
-               u'%(deathdate)s)')
-
-     would return u'Hugo, Victor (26 fev 1802 - 22 mai 1885)'
-    """
-    def __init__(self, regexp, output, attr_index=None):
-        callback = partial(rgxformat, regexp=regexp, output=output)
-        super(RegexpNormalizer, self).__init__(callback, attr_index=attr_index)
-
-
-###############################################################################
-### JOIN NORMALIZER ###########################################################
-###############################################################################
-class JoinNormalizer(BaseNormalizer):
-    """Normalizer that join multiple fields in only one.
-    This new field will be put at the end of the new record.
-    """
-    def __init__(self, attr_indexes, join_car=', '):
-        self.attr_indexes = attr_indexes
-        self.join_car = join_car
-
-    def normalize(self, record):
-        """ Normalize a record
-
-        Parameters
-        ----------
-        record: a record (tuple/list of values).
-
-        Returns
-        -------
-
-        record: the normalized record.
-        """
-        _record = [r for ind, r in enumerate(record) if ind not in self.attr_indexes]
-        _record.append(self.join_car.join([r for ind, r in enumerate(record) if ind in self.attr_indexes]))
-        return _record
-
-
-###############################################################################
-### NORMALIZER PIPELINE #######################################################
-###############################################################################
-class NormalizerPipeline(BaseNormalizer):
-    """ Pipeline of Normalizers
-    """
-
-    def __init__(self, normalizers):
-        """ Initiate the NormalizerPipeline
-
-        Parameters
-        ----------
-        normalizers: list (ordered) of Normalizer
-        """
-        self.normalizers = normalizers
-
-    def normalize(self, record):
-        """ Normalize a record
-
-        Parameters
-        ----------
-        record: a record (tuple/list of values).
-
-        Returns
-        -------
-
-        record: the normalized record.
-        """
-        for normalizer in self.normalizers:
-            record = normalizer.normalize(record)
-        return record
--- a/record_linkage/aligner.py	Thu Dec 19 14:44:44 2013 +0000
+++ b/record_linkage/aligner.py	Thu Dec 19 14:44:58 2013 +0000
@@ -21,7 +21,7 @@
 from scipy import zeros
 from scipy.sparse import lil_matrix
 
-from nazca.dataio import parsefile
+from nazca.utils.dataio import parsefile
 
 
 ###############################################################################
--- a/record_linkage/blocking.py	Thu Dec 19 14:44:44 2013 +0000
+++ b/record_linkage/blocking.py	Thu Dec 19 14:44:58 2013 +0000
@@ -32,8 +32,8 @@
 
 from scipy.spatial import KDTree
 
-from nazca.minhashing import Minlsh
-from nazca.distances import soundexcode
+from nazca.utils.minhashing import Minlsh
+from nazca.utils.distances import soundexcode
 
 
 ###############################################################################
--- a/test/test_alignment.py	Thu Dec 19 14:44:44 2013 +0000
+++ b/test/test_alignment.py	Thu Dec 19 14:44:58 2013 +0000
@@ -21,10 +21,10 @@
 random.seed(6) ### Make sure tests are repeatable
 from os import path
 
-from nazca.normalize import simplify
+from nazca.utils.normalize import simplify
 import nazca.record_linkage.aligner as alig
 import nazca.record_linkage.blocking as blo
-from nazca.distances import LevenshteinProcessing, GeographicalProcessing
+from nazca.utils.distances import LevenshteinProcessing, GeographicalProcessing
 
 
 TESTDIR = path.dirname(__file__)
--- a/test/test_blocking.py	Thu Dec 19 14:44:44 2013 +0000
+++ b/test/test_blocking.py	Thu Dec 19 14:44:58 2013 +0000
@@ -21,14 +21,14 @@
 import random
 random.seed(6) ### Make sure tests are repeatable / Minhashing
 
-from nazca.distances import (levenshtein, soundex, soundexcode,   \
-                             jaccard, euclidean, geographical)
+from nazca.utils.distances import (levenshtein, soundex, soundexcode,   \
+                                       jaccard, euclidean, geographical)
 from nazca.record_linkage.blocking import (KeyBlocking, SortedNeighborhoodBlocking,
                                            MergeBlocking,
                                            NGramBlocking, PipelineBlocking,
                                            SoundexBlocking, KmeansBlocking,
                                            MinHashingBlocking, KdTreeBlocking)
-from nazca.normalize import SimplifyNormalizer, loadlemmas
+from nazca.utils.normalize import SimplifyNormalizer, loadlemmas
 
 
 TESTDIR = path.dirname(__file__)
--- a/test/test_dataio.py	Thu Dec 19 14:44:44 2013 +0000
+++ b/test/test_dataio.py	Thu Dec 19 14:44:58 2013 +0000
@@ -22,7 +22,7 @@
 from os import path
 from tempfile import mkdtemp
 
-from nazca.dataio import sparqlquery, parsefile, autocast, split_file
+from nazca.utils.dataio import sparqlquery, parsefile, autocast, split_file
 
 
 TESTDIR = path.dirname(__file__)
--- a/test/test_distances.py	Thu Dec 19 14:44:44 2013 +0000
+++ b/test/test_distances.py	Thu Dec 19 14:44:58 2013 +0000
@@ -21,9 +21,9 @@
 random.seed(6) ### Make sure tests are repeatable
 from dateutil import parser as dateparser
 
-from nazca.distances import (levenshtein, soundex, soundexcode,\
-                             jaccard, euclidean, geographical,
-                             LevenshteinProcessing)
+from nazca.utils.distances import (levenshtein, soundex, soundexcode,
+                                   jaccard, euclidean, geographical,
+                                   LevenshteinProcessing)
 
 
 class DistancesTest(unittest2.TestCase):
--- a/test/test_minhashing.py	Thu Dec 19 14:44:44 2013 +0000
+++ b/test/test_minhashing.py	Thu Dec 19 14:44:58 2013 +0000
@@ -21,8 +21,8 @@
 import random
 random.seed(6) ### Make sure tests are repeatable
 
-from nazca.normalize import loadlemmas, simplify
-from nazca.minhashing import Minlsh
+from nazca.utils.normalize import loadlemmas, simplify
+from nazca.utils.minhashing import Minlsh
 
 TESTDIR = path.dirname(__file__)
 
--- a/test/test_normalize.py	Thu Dec 19 14:44:44 2013 +0000
+++ b/test/test_normalize.py	Thu Dec 19 14:44:58 2013 +0000
@@ -19,12 +19,12 @@
 import unittest2
 from os import path
 
-from nazca.normalize import (BaseNormalizer, UnicodeNormalizer, JoinNormalizer,
-                             SimplifyNormalizer, TokenizerNormalizer,
-                             LemmatizerNormalizer, RoundNormalizer,
-                             RegexpNormalizer, NormalizerPipeline,
-                             lunormalize, loadlemmas, lemmatized, \
-                             roundstr, rgxformat, tokenize, simplify)
+from nazca.utils.normalize import (BaseNormalizer, UnicodeNormalizer, JoinNormalizer,
+                                   SimplifyNormalizer, TokenizerNormalizer,
+                                   LemmatizerNormalizer, RoundNormalizer,
+                                   RegexpNormalizer, NormalizerPipeline,
+                                   lunormalize, loadlemmas, lemmatized,
+                                   roundstr, rgxformat, tokenize, simplify)
 
 
 TESTDIR = path.dirname(__file__)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/dataio.py	Thu Dec 19 14:44:58 2013 +0000
@@ -0,0 +1,224 @@
+# -*- coding:utf-8 -*-
+# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from os.path import exists as fileexists
+from os import path as osp
+
+import csv
+import urllib
+
+try:
+    from SPARQLWrapper import SPARQLWrapper, JSON
+    SPARQL_ENABLED = True
+except ImportError:
+    SPARQL_ENABLED = False
+
+
+###############################################################################
+### UTILITY FUNCTIONS #########################################################
+###############################################################################
+def autocast(data, encoding=None):
+    """ Try to convert data into a specific type
+    in (int, float, str)
+    """
+    try:
+        return int(data)
+    except ValueError:
+        try:
+            return float(data.replace(',', '.'))
+        except ValueError:
+            data = data.strip()
+            if encoding:
+                return data.decode(encoding)
+            return data
+
+
+###############################################################################
+### RQL FUNCTIONS #############################################################
+###############################################################################
+def rqlquery(host, rql, indexes=None, formatopt=None):
+    """ Run the rql query on the given cubicweb host
+    """
+
+    if host.endswith('/'):
+        host = host[:-1]
+
+    indexes = indexes or []
+    filehandle = urllib.urlopen('%(host)s/view?'
+                                'rql=%(rql)s&vid=csvexport'
+                                % {'rql': rql, 'host': host})
+    filehandle.readline()#Skip the first line
+    return parsefile(filehandle, delimiter=';', indexes=indexes,
+                     formatopt=formatopt);
+
+
+###############################################################################
+### SPARQL FUNCTIONS ##########################################################
+###############################################################################
+def sparqlquery(endpoint, query, indexes=None, autocaste_data=True):
+    """ Run the sparql query on the given endpoint, and wrap the items in the
+    indexes form. If indexes is empty, keep raw output"""
+
+    if not SPARQL_ENABLED:
+        raise ImportError("You have to install SPARQLWrapper and JSON modules to"
+                          "used this function")
+
+    sparql = SPARQLWrapper(endpoint)
+    sparql.setQuery(query)
+    sparql.setReturnFormat(JSON)
+    rawresults = sparql.query().convert()
+    labels = rawresults['head']['vars']
+    results = []
+    indexes = indexes or []
+    if autocaste_data:
+        transform = autocast
+    else:
+        def transform(*args): return args
+    for raw in rawresults["results"]["bindings"]:
+        data = []
+        if not indexes:
+            data = [transform(raw[label]['value']) for label in labels]
+        else:
+            for il, ind in enumerate(indexes):
+                if isinstance(ind, tuple):
+                    data.append(tuple([transform(raw[labels[i]]['value']) for i in ind]))
+                else:
+                    data.append(transform(raw[labels[il]]['value']))
+        results.append(data)
+    return results
+
+
+###############################################################################
+### FILE FUNCTIONS ############################################################
+###############################################################################
+def parsefile(filename, indexes=None, nbmax=None, delimiter='\t',
+              encoding='utf-8', field_size_limit=None, formatopt=None):
+    """ Parse the file (read ``nbmax`` line at maximum if given). Each
+        line is splitted according ``delimiter`` and only ``indexes`` are kept
+
+        eg : The file is :
+                1, house, 12, 19, apple
+                2, horse, 21.9, 19, stramberry
+                3, flower, 23, 2.17, cherry
+
+            >>> data = parsefile('myfile', [0, (2, 3), 4, 1], delimiter=',')
+            data = [[1, (12, 19), u'apple', u'house'],
+                    [2, (21.9, 19), u'stramberry', u'horse'],
+                    [3, (23, 2.17), u'cherry', u'flower']]
+
+            By default, all cells are "autocast" (thanks to the
+            ``autocast()`` function), but you can overpass it thanks to the
+            ``formatopt`` dictionnary. Each key is the index to work on, and the
+            value is the function to call. See the following example:
+
+            >>> data = parsefile('myfile', [0, (2, 3), 4, 1], delimiter=',',
+            >>>                  formatopt={2:lambda x:x.decode('utf-8')})
+            data = [[1, (u'12', 19), u'apple', u'house'],
+                    [2, (u'21.9', 19), u'stramberry', u'horse'],
+                    [3, (u'23', 2.17), u'cherry', u'flower']]
+
+    """
+    def formatedoutput(filename):
+        if field_size_limit:
+            csv.field_size_limit(field_size_limit)
+
+        if isinstance(filename, basestring):
+            csvfile = open(filename, 'r')
+        else:
+            csvfile = filename
+        reader = csv.reader(csvfile, delimiter=delimiter)
+        for row in reader:
+            yield [cell.strip() for cell in row]
+        csvfile.close()
+
+
+
+    result = []
+    indexes = indexes or []
+    formatopt = formatopt or {}
+    for ind, row in enumerate(formatedoutput(filename)):
+        row = [formatopt.get(i, lambda x: autocast(x, encoding))(cell)
+               for i, cell in enumerate(row)]
+        data = []
+        if nbmax and ind > nbmax:
+            break
+        if not indexes:
+            data = row
+        else:
+            for ind in indexes:
+                if isinstance(ind, tuple):
+                    data.append(tuple([row[i] for i in ind]))
+                    if '' in data[-1]:
+                        data[-1] = None
+                elif row[ind]:
+                    data.append(row[ind])
+                else:
+                    data.append(None)
+
+        result.append(data)
+    return result
+
+def write_results(matched, alignset, targetset, resultfile):
+    """ Given a matched dictionnay, an alignset and a targetset to the
+        resultfile
+    """
+    openmode = 'a' if fileexists(resultfile) else 'w'
+    with open(resultfile, openmode) as fobj:
+        if openmode == 'w':
+            fobj.write('aligned;targetted;distance\n')
+        for aligned in matched:
+            for target, dist in matched[aligned]:
+                alignid = alignset[aligned][0]
+                targetid = targetset[target][0]
+                fobj.write('%s;%s;%s\n' %
+                    (alignid.encode('utf-8') if isinstance(alignid, basestring)
+                                             else alignid,
+                     targetid.encode('utf-8') if isinstance(targetid, basestring)
+                                              else targetid,
+                     dist
+                     ))
+
+def split_file(filename, outputdir, nblines=60000):
+    """ Split `filename` into smaller files of ``nblines`` lines. Files are
+        written into `outputdir`.
+
+        Return the list of files
+    """
+    NEW = object()
+
+    def readlines(fobj, nblines):
+        """ yield all lines of the file, and
+        at split-file boundaries, yield a NEW marker
+        """
+        for index, line in enumerate(fobj):
+            if index and index % nblines == 0:
+                yield NEW
+            yield line
+
+    count = 0
+    with open(filename, 'rb') as fobj:
+        outfile = open(osp.join(outputdir, '%s' % count), 'wb')
+        for line in readlines(fobj, nblines):
+            if line is NEW:
+                outfile.close()
+                count += 1
+                outfile = open(osp.join(outputdir, '%s' % count), 'wb')
+                continue
+            outfile.write(line)
+        outfile.close()
+        count += 1
+    return map(str, xrange(count))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/distances.py	Thu Dec 19 14:44:58 2013 +0000
@@ -0,0 +1,456 @@
+# -*- coding:utf-8 -*-
+# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from functools import partial
+from math import cos, sqrt, pi #Needed for geographical distance
+try:
+    from dateutil import parser as dateparser
+    DATEUTIL_ENABLED = True
+except ImportError:
+    DATEUTIL_ENABLED = False
+from scipy import matrix, empty
+
+from nazca.utils.normalize import tokenize
+
+
+###############################################################################
+### UTILITY FUNCTIONS #########################################################
+###############################################################################
+def cdist(distance_callback, refset, targetset, matrix_normalized=False,
+          ref_indexes=None, target_indexes=None):
+    """ Compute the metric matrix, given two datasets and a metric
+
+    Parameters
+    ----------
+    refset: a dataset (list of records)
+
+    targetset: a dataset (list of records)
+
+    Returns
+    -------
+
+    A distance matrix, of shape (len(refset), len(targetset))
+    with the distance of each element in it.
+    """
+    ref_indexes = ref_indexes or xrange(len(refset))
+    target_indexes = target_indexes or xrange(len(targetset))
+    distmatrix = empty((len(ref_indexes), len(target_indexes)), dtype='float32')
+    size = distmatrix.shape
+    for i, iref in enumerate(ref_indexes):
+        for j, jref in enumerate(target_indexes):
+            d = 1
+            if refset[iref] and targetset[jref]:
+                d = distance_callback(refset[iref], targetset[jref])
+                if matrix_normalized:
+                    d = 1 - (1.0/(1.0 + d))
+            distmatrix[i, j] = d
+    return distmatrix
+
+def _handlespaces(stra, strb, distance, tokenizer=None, **kwargs):
+    """ Compute the matrix of distances between all tokens of stra and strb
+        (with function ``distance``). Extra args are given to the distance
+        function
+
+        The distance returned is defined as the max of the min of each rows of
+        each distance matrix, see the example above :
+
+                 |  Victor |  Hugo                  Victor | Jean | Hugo
+         Victor  |     0   |    5           Victor |  0    |  6   |  5
+          Jean   |     6   |    4           Hugo   |  5    |  4   |  0
+          Hugo   |     5   |    0
+
+                 --> 4                                --> 0
+
+        Return 4
+    """
+
+    if ' ' not in stra:
+        stra += ' '
+    if ' ' not in strb:
+        strb += ' '
+
+    toka = tokenize(stra, tokenizer)
+    tokb = tokenize(strb, tokenizer)
+    # If not same number of tokens, complete the smallest list with empty strings
+    if len(toka) != len(tokb):
+        mint = toka if len(toka)<len(tokb) else tokb
+        maxt = toka if len(toka)>len(tokb) else tokb
+        mint.extend(['' for i in range(len(maxt)-len(mint))])
+
+    listmatrix = []
+    for i in xrange(len(toka)):
+        listmatrix.append([distance(toka[i], tokb[j], **kwargs) for j in xrange(len(tokb))])
+    m = matrix(listmatrix)
+    minlist = [m[i,:].min() for i in xrange(m.shape[0])]
+    minlist.extend([m[:,i].min() for i in xrange(m.shape[1])])
+    return max(minlist)
+
+
+###############################################################################
+### NUMERICAL DISTANCES #######################################################
+###############################################################################
+def euclidean(a, b):
+    """ Simple euclidian distance
+    """
+    try:
+        return abs(a - b)
+    except TypeError:
+        #a and b may be strings
+        return abs(float(a) - float(b))
+
+
+###############################################################################
+### STRING DISTANCES ##########################################################
+###############################################################################
+def levenshtein(stra, strb, tokenizer=None):
+    """ Compute the Levenshtein distance between stra and strb.
+
+    The Levenshtein distance is defined as the minimal cost to transform stra
+    into strb, where 3 operators are allowed :
+        - Replace one character of stra into a character of strb
+        - Add one character of strb into stra
+        - Remove one character of strb
+
+        If spaces are found in stra or strb, this method returns
+            _handlespaces(stra, strb, levenshtein)
+    """
+    if ' ' in stra or ' ' in strb:
+        return _handlespaces(stra, strb, levenshtein, tokenizer)
+
+    lenb = len(strb)
+    onerowago = None
+    thisrow = range(1, lenb + 1) + [0]
+    for x in xrange(len(stra)):
+        onerowago, thisrow = thisrow, [0]*lenb + [x+1]
+        for y in xrange(lenb):
+            delcost = onerowago[y] + 1
+            addcost = thisrow[y - 1] + 1
+            subcost = onerowago[y - 1] + (stra[x] != strb[y])
+            thisrow[y] = min(delcost, addcost, subcost)
+    return thisrow[lenb - 1]
+
+def soundexcode(word, language='french'):
+    """ Return the Soundex code of the word ``word``
+        For more information about soundex code see wiki_
+
+        ``language`` can be 'french' or 'english'
+
+        .:: wiki_ : https://en.wikipedia.org/wiki/Soundex
+
+        If spaces are found in stra or strb, this method returns
+            _handlespaces(stra, strb), soundex, language=language)
+    """
+
+    vowels = 'AEHIOUWY'
+    if language.lower() == 'french' :
+        consonnantscode = {'B': '1', 'P': '1',
+                           'C': '2', 'K': '2', 'Q': '2',
+                           'D': '3', 'T': '3',
+                           'L': '4',
+                           'M': '5', 'N': '5',
+                           'R': '6',
+                           'G': '7', 'J': '7',
+                           'X': '8', 'Z': '8', 'S': '8',
+                           'F': '9', 'V': '9'
+                          }
+    elif language.lower() == 'english':
+        consonnantscode = {'B': '1', 'F': '1', 'P': '1', 'V': '1',
+                           'C': '2', 'G': '2', 'J': '2', 'K': '2',
+                           'Q': '2', 'S': '2', 'X': '2', 'Z': '2',
+                           'D': '3', 'T': '3',
+                           'L': '4',
+                           'M': '5', 'N': '5',
+                           'R': '6'
+                          }
+    else:
+        raise NotImplementedError('Soundex code is not supported (yet ?) for'
+                                  'this language (%s). '
+                                  'Supported languages are french and english' % language)
+    word = word.strip().upper()
+    code = word[0]
+    #After this ``for`` code is
+    # the first letter of ``word`` followed by all the consonnants of word,
+    # where from consecutive consonnants, only the first is kept,
+    # and from two identical consonnants separated by a W or a H, only the first
+    # is kept too.
+    for i in xrange(1, len(word)):
+        if word[i] in vowels:
+            continue
+        if word[i - 1] not in vowels and \
+           consonnantscode[word[i]] == consonnantscode.get(code[-1], ''):
+            continue
+        if i + 2 < len(word) and word[i + 1] in 'WH' and \
+           consonnantscode[word[i]] == consonnantscode.get(word[i + 2], ''):
+            continue
+        code += word[i]
+        if len(code) > 4:
+            break
+
+    #Replace according to the codes
+    code = code[0] + ''.join([consonnantscode[c] for c in code[1:]])
+    ###First four letters, completed by zeros
+    return code[:4] + '0'*(4 - len(code))
+
+def soundex(stra, strb, language='french', tokenizer=None):
+    """ Return the 1/0 distance between the soundex code of stra and strb.
+        0 means they have the same code, 1 they don't
+    """
+    if ' ' in stra or ' ' in strb:
+        return _handlespaces(stra, strb, soundex, tokenizer=tokenizer, language=language)
+
+    return 0 if (soundexcode(stra, language) == soundexcode(strb, language)) \
+             else 1
+
+def jaccard(stra, strb, tokenizer=None):
+    """ Return the jaccard distance between stra and strb, condering the tokens
+        set of stra and strb. If no tokenizer is given, it use if
+        alignement.normalize.tokenize's default one.
+
+        J(A, B) = (A \cap B)/(A \cup B)
+        d(A, B) = 1 - J(A, B)
+    """
+    seta = set(tokenize(stra, tokenizer))
+    setb = set(tokenize(strb, tokenizer))
+    return generic_jaccard(seta, setb)
+
+def generic_jaccard(seta, setb):
+    """ Return the jaccard distance between two sets A and B.
+
+        J(A, B) = (A \cap B)/(A \cup B)
+        d(A, B) = 1 - J(A, B)
+    """
+    return 1.0 - 1.0*len(seta.intersection(setb))/len(seta.union(setb))
+
+
+###############################################################################
+### TEMPORAL DISTANCES ########################################################
+###############################################################################
+if DATEUTIL_ENABLED:
+    class FrenchParserInfo(dateparser.parserinfo):
+        """ Inherit of the dateutil.parser.parserinfo and translate the english
+            dependant variables into french.
+        """
+
+        HMS = [(u'h', u'heure', u'heures'),
+               (u'm', u'minute', u'minutes'),
+                    (u's', u'seconde', u'seconde'),]
+        JUMP = [u' ', u'.', u',', u';', u'-', u'/', u"'",
+               u'a', u'le', u'et', u'er']
+        MONTHS = [(u'Jan', u'Janvier'), (u'Fev', u'Fevrier'),
+                  (u'Mar', u'Mars'), (u'Avr', u'Avril'), (u'Mai', u'Mai'),
+                  (u'Jun', u'Juin'), (u'Jui', u'Juillet'),
+                  (u'Aou', u'Aout'), (u'Sep', u'Septembre'),
+                  (u'Oct', u'Octobre'), (u'Nov', u'Novembre'),
+                  (u'Dec', u'Decembre')]
+        PERTAIN = [u'de']
+        WEEKDAYS = [(u'Lun', u'Lundi'),
+                    (u'Mar', u'Mardi'),
+                    (u'Mer', u'Mercredi'),
+                    (u'Jeu', u'Jeudi'),
+                    (u'Ven', u'Vendredi'),
+                    (u'Sam', u'Samedi'),
+                    (u'Dim', u'Dimanche')]
+
+    def temporal(stra, strb, granularity=u'days', parserinfo=FrenchParserInfo,
+                 dayfirst=True, yearfirst=False):
+        """ Return the distance between two strings (read as dates).
+
+            ``granularity`` can be either ``days`` or ``months`` or ``years``
+            (be careful to the plural form !)
+            ``language`` can be either french or english
+
+            ``dayfirst`` and ``yearfirst`` are used in case of ambiguity, for
+            instance 09/09/09, by default it assumes it's day/month/year
+
+            Neither stra nor strb can have accent. Clean it before.
+        """
+
+        datea = dateparser.parse(stra, parserinfo=parserinfo(dayfirst,
+                                 yearfirst), fuzzy=True)
+        dateb = dateparser.parse(strb, parserinfo=parserinfo(dayfirst,
+                                 yearfirst), fuzzy=True)
+        diff = datea - dateb
+        if granularity.lower() == 'years':
+            return abs(diff.days/365.25)
+        if granularity.lower() == 'months':
+            return abs(diff.days/30.5)
+        return abs(diff.days)
+
+
+###############################################################################
+### GEOGRAPHICAL DISTANCES ####################################################
+###############################################################################
+def geographical(pointa, pointb, in_radians=False, planet_radius=6371009,
+                 units='m'):
+    """ Return the geographical distance between two points.
+
+        Both points must be tuples (latitude, longitude)
+
+        - in_radians is True, if latitude and longitude are in radians, false
+          otherwise
+        - planetRadius is the planet's radius in meters. By default, it's the
+          Earth'one.
+
+        - `units` can be 'm' (meters) or 'km' (kilometers)
+    """
+    pointa = (float(pointa[0]), float(pointa[1]))
+    pointb = (float(pointb[0]), float(pointb[1]))
+
+    difflat = pointa[0] - pointb[0]
+    difflong = pointa[1] - pointb[1]
+    meanlat = (pointa[0] + pointb[0])/2.0
+
+    if not in_radians:
+        difflat *= pi/180.0
+        difflong *= pi/180.0
+        meanlat *= pi/180.0
+
+    coef = 1. if units == 'm' else 0.001
+    return coef*planet_radius*sqrt(difflat**2 + (cos(meanlat)*difflong)**2)
+
+
+###############################################################################
+### BASE PROCESSING ############################################################
+###############################################################################
+class BaseProcessing(object):
+    """ A processing object used to provide an abstraction over the different
+    distance functions, and help building Nazca process. """
+
+    def __init__(self, ref_attr_index=None, target_attr_index=None,
+                 distance_callback=euclidean, weight=1, matrix_normalized=False):
+        """ Initiate the BaseProcessing
+
+        Parameters
+        ----------
+
+        ref_attr_index: index of the attribute of interest in a record
+                        for the reference dataset
+                        (i.e. attribute to be used for key computation)
+
+        target_attr_index: index of the attribute of interest in a record
+                           for the target dataset
+                           (i.e. attribute to be used for key computation)
+
+        distance_callback: distance callback. Default is euclidean distance.
+
+        weight: weight of the processing in a global distance matrix
+
+        matrix_normalized: Boolean. If matrix_normalized is True,
+                           the distance between two points is changed to
+                           a value between 0 (equal) and 1 (totaly different).
+                           To avoid useless computation and scale
+                           problems the following “normalization” is done:
+                                d = 1 - 1/(1 + d(x, y))
+
+        """
+        self.ref_attr_index = ref_attr_index
+        self.target_attr_index = target_attr_index
+        self.distance_callback = distance_callback
+        self.weight = weight
+        self.matrix_normalized = matrix_normalized
+
+    def distance(self, reference_record, target_record):
+        """ Compute the distance between two records
+
+        Parameters
+        ----------
+        reference_record: a record (tuple/list of values) of the reference dataset.
+
+        target_record: a record (tuple/list of values) of the target dataset.
+
+        """
+        refrecord = (reference_record[self.ref_attr_index] if self.ref_attr_index
+                     else reference_record)
+        targetrecord = (target_record[self.target_attr_index] if self.target_attr_index
+                        else target_record)
+        return self.distance_callback(refrecord, targetrecord)
+
+    def cdist(self, refset, targetset, ref_indexes=None, target_indexes=None):
+        """ Compute the metric matrix, given two datasets and a metric
+
+        Parameters
+        ----------
+        refset: a dataset (list of records)
+
+        targetset: a dataset (list of records)
+
+        Returns
+        -------
+
+        A distance matrix, of shape (len(refset), len(targetset))
+        with the distance of each element in it.
+        """
+        return cdist(self.distance, refset, targetset,
+                     matrix_normalized=self.matrix_normalized,
+                     ref_indexes=ref_indexes, target_indexes=target_indexes)
+
+    def pdist(self, dataset):
+        """ Compute the upper triangular matrix in a way similar
+        to scipy.spatial.metric
+
+        Parameters
+        ----------
+        dataset: a dataset (list of records)
+
+        Returns
+        -------
+
+        The values of the upper triangular distance matrix
+        (of shape (len(dataset), len(dataset)) with the distance of each element in it.
+        The values are sorted as row 1, row2, ...
+        """
+        values = []
+        for i in xrange(len(dataset)):
+            for j in xrange(i+1, len(dataset)):
+                d = 1
+                if dataset[i] and dataset[j]:
+                    d = self.distance(dataset[i], dataset[j])
+                    if self.matrix_normalized:
+                        d = 1 - (1.0/(1.0 + d))
+                values.append(d)
+        return values
+
+
+###############################################################################
+### CONCRETE PROCESSINGS #######################################################
+###############################################################################
+class LevenshteinProcessing(BaseProcessing):
+    """ A processing based on the levenshtein distance.
+    """
+
+    def __init__(self, ref_attr_index=None, target_attr_index=None,
+                 tokenizer=None, weight=1, matrix_normalized=False):
+        distance_callback = partial(levenshtein,
+                                    tokenizer=tokenizer)
+        super(LevenshteinProcessing, self).__init__(ref_attr_index,
+                                                   target_attr_index,
+                                                   distance_callback,
+                                                   weight,matrix_normalized)
+
+
+class GeographicalProcessing(BaseProcessing):
+    """ A processing based on the geographical distance.
+    """
+
+    def __init__(self, ref_attr_index=None, target_attr_index=None,
+                 in_radians=False, planet_radius=6371009, units='m', weight=1, matrix_normalized=False):
+        distance_callback = partial(geographical, in_radians=in_radians,
+                                    planet_radius=planet_radius, units=units)
+        super(GeographicalProcessing, self).__init__(ref_attr_index,
+                                                    target_attr_index,
+                                                    distance_callback,
+                                                    weight,matrix_normalized)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/minhashing.py	Thu Dec 19 14:44:58 2013 +0000
@@ -0,0 +1,184 @@
+# -*- coding:utf-8 -*-
+# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import cPickle
+
+from random import randint
+from collections import defaultdict
+
+import numpy as np
+from scipy.optimize import bisect
+
+from nazca.utils.normalize import iter_wordgrams
+
+
+def randomhashfunction(zr):
+    """ Return a random hash function, mapping x in Z to ZR
+        h:x -> ax + b mod R
+
+    """
+    bound = max(zr - 1, 1)
+    a = randint(1, bound)
+    b = randint(1, bound)
+
+    def hashfunc(x):
+        return ((a*x + b)%zr)
+
+    return hashfunc
+
+
+class Minlsh(object):
+    """ Operate minhashing + locally-sensitive-hashing to find similar sentences
+    """
+
+    def __init__(self, verbose=False):
+        self._trained = False
+        self.sigmatrix = None
+        self._verbose = verbose
+
+    def train(self, sentences, k=2, siglen=200):
+        """ Train the minlsh on the given sentences.
+
+            - `k` is the length of the k-wordgrams used
+              (the lower k is, the faster is the training)
+            - `siglen` the length of the sentences signature
+
+        """
+
+        rows, shape = self._buildmatrixdocument(sentences, k)
+
+        if self._verbose: print "Training is done. Wait while signaturing"
+
+        self._computesignaturematrix(rows, shape, siglen)
+        self._trained = True
+
+
+    def _buildmatrixdocument(self, sentences, k):
+        """ Return a sparse matrix where :
+
+            - Each sentence is a column
+            - Each row is a element of the universal set
+
+            Each value (r, c) is set to 1 if the element at row r is in the
+            sentence c, 0 otherwise
+
+        """
+
+        rows, universe, sizeofuniverse = [], {}, 0
+        for nb, sent in enumerate(sentences):
+            row = []
+            for w in iter_wordgrams(sent, k):
+                row.append(universe.setdefault(w, sizeofuniverse))
+                if row[-1] == sizeofuniverse:
+                    sizeofuniverse += 1
+            rows.append(row)
+            if self._verbose and nb % 50000 == 0:
+                print nb
+
+        return rows, (len(rows), sizeofuniverse)
+
+    def _computesignaturematrix(self, rows, shape, siglen):
+        """ Return a matrix where each column is the signature the document
+            The signature is composed of `siglen` numbers
+
+            The more the documents have rows in commun, the closer they are.
+        """
+
+        nrows, ncols = shape
+        sig = np.empty((siglen, nrows))
+        #Generate the random hash functions
+        hashfunc = [randomhashfunction(ncols) for _ in xrange(siglen)]
+        #Compute hashing values just for once.
+        #Avoid multiple recomputations for the same column.
+        hashvalues = np.array([[hashfunc[i](r) for r in xrange(ncols)]
+                                for i in  xrange(siglen)])
+
+        docind = 0
+        while rows:
+            doc = rows.pop(0)
+            #Concatenate the needed rows.
+            tmp = np.dstack([hashvalues[:, r] for r in doc])
+            #Take the mininum of hashes
+            sig[:, docind] = np.min(tmp[0], 1)
+            docind += 1
+            if self._verbose and docind % 50000 == 0:
+                print (docind * 100) / nrows
+        self.sigmatrix = sig
+
+    def save(self, savefile):
+        """ Save the training into `savefile` for a future use """
+
+        if not self._trained:
+            print "Not trained, nothing to save"
+            return
+
+        with open(savefile, 'wb') as fobj:
+            pickler = cPickle.Pickler(fobj)
+            pickler.dump(self.sigmatrix)
+
+    def load(self, savefile):
+        """ Load a trained minhashing """
+
+        with open(savefile, 'rb') as fobj:
+            pickler = cPickle.Unpickler(fobj)
+            self.sigmatrix = pickler.load()
+
+        if self.sigmatrix is not None:
+            self._trained = True
+        else:
+            self._trained = False
+
+    def computebandsize(self, threshold, nbrows):
+        """ Compute the bandsize according to the threshold given """
+
+        ### t ~ (1/b)^(1/r), where t is the threshold, b the number of
+        ### bands, and r the number of rows per band. And nbrows (the length
+        ### of the matrix is nbrows = b*r, so t ~ (r/L)^(1/r). So, let's
+        ### find the root of f(x) = (x/L)^(1/r) - t.
+        def f(x):
+            y = pow(x/nbrows, 1. /x) - threshold
+            return y
+
+        ## Solve f(x) = 0, with x having values in [1, nbrows]
+        return int(bisect(f, 1, nbrows))
+
+    def predict(self, threshold):
+        """ Return a set of tuples of *possible* similar sentences
+        """
+        if not self._trained:
+            print "Train it before"
+            return
+
+        if not (0 < threshold <= 1):
+            print "Threshold must be in ]0 ; 1]"
+            return
+
+        sig = self.sigmatrix
+        # Treshold is a percent of similarity
+        # It should be inverted here (0 is closed, 1 is far)
+        threshold = 1 - threshold
+        bandsize = self.computebandsize(threshold, self.sigmatrix.shape[0])
+
+        buckets = defaultdict(set)
+        similars = set()
+        for r in xrange(0, sig.shape[0], bandsize):
+            buckets.clear()
+            for i in xrange(sig.shape[1]):
+                buckets[tuple(sig[r:r+bandsize, i])].add(i)
+            similars.update(set(tuple(v) for v in buckets.itervalues()
+                                         if len(v) > 1))
+        return similars
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/ner_dataio.py	Thu Dec 19 14:44:58 2013 +0000
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+""" IO for Named Entities Recognition.
+"""
+import json
+import urllib
+import lxml.etree as ET
+
+
+###############################################################################
+### SPARQL UTILITIES ##########################################################
+###############################################################################
+def sparql_query(query, endpoint):
+    """ Execute a query on an endpoint:
+
+    sparql_query(query=u'''SELECT ?uri ?type
+                           WHERE{
+                           ?uri rdfs:label "Python"@en .
+                           ?uri rdf:type ?type}''',
+                           endpoint=u'http://dbpedia.org/sparql')
+    """
+    from SPARQLWrapper import SPARQLWrapper, JSON
+    sparql = SPARQLWrapper(endpoint)
+    sparql.setQuery(query)
+    sparql.setReturnFormat(JSON)
+    try:
+        rawresults = sparql.query().convert()
+        labels = rawresults['head']['vars']
+        return rawresults["results"]["bindings"]
+    except:
+        print 'Error in sparql query'
+        return []
+
+
+###############################################################################
+### RQL UTILITIES #############################################################
+###############################################################################
+def get_cw_cnx(endpoint):
+    """ Get a cnx on a CubicWeb database
+    """
+    from cubicweb import dbapi
+    from cubicweb.cwconfig import CubicWebConfiguration
+    from cubicweb.entities import AnyEntity
+    CubicWebConfiguration.load_cwctl_plugins()
+    config = CubicWebConfiguration.config_for(endpoint)
+    sourceinfo = config.sources()['admin']
+    login = sourceinfo['login']
+    password = sourceinfo['password']
+    _, cnx = dbapi.in_memory_repo_cnx(config, login, password=password)
+    req = cnx.request()
+    return req
+
+def rql_appid_query(query, endpoint, _cache_cnx={}, **kwargs):
+    """ Execute a query on an appid endpoint:
+
+    rql_query('Any X WHERE X label "Python"', 'localhost')
+
+    Additional arguments can be passed to be properly substitued
+    in the execute() function.
+    """
+    if endpoint in _cache_cnx:
+        cnx = _cache_cnx[endpoint]
+    else:
+        cnx = get_cw_cnx(endpoint)
+        _cache_cnx[endpoint] = cnx
+    return cnx.execute(query, kwargs)
+
+def rql_url_query(query, endpoint):
+    """ Execute a query on an url endpoint:
+
+    rql_query('Any X WHERE X label "Python"', 'localhost')
+    """
+    url = urllib.basejoin(endpoint, '?rql=%s&vid=jsonexport' % query)
+    return json.loads(urllib.urlopen(url).read())
+
+
+###############################################################################
+### OUTPUT UTILITIES ##########################################################
+###############################################################################
+class AbstractNerdyPrettyPrint(object):
+    """ Pretty print the output of a Nerdy process
+    """
+
+    def pprint_text(self, text, named_entities, **kwargs):
+        newtext = u''
+        indice = 0
+        tindices = dict([(t.start, (uri, t)) for uri, p, t in named_entities])
+        while indice < len(text):
+            if indice in tindices:
+                uri, t = tindices[indice]
+                words = text[t.start:t.end]
+                fragment = self.pprint_entity(uri, words, **kwargs)
+                if not self.is_valid(newtext+fragment+text[t.end:]):
+                    fragment = words
+                newtext += fragment
+                indice = t.end
+            else:
+                newtext += text[indice]
+                indice += 1
+        return newtext
+
+    def pprint_entity(self, uri, word, **kwargs):
+        """ Pretty print an entity """
+        raise NotImplementedError
+
+    def is_valid(self, newtext):
+        """Override to check the validity of the prettified content at each
+        enrichement step"""
+        return True
+
+
+class NerdyHTMLPrettyPrint(AbstractNerdyPrettyPrint):
+    """ Pretty print the output of a Nerdy process
+    """
+
+    def pprint_entity(self, uri, word, **kwargs):
+        """ Pretty print an entity """
+        klass = ' class="%s"' % kwargs['html_class'] if 'html_class' in kwargs else ''
+        return u'<a href="%s"%s>%s</a>' % (uri, klass, word)
+
+
+class NerdyValidXHTMLPrettyPrint(NerdyHTMLPrettyPrint):
+
+    XHTML_DOC_TEMPLATE = '''\
+<?xml version="1.0" encoding="UTF-8" ?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
+<title>nerdy</title>
+</head>
+<body><div>%s</div></body>
+</html>'''
+
+    def is_valid(self, html):
+        try:
+            ET.fromstring(self.XHTML_DOC_TEMPLATE % html.encode('utf-8'),
+                          parser=ET.XMLParser(dtd_validation=True))
+        except ET.XMLSyntaxError:
+            return False
+        return True
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/normalize.py	Thu Dec 19 14:44:58 2013 +0000
@@ -0,0 +1,415 @@
+# -*- coding:utf-8 -*-
+# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import re
+from string import punctuation
+from warnings import warn
+from unicodedata import normalize as _uninormalize
+from functools import partial
+
+
+FRENCH_STOPWORDS = set([u'alors', u'au', u'aux', u'aucuns', u'aussi', u'autre', u'avant',
+u'avec', u'avoir', u'bon', u'car', u'ce', u'cela', u'ces', u'ceux', u'chaque',
+u'ci', u'comme', u'comment', u'dans', u'de', u'des', u'du', u'dedans', u'dehors',
+u'depuis', u'deux', u'devrait', u'doit', u'donc', u'dos', u'droite', u'début',
+u'elle', u'elles', u'en', u'encore', u'essai', u'est', u'et', u'eu', u'eux', u'fait',
+u'faites', u'fois', u'font', u'force', u'haut', u'hors', u'ici', u'il', u'ils',
+u'je', u'juste', u'la', u'le', u'les', u'leur', u'lui', u'là', u'ma', u'maintenant',
+u'mais', u'me', u'mes', u'moi', u'moins', u'mon', u'mot', u'même', u'ne',
+u'ni', u'nommés', u'nos',
+u'notre', u'nous', u'nouveaux', u'on', u'ou', u'où', u'par', u'parce', u'parole',
+u'pas', u'personnes', u'peut', u'peu', u'pièce', u'plupart', u'pour',
+u'pourquoi', u'quand', u'que', u'quel', u'quelle', u'quelles', u'quels', u'qui',
+u'sa', u'sans', u'se', u'ses', u'seulement', u'si', u'sien', u'son', u'sont', u'sous',
+u'soyez', u'sujet', u'sur', u'ta', u'tandis', u'tellement', u'te', u'tels', u'tes', u'toi',
+u'ton', u'tous', u'tout', u'trop', u'très', u'tu', u'un', u'une', u'valeur', u'voie',
+u'voient', u'vont', u'vos', u'votre', u'vous', u'vu', u'ça', u'étaient', u'état',
+u'étions', u'été', u'être'])
+
+MANUAL_UNICODE_MAP = {
+    u'\xa1': u'!',    # INVERTED EXCLAMATION MARK
+    u'\u0142': u'l',  # LATIN SMALL LETTER L WITH STROKE
+    u'\u2044': u'/',  # FRACTION SLASH
+    u'\xc6': u'AE',   # LATIN CAPITAL LETTER AE
+    u'\xa9': u'(c)',  # COPYRIGHT SIGN
+    u'\xab': u'"',    # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
+    u'\xe6': u'ae',   # LATIN SMALL LETTER AE
+    u'\xae': u'(r)',  # REGISTERED SIGN
+    u'\u0153': u'oe', # LATIN SMALL LIGATURE OE
+    u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE
+    u'\xd8': u'O',    # LATIN CAPITAL LETTER O WITH STROKE
+    u'\xf8': u'o',    # LATIN SMALL LETTER O WITH STROKE
+    u'\xbb': u'"',    # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
+    u'\xdf': u'ss',   # LATIN SMALL LETTER SHARP S
+    }
+
+
+###############################################################################
+### NORMALIZE FUNCTIONS #######################################################
+###############################################################################
+def unormalize(ustring, substitute=None):
+    """replace diacritical characters with their corresponding ascii characters
+
+    Convert the unicode string to its long normalized form (unicode character
+    will be transform into several characters) and keep the first one only.
+    The normal form KD (NFKD) will apply the compatibility decomposition, i.e.
+    replace all compatibility characters with their equivalents.
+
+    :type substitute: str
+    :param substitute: replacement character to use if decomposition fails
+
+    :see: Another project about ASCII transliterations of Unicode text
+          http://pypi.python.org/pypi/Unidecode
+    """
+    res = []
+    for letter in ustring[:]:
+        try:
+            replacement = MANUAL_UNICODE_MAP[letter]
+        except KeyError:
+            if isinstance(letter, unicode):
+                replacement = _uninormalize('NFKD', letter)[0]
+            else:
+                replacement = letter
+            if ord(replacement) >= 2 ** 7:
+                if substitute is None:
+                    raise ValueError("can't deal with non-ascii based characters")
+                replacement = substitute
+        res.append(replacement)
+    return u''.join(res)
+
+def lunormalize(sentence, substitute=None):
+    """ Normalize a sentence (ie remove accents, set to lower, etc) """
+    return unormalize(sentence,substitute).lower()
+
+def simplify(sentence, lemmas=None, remove_stopwords=True, stopwords=FRENCH_STOPWORDS):
+    """ Simply the given sentence
+        0) If remove_stopwords, then remove the stop words
+        1) If lemmas are given, the sentence is lemmatized
+        2) Set the sentence to lower case
+        3) Remove punctuation
+    """
+    if not isinstance(sentence, basestring):
+        return sentence
+
+    if lemmas:
+        sentence = lemmatized(sentence, lemmas)
+    sentence = sentence.lower()
+    cleansent = ''.join([s if s not in punctuation
+                           else ' ' for s in sentence]).strip()
+    #comma followed by a space is replaced by two spaces, keep only one
+    cleansent = cleansent.replace('  ', ' ')
+
+    if not remove_stopwords:
+        return cleansent
+    else:
+        return ' '.join([w for w in cleansent.split(' ') if w not in stopwords])
+
+def tokenize(sentence, tokenizer=None, regexp=re.compile(r"[^\s]+")):
+    """ Tokenize a sentence.
+        Use ``tokenizer`` if given, else try to use the nltk WordPunctTokenizer,
+        in case of failure, it just split on spaces.
+
+        Anyway, tokenizer must have a ``tokenize()`` method
+    """
+    if tokenizer:
+        return tokenizer().tokenize(sentence)
+    # XXX Unicode, could not use WorkTokenizer.
+    # Instead split on whitespaces
+    chunks = []
+    for chunk in [t for t in regexp.findall(sentence) if t]:
+        # Deals with '
+        if "'" in chunk:
+            schunks = chunk.split("'")
+            chunks.extend([c+"'" for c in schunks[:-1]])
+            chunks.append(schunks[-1])
+        else:
+            chunks.append(chunk)
+    return chunks
+
+def iter_wordgrams(sentence, k):
+    """ Generator of k-wordgrams on the given sentence
+    """
+    words = sentence.split(' ')
+    #XXX Call tokenizer
+    for r in xrange(len(words)):
+        yield ' '.join(words[r:r + k])
+
+def loadlemmas(filename, encoding='utf-8'):
+    """ Return the default lemmas dictionnary
+    """
+    lemmas = {}
+    with open(filename) as fobj:
+        for line in fobj:
+            line = line.decode(encoding).strip().split('\t')
+            if len(line) == 2:
+                lemmas[line[0]] = line[1]
+    return lemmas
+
+def lemmatized(sentence, lemmas, tokenizer=None):
+    """ Return the lemmatized sentence
+    """
+    tokenized_sent = tokenize(sentence, tokenizer)
+    tokenized_sentformated = []
+    for w in tokenized_sent:
+        if w in ".,'" and len(tokenized_sentformated) > 0:
+            tokenized_sentformated[-1] += w
+        elif w not in punctuation:
+            tokenized_sentformated.append(w)
+    return u' '.join([lemmatized_word(w, lemmas) for w in tokenized_sentformated])
+
+def lemmatized_word(word, lemmas):
+    """ Return the lemmatized word
+    """
+    lemma = lemmas.get(word.lower(), word)
+    if '|' in lemma:
+        _words = lemma.split('|')
+        if word.lower() in _words:
+            lemma = word.lower()
+        else:
+            lemma = _words[0]
+    return lemma
+
+def roundstr(number, ndigits=0):
+    """Return an unicode string of ``number`` rounded to a given precision
+        in decimal digits (default 0 digits)
+
+        If ``number`` is not a float, this method casts it to a float. (An
+        exception may be raised if it's not possible)
+    """
+    return format(round(float(number), ndigits), '0.%df' % ndigits)
+
+def rgxformat(string, regexp, output):
+    """ Apply the regexp to the ``string`` and return a formatted string
+    according to ``output``
+
+    eg :
+        format(u'[Victor Hugo - 26 fev 1802 / 22 mai 1885]',
+               r'\[(?P<firstname>\w+) (?p<lastname>\w+) - '
+               r'(?P<birthdate>.*?) / (?<deathdate>.*?)\]',
+               u'%(lastname)s, %(firstname)s (%(birthdate)s -'
+               u'%(deathdate)s)')
+
+     would return u'Hugo, Victor (26 fev 1802 - 22 mai 1885)'
+     """
+
+    match = re.match(regexp, string)
+    return output % match.groupdict()
+
+
+###############################################################################
+### NORMALIZER OBJECTS ########################################################
+###############################################################################
+class BaseNormalizer(object):
+    """ A normalizer object used to provide an abstraction over the different
+    normalization functions, and help building Nazca process. """
+
+    def __init__(self, callback, attr_index=None):
+        """ Initiate the BaseNormalizer
+
+        Parameters
+        ----------
+        callback: normalization callback
+
+        attr_index: index of the attribute of interest in a record
+                    (i.e. attribute to be normalized).
+                    By default, 'attr_index' is None and the whole
+                    record is passed to the callback.
+                    If given, only the attr_index value of the record
+                    is passed to the the callback.
+                    Could be a list or an int
+        """
+        self.callback = callback
+        if attr_index:
+            self.attr_index = attr_index if isinstance(attr_index, (tuple, list)) else (attr_index,)
+        else:
+            self.attr_index = attr_index
+
+    def normalize(self, record):
+        """ Normalize a record
+
+        Parameters
+        ----------
+        record: a record (tuple/list of values).
+
+        Returns
+        -------
+
+        record: the normalized record.
+        """
+        if not self.attr_index:
+            return self.callback(record)
+        else:
+            for attr_ind in self.attr_index:
+                record = list(r if ind != attr_ind else self.callback(r)
+                               for ind, r in enumerate(record))
+            return record
+
+    def normalize_dataset(self, dataset, inplace=False):
+        """ Normalize a dataset
+
+        Parameters
+        ----------
+        dataset: a list of record (tuple/list of values).
+
+        inplace: Boolean. If True, normalize the dataset in place.
+
+        Returns
+        -------
+
+        record: the normalized dataset.
+        """
+        if not inplace:
+            dataset = [self.normalize(record) for record in dataset]
+        else:
+            # Change dataset in place
+            for ind, record in enumerate(dataset):
+                dataset[ind] = self.normalize(record)
+        return dataset
+
+
+class UnicodeNormalizer(BaseNormalizer):
+    """ Normalizer that unormalize the unicode
+    (i.e. replace accentuating characters by ASCII ones)
+    """
+    def __init__(self, attr_index=None, substitute=None):
+        callback = partial(lunormalize, substitute=substitute)
+        super(UnicodeNormalizer, self).__init__(callback, attr_index=attr_index)
+
+
+class SimplifyNormalizer(BaseNormalizer):
+    """ Normalizer that simplify a string
+        0) If remove_stopwords, then remove the stop words
+        1) If lemmas are given, the sentence is lemmatized
+        2) Set the sentence to lower case
+        3) Remove punctuation
+    """
+    def __init__(self, attr_index=None, lemmas=None, remove_stopwords=True):
+        callback = partial(simplify, lemmas=lemmas, remove_stopwords=remove_stopwords)
+        super(SimplifyNormalizer, self).__init__(callback, attr_index=attr_index)
+
+
+class TokenizerNormalizer(BaseNormalizer):
+    """ Normalizer that tokenize a string
+        Use ``tokenizer`` if given, else try to use the nltk WordPunctTokenizer,
+        in case of failure, it just split on spaces.
+        Anyway, tokenizer must have a ``tokenize()`` method
+    """
+    def __init__(self, attr_index=None, tokenizer=None, regexp=re.compile(r"[^\s]+")):
+        callback = partial(tokenize, tokenizer=tokenizer, regexp=regexp)
+        super(TokenizerNormalizer, self).__init__(callback, attr_index=attr_index)
+
+
+class LemmatizerNormalizer(BaseNormalizer):
+    """ Normalizer that lemmatize a string
+    """
+    def __init__(self, lemmas, attr_index=None, tokenizer=None):
+        callback = partial(lemmatized, lemmas=lemmas, tokenizer=tokenizer)
+        super(LemmatizerNormalizer, self).__init__(callback, attr_index=attr_index)
+
+
+class RoundNormalizer(BaseNormalizer):
+    """Normalizer that round a string
+    Return an unicode string of ``number`` rounded to a given precision
+    in decimal digits (default 0 digits)
+
+    If ``number`` is not a float, this method casts it to a float. (An
+    exception may be raised if it's not possible)
+    """
+    def __init__(self, attr_index=None, ndigits=0):
+        callback = partial(roundstr, ndigits=ndigits)
+        super(RoundNormalizer, self).__init__(callback, attr_index=attr_index)
+
+
+class RegexpNormalizer(BaseNormalizer):
+    """Normalizer that normalize a string based on a regexp
+
+     Apply the regexp to the ``string`` and return a formatted string
+    according to ``output``
+
+    eg :
+        format(u'[Victor Hugo - 26 fev 1802 / 22 mai 1885]',
+               r'\[(?P<firstname>\w+) (?p<lastname>\w+) - '
+               r'(?P<birthdate>.*?) / (?<deathdate>.*?)\]',
+               u'%(lastname)s, %(firstname)s (%(birthdate)s -'
+               u'%(deathdate)s)')
+
+     would return u'Hugo, Victor (26 fev 1802 - 22 mai 1885)'
+    """
+    def __init__(self, regexp, output, attr_index=None):
+        callback = partial(rgxformat, regexp=regexp, output=output)
+        super(RegexpNormalizer, self).__init__(callback, attr_index=attr_index)
+
+
+###############################################################################
+### JOIN NORMALIZER ###########################################################
+###############################################################################
+class JoinNormalizer(BaseNormalizer):
+    """Normalizer that join multiple fields in only one.
+    This new field will be put at the end of the new record.
+    """
+    def __init__(self, attr_indexes, join_car=', '):
+        self.attr_indexes = attr_indexes
+        self.join_car = join_car
+
+    def normalize(self, record):
+        """ Normalize a record
+
+        Parameters
+        ----------
+        record: a record (tuple/list of values).
+
+        Returns
+        -------
+
+        record: the normalized record.
+        """
+        _record = [r for ind, r in enumerate(record) if ind not in self.attr_indexes]
+        _record.append(self.join_car.join([r for ind, r in enumerate(record) if ind in self.attr_indexes]))
+        return _record
+
+
+###############################################################################
+### NORMALIZER PIPELINE #######################################################
+###############################################################################
+class NormalizerPipeline(BaseNormalizer):
+    """ Pipeline of Normalizers
+    """
+
+    def __init__(self, normalizers):
+        """ Initiate the NormalizerPipeline
+
+        Parameters
+        ----------
+        normalizers: list (ordered) of Normalizer
+        """
+        self.normalizers = normalizers
+
+    def normalize(self, record):
+        """ Normalize a record
+
+        Parameters
+        ----------
+        record: a record (tuple/list of values).
+
+        Returns
+        -------
+
+        record: the normalized record.
+        """
+        for normalizer in self.normalizers:
+            record = normalizer.normalize(record)
+        return record