[refactor] Refactor normalize module and objectify it, see #182001
authorVincent Michel <vincent.michel@logilab.fr>
Mon, 14 Oct 2013 16:29:16 +0000
changeset 301 50a25080aa33
parent 263 226adc04beff
child 302 8170c858e303
[refactor] Refactor normalize module and objectify it, see #182001 Refactor test for distances in a new file; Refactor findneighbours technics in Blockings; Refactor Matrix computation and Processings, see #182001 Define a BaseAligner class with an align() method; Define an AbstractGeneralBlocking class with a fit method, for blocking methods; Define some concrete blocking methods, e.g. SortedNeighborhood blocking; Update tests; Put DeprecationWarning and move api compat tests in a single file;
aligner.py
blocking.py
dataio.py
demo.py
distances.py
matrix.py
normalize.py
old_api.py
test/test_alignment.py
test/test_blocking.py
test/test_dataio.py
test/test_distances.py
test/test_minhashing.py
test/test_normalize.py
test/test_old_api.py
--- a/aligner.py	Thu Apr 25 16:09:39 2013 +0200
+++ b/aligner.py	Mon Oct 14 16:29:16 2013 +0000
@@ -15,375 +15,114 @@
 # You should have received a copy of the GNU Lesser General Public License along
 # with this program. If not, see <http://www.gnu.org/licenses/>.
 
-from os import listdir
-import os.path as osp
-from shutil import rmtree
-from tempfile import mkdtemp
-import sys
+from collections import defaultdict
 
-from scipy.spatial import KDTree
+from scipy import zeros
 from scipy.sparse import lil_matrix
 
-from nazca.minhashing import Minlsh
-from nazca.dataio import write_results, split_file, parsefile
-import nazca.matrix as m
-
 
-def normalize_set(rset, treatments):
-    """ Apply all the normalization functions to the given rset """
-    normalized_set = []
-    for row in rset:
-        row = list(row)
-        normalized_set.append(row)
-        for ind, attribut in enumerate(row):
-            treat = treatments.get(ind)
-            if not attribut or not treat:
-                continue
-            for f in treat.get('normalization', []):
-                farg = f.func_code.co_varnames #List of the arguments of f
-                # A kind of union between the arguments needed by f, and the
-                # provided ones
-                givenargs = dict((arg, treat['norm_params'][arg])
-                                 for arg in farg if arg in treat.get('norm_params', []))
-                attribut = f(attribut, **givenargs)
-            row[ind] = attribut
-    return normalized_set
+###############################################################################
+### ALIGNER OBJECTS ###########################################################
+###############################################################################
+class BaseAligner(object):
+
+    def __init__(self, threshold, processings):
+        self.threshold = threshold
+        self.processings = processings
+        self.normalizers = None
+        self.blocking = None
 
-def findneighbours_kdtree(alignset, targetset, indexes=(1, 1), threshold=0.1):
-    """ Find the neigbhours using kdree
-    """
-    #If an element is None (missing), use instead the identity element.
-    #The identity element is defined as the 0-vector
-    firstelement = alignset[0][indexes[0]]
-    idsize = len(firstelement) if isinstance(firstelement, (tuple, list)) else 1
-    idelement = (0,) * idsize
-    # KDTree is expecting a two-dimensional array
-    if idsize == 1:
-        aligntree  = KDTree([(elt[indexes[0]],) or idelement for elt in alignset])
-        targettree = KDTree([(elt[indexes[1]],) or idelement for elt in targetset])
-    else:
-        aligntree  = KDTree([elt[indexes[0]] or idelement for elt in alignset])
-        targettree = KDTree([elt[indexes[1]] or idelement for elt in targetset])
-    extraneighbours = aligntree.query_ball_tree(targettree, threshold)
-    neighbours = []
-    for ind in xrange(len(alignset)):
-        if not extraneighbours[ind]:
-            continue
-        neighbours.append([[ind], extraneighbours[ind]])
-    return neighbours
+    def register_normalizers(self, normalizers):
+        """ Register normalizers to be applied
+        before alignment """
+        self.normalizers = normalizers
 
-def findneighbours_minhashing(alignset, targetset, indexes=(1, 1), threshold=0.1,
-                              kwordsgram=1, siglen=200):
-    """ Find the neigbhours using minhashing
-    """
-    #If an element is None (missing), use instead the identity element.
-    idelement = ''
-    minhasher = Minlsh()
-    minhasher.train([elt[indexes[0]] or idelement for elt in alignset] +
-                    [elt[indexes[1]] or idelement for elt in targetset],
-                    kwordsgram, siglen)
-    rawneighbours = minhasher.predict(threshold)
-    neighbours = []
-    for data in rawneighbours:
-        neighbours.append([[], []])
-        for i in data:
-            if i >= len(alignset):
-                neighbours[-1][1].append(i - len(alignset))
-            else:
-                neighbours[-1][0].append(i)
-        if len(neighbours[-1][0]) == 0 or len(neighbours[-1][1]) == 0:
-            neighbours.pop()
-    return neighbours
+    def apply_normalization(self, dataset):
+        if self.normalizers:
+            norm_pipeline = NormalizerPipeline(self.normalizers)
+            return norm_pipeline.normalize_dataset(dataset)
+        return dataset
 
-def findneighbours_clustering(alignset, targetset, indexes=(1, 1),
-                              mode='kmeans', n_clusters=None):
-    """ Find the neigbhours using clustering (kmeans or minibatchkmeans)
-    """
-    from sklearn import cluster
-    #If an element is None (missing), use instead the identity element.
-    #The identity element is defined as the 0-vector
-    idelement = tuple([0 for _ in xrange(len(alignset[0][indexes[0]]))])
-    # We assume here that there are at least 2 elements in the alignset
-    n_clusters = n_clusters or (len(alignset)/10 or len(alignset)/2)
-
-    if mode == 'kmeans':
-        kmeans = cluster.KMeans(n_clusters=n_clusters)
-    else:
-        kmeans = cluster.MiniBatchKMeans(n_clusters=n_clusters)
-
-    kmeans.fit([elt[indexes[0]] or idelement for elt in alignset])
-    predicted = kmeans.predict([elt[indexes[1]] or idelement for elt in targetset])
+    def register_blocking(self, blocking):
+        self.blocking = blocking
 
-    neighbours = [[[], []] for _ in xrange(kmeans.n_clusters)]
-    for ind, i in enumerate(predicted):
-        neighbours[i][1].append(ind)
-    for ind, i in enumerate(kmeans.labels_):
-        neighbours[i][0].append(ind)
-    #XXX: Check all lists have one element at least ?
-    return neighbours
-
-def findneighbours(alignset, targetset, indexes=(1, 1), mode='kdtree',
-                   neighbours_threshold=0.1, n_clusters=None, kwordsgram=1, siglen=200):
-    """ This function helps to find neighbours from items of alignset and
-        targetset. “Neighbours” are items that are “not so far”, ie having a
-        close label, are located in the same area etc.
-
-        This function handles two types of neighbouring : text and numeric.
-        For text value, you have to use the “minhashing” and for numeric, you
-        can choose from “kdtree“, “kdmeans“ and “minibatch”
-
-        The arguments to give are :
-            - `alignset` and `targetset` are the sets where neighbours have to
-              be found.
-            - `indexes` are the location of items to compare
-            - `mode` is the search type to use
-            - `neighbours_threshold` is the `mode` neighbours_threshold
-
-            - `n_clusters` is used for "kmeans" and "minibatch" methods, and it
-              is the number of clusters to use.
-
-            - `kwordsgram` and `siglen` are used for "minhashing". `kwordsgram`
-              is the length of wordsgrams to use, and `siglen` is the length of
-              the minhashing signature matrix.
-
-        return a list of lists, built as the following :
-            [
-                [[indexes_of_alignset_0], [indexes_of_targetset_0]],
-                [[indexes_of_alignset_1], [indexes_of_targetset_1]],
-                [[indexes_of_alignset_2], [indexes_of_targetset_2]],
-                [[indexes_of_alignset_3], [indexes_of_targetset_3]],
-                ...
-            ]
-    """
-
-    SEARCHERS = set(['kdtree', 'minhashing', 'kmeans', 'minibatch'])
-    mode = mode.lower()
+    def compute_distance_matrix(self, refset, targetset,
+                                ref_indexes, target_indexes):
+        """ Compute and return the global alignment matrix.
+        For each `processing` a `Distancematrix` is built, then all the
+        matrices are summed with their own weighting and the result is the global
+        alignment matrix, which is returned.
+        """
+        distmatrix = zeros((len(ref_indexes), len(target_indexes)), dtype='float32')
+        for processing in self.processings:
+            distmatrix += processing.cdist(refset, targetset,
+                                          ref_indexes, target_indexes)
+        return distmatrix
 
-    if mode not in SEARCHERS:
-        raise NotImplementedError('Unknown mode given')
-    if mode == 'kdtree':
-        return findneighbours_kdtree(alignset, targetset, indexes, neighbours_threshold)
-    elif mode == 'minhashing':
-        return findneighbours_minhashing(alignset, targetset, indexes, neighbours_threshold,
-                                         kwordsgram, siglen)
-    elif mode in set(['kmeans', 'minibatch']):
-        try:
-            return findneighbours_clustering(alignset, targetset, indexes, mode, n_clusters)
-        except:
-            raise NotImplementedError('Scikit learn does not seem to be installed')
-
-def align(alignset, targetset, threshold, treatments=None, resultfile=None,
-          _applyNormalization=True):
-    """ Try to align the items of alignset onto targetset's ones
-
-        `alignset` and `targetset` are the sets to align. Each set contains
-        lists where the first column is the identifier of the item, and the others are
-        the attributs to align. (Note that the order is important !) Both must
-        have the same number of columns.
-
-        `treatments` is a dictionary of dictionaries.
-        Each key is the indice of the row, and each value is a dictionary
-        that contains the treatments to do on the different attributs.
-        Each dictionary is built as the following:
-
-            treatment = {'normalization': [f1, f2, f3],
-                         'norm_params': {'arg1': arg01, 'arg2': arg02},
-                         'metric': d1,
-                         'metric_params': {'arg1': arg11},
-                         'weighting': w,
-                         'matrix_normalize': True
-                        }
-
-            `normalization` is the list of functions called to normalize the
-            given attribut (in order). Each functions is called with `norm_params`
-            as arguments
-
-            Idem for `distance` and `distance_args`
-
-            `weighting` is the weighting for the current attribut in regard to
-            the others
-
-            `resultfile` (default is None). Write the matched elements in a file.
+    def threshold_matched(self, distmatrix):
+        """ Return the matched elements within a dictionnary,
+        each key being the indice from X, and the corresponding
+        values being a list of couple (indice from Y, distance)
+        """
+        match = defaultdict(list)
+        # if normalized:
+        #     distmatrix /= distmatrix.max()
+        ind = (distmatrix <= self.threshold).nonzero()
+        indrow = ind[0].tolist()
+        indcol = ind[1].tolist()
+        for (i, j) in zip(indrow, indcol):
+            match[i].append((j, distmatrix[i, j]))
+        return match
 
-        Return the distance matrix and the matched list.
-    """
-    treatments = treatments or {}
-
-    if _applyNormalization:
-        ralignset = normalize_set(alignset, treatments)
-        rtargetset = normalize_set(targetset, treatments)
-    else:
-        ralignset = alignset
-        rtargetset = targetset
-
-    items = []
-    for ind, tr in treatments.iteritems():
-        items.append((tr.get('weighting', 1),
-                      [ralignset[i][ind] for i in xrange(len(ralignset))],
-                      [rtargetset[i][ind] for i in xrange(len(rtargetset))],
-                      tr['metric'],
-                      tr.get('matrix_normalized', True),
-                      tr.get('metric_params', {})))
-
-    mat = m.globalalignmentmatrix(items)
-    matched = m.matched(mat, threshold)
-
-    # Write file if asked
-    if resultfile:
-        write_results(matched, alignset, targetset, resultfile)
-
-    return mat, matched
-
-def subalign(alignset, targetset, alignind, targetind, threshold,
-             treatments=None, _applyNormalization=True):
-    """ Compute a subalignment for a list of indices of the alignset and
-    a list of indices for the targetset """
-    mat, matched = align([alignset[i] for i in alignind],
-                         [targetset[i] for i in targetind], threshold,
-                         treatments, _applyNormalization=_applyNormalization)
-    new_matched = {}
-    for k, values in matched.iteritems():
-        new_matched[alignind[k]] = [(targetind[i], d) for i, d in values]
-    return mat, new_matched
-
-
-def conquer_and_divide_alignment(alignset, targetset, threshold, treatments=None,
-                                 indexes=(1,1), mode='kdtree', neighbours_threshold=0.1,
-                                 n_clusters=None, kwordsgram=1, siglen=200,
-                                 get_global_mat=True):
-    """ Full conquer and divide method for alignment.
-    Compute neighbours and merge the different subalignments.
-    """
-    global_matched = {}
-    if get_global_mat:
-        global_mat = lil_matrix((len(alignset), len(targetset)))
-
-    treatments = treatments or {}
-    ralignset = normalize_set(alignset, treatments)
-    rtargetset = normalize_set(targetset, treatments)
-
-    for alignind, targetind in findneighbours(ralignset, rtargetset, indexes, mode,
-                                              neighbours_threshold, n_clusters,
-                                              kwordsgram, siglen):
-        _, matched = subalign(alignset, targetset, alignind, targetind,
-                                threshold, treatments, _applyNormalization=False)
+    def _get_match(self, refset, targetset, ref_indexes=None, target_indexes=None):
+        # Build items
+        items = []
+        ref_indexes = ref_indexes or xrange(len(refset))
+        target_indexes = target_indexes or xrange(len(targetset))
+        # Apply alignments
+        mat = self.compute_distance_matrix(refset, targetset,
+                                           ref_indexes=ref_indexes,
+                                           target_indexes=target_indexes)
+        matched = self.threshold_matched(mat)
+        # Reapply matched to global indexes
+        new_matched = {}
         for k, values in matched.iteritems():
-            subdict = global_matched.setdefault(k, set())
-            for v, d in values:
-                subdict.add((v, d))
-                # XXX avoid issue in sparse matrix
-                if get_global_mat:
-                    global_mat[k, v] = d or 10**(-10)
-    if get_global_mat:
-        return global_mat, global_matched
-    return global_matched
-
-def alignall(alignset, targetset, threshold, treatments=None,
-             indexes=(1,1), mode='kdtree', neighbours_threshold=0.1,
-             n_clusters=None, kwordsgram=1, siglen=200, uniq=False):
-
-    if not mode:
-        _, matched = align(alignset, targetset, threshold, treatments,
-                           resultfile=None, _applyNormalization=True)
-    else:
-        matched = conquer_and_divide_alignment(alignset, targetset, threshold,
-                                               treatments, indexes, mode,
-                                               neighbours_threshold, n_clusters,
-                                               kwordsgram, siglen,
-                                               get_global_mat=False)
+            new_matched[ref_indexes[k]] = [(target_indexes[i], d) for i, d in values]
+        return mat, new_matched
 
-    if not uniq:
-        for alignid in matched:
-            for targetid, _ in matched[alignid]:
-                yield alignset[alignid][0], targetset[targetid][0]
-    else:
-        for alignid in matched:
-            bestid, _ = sorted(matched[alignid], key=lambda x:x[1])[0]
-            yield alignset[alignid][0], targetset[bestid][0]
-
-def alignall_iterative(alignfile, targetfile, alignformat, targetformat,
-                       threshold, size=10000, equality_threshold=0.01,
-                       treatments=None, indexes=(1,1), mode='kdtree',
-                       neighbours_threshold=0.1, n_clusters=None, kwordsgram=1,
-                       siglen=200, cache=None):
-
-    """ This function helps you to align *huge* files.
-        It takes your csv files as arguments and split them into smaller ones
-        (files of `size` lines), and runs the alignment on those files.
-
-        `alignformat` and `targetformat` are keyworded arguments given to the
-        nazca.dataio.parsefile function.
-
-        This function returns its own cache. The cache is quite simply a
-        dictionary having align items' id as keys and tuples (target item's id,
-        distance) as value. This dictionary can be regiven to this function to
-        perform another alignment (with different parameters, or just to be
-        sure everything has been caught)
-
-        If the distance of an alignment is below `equality_threshold`, the
-        alignment is considered as perfect, and the corresponding item is
-        removed from the alignset (to speed up the computation).
-    """
-
-    #Split the huge files into smaller ones
-    aligndir = mkdtemp()
-    targetdir = mkdtemp()
-    alignfiles = split_file(alignfile, aligndir, size)
-    targetfiles = split_file(targetfile, targetdir, size)
-
-    #Compute the number of iterations that must be done to achieve the alignement
-    nb_iterations = len(alignfiles) * len(targetfiles)
-    current_it = 0
-
-    cache = cache or {} #Contains the better known alignements
-    #Contains the id of perfectly aligned data
-    doneids = set(_id for _id, (_, dist) in cache.iteritems()
-                          if dist < equality_threshold)
+    def align(self, refset, targetset, get_matrix=True):
+        """ Perform the alignment on the referenceset
+        and the targetset
+        """
+        refset = self.apply_normalization(refset)
+        targetset = self.apply_normalization(targetset)
+        # If no blocking
+        if not self.blocking:
+            return self._get_match(refset, targetset)
+        # Blocking == conquer_and_divide
+        global_matched = {}
+        global_mat = lil_matrix((len(refset), len(targetset)))
+        self.blocking.fit(refset, targetset)
+        for refblock, targetblock in self.blocking.iter_blocks():
+            _, matched = self._get_match(refset, targetset, refblock, targetblock)
+            for k, values in matched.iteritems():
+                subdict = global_matched.setdefault(k, set())
+                for v, d in values:
+                    subdict.add((v, d))
+                    # XXX avoid issue in sparse matrix
+                    if get_matrix:
+                        global_mat[k, v] = d or 10**(-10)
+        return global_mat, global_matched
 
-    try:
-        for alignfile in alignfiles:
-            alignset = [a for a in parsefile(osp.join(aligndir, alignfile), **alignformat)
-                        if a[0] not in doneids]
-            for targetfile in targetfiles:
-                targetset = parsefile(osp.join(targetdir, targetfile), **targetformat)
-                matched = conquer_and_divide_alignment(alignset, targetset,
-                                                       threshold,
-                                                       treatments=treatments,
-                                                       indexes=indexes,
-                                                       mode=mode,
-                                                       neighbours_threshold=neighbours_threshold,
-                                                       n_clusters=n_clusters,
-                                                       kwordsgram=kwordsgram,
-                                                       siglen=siglen,
-                                                       get_global_mat=False)
-                for alignid in matched:
-                    bestid, dist = sorted(matched[alignid], key=lambda x:x[1])[0]
-                    #Get the better known distance
-                    _, current_dist = cache.get(alignset[alignid][0], (None, None))
-                    if current_dist is None or current_dist > dist:
-                        #If it's better, update the cache
-                        cache[alignset[alignid][0]] = (targetset[bestid][0], dist)
-                        if dist <= equality_threshold:
-                            #If perfect, stop trying to align this one
-                            doneids.add(alignset[alignid][0])
-
-                current_it += 1
-                sys.stdout.write('\r%0.2f%%' % (current_it * 100. /
-                                                nb_iterations))
-                sys.stdout.flush()
-                if doneids:
-                    alignset = [a for a in alignset if a[0] not in doneids]
-                if not alignset: #All items have been aligned
-                    #TODO Increment current_it.
-                    #The progress of the alignment process is computed with
-                    #`current_it`. If all items of `alignset` are aligned, we
-                    #stop the alignment process for this `alignset`. If
-                    #`current_it` isn’t incremented, the progress shown will be
-                    #false.
-                    break
-
-    finally:
-        rmtree(aligndir)
-        rmtree(targetdir)
-
-    return cache
+    def get_aligned_pairs(self, refset, targetset, unique=True):
+        """ Get the pairs of aligned elements
+        """
+        global_mat, global_matched = self.align(refset, targetset, False)
+        if unique:
+            for refid in global_matched:
+                bestid, _ = sorted(global_matched[refid], key=lambda x:x[1])[0]
+                yield refset[refid][0], targetset[bestid][0]
+        else:
+            for refid in global_matched:
+                for targetid, _ in global_matched[refid]:
+                    yield refset[refid][0], targetset[targetid][0]
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/blocking.py	Mon Oct 14 16:29:16 2013 +0000
@@ -0,0 +1,354 @@
+# -*- coding:utf-8 -*-
+# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+
+
+""" Blocking techniques.
+
+This module implements a set of blocking techniques used to split
+datasets in smaller subsets that will be aligned in more details.
+
+Additional information:
+
+   P. Christen, Data Matching, Data-Centric Systems and Applications,
+
+
+"""
+from functools import partial
+import warnings
+
+from scipy.spatial import KDTree
+
+from nazca.minhashing import Minlsh
+from nazca.distances import soundexcode
+
+
+###############################################################################
+### GENERAL BLOCKING ##########################################################
+###############################################################################
+class BaseBlocking(object):
+    """ An abstract general blocking object that exposes
+    the API that should be common to all blockings object
+    """
+    def __init__(self, ref_attr_index, target_attr_index):
+        """ Build the blocking object
+
+        Parameters
+        ----------
+
+        ref_attr_index: index of the attribute of interest in a record
+                        for the reference dataset
+                        (i.e. attribute to be used for key computation)
+
+        target_attr_index: index of the attribute of interest in a record
+                           for the target dataset
+                           (i.e. attribute to be used for key computation)
+        """
+        self.ref_attr_index = ref_attr_index
+        self.target_attr_index = target_attr_index
+        self.is_fitted = False
+
+    def fit(self, refset, targetset):
+        """ Fit the blocking technique on the reference and target datasets
+
+        Parameters
+        ----------
+        refset: a dataset (list of records)
+
+        targetset: a dataset (list of records)
+        """
+        self._fit(refset, targetset)
+        self.is_fitted = True
+
+    def _fit(self, refset, targetset):
+        raise NotImplementedError
+
+    def iter_blocks(self):
+        """ Iterator over the different possible blocks.
+
+        Returns
+        -------
+
+        (block1, block2): The blocks are always (reference_block, target_block)
+                          and containts the indexes of the record in the
+                          corresponding dataset.
+        """
+        assert self.is_fitted
+        return self._iter_blocks()
+
+    def _iter_blocks(self):
+        """ Internal iteration function over blocks
+        """
+        raise NotImplementedError
+
+    def iter_pairs(self):
+        """ Iterator over the different possible pairs.
+
+        Returns
+        -------
+
+        (pair1, pari2): The pairs are always (id_reference, id_target)
+                        and are the ids of the record in the corresponding dataset.
+        """
+        assert self.is_fitted
+        for block1, block2 in self.iter_blocks():
+            for refid in block1:
+                for targetid in block2:
+                    yield refid, targetid
+
+
+###############################################################################
+### KEY BLOCKING ##############################################################
+###############################################################################
+class KeyBlocking(BaseBlocking):
+    """ This blocking technique is based on a a blocking criteria
+    (or blocking key), that will be used to divide the datasets.
+
+    The main idea here is:
+
+    1 - to create an index of f(x) for each x in the reference set.
+
+    2 - to create an index of f(y) for each y in the target set.
+
+    3 - to iterate on each distinct value of f(x) and to return
+        the identifiers of the records of the both sets for this value.
+    """
+
+    def __init__(self, ref_attr_index, target_attr_index, callback):
+        super(KeyBlocking, self).__init__(ref_attr_index, target_attr_index)
+        self.callback = callback
+        self.reference_index = {}
+        self.target_index = {}
+
+    def _fit(self, refset, targetset):
+        """ Fit a dataset in an index using the callback
+        """
+        for rec in refset:
+            key = self.callback(rec[self.ref_attr_index])
+            self.reference_index.setdefault(key, []).append(rec[0])
+        for rec in targetset:
+            key = self.callback(rec[self.target_attr_index])
+            self.target_index.setdefault(key, []).append(rec[0])
+
+    def _iter_blocks(self):
+        """ Iterator over the different possible blocks.
+
+        Returns
+        -------
+
+        (block1, block2): The blocks are always (reference_block, target_block)
+                          and containts the indexes of the record in the
+                          corresponding dataset.
+        """
+        for key, block1 in self.reference_index.iteritems():
+            block2 = self.target_index.get(key)
+            if block1 and block2:
+                yield (block1, block2)
+
+
+class SoundexBlocking(KeyBlocking):
+
+    def __init__(self, ref_attr_index, target_attr_index, language='french',):
+        super(SoundexBlocking, self).__init__(ref_attr_index, target_attr_index,
+                                              partial(soundexcode, language=language))
+
+
+###############################################################################
+### SORTKEY BLOCKING ##########################################################
+###############################################################################
+class SortedNeighborhoodBlocking(BaseBlocking):
+    """ This blocking technique is based on a a sorting blocking criteria
+    (or blocking key), that will be used to divide the datasets.
+    """
+
+    def __init__(self, ref_attr_index, target_attr_index, key_func=lambda x: x, window_width=20):
+        super(SortedNeighborhoodBlocking, self).__init__(ref_attr_index, target_attr_index)
+        self.key_func = key_func
+        self.window_width = window_width
+        self.sorted_dataset = None
+
+    def _fit(self, refset, targetset):
+        """ Fit a dataset in an index using the callback
+        """
+        self.sorted_dataset = [(r[0], r[self.ref_attr_index], 0) for r in refset]
+        self.sorted_dataset.extend([(r[0], r[self.target_attr_index], 1) for r in targetset])
+        self.sorted_dataset.sort(key=lambda x: self.key_func(x[1]))
+
+    def _iter_blocks(self):
+        """ Iterator over the different possible blocks.
+        """
+        for ind, (rid, record, dset) in enumerate(self.sorted_dataset):
+            # Only keep reference set record
+            if dset == 1:
+                continue
+            block1 = [rid,]
+            minind = (ind - self.window_width)
+            minind = minind if minind >=0 else 0
+            maxind = (ind + self.window_width + 1)
+            block2 = [ri for ri, re, d in self.sorted_dataset[minind:maxind]
+                      if d == 1]
+            if block1 and block2:
+                yield (block1, block2)
+
+
+###############################################################################
+### CLUSTERING-BASED BLOCKINGS ################################################
+###############################################################################
+class KmeansBlocking(BaseBlocking):
+    """ A blocking technique based on Kmeans
+    """
+
+    def __init__(self, ref_attr_index, target_attr_index, n_clusters=None):
+        super(KmeansBlocking, self).__init__(ref_attr_index, target_attr_index)
+        self.n_clusters = n_clusters
+        self.kmeans = None
+        self.predicted = None
+        from sklearn import cluster
+        self.cluster_class = cluster.KMeans
+
+    def _fit(self, refset, targetset):
+        """ Fit the reference dataset.
+        """
+        # If an element is None (missing), use instead the identity element.
+        # The identity element is defined as the 0-vector
+        idelement = tuple([0 for _ in xrange(len(refset[0][self.ref_attr_index]))])
+        # We assume here that there are at least 2 elements in the refset
+        n_clusters = self.n_clusters or (len(refset)/10 or len(refset)/2)
+        kmeans =  self.cluster_class(n_clusters=n_clusters)
+        kmeans.fit([elt[self.ref_attr_index] or idelement for elt in refset])
+        self.kmeans = kmeans
+        # Predict on targetset
+        self.predicted = self.kmeans.predict([elt[self.target_attr_index]
+                                              or idelement for elt in targetset])
+
+    def _iter_blocks(self):
+        """ Iterator over the different possible blocks.
+
+        Returns
+        -------
+
+        (block1, block2): The blocks are always (reference_block, target_block)
+                          and containts the indexes of the record in the
+                          corresponding dataset.
+        """
+        neighbours = [[[], []] for _ in xrange(self.kmeans.n_clusters)]
+        for ind, i in enumerate(self.predicted):
+            neighbours[i][1].append(ind)
+        for ind, i in enumerate(self.kmeans.labels_):
+            neighbours[i][0].append(ind)
+        for block1, block2 in neighbours:
+            if len(block1) and len(block2):
+                yield block1, block2
+
+
+###############################################################################
+### KDTREE BLOCKINGS ##########################################################
+###############################################################################
+class KdTreeBlocking(BaseBlocking):
+    """ A blocking technique based on KdTree
+    """
+    def __init__(self, ref_attr_index, target_attr_index, threshold=0.1):
+        super(KdTreeBlocking, self).__init__(ref_attr_index, target_attr_index)
+        self.threshold = threshold
+        self.reftree = None
+        self.targettree = None
+        self.nb_elements = None
+
+    def _fit(self, refset, targetset):
+        """ Fit the blocking
+        """
+        firstelement = refset[0][self.ref_attr_index]
+        self.nb_elements = len(refset)
+        idsize = len(firstelement) if isinstance(firstelement, (tuple, list)) else 1
+        idelement = (0,) * idsize
+        # KDTree is expecting a two-dimensional array
+        if idsize == 1:
+            self.reftree  = KDTree([(elt[self.ref_attr_index],) or idelement for elt in refset])
+            self.targettree = KDTree([(elt[self.target_attr_index],) or idelement for elt in targetset])
+        else:
+            self.reftree = KDTree([elt[self.ref_attr_index] or idelement for elt in refset])
+            self.targettree = KDTree([elt[self.target_attr_index] or idelement for elt in targetset])
+
+    def _iter_blocks(self):
+        """ Iterator over the different possible blocks.
+
+        Returns
+        -------
+
+        (block1, block2): The blocks are always (reference_block, target_block)
+                          and containts the indexes of the record in the
+                          corresponding dataset.
+        """
+        extraneighbours = self.reftree.query_ball_tree(self.targettree, self.threshold)
+        neighbours = []
+        for ind in xrange(self.nb_elements):
+            if not extraneighbours[ind]:
+                continue
+            neighbours.append([[ind], extraneighbours[ind]])
+        for block1, block2 in neighbours:
+            if len(block1) and len(block2):
+                yield block1, block2
+
+
+###############################################################################
+### MINHASHING BLOCKINGS ######################################################
+###############################################################################
+class MinHashingBlocking(BaseBlocking):
+    """ A blocking technique based on MinHashing
+    """
+    def __init__(self, ref_attr_index, target_attr_index,
+                 threshold=0.1, kwordsgram=1, siglen=200):
+        super(MinHashingBlocking, self).__init__(ref_attr_index, target_attr_index)
+        self.threshold = threshold
+        self.kwordsgram = kwordsgram
+        self.siglen = siglen
+        self.minhasher = Minlsh()
+        self.nb_elements = None
+
+    def _fit(self, refset, targetset):
+        """ Find the blocking using minhashing
+        """
+        # If an element is None (missing), use instead the identity element.
+        idelement = ''
+        self.minhasher.train([elt[self.ref_attr_index] or idelement for elt in refset] +
+                        [elt[self.target_attr_index] or idelement for elt in targetset],
+                        self.kwordsgram, self.siglen)
+        self.nb_elements = len(refset)
+
+    def _iter_blocks(self):
+        """ Iterator over the different possible blocks.
+
+        Returns
+        -------
+
+        (block1, block2): The blocks are always (reference_block, target_block)
+                          and containts the indexes of the record in the
+                          corresponding dataset.
+        """
+        rawneighbours = self.minhasher.predict(self.threshold)
+        neighbours = []
+        for data in rawneighbours:
+            neighbours.append([[], []])
+            for i in data:
+                if i >= self.nb_elements:
+                    neighbours[-1][1].append(i - self.nb_elements)
+                else:
+                    neighbours[-1][0].append(i)
+            if len(neighbours[-1][0]) == 0 or len(neighbours[-1][1]) == 0:
+                neighbours.pop()
+        for block1, block2 in neighbours:
+            if len(block1) and len(block2):
+                yield block1, block2
--- a/dataio.py	Thu Apr 25 16:09:39 2013 +0200
+++ b/dataio.py	Mon Oct 14 16:29:16 2013 +0000
@@ -28,6 +28,9 @@
     SPARQL_ENABLED = False
 
 
+###############################################################################
+### UTILITY FUNCTIONS #########################################################
+###############################################################################
 def autocasted(data, encoding=None):
     """ Try to convert data into a specific type
     in (int, float, str)
@@ -43,6 +46,10 @@
                 return data.decode(encoding)
             return data
 
+
+###############################################################################
+### RQL FUNCTIONS #############################################################
+###############################################################################
 def rqlquery(host, rql, indexes=None, formatopt=None):
     """ Run the rql query on the given cubicweb host
     """
@@ -58,6 +65,10 @@
     return parsefile(filehandle, delimiter=';', indexes=indexes,
                      formatopt=formatopt);
 
+
+###############################################################################
+### SPARQL FUNCTIONS ##########################################################
+###############################################################################
 def sparqlquery(endpoint, query, indexes=None):
     """ Run the sparql query on the given endpoint, and wrap the items in the
     indexes form. If indexes is empty, keep raw output"""
@@ -87,6 +98,10 @@
         results.append(data)
     return results
 
+
+###############################################################################
+### FILE FUNCTIONS ############################################################
+###############################################################################
 def parsefile(filename, indexes=None, nbmax=None, delimiter='\t',
               encoding='utf-8', field_size_limit=None, formatopt=None):
     """ Parse the file (read ``nbmax`` line at maximum if given). Each
@@ -174,7 +189,6 @@
                      dist
                      ))
 
-
 def split_file(filename, outputdir, nblines=60000):
     """ Split `filename` into smaller files of ``nblines`` lines. Files are
         written into `outputdir`.
--- a/demo.py	Thu Apr 25 16:09:39 2013 +0200
+++ b/demo.py	Mon Oct 14 16:29:16 2013 +0000
@@ -54,10 +54,10 @@
                'metric': ald.levenshtein
               }
 
-    treatments = {1: tr_name}
+    processings = {1: tr_name}
 
     print "Alignment started"
-    align(alignset, targetset, 0.4, treatments,
+    align(alignset, targetset, 0.4, processings,
           dpath('demo0_results'))
 
     print "Done, see the resuls in %s" % dpath('demo0_results')
@@ -77,7 +77,7 @@
                          indexes=[0, 2, (14, 12)], nbmax=1000)
 
 
-    # Let's define the treatments to apply on the location's name
+    # Let's define the processings to apply on the location's name
     tr_name = {'normalization': [aln.simplify], # Simply all the names (remove
                                               #   punctuation, lower case, etc)
                'metric': ald.levenshtein,       # Use the levenshtein distance
@@ -92,14 +92,14 @@
               'weighting': 1
              }
 
-    treatments = {1: tr_name, 2: tr_geo}
+    processings = {1: tr_name, 2: tr_geo}
 
     print "Alignment started"
     align(alignset,           # The dataset to align
           targetset,          # The target dataset
           0.4,                # The maximal distance
                               #   threshold
-          treatments,         # The list of treatments to
+          processings,         # The list of processings to
                               #   apply.
           dpath('demo1_results'))
                               # Filename of the output
@@ -120,7 +120,7 @@
     neighbours = findneighbours(alignset, targetset, indexes=(2, 2),
                                mode='minibatch')
 
-    # Let's define the treatments to apply on the location's name
+    # Let's define the processings to apply on the location's name
     tr_name = {'normalization': [aln.simplify], # Simply all the names (remove
                                               #   punctuation, lower case, etc)
                'metric': ald.levenshtein,     # Use the levenshtein distance
@@ -128,7 +128,7 @@
                                               #   weighting coefficient
               }
 
-    treatments = {1: tr_name}
+    processings = {1: tr_name}
     print "Start computation"
     for ind, (alignid, targetid) in enumerate(neighbours):
         print '%3d' % ind, len(alignid), 'x', len(targetid)
@@ -137,7 +137,7 @@
                               alignid,
                               targetid,
                               0.3,
-                              treatments)
+                              processings)
         write_results(matched, alignset, targetset, dpath('demo2_results'))
     print "Done, see the results in %s" % dpath('demo2_results')
 
@@ -153,7 +153,7 @@
               }
 
     print "Alignment started"
-    results = alignall(alignset, targetset, 0.75, treatments={1: tr_name},
+    results = alignall(alignset, targetset, 0.75, processings={1: tr_name},
                        indexes=(1,1), mode='minhashing', kwordsgram=1, siglen=200,
                        uniq=True)
     dicresults = dict([(a, b) for (a, b) in results])
--- a/distances.py	Thu Apr 25 16:09:39 2013 +0200
+++ b/distances.py	Mon Oct 14 16:29:16 2013 +0000
@@ -15,19 +15,21 @@
 # You should have received a copy of the GNU Lesser General Public License along
 # with this program. If not, see <http://www.gnu.org/licenses/>.
 
+from functools import partial
+from math import cos, sqrt, pi #Needed for geographical distance
 try:
     from dateutil import parser as dateparser
     DATEUTIL_ENABLED = True
 except ImportError:
     DATEUTIL_ENABLED = False
-from math import cos, sqrt, pi #Needed for geographical distance
-
-from scipy import matrix
+from scipy import matrix, empty
 
 from nazca.normalize import tokenize
 
 
+###############################################################################
 ### UTILITY FUNCTIONS #########################################################
+###############################################################################
 def _handlespaces(stra, strb, distance, tokenizer=None, **kwargs):
     """ Compute the matrix of distances between all tokens of stra and strb
         (with function ``distance``). Extra args are given to the distance
@@ -68,7 +70,9 @@
     return max(minlist)
 
 
+###############################################################################
 ### NUMERICAL DISTANCES #######################################################
+###############################################################################
 def euclidean(a, b):
     """ Simple euclidian distance
     """
@@ -79,7 +83,9 @@
         return abs(float(a) - float(b))
 
 
+###############################################################################
 ### STRING DISTANCES ##########################################################
+###############################################################################
 def levenshtein(stra, strb, tokenizer=None):
     """ Compute the Levenshtein distance between stra and strb.
 
@@ -187,14 +193,23 @@
         J(A, B) = (A \cap B)/(A \cup B)
         d(A, B) = 1 - J(A, B)
     """
-
     seta = set(tokenize(stra, tokenizer))
     setb = set(tokenize(strb, tokenizer))
+    return generic_jaccard(seta, setb)
+
+def generic_jaccard(seta, setb):
+    """ Return the jaccard distance between two sets A and B.
+
+        J(A, B) = (A \cap B)/(A \cup B)
+        d(A, B) = 1 - J(A, B)
+    """
     return 1.0 - 1.0*len(seta.intersection(setb))/len(seta.union(setb))
 
 
+###############################################################################
+### TEMPORAL DISTANCES ########################################################
+###############################################################################
 if DATEUTIL_ENABLED:
-    ### TEMPORAL DISTANCES ####################################################
     class FrenchParserInfo(dateparser.parserinfo):
         """ Inherit of the dateutil.parser.parserinfo and translate the english
             dependant variables into french.
@@ -246,7 +261,9 @@
         return abs(diff.days)
 
 
+###############################################################################
 ### GEOGRAPHICAL DISTANCES ####################################################
+###############################################################################
 def geographical(pointa, pointb, in_radians=False, planet_radius=6371009,
                  units='m'):
     """ Return the geographical distance between two points.
@@ -274,3 +291,146 @@
 
     coef = 1. if units == 'm' else 0.001
     return coef*planet_radius*sqrt(difflat**2 + (cos(meanlat)*difflong)**2)
+
+
+###############################################################################
+### BASE PROCESSING ############################################################
+###############################################################################
+class BaseProcessing(object):
+    """ A processing object used to provide an abstraction over the different
+    distance functions, and help building Nazca process. """
+
+    def __init__(self, ref_attr_index=None, target_attr_index=None,
+                 distance_callback=euclidean, weight=1, matrix_normalized=False):
+        """ Initiate the BaseProcessing
+
+        Parameters
+        ----------
+
+        ref_attr_index: index of the attribute of interest in a record
+                        for the reference dataset
+                        (i.e. attribute to be used for key computation)
+
+        target_attr_index: index of the attribute of interest in a record
+                           for the target dataset
+                           (i.e. attribute to be used for key computation)
+
+        distance_callback: distance callback. Default is euclidean distance.
+
+        weight: weight of the processing in a global distance matrix
+
+        matrix_normalized: Boolean. If matrix_normalized is True,
+                           the distance between two points is changed to
+                           a value between 0 (equal) and 1 (totaly different).
+                           To avoid useless computation and scale
+                           problems the following “normalization” is done:
+                                d = 1 - 1/(1 + d(x, y))
+
+        """
+        self.ref_attr_index = ref_attr_index
+        self.target_attr_index = target_attr_index
+        self.distance_callback = distance_callback
+        self.weight = weight
+        self.matrix_normalized = matrix_normalized
+
+    def distance(self, reference_record, target_record):
+        """ Compute the distance between two records
+
+        Parameters
+        ----------
+        reference_record: a record (tuple/list of values) of the reference dataset.
+
+        target_record: a record (tuple/list of values) of the target dataset.
+
+        """
+        refrecord = (reference_record[self.ref_attr_index] if self.ref_attr_index
+                     else reference_record)
+        targetrecord = (target_record[self.target_attr_index] if self.target_attr_index
+                        else target_record)
+        return self.distance_callback(refrecord, targetrecord)
+
+    def cdist(self, refset, targetset, ref_indexes=None, target_indexes=None):
+        """ Compute the metric matrix, given two datasets and a metric
+
+        Parameters
+        ----------
+        refset: a dataset (list of records)
+
+        targetset: a dataset (list of records)
+
+        Returns
+        -------
+
+        A distance matrix, of shape (len(refset), len(targetset))
+        with the distance of each element in it.
+        """
+        ref_indexes = ref_indexes or xrange(len(refset))
+        target_indexes = target_indexes or xrange(len(targetset))
+        distmatrix = empty((len(ref_indexes), len(target_indexes)), dtype='float32')
+        size = distmatrix.shape
+        for i, iref in enumerate(ref_indexes):
+            for j, jref in enumerate(target_indexes):
+                d = 1
+                if refset[iref] and targetset[jref]:
+                    d = self.distance(refset[iref], targetset[jref])
+                    if self.matrix_normalized:
+                        d = 1 - (1.0/(1.0 + d))
+                distmatrix[i, j] = d
+        return distmatrix
+
+    def pdist(self, dataset):
+        """ Compute the upper triangular matrix in a way similar
+        to scipy.spatial.metric
+
+        Parameters
+        ----------
+        dataset: a dataset (list of records)
+
+        Returns
+        -------
+
+        The values of the upper triangular distance matrix
+        (of shape (len(dataset), len(dataset)) with the distance of each element in it.
+        The values are sorted as row 1, row2, ...
+        """
+        values = []
+        for i in xrange(len(dataset)):
+            for j in xrange(i+1, len(dataset)):
+                d = 1
+                if dataset[i] and dataset[j]:
+                    d = self.distance(dataset[i], dataset[j])
+                    if self.matrix_normalized:
+                        d = 1 - (1.0/(1.0 + d))
+                values.append(d)
+        return values
+
+
+###############################################################################
+### CONCRETE PROCESSINGS #######################################################
+###############################################################################
+class LevenshteinProcessing(BaseProcessing):
+    """ A processing based on the levenshtein distance.
+    """
+
+    def __init__(self, ref_attr_index=None, target_attr_index=None,
+                 tokenizer=None, weight=1, matrix_normalized=False):
+        distance_callback = partial(levenshtein,
+                                    tokenizer=tokenizer)
+        super(LevenshteinProcessing, self).__init__(ref_attr_index,
+                                                   target_attr_index,
+                                                   distance_callback,
+                                                   weight,matrix_normalized)
+
+
+class GeographicalProcessing(BaseProcessing):
+    """ A processing based on the geographical distance.
+    """
+
+    def __init__(self, ref_attr_index=None, target_attr_index=None,
+                 in_radians=False, planet_radius=6371009, units='m', weight=1, matrix_normalized=False):
+        distance_callback = partial(geographical, in_radians=in_radians,
+                                    planet_radius=planet_radius, units=units)
+        super(GeographicalProcessing, self).__init__(ref_attr_index,
+                                                    target_attr_index,
+                                                    distance_callback,
+                                                    weight,matrix_normalized)
--- a/matrix.py	Thu Apr 25 16:09:39 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,135 +0,0 @@
-# -*- coding:utf-8 -*-
-# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-
-from collections import defaultdict
-
-from scipy import empty
-
-import nazca.distances as ds
-
-METRICS = {'euclidean': ds.euclidean, 'levenshtein': ds.levenshtein,
-           'soundex': ds.soundex, 'jaccard': ds.jaccard, 'geographical': ds.geographical}
-
-try:
-    from nazca.distances import temporal
-    METRICS['temporal'] = temporal
-except ImportError:
-    pass
-
-
-def pdist(X, metric='euclidean', matrix_normalized=True, metric_params=None):
-    """ Compute the upper triangular matrix in a way similar
-    to scipy.spatial.metric
-
-    If matrix_normalized is True, the distance between two points is changed to
-    a value between 0 (equal) and 1 (totaly different). To avoid useless
-    computation and scale problems the following “normalization” is done:
-        d = 1 - 1/(1 + d(x, y))
-
-    """
-    metric = metric if not isinstance(metric, basestring) else METRICS.get(metric, ds.euclidean)
-    values = []
-    for i in xrange(len(X)):
-        for j in xrange(i+1, len(X)):
-            d = 1
-            if X[i] and X[j]:
-                d = metric(X[i], X[j], **(metric_params or {}))
-                if matrix_normalized:
-                    d = 1 - (1.0/(1.0 + d))
-            values.append(d)
-    return values
-
-def cdist(X, Y, metric='euclidean', matrix_normalized=True, metric_params=None):
-    """ Compute the metric matrix, given two inputs and a metric
-
-    If matrix_normalized is True, the distance between two points is changed to
-    a value between 0 (equal) and 1 (totaly different). To avoid useless
-    computation and scale problems the following “normalization” is done:
-        d = 1 - 1/(1 + d(x, y))
-
-    """
-    metric = metric if not isinstance(metric, basestring) else METRICS.get(metric, ds.euclidean)
-    distmatrix = empty((len(X), len(Y)), dtype='float32')
-    size = distmatrix.shape
-    for i in xrange(size[0]):
-        for j in xrange(size[1]):
-            d = 1
-            if X[i] and Y[j]:
-                d = metric(X[i], Y[j], **(metric_params or {}))
-                if matrix_normalized:
-                    d = 1 - (1.0/(1.0 + d))
-            distmatrix[i, j] = d
-    return distmatrix
-
-def matched(distmatrix, cutoff=0, normalized=False):
-    """ Return the matched elements within a dictionnary,
-    each key being the indice from X, and the corresponding
-    values being a list of couple (indice from Y, distance)
-    """
-    match = defaultdict(list)
-    if normalized:
-        distmatrix /= distmatrix.max()
-
-    ind = (distmatrix <= cutoff).nonzero()
-    indrow = ind[0].tolist()
-    indcol = ind[1].tolist()
-
-    for (i, j) in zip(indrow, indcol):
-        match[i].append((j, distmatrix[i, j]))
-
-    return match
-
-def globalalignmentmatrix(items):
-    """ Compute and return the global alignment matrix.
-        Let's explain :
-
-        - `items` is a list of tuples where each tuple is built as following :
-
-            `(weighting, input1, input2, distance_function, normalize, args)`
-
-            * `input1` : a list of "things" (names, dates, numbers) to align onto
-                `input2`. If a value is unknown, set it as `None`.
-
-            * `distance_function` : the distance function used to compute the
-                 distance matrix between `input1` and `input2`
-
-            * `weighting` : the weight of the "things" computed, compared
-                 with the others "things" of `items`
-
-            * `normalize` : boolean, if true, the matrix values will be between 0
-                and 1, else the real result of `distance_function` will be
-                stored
-
-            * `args` : a dictionnay of the extra arguments the
-                `distance_function` could take (as language or granularity)
-
-     - For each tuple of `items` a `Distancematrix` is built, then all the
-       matrices are summed with their own weighting and the result is the global
-       alignment matrix, which is returned.
-
-    """
-
-    #Assert all items have the same size
-    size1, size2 = len(items[0][1]), len(items[0][2])
-    for item in items[1:]:
-        assert size1 == len(item[1])
-        assert size2 == len(item[2])
-
-    globalmatrix = items[0][0]*cdist(*items[0][1:])
-    for item in items[1:]:
-        globalmatrix += item[0]*cdist(*item[1:])
-    return globalmatrix
--- a/normalize.py	Thu Apr 25 16:09:39 2013 +0200
+++ b/normalize.py	Mon Oct 14 16:29:16 2013 +0000
@@ -19,9 +19,10 @@
 from string import punctuation
 from warnings import warn
 from unicodedata import normalize as _uninormalize
+from functools import partial
 
 
-STOPWORDS = set([u'alors', u'au', u'aux', u'aucuns', u'aussi', u'autre', u'avant',
+FRENCH_STOPWORDS = set([u'alors', u'au', u'aux', u'aucuns', u'aussi', u'autre', u'avant',
 u'avec', u'avoir', u'bon', u'car', u'ce', u'cela', u'ces', u'ceux', u'chaque',
 u'ci', u'comme', u'comment', u'dans', u'de', u'des', u'du', u'dedans', u'dehors',
 u'depuis', u'deux', u'devrait', u'doit', u'donc', u'dos', u'droite', u'début',
@@ -57,6 +58,9 @@
     }
 
 
+###############################################################################
+### NORMALIZE FUNCTIONS #######################################################
+###############################################################################
 def unormalize(ustring, ignorenonascii=None, substitute=None):
     """replace diacritical characters with their corresponding ascii characters
 
@@ -82,7 +86,10 @@
         try:
             replacement = MANUAL_UNICODE_MAP[letter]
         except KeyError:
-            replacement = _uninormalize('NFKD', letter)[0]
+            if isinstance(letter, unicode):
+                replacement = _uninormalize('NFKD', letter)[0]
+            else:
+                replacement = letter
             if ord(replacement) >= 2 ** 7:
                 if substitute is None:
                     raise ValueError("can't deal with non-ascii based characters")
@@ -94,7 +101,7 @@
     """ Normalize a sentence (ie remove accents, set to lower, etc) """
     return unormalize(sentence, ignorenonascii, substitute).lower()
 
-def simplify(sentence, lemmas=None, remove_stopwords=True):
+def simplify(sentence, lemmas=None, remove_stopwords=True, stopwords=FRENCH_STOPWORDS):
     """ Simply the given sentence
         0) If remove_stopwords, then remove the stop words
         1) If lemmas are given, the sentence is lemmatized
@@ -112,7 +119,7 @@
     if not remove_stopwords:
         return cleansent
     else:
-        return ' '.join([w for w in cleansent.split(' ') if w not in STOPWORDS])
+        return ' '.join([w for w in cleansent.split(' ') if w not in stopwords])
 
 def tokenize(sentence, tokenizer=None, regexp=re.compile(r"[^\s]+")):
     """ Tokenize a sentence.
@@ -204,3 +211,180 @@
 
     match = re.match(regexp, string)
     return output % match.groupdict()
+
+
+###############################################################################
+### NORMALIZER OBJECTS ########################################################
+###############################################################################
+class BaseNormalizer(object):
+    """ A normalizer object used to provide an abstraction over the different
+    normalization functions, and help building Nazca process. """
+
+    def __init__(self, callback, attr_index=None):
+        """ Initiate the BaseNormalizer
+
+        Parameters
+        ----------
+        callback: normalization callback
+
+        attr_index: index of the attribute of interest in a record
+                    (i.e. attribute to be normalized).
+                    By default, 'attr_index' is None and the whole
+                    record is passed to the callback.
+                    If given, only the attr_index value of the record
+                    is passed to the the callback.
+                    Could be a list or an int
+        """
+        self.callback = callback
+        if attr_index:
+            self.attr_index = attr_index if isinstance(attr_index, (tuple, list)) else (attr_index,)
+        else:
+            self.attr_index = attr_index
+
+    def normalize(self, record):
+        """ Normalize a record
+
+        Parameters
+        ----------
+        record: a record (tuple/list of values).
+
+        Returns
+        -------
+
+        record: the normalized record.
+        """
+        if not self.attr_index:
+            return self.callback(record)
+        else:
+            for attr_ind in self.attr_index:
+                record = list(r if ind != attr_ind else self.callback(r)
+                               for ind, r in enumerate(record))
+            return record
+
+    def normalize_dataset(self, dataset, inplace=False):
+        """ Normalize a dataset
+
+        Parameters
+        ----------
+        dataset: a list of record (tuple/list of values).
+
+        inplace: Boolean. If True, normalize the dataset in place.
+
+        Returns
+        -------
+
+        record: the normalized dataset.
+        """
+        if not inplace:
+            dataset = [self.normalize(record) for record in dataset]
+        else:
+            # Change dataset in place
+            for ind, record in enumerate(dataset):
+                dataset[ind] = self.normalize(record)
+        return dataset
+
+
+class UnicodeNormalizer(BaseNormalizer):
+    """ Normalizer that unormalize the unicode
+    (i.e. replace accentuating characters by ASCII ones)
+    """
+    def __init__(self, attr_index=None, ignorenonascii=None, substitute=None):
+        callback = partial(lunormalize, ignorenonascii=ignorenonascii, substitute=substitute)
+        super(UnicodeNormalizer, self).__init__(callback, attr_index=attr_index)
+
+
+class SimplifyNormalizer(BaseNormalizer):
+    """ Normalizer that simplify a string
+        0) If remove_stopwords, then remove the stop words
+        1) If lemmas are given, the sentence is lemmatized
+        2) Set the sentence to lower case
+        3) Remove punctuation
+    """
+    def __init__(self, attr_index=None, lemmas=None, remove_stopwords=True):
+        callback = partial(simplify, lemmas=lemmas, remove_stopwords=remove_stopwords)
+        super(SimplifyNormalizer, self).__init__(callback, attr_index=attr_index)
+
+
+class TokenizerNormalizer(BaseNormalizer):
+    """ Normalizer that tokenize a string
+        Use ``tokenizer`` if given, else try to use the nltk WordPunctTokenizer,
+        in case of failure, it just split on spaces.
+        Anyway, tokenizer must have a ``tokenize()`` method
+    """
+    def __init__(self, attr_index=None, tokenizer=None, regexp=re.compile(r"[^\s]+")):
+        callback = partial(tokenize, tokenizer=tokenizer, regexp=regexp)
+        super(TokenizerNormalizer, self).__init__(callback, attr_index=attr_index)
+
+
+class LemmatizerNormalizer(BaseNormalizer):
+    """ Normalizer that lemmatize a string
+    """
+    def __init__(self, lemmas, attr_index=None, tokenizer=None):
+        callback = partial(lemmatized, lemmas=lemmas, tokenizer=tokenizer)
+        super(LemmatizerNormalizer, self).__init__(callback, attr_index=attr_index)
+
+
+class RoundNormalizer(BaseNormalizer):
+    """Normalizer that round a string
+    Return an unicode string of ``number`` rounded to a given precision
+    in decimal digits (default 0 digits)
+
+    If ``number`` is not a float, this method casts it to a float. (An
+    exception may be raised if it's not possible)
+    """
+    def __init__(self, attr_index=None, ndigits=0):
+        callback = partial(roundstr, ndigits=ndigits)
+        super(RoundNormalizer, self).__init__(callback, attr_index=attr_index)
+
+
+class RegexpNormalizer(BaseNormalizer):
+    """Normalizer that normalize a string based on a regexp
+
+     Apply the regexp to the ``string`` and return a formatted string
+    according to ``output``
+
+    eg :
+        format(u'[Victor Hugo - 26 fev 1802 / 22 mai 1885]',
+               r'\[(?P<firstname>\w+) (?p<lastname>\w+) - '
+               r'(?P<birthdate>.*?) / (?<deathdate>.*?)\]',
+               u'%(lastname)s, %(firstname)s (%(birthdate)s -'
+               u'%(deathdate)s)')
+
+     would return u'Hugo, Victor (26 fev 1802 - 22 mai 1885)'
+    """
+    def __init__(self, regexp, output, attr_index=None):
+        callback = partial(rgxformat, regexp=regexp, output=output)
+        super(RegexpNormalizer, self).__init__(callback, attr_index=attr_index)
+
+
+###############################################################################
+### NORMALIZER PIPELINE #######################################################
+###############################################################################
+class NormalizerPipeline(BaseNormalizer):
+    """ Pipeline of Normalizers
+    """
+
+    def __init__(self, normalizers):
+        """ Initiate the NormalizerPipeline
+
+        Parameters
+        ----------
+        normalizers: list (ordered) of Normalizer
+        """
+        self.normalizers = normalizers
+
+    def normalize(self, record):
+        """ Normalize a record
+
+        Parameters
+        ----------
+        record: a record (tuple/list of values).
+
+        Returns
+        -------
+
+        record: the normalized record.
+        """
+        for normalizer in self.normalizers:
+            record = normalizer.normalize(record)
+        return record
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/old_api.py	Mon Oct 14 16:29:16 2013 +0000
@@ -0,0 +1,431 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from os import listdir
+import os.path as osp
+from shutil import rmtree
+from tempfile import mkdtemp
+import sys
+import warnings
+from functools import partial
+
+from scipy.sparse import lil_matrix
+
+from nazca.dataio import write_results, split_file, parsefile
+from nazca.normalize import BaseNormalizer, NormalizerPipeline
+from nazca.blocking import KmeansBlocking, KdTreeBlocking, MinHashingBlocking
+from nazca.distances import GeographicalProcessing
+from nazca.aligner import BaseAligner
+
+
+# Backward compatibility. Now, use the BaseAligner inside the functions.
+# Perhaps these functions may be removed later...
+
+
+###############################################################################
+### NORMALIZE FUNCTIONS #######################################################
+###############################################################################
+# Backward compatibility. Now, use the NormalizerPipeline inside the functions.
+# Perhaps these functions may be removed later...
+
+def normalize_set(rset, processings):
+    """ Apply all the normalization functions to the given rset """
+    warnings.warn(DeprecationWarning('This function will be removed '
+                                     'in the next release.'
+                                     'You should rather use the BaseNormalizer '
+                                     'object of the normalize module'))
+    normalizers = []
+    for ind, processing in processings.iteritems():
+        for normalizer in extract_normalization_from_treatment(processing, ind):
+            normalizers.append(normalizer)
+    # Create pipeline
+    pipeline = NormalizerPipeline(normalizers)
+    return pipeline.normalize_dataset(rset)
+
+def extract_normalization_from_treatment(processing, ind):
+    """ Extract normalization from processing.
+    This function is used for backward compatibility with
+    the old function-based API """
+    warnings.warn(DeprecationWarning('This function will be removed '
+                                     'in the next release.'
+                                     'You should rather use the BaseNormalizer '
+                                     'object of the normalize module'))
+    for f in processing.get('normalization', []):
+        farg = f.func_code.co_varnames #List of the arguments of f
+        # A kind of union between the arguments needed by f, and the
+        # provided ones
+        givenargs = dict((arg, processing['norm_params'][arg])
+                         for arg in farg if arg in processing.get('norm_params', []))
+        callback = f
+        if givenargs:
+            callback = partial(callback, **givenargs)
+        yield BaseNormalizer(callback=callback, attr_index=ind)
+
+def extract_treatment_from_treatment(processing, ind):
+    """ Extract Treatment object from processing dict.
+    This is only for backward compatibility with the old API.
+    """
+    if processing['metric'] == 'geographical':
+        return GeographicalProcessing(ind, ind,
+                                     matrix_normalized=processing.get('matrix_normalized', False),
+                                     **processing.get('metric_params', {}))
+
+
+###############################################################################
+### ALIGNER ###################################################################
+###############################################################################
+def align(alignset, targetset, threshold, processings=None, resultfile=None,
+          _applyNormalization=True):
+    """ Try to align the items of alignset onto targetset's ones
+
+        `alignset` and `targetset` are the sets to align. Each set contains
+        lists where the first column is the identifier of the item,
+        and the others are
+        the attributs to align. (Note that the order is important !) Both must
+        have the same number of columns.
+
+        `processings` is a dictionary of dictionaries.
+        Each key is the indice of the row, and each value is a dictionary
+        that contains the processings to do on the different attributs.
+        Each dictionary is built as the following:
+
+            processing = {'normalization': [f1, f2, f3],
+                         'norm_params': {'arg1': arg01, 'arg2': arg02},
+                         'metric': d1,
+                         'metric_params': {'arg1': arg11},
+                         'weighting': w,
+                         'matrix_normalize': True
+                        }
+
+            `normalization` is the list of functions called to normalize the
+            given attribut (in order). Each functions is called with `norm_params`
+            as arguments
+
+            Idem for `distance` and `distance_args`
+
+            `weighting` is the weighting for the current attribut in regard to
+            the others
+
+            `resultfile` (default is None). Write the matched elements in a file.
+
+        Return the distance matrix and the matched list.
+    """
+    warnings.warn(DeprecationWarning('This function will be removed in the next '
+                                     'release.'
+                                     ' You should rather use the BaseAligner '
+                                     'object of the aligner module'))
+    processings = processings or {}
+    # Get the normalizers
+    normalizers = []
+    for ind, processing in processings.iteritems():
+        for normalizer in extract_normalization_from_treatment(processing, ind):
+            normalizers.append(normalizer)
+    # Cleanup processings
+    for t in processings.itervalues():
+        if 'normalization' in t:
+            t.pop('normalization')
+        if 'norm_params' in t:
+            t.pop('norm_params')
+    # Build aligner
+    processings = [extract_treatment_from_treatment(t, ind) for ind, t in processings.iteritems()]
+    aligner = BaseAligner(threshold, processings)
+    aligner.register_normalizers(normalizers)
+    # Align
+    return aligner.align(alignset, targetset)
+
+def subalign(alignset, targetset, alignind, targetind, threshold,
+             processings=None, _applyNormalization=True):
+    """ Compute a subalignment for a list of indices of the alignset and
+    a list of indices for the targetset """
+    warnings.warn(DeprecationWarning('This function will be removed in the next '
+                                     'release.'
+                                     ' You should rather use the BaseAligner '
+                                     'object of the aligner module'))
+    mat, matched = align([alignset[i] for i in alignind],
+                         [targetset[i] for i in targetind], threshold,
+                         processings, _applyNormalization=_applyNormalization)
+    new_matched = {}
+    for k, values in matched.iteritems():
+        new_matched[alignind[k]] = [(targetind[i], d) for i, d in values]
+    return mat, new_matched
+
+def conquer_and_divide_alignment(alignset, targetset, threshold, processings=None,
+                                 indexes=(1,1), mode='kdtree', neighbours_threshold=0.1,
+                                 n_clusters=None, kwordsgram=1, siglen=200,
+                                 get_global_mat=True):
+    """ Full conquer and divide method for alignment.
+    Compute neighbours and merge the different subalignments.
+    """
+    warnings.warn(DeprecationWarning('This function will be removed in the next '
+                                     'release.'
+                                     ' You should rather use the BaseAligner '
+                                     'object of the aligner module'))
+    global_matched = {}
+    if get_global_mat:
+        global_mat = lil_matrix((len(alignset), len(targetset)))
+
+    processings = processings or {}
+    ralignset = normalize_set(alignset, processings)
+    rtargetset = normalize_set(targetset, processings)
+
+    for alignind, targetind in findneighbours(ralignset, rtargetset, indexes, mode,
+                                              neighbours_threshold, n_clusters,
+                                              kwordsgram, siglen):
+        _, matched = subalign(alignset, targetset, alignind, targetind,
+                                threshold, processings, _applyNormalization=False)
+        for k, values in matched.iteritems():
+            subdict = global_matched.setdefault(k, set())
+            for v, d in values:
+                subdict.add((v, d))
+                # XXX avoid issue in sparse matrix
+                if get_global_mat:
+                    global_mat[k, v] = d or 10**(-10)
+    if get_global_mat:
+        return global_mat, global_matched
+    return global_matched
+
+def alignall(alignset, targetset, threshold, processings=None,
+             indexes=(1,1), mode='kdtree', neighbours_threshold=0.1,
+             n_clusters=None, kwordsgram=1, siglen=200, uniq=False):
+    warnings.warn(DeprecationWarning('This function will be removed in the next '
+                                     'release.'
+                                     ' You should rather use the BaseAligner '
+                                     'object of the aligner module'))
+    if not mode:
+        _, matched = align(alignset, targetset, threshold, processings,
+                           resultfile=None, _applyNormalization=True)
+    else:
+        matched = conquer_and_divide_alignment(alignset, targetset, threshold,
+                                               processings, indexes, mode,
+                                               neighbours_threshold, n_clusters,
+                                               kwordsgram, siglen,
+                                               get_global_mat=False)
+
+    if not uniq:
+        for alignid in matched:
+            for targetid, _ in matched[alignid]:
+                yield alignset[alignid][0], targetset[targetid][0]
+    else:
+        for alignid in matched:
+            bestid, _ = sorted(matched[alignid], key=lambda x:x[1])[0]
+            yield alignset[alignid][0], targetset[bestid][0]
+
+def alignall_iterative(alignfile, targetfile, alignformat, targetformat,
+                       threshold, size=10000, equality_threshold=0.01,
+                       processings=None, indexes=(1,1), mode='kdtree',
+                       neighbours_threshold=0.1, n_clusters=None, kwordsgram=1,
+                       siglen=200, cache=None):
+    """ This function helps you to align *huge* files.
+        It takes your csv files as arguments and split them into smaller ones
+        (files of `size` lines), and runs the alignment on those files.
+
+        `alignformat` and `targetformat` are keyworded arguments given to the
+        nazca.dataio.parsefile function.
+
+        This function returns its own cache. The cache is quite simply a
+        dictionary having align items' id as keys and tuples (target item's id,
+        distance) as value. This dictionary can be regiven to this function to
+        perform another alignment (with different parameters, or just to be
+        sure everything has been caught)
+
+        If the distance of an alignment is below `equality_threshold`, the
+        alignment is considered as perfect, and the corresponding item is
+        removed from the alignset (to speed up the computation).
+    """
+    warnings.warn(DeprecationWarning('This function will be removed in the next '
+                                     'release.'
+                                     ' You should rather use the BaseAligner '
+                                     'object of the aligner module'))
+    #Split the huge files into smaller ones
+    aligndir = mkdtemp()
+    targetdir = mkdtemp()
+    alignfiles = split_file(alignfile, aligndir, size)
+    targetfiles = split_file(targetfile, targetdir, size)
+
+    #Compute the number of iterations that must be done to achieve the alignement
+    nb_iterations = len(alignfiles) * len(targetfiles)
+    current_it = 0
+
+    cache = cache or {} #Contains the better known alignements
+    #Contains the id of perfectly aligned data
+    doneids = set(_id for _id, (_, dist) in cache.iteritems()
+                          if dist < equality_threshold)
+
+    try:
+        for alignfile in alignfiles:
+            alignset = [a for a in parsefile(osp.join(aligndir, alignfile), **alignformat)
+                        if a[0] not in doneids]
+            for targetfile in targetfiles:
+                targetset = parsefile(osp.join(targetdir, targetfile), **targetformat)
+                matched = conquer_and_divide_alignment(alignset, targetset,
+                                                       threshold,
+                                                       processings=processings,
+                                                       indexes=indexes,
+                                                       mode=mode,
+                                                       neighbours_threshold=neighbours_threshold,
+                                                       n_clusters=n_clusters,
+                                                       kwordsgram=kwordsgram,
+                                                       siglen=siglen,
+                                                       get_global_mat=False)
+                for alignid in matched:
+                    bestid, dist = sorted(matched[alignid], key=lambda x:x[1])[0]
+                    #Get the better known distance
+                    _, current_dist = cache.get(alignset[alignid][0], (None, None))
+                    if current_dist is None or current_dist > dist:
+                        #If it's better, update the cache
+                        cache[alignset[alignid][0]] = (targetset[bestid][0], dist)
+                        if dist <= equality_threshold:
+                            #If perfect, stop trying to align this one
+                            doneids.add(alignset[alignid][0])
+
+                current_it += 1
+                sys.stdout.write('\r%0.2f%%' % (current_it * 100. /
+                                                nb_iterations))
+                sys.stdout.flush()
+                if doneids:
+                    alignset = [a for a in alignset if a[0] not in doneids]
+                if not alignset: #All items have been aligned
+                    #TODO Increment current_it.
+                    #The progress of the alignment process is computed with
+                    #`current_it`. If all items of `alignset` are aligned, we
+                    #stop the alignment process for this `alignset`. If
+                    #`current_it` isn’t incremented, the progress shown will be
+                    #false.
+                    break
+
+    finally:
+        rmtree(aligndir)
+        rmtree(targetdir)
+
+    return cache
+
+
+
+
+
+
+
+###############################################################################
+### CLUSTERING-BASED BLOCKINGS FUNCTIONS ######################################
+###############################################################################
+# Backward compatibility. Now, use the BlockingObject inside the functions.
+# Perhaps these functions may be removed later...
+def findneighbours_clustering(alignset, targetset, indexes=(1, 1),
+                              mode='kmeans', n_clusters=None):
+    """ Find the neigbhours using clustering (kmeans or minibatchkmeans)
+    """
+    warnings.warn(DeprecationWarning('This function will be removed in the next '
+                                     'release.'
+                                     ' You should rather use the KmeansBlocking '
+                                     'object of the blocking module'))
+    if mode == 'kmeans':
+        blocking = KmeansBlocking(ref_attr_index=indexes[0],
+                                  target_attr_index=indexes[1],
+                                  n_clusters=n_clusters)
+    elif mode == 'minibatch':
+        blocking = MiniBatchKmeansBlocking(ref_attr_index=indexes[0],
+                                           target_attr_index=indexes[1],
+                                           n_clusters=n_clusters)
+    else:
+        raise ValueError("Mode should be 'kmeans' or 'minibatch'")
+    # Fit blocking object
+    blocking.fit(alignset, targetset)
+    return list(blocking.iter_blocks())
+
+def findneighbours_kdtree(alignset, targetset, indexes=(1, 1), threshold=0.1):
+    """ Find the neigbhours using kdree
+    """
+    warnings.warn(DeprecationWarning('This function will be removed in the next '
+                                     'release.'
+                                     ' You should rather use the KdTreeBlocking '
+                                     'object of the blocking module'))
+    blocking = KdTreeBlocking(ref_attr_index=indexes[0],
+                              target_attr_index=indexes[1],
+                              threshold=threshold)
+    blocking.fit(alignset, targetset)
+    return list(blocking.iter_blocks())
+
+def findneighbours_minhashing(alignset, targetset, indexes=(1, 1), threshold=0.1,
+                              kwordsgram=1, siglen=200):
+    """ Find the neigbhours using minhashing
+    """
+    warnings.warn(DeprecationWarning('This function will be removed in the next '
+                                     'release.'
+                                     ' You should rather use the '
+                                     'MinHashingBlocking '
+                                     'object of the blocking module'))
+    blocking = MinHashingBlocking(ref_attr_index=indexes[0],
+                                  target_attr_index=indexes[1],
+                                  threshold=threshold, kwordsgram=kwordsgram,
+                                  siglen=siglen)
+    blocking.fit(alignset, targetset)
+    return list(blocking.iter_blocks())
+
+def findneighbours(alignset, targetset, indexes=(1, 1), mode='kdtree',
+                   neighbours_threshold=0.1, n_clusters=None, kwordsgram=1, siglen=200):
+    """ This function helps to find neighbours from items of alignset and
+        targetset. “Neighbours” are items that are “not so far”, ie having a
+        close label, are located in the same area etc.
+
+        This function handles two types of neighbouring : text and numeric.
+        For text value, you have to use the “minhashing” and for numeric, you
+        can choose from “kdtree“, “kdmeans“ and “minibatch”
+
+        The arguments to give are :
+            - `alignset` and `targetset` are the sets where neighbours have to
+              be found.
+            - `indexes` are the location of items to compare
+            - `mode` is the search type to use
+            - `neighbours_threshold` is the `mode` neighbours_threshold
+
+            - `n_clusters` is used for "kmeans" and "minibatch" methods, and it
+              is the number of clusters to use.
+
+            - `kwordsgram` and `siglen` are used for "minhashing". `kwordsgram`
+              is the length of wordsgrams to use, and `siglen` is the length of
+              the minhashing signature matrix.
+
+        return a list of lists, built as the following :
+            [
+                [[indexes_of_alignset_0], [indexes_of_targetset_0]],
+                [[indexes_of_alignset_1], [indexes_of_targetset_1]],
+                [[indexes_of_alignset_2], [indexes_of_targetset_2]],
+                [[indexes_of_alignset_3], [indexes_of_targetset_3]],
+                ...
+            ]
+    """
+    warnings.warn(DeprecationWarning('This function will be removed in the next '
+                                     'release.'
+                                     ' You should rather use the '
+                                     'BaseBlocking '
+                                     'objects of the blocking module'))
+    SEARCHERS = set(['kdtree', 'minhashing', 'kmeans', 'minibatch'])
+    mode = mode.lower()
+
+    if mode not in SEARCHERS:
+        raise NotImplementedError('Unknown mode given')
+    if mode == 'kdtree':
+        return findneighbours_kdtree(alignset, targetset, indexes, neighbours_threshold)
+    elif mode == 'minhashing':
+        return findneighbours_minhashing(alignset, targetset, indexes, neighbours_threshold,
+                                         kwordsgram, siglen)
+    elif mode in set(['kmeans', 'minibatch']):
+        try:
+            return findneighbours_clustering(alignset, targetset, indexes, mode, n_clusters)
+        except:
+            raise NotImplementedError('Scikit learn does not seem to be installed')
--- a/test/test_alignment.py	Thu Apr 25 16:09:39 2013 +0200
+++ b/test/test_alignment.py	Mon Oct 14 16:29:16 2013 +0000
@@ -17,366 +17,92 @@
 # with this program. If not, see <http://www.gnu.org/licenses/>.
 
 import unittest2
-import sys, os
 import random
 random.seed(6) ### Make sure tests are repeatable
-import numpy as np
-import shutil
-from contextlib import contextmanager
-
 from os import path
-from tempfile import mkdtemp
-from dateutil import parser as dateparser
 
-from nazca.distances import (levenshtein, soundex, soundexcode,   \
-                             jaccard, euclidean, geographical)
-from nazca.normalize import (lunormalize, loadlemmas, lemmatized, \
-                                 roundstr, rgxformat, tokenize, simplify)
-import nazca.matrix as am
-from nazca.minhashing import Minlsh
-from nazca.dataio import parsefile, autocasted, split_file
+from nazca.normalize import simplify
 import nazca.aligner as alig
+import nazca.blocking as blo
+from nazca.distances import GeographicalProcessing
 
 
 TESTDIR = path.dirname(__file__)
 
-@contextmanager
-def tempdir():
-    try:
-        temp = mkdtemp()
-        yield temp
-    finally:
-        try:
-            shutil.rmtree(temp)
-        except:
-            pass
-
-class DistancesTest(unittest2.TestCase):
-    def test_levenshtein(self):
-        self.assertEqual(levenshtein('niche', 'chiens'), 5)
-        self.assertEqual(levenshtein('bonjour', 'bonjour !'), 1)
-        self.assertEqual(levenshtein('bon', 'bonjour'), 4)
-        self.assertEqual(levenshtein('Victor Hugo', 'Hugo Victor'), 0)
-
-        #Test symetry
-        self.assertEqual(levenshtein('Victor Hugo', 'Vitor Wugo'),
-                         levenshtein('Vitor Wugo', 'Victor Hugo'))
-
-    def test_soundex(self):
-        ##     Test extracted from Wikipedia en :
-        #Using this algorithm :
-        #both "Robert" and "Rupert" return the same string "R163"
-        #while "Rubin" yields "R150".
-        #
-        # "Ashcraft" and "Ashcroft" both yield "A261" and not "A226"
-        #(the chars 's' and 'c' in the name would receive a single number
-        #of 2 and not 22 since an 'h' lies in between them).
-        #
-        # "Tymczak" yields "T522" not "T520"
-        #(the chars 'z' and 'k' in the name are coded as 2 twice since a vowel
-        #lies in between them).
-        #
-        #"Pfister" yields "P236" not "P123" (the first two letters have the same
-        #number and are coded once as 'P').
-
-        self.assertEqual(soundexcode('Robert', 'english'), 'R163')
-        self.assertEqual(soundexcode('Rubert', 'english'), 'R163')
-        self.assertEqual(soundexcode('Rubin', 'english'), 'R150')
-        self.assertEqual(soundexcode('Ashcraft', 'english'), 'A261')
-        self.assertEqual(soundexcode('Tymczak', 'english'), 'T522')
-        self.assertEqual(soundexcode('Pfister', 'english'), 'P236')
-
-        self.assertEqual(soundex('Rubert', 'Robert', 'english'), 0)
-        self.assertEqual(soundex('Rubin', 'Robert', 'english'), 1)
-
-    def test_jaccard(self):
-        #The jaccard indice between two words is the ratio of the number of
-        #identical letters and the total number of letters
-        #Each letter is counted once only
-        #The distance is 1 - jaccard_indice
-
-        self.assertEqual(jaccard('bonjour', 'bonjour'), 0.0)
-        self.assertAlmostEqual(jaccard('boujour', 'bonjour'), 1, 2)
-        self.assertAlmostEqual(jaccard(u'sacré rubert', u'sacré hubert'), 0.667, 2)
-
-        #Test symetry
-        self.assertEqual(jaccard('orange', 'morange'),
-                         jaccard('morange', 'orange'))
-
-    def test_temporal(self):
-        #Test the distance between two dates. The distance can be given in
-        #``days``, ``months`` or ``years``
-        try:
-            from nazca.distances import temporal
-        except ImportError:
-            return
-        self.assertEqual(temporal('14 aout 1991', '14/08/1991'), 0)
-        self.assertEqual(temporal('14 aout 1991', '08/14/1991'), 0)
-        self.assertEqual(temporal('14 aout 1991', '08/15/1992'), 367)
-        #Test a case of ambiguity
-        self.assertEqual(temporal('1er mai 2012', '01/05/2012'), 0)
-        self.assertEqual(temporal('1er mai 2012', '05/01/2012', dayfirst=False), 0)
-        #Test the different granularities available
-        self.assertAlmostEqual(temporal('14 aout 1991', '08/15/1992', 'years'), 1.0, 1)
-        self.assertAlmostEqual(temporal('1991', '1992', 'years'), 1.0, 1)
-        self.assertAlmostEqual(temporal('13 mars', '13 mai', 'months'), 2.0, 1)
-        self.assertAlmostEqual(temporal('13 march', '13 may', 'months',
-                                        parserinfo=dateparser.parserinfo), 2.0, 1)
-
-        #Test fuzzyness
-        self.assertEqual(temporal('Jean est né le 1er octobre 1958',
-                                  'Le 01-10-1958, Jean est né'), 0)
-
-        #Test symetry
-        self.assertEqual(temporal('14-08-1991', '15/08/1992'),
-                         temporal('15/08/1992', '14/08/1991'))
-
-    def test_euclidean(self):
-        self.assertEqual(euclidean(10, 11), 1)
-        self.assertEqual(euclidean(-10, 11), 21)
-        self.assertEqual(euclidean('-10', '11'), 21)
-
-        #Test symetry
-        self.assertEqual(euclidean(10, 11),
-                         euclidean(11, 10))
-
-    def test_geographical(self):
-        paris = (48.856578, 2.351828)
-        london = (51.504872, -0.07857)
-        dist_parislondon = geographical(paris, london, in_radians=False)
-
-        self.assertAlmostEqual(dist_parislondon, 341564, 0)
-
-
-class NormalizerTestCase(unittest2.TestCase):
-    def setUp(self):
-        self.lemmas = loadlemmas(path.join(TESTDIR, 'data', 'french_lemmas.txt'))
-
-    def test_unormalize(self):
-        self.assertEqual(lunormalize(u'bépoèàÀêùï'),
-                                     u'bepoeaaeui')
-
-    def test_simplify(self):
-        self.assertEqual(simplify(u"J'aime les frites, les pommes et les" \
-                                  u" scoubidous !", self.lemmas),
-                         u"aimer frites pomme scoubidou")
-
-    def test_tokenize(self):
-        self.assertEqual(tokenize(u"J'aime les frites !"),
-                         [u"J'", u'aime', u'les', u'frites', u'!',])
-
-    def test_lemmatizer(self):
-        self.assertEqual(lemmatized(u'sacré rubert', self.lemmas), u'sacré rubert')
-        self.assertEqual(lemmatized(u"J'aime les frites !", self.lemmas),
-                         u'je aimer le frite')
-        self.assertEqual(lemmatized(u", J'aime les frites", self.lemmas),
-                         u'je aimer le frite')
-
-    def test_round(self):
-        self.assertEqual(roundstr(3.14159, 2), '3.14')
-        self.assertEqual(roundstr(3.14159), '3')
-        self.assertEqual(roundstr('3.14159', 3), '3.142')
-
-    def test_format(self):
-        string = u'[Victor Hugo - 26 fev 1802 / 22 mai 1885]'
-        regex  = r'\[(?P<firstname>\w+) (?P<lastname>\w+) - ' \
-                 r'(?P<birthdate>.*) \/ (?P<deathdate>.*?)\]'
-        output = u'%(lastname)s, %(firstname)s (%(birthdate)s - %(deathdate)s)'
-        self.assertEqual(rgxformat(string, regex, output),
-                         u'Hugo, Victor (26 fev 1802 - 22 mai 1885)')
-
-        string = u'http://perdu.com/42/supertop/cool'
-        regex  = r'http://perdu.com/(?P<id>\d+).*'
-        output = u'%(id)s'
-        self.assertEqual(rgxformat(string, regex, output),
-                         u'42')
-
-class MatrixTestCase(unittest2.TestCase):
-
-    def setUp(self):
-        self.input1 = [u'Victor Hugo', u'Albert Camus', 'Jean Valjean']
-        self.input2 = [u'Victor Wugo', u'Albert Camus', 'Albert Camu']
-        self.distance = levenshtein
-        self.matrix = am.cdist(self.input1, self.input2, self.distance, False)
-
-    def test_matrixconstruction(self):
-        d = self.distance
-        i1, i2 = self.input1, self.input2
-        m = self.matrix
-
-        for i in xrange(len(i1)):
-            for j in xrange(len(i2)):
-                self.assertAlmostEqual(m[i, j], d(i1[i], i2[j]), 4)
-
-    def test_matched(self):
-        d = self.distance
-        i1, i2 = self.input1, self.input2
-        m = self.matrix
-
-        #Only the element 1 of input1 has *exactly* matched with the element 1
-        #of input2
-        self.assertEqual(am.matched(m), {1: [(1, 0)]})
-
-        #Victor Hugo --> Victor Wugo
-        #Albert Camus --> Albert Camus, Albert Camu
-        self.assertEqual(am.matched(m, cutoff=2),
-                        {0: [(0, d(i1[0], i2[0]))], 1: [(1, d(i1[1], i2[1])),
-                                                        (2, d(i1[1], i2[2]))]})
-
-    def test_operation(self):
-        m = self.matrix
-        self.assertTrue((3 * m == m * 3).all())
-        self.assertTrue(((m - 0.5*m) == (0.5 * m)).all())
-        self.assertTrue(((m + 10*m - m * 3) == (8 * m)).all())
-
-    def test_pdist(self):
-        _input = [u'Victor Wugo', u'Albert Camus', 'Albert Camu']
-        d = self.distance
-        pdist = am.pdist(_input, self.distance, False)
-        self.assertEqual(pdist, [6, 6, 1])
-
-
-class MinLSHTest(unittest2.TestCase):
-    def test_all(self):
-        sentences = [u"Un nuage flotta dans le grand ciel bleu.",
-                     u"Des grands nuages noirs flottent dans le ciel.",
-                     u"Je n'aime pas ce genre de bandes dessinées tristes.",
-                     u"J'aime les bandes dessinées de genre comiques.",
-                     u"Pour quelle occasion vous êtes-vous apprêtée ?",
-                     u"Je les vis ensemble à plusieurs occasions.",
-                     u"Je les ai vus ensemble à plusieurs occasions.",
-                    ]
-        minlsh = Minlsh()
-        lemmas = loadlemmas(path.join(TESTDIR, 'data', 'french_lemmas.txt'))
-        # XXX Should works independantly of the seed. Unstability due to the bands number ?
-        minlsh.train((simplify(s, lemmas, remove_stopwords=True) for s in sentences), 1, 200)
-        self.assertEqual(minlsh.predict(0.4), set([(0, 1), (2, 3), (5,6)]))
-
-
-class DataIOTestCase(unittest2.TestCase):
-    def test_parser(self):
-        data = parsefile(path.join(TESTDIR, 'data', 'file2parse'),
-                         [0, (2, 3), 4, 1], delimiter=',')
-        self.assertEqual(data, [[1, (12, 19), u'apple', u'house'],
-                                [2, (21.9, 19), u'stramberry', u'horse'],
-                                [3, (23, 2.17), u'cherry', u'flower']])
-
-        data = parsefile(path.join(TESTDIR, 'data', 'file2parse'),
-                         [0, (2, 3), 4, 1], delimiter=',', formatopt={2:str})
-        self.assertEqual(data, [[1, ('12', 19), u'apple', u'house'],
-                                [2, ('21.9', 19), u'stramberry', u'horse'],
-                                [3, ('23', 2.17), u'cherry', u'flower']])
-
-    def test_autocasted(self):
-        self.assertEqual(autocasted('1'), 1)
-        self.assertEqual(autocasted('1.'), 1.)
-        self.assertEqual(autocasted('1,'), 1.)
-        self.assertEqual(autocasted('1,2'), 1.2)
-        self.assertEqual(autocasted('1,2X'), '1,2X')
-        self.assertEqual(autocasted(u'tété'), u'tété')
-        self.assertEqual(autocasted('tété', encoding='utf-8'), u'tété')
-
-    def test_split_file(self):
-        NBLINES = 190
-        with tempdir() as outputdir:
-            file2split = path.join(TESTDIR, 'data', 'file2split')
-            files = split_file(file2split, outputdir, nblines=NBLINES)
-
-            alllines = []
-            nbfiles = len(files)
-            for num, localpath in enumerate(sorted(files)):
-                fullpath = path.join(outputdir, localpath)
-                with open(fullpath) as fobj:
-                    lines = fobj.readlines()
-                # All files, except the last one, must be NBLINES-length.
-                if num < nbfiles - 1:
-                    self.assertEqual(len(lines), NBLINES)
-                alllines.extend(lines)
-
-            with open(file2split) as fobj:
-                self.assertEqual(fobj.readlines(), alllines)
 
 class AlignerTestCase(unittest2.TestCase):
 
-    def test_findneighbours_kdtree(self):
-        alignset = [['V1', 'label1', (6.14194444444, 48.67)],
-                    ['V2', 'label2', (6.2, 49)],
-                    ['V3', 'label3', (5.1, 48)],
-                    ['V4', 'label4', (5.2, 48.1)],
-                    ]
-        targetset = [['T1', 'labelt1', (6.2, 48.9)],
+    def test_align(self):
+        refset = [['V1', 'label1', (6.14194444444, 48.67)],
+                  ['V2', 'label2', (6.2, 49)],
+                  ['V3', 'label3', (5.1, 48)],
+                  ['V4', 'label4', (5.2, 48.1)],
+                  ]
+        targetset = [['T1', 'labelt1', (6.17, 48.7)],
+                     ['T2', 'labelt2', (5.3, 48.2)],
+                     ['T3', 'labelt3', (6.25, 48.91)],
+                     ]
+        # Creation of the aligner object
+        processings = (GeographicalProcessing(2, 2, units='km'),)
+        aligner = alig.BaseAligner(threshold=30, processings=processings)
+        mat, matched = aligner.align(refset, targetset)
+        true_matched = [(0,0), (0, 2), (1,2), (3,1)]
+        for k, values in matched.iteritems():
+            for v, distance in values:
+                self.assertIn((k,v), true_matched)
+
+    def test_neighbours_align(self):
+        refset = [['V1', 'label1', (6.14194444444, 48.67)],
+                  ['V2', 'label2', (6.2, 49)],
+                  ['V3', 'label3', (5.1, 48)],
+                  ['V4', 'label4', (5.2, 48.1)],
+                  ]
+        targetset = [['T1', 'labelt1', (6.17, 48.7)],
                      ['T2', 'labelt2', (5.3, 48.2)],
                      ['T3', 'labelt3', (6.25, 48.91)],
                      ]
-        neighbours = alig.findneighbours_kdtree(alignset, targetset, indexes=(2, 2), threshold=0.3)
-        self.assertEqual(neighbours, [[[0], [0, 2]], [[1], [0, 2]], [[2], [1]], [[3], [1]]])
-
-    def test_normalize_set(self):
-        treatments = {1: {'normalization': [simplify,]}}
-
-        alignlist = [['Label1', u"Un nuage flotta dans le grand ciel bleu."],
-                     ['Label2', u"Pour quelle occasion vous êtes-vous apprêtée ?"],
-                     ['Label3', u"Je les vis ensemble à plusieurs occasions."],
-                     ['Label4', u"Je n'aime pas ce genre de bandes dessinées tristes."],
-                     ['Label5', u"Ensemble et à plusieurs occasions, je les vis."],
-                    ]
-        aligntuple = [tuple(l) for l in alignlist]
-
-        normalizedlist = alig.normalize_set(alignlist, treatments)
-        normalizedtuple = alig.normalize_set(aligntuple, treatments)
-
-        self.assertListEqual(normalizedlist, normalizedtuple)
-        self.assertListEqual(normalizedlist,
-                        [['Label1', u"nuage flotta grand ciel bleu"],
-                         ['Label2', u"occasion êtes apprêtée"],
-                         ['Label3', u"vis ensemble à plusieurs occasions"],
-                         ['Label4', u"n aime genre bandes dessinées tristes"],
-                         ['Label5', u"ensemble à plusieurs occasions vis"],
-                        ])
+        # Creation of the aligner object
+        true_matched = set([(0,0), (0, 2), (1,2), (3,1)])
+        processings = (GeographicalProcessing(2, 2, units='km'),)
+        aligner = alig.BaseAligner(threshold=30, processings=processings)
+        blocking = blo.KdTreeBlocking(ref_attr_index=2,
+                                      target_attr_index=2,
+                                      threshold=0.3)
+        blocking.fit(refset, targetset)
+        predict_matched = set()
+        for alignind, targetind in blocking.iter_blocks():
+            mat, matched = aligner._get_match(refset, targetset, alignind, targetind)
+            for k, values in matched.iteritems():
+                for v, distance in values:
+                    predict_matched.add((k, v))
+        self.assertEqual(true_matched, predict_matched)
 
-    def test_findneighbours_minhashing(self):
-        lemmas = loadlemmas(path.join(TESTDIR, 'data', 'french_lemmas.txt'))
-        treatments = {2: {'normalization': [simplify,], 'norm_params': {'lemmas': lemmas}}}
-        alignset = [['V1', 'label1', u"Un nuage flotta dans le grand ciel bleu."],
-                    ['V2', 'label2', u"Pour quelle occasion vous êtes-vous apprêtée ?"],
-                    ['V3', 'label3', u"Je les vis ensemble à plusieurs occasions."],
-                    ['V4', 'label4', u"Je n'aime pas ce genre de bandes dessinées tristes."],
-                    ['V5', 'label5', u"Ensemble et à plusieurs occasions, je les vis."],
-                    ]
-        targetset = [['T1', 'labelt1', u"Des grands nuages noirs flottent dans le ciel."],
-                     ['T2', 'labelt2', u"Je les ai vus ensemble à plusieurs occasions."],
-                     ['T3', 'labelt3', u"J'aime les bandes dessinées de genre comiques."],
-                     ]
-        alignset = alig.normalize_set(alignset, treatments)
-        targetset = alig.normalize_set(targetset, treatments)
-        neighbours = alig.findneighbours_minhashing(alignset, targetset, indexes=(2, 2), threshold=0.4)
-        for align in ([[2, 4], [1]], [[0], [0]], [[3], [2]]):
-            self.assertIn(align, neighbours)
-
-    def test_findneighbours_clustering(self):
-        alignset = [['V1', 'label1', (6.14194444444, 48.67)],
-                    ['V2', 'label2', (6.2, 49)],
-                    ['V3', 'label3', (5.1, 48)],
-                    ['V4', 'label4', (5.2, 48.1)],
-                    ]
-        targetset = [['T1', 'labelt1', (6.2, 48.9)],
+    def test_divide_and_conquer_align(self):
+        refset = [['V1', 'label1', (6.14194444444, 48.67)],
+                  ['V2', 'label2', (6.2, 49)],
+                  ['V3', 'label3', (5.1, 48)],
+                  ['V4', 'label4', (5.2, 48.1)],
+                  ]
+        targetset = [['T1', 'labelt1', (6.17, 48.7)],
                      ['T2', 'labelt2', (5.3, 48.2)],
                      ['T3', 'labelt3', (6.25, 48.91)],
                      ]
-        try:
-            import sklearn as skl
-        except:
-            print 'Scikit learn does not seem to be installed - Skipping test'
-            return
-        if int(skl.__version__.split('-')[0].split('.')[1])<11:
-            print 'Scikit learn version is too old - Skipping test'
-            return
-        neighbours = alig.findneighbours_clustering(alignset, targetset, indexes=(2, 2))
-        for neighbour in neighbours:
-            self.assertIn(neighbour, [[[0, 1], [0, 2]], [[2, 3], [1]]])
+        # Creation of the aligner object
+        true_matched = set([(0,0), (0, 2), (1,2), (3,1)])
+        processings = (GeographicalProcessing(2, 2, units='km'),)
+        aligner = alig.BaseAligner(threshold=30, processings=processings)
+        aligner.register_blocking(blo.KdTreeBlocking(ref_attr_index=2,
+                                                     target_attr_index=2,
+                                                     threshold=0.3))
+        global_mat, global_matched = aligner.align(refset, targetset)
+        predict_matched = set()
+        for k, values in global_matched.iteritems():
+            for v, distance in values:
+                predict_matched.add((k, v))
+        self.assertEqual(true_matched, predict_matched)
 
-    def test_align(self):
-        alignset = [['V1', 'label1', (6.14194444444, 48.67)],
+    def test_alignall(self):
+        refset = [['V1', 'label1', (6.14194444444, 48.67)],
                     ['V2', 'label2', (6.2, 49)],
                     ['V3', 'label3', (5.1, 48)],
                     ['V4', 'label4', (5.2, 48.1)],
@@ -385,114 +111,21 @@
                      ['T2', 'labelt2', (5.3, 48.2)],
                      ['T3', 'labelt3', (6.25, 48.91)],
                      ]
-        treatments = {2: {'metric': 'geographical', 'matrix_normalized':False,
-                          'metric_params': {'units': 'km', 'in_radians': False}}}
-        mat, matched = alig.align(alignset, targetset, 30, treatments)
-        true_matched = [(0,0), (0, 2), (1,2), (3,1)]
-        for k, values in matched.iteritems():
-            for v, distance in values:
-                self.assertIn((k,v), true_matched)
-
-    def test_neighbours_align(self):
-        alignset = [['V1', 'label1', (6.14194444444, 48.67)],
-                    ['V2', 'label2', (6.2, 49)],
-                    ['V3', 'label3', (5.1, 48)],
-                    ['V4', 'label4', (5.2, 48.1)],
-                    ]
-        targetset = [['T1', 'labelt1', (6.17, 48.7)],
-                     ['T2', 'labelt2', (5.3, 48.2)],
-                     ['T3', 'labelt3', (6.25, 48.91)],
-                     ]
-        true_matched = set([(0,0), (0, 2), (1,2), (3,1)])
-        neighbours = alig.findneighbours_kdtree(alignset, targetset, indexes=(2, 2), threshold=0.3)
-        treatments = {2: {'metric': 'geographical', 'matrix_normalized':False,
-                          'metric_params': {'units': 'km', 'in_radians': False}}}
-        predict_matched = set()
-        for alignind, targetind in neighbours:
-            mat, matched = alig.subalign(alignset, targetset, alignind, targetind, 30, treatments)
-            for k, values in matched.iteritems():
-                for v, distance in values:
-                    predict_matched.add((k, v))
-        self.assertEqual(predict_matched, true_matched)
-
-    def test_divide_and_conquer_align(self):
-        true_matched = set([(0,0), (0, 2), (1,2), (3,1)])
-        alignset = [['V1', 'label1', (6.14194444444, 48.67)],
-                    ['V2', 'label2', (6.2, 49)],
-                    ['V3', 'label3', (5.1, 48)],
-                    ['V4', 'label4', (5.2, 48.1)],
-                    ]
-        targetset = [['T1', 'labelt1', (6.17, 48.7)],
-                     ['T2', 'labelt2', (5.3, 48.2)],
-                     ['T3', 'labelt3', (6.25, 48.91)],
-                     ]
-        treatments = {2: {'metric': 'geographical', 'matrix_normalized':False,
-                          'metric_params': {'units': 'km', 'in_radians': False}}}
-        global_mat, global_matched = alig.conquer_and_divide_alignment(alignset, targetset,
-                                                                       threshold=30,
-                                                                       treatments=treatments,
-                                                                       indexes=(2,2),
-                                                                       neighbours_threshold=0.3)
-        predict_matched = set()
-        for k, values in global_matched.iteritems():
-            for v, distance in values:
-                predict_matched.add((k, v))
-        self.assertEqual(predict_matched, true_matched)
-
-    def test_alignall(self):
-        alignset = [['V1', 'label1', (6.14194444444, 48.67)],
-                    ['V2', 'label2', (6.2, 49)],
-                    ['V3', 'label3', (5.1, 48)],
-                    ['V4', 'label4', (5.2, 48.1)],
-                    ]
-        targetset = [['T1', 'labelt1', (6.17, 48.7)],
-                     ['T2', 'labelt2', (5.3, 48.2)],
-                     ['T3', 'labelt3', (6.25, 48.91)],
-                     ]
-        all_matched = set([('V1','T1'), ('V1', 'T3'), ('V2','T3'), ('V4','T2')])
-        uniq_matched = set([('V2', 'T3'), ('V4', 'T2'), ('V1', 'T1')])
-        treatments = {2: {'metric': 'geographical', 'matrix_normalized': False,
-                          'metric_params': {'units': 'km', 'in_radians': False}}}
-
-        predict_uniq_matched = set(alig.alignall(alignset, targetset,
-                                                 threshold=30,
-                                                 treatments=treatments,
-                                                 indexes=(2,2),
-                                                 neighbours_threshold=0.3,
-                                                 uniq=True))
-        predict_matched = set(alig.alignall(alignset, targetset,
-                                            threshold=30,
-                                            treatments=treatments,
-                                            indexes=(2,2),
-                                            neighbours_threshold=0.3,
-                                            uniq=False))
-
-        self.assertEqual(predict_matched, all_matched)
-        self.assertEqual(predict_uniq_matched, uniq_matched)
-
-    def test_alignall_iterative(self):
-        matched = set([('V2', 'T3'), ('V4', 'T2'), ('V1', 'T1')])
-        treatments = {2: {'metric': 'geographical', 'matrix_normalized': False,
-                          'metric_params': {'units': 'km', 'in_radians': False}}}
-
-        _format={'indexes': [0, 1, (2, 3)]}
-        alignements = alig.alignall_iterative(path.join(TESTDIR, 'data',
-                                                        'alignfile.csv'),
-                                              path.join(TESTDIR, 'data',
-                                                        'targetfile.csv'),
-                                              _format, _format, threshold=30,
-                                              size=2, #very small files ;)
-                                              treatments=treatments,
-                                              indexes=(2,2),
-                                              neighbours_threshold=0.3)
-
-        predict_matched = set([(a, t) for (a, (t, _)) in
-                               alignements.iteritems()])
-        self.assertEqual(predict_matched, matched)
-
-
-
-
+        all_matched = [('V1','T1'), ('V1', 'T3'), ('V2','T3'), ('V4','T2')]
+        uniq_matched = [('V2', 'T3'), ('V4', 'T2'), ('V1', 'T1')]
+        processings = (GeographicalProcessing(2, 2, units='km'),)
+        aligner = alig.BaseAligner(threshold=30, processings=processings)
+        aligner.register_blocking(blo.KdTreeBlocking(ref_attr_index=2,
+                                                     target_attr_index=2,
+                                                     threshold=0.3))
+        unimatched = list(aligner.get_aligned_pairs(refset, targetset, unique=True))
+        matched = list(aligner.get_aligned_pairs(refset, targetset, unique=False))
+        self.assertEqual(len(matched), len(all_matched))
+        for m in all_matched:
+            self.assertIn(m, matched)
+        self.assertEqual(len(unimatched), len(uniq_matched))
+        for m in uniq_matched:
+            self.assertIn(m, unimatched)
 
 
 if __name__ == '__main__':
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_blocking.py	Mon Oct 14 16:29:16 2013 +0000
@@ -0,0 +1,203 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+import unittest2
+from os import path
+from functools import partial
+import random
+random.seed(6) ### Make sure tests are repeatable / Minhashing
+
+from nazca.distances import (levenshtein, soundex, soundexcode,   \
+                             jaccard, euclidean, geographical)
+from nazca.blocking import (KeyBlocking, SortedNeighborhoodBlocking,
+                            SoundexBlocking, KmeansBlocking,
+                            MinHashingBlocking, KdTreeBlocking)
+from nazca.normalize import SimplifyNormalizer, loadlemmas
+
+
+TESTDIR = path.dirname(__file__)
+
+SOUNDEX_REFSET = (('a1', 'smith'),
+                  ('a2', 'neighan'),
+                  ('a3', 'meier'),
+                  ('a4', 'smithers'),
+                  ('a5', 'nguyen'),
+                  ('a6', 'faulkner'),
+                  ('a7', 'sandy'))
+SOUNDEX_TARGETSET = (('b1', 'meier'),
+                     ('b2', 'meier'),
+                     ('b3', 'smith'),
+                     ('b4', 'nguyen'),
+                     ('b5', 'fawkner'),
+                     ('b6', 'santi'),
+                     ('b7', 'cain'))
+SOUNDEX_PAIRS = (('a3', 'b1'),
+                 ('a3', 'b2'),
+                 ('a2', 'b4'),
+                 ('a5', 'b4'),
+                 ('a1', 'b3'),
+                 ('a1', 'b6'),
+                 ('a7', 'b3'),
+                 ('a7', 'b6'),)
+
+
+class KeyBlockingTest(unittest2.TestCase):
+
+    def test_keyblocking_blocks(self):
+        blocking = KeyBlocking(ref_attr_index=1, target_attr_index=1,
+                               callback=partial(soundexcode, language='english'))
+        blocking.fit(SOUNDEX_REFSET, SOUNDEX_TARGETSET)
+        blocks = list(blocking.iter_blocks())
+        self.assertEqual(len(blocks), 3)
+        self.assertIn((['a1', 'a7'], ['b3', 'b6']), blocks)
+        self.assertIn((['a2', 'a5'], ['b4']), blocks)
+        self.assertIn((['a3'], ['b1', 'b2']), blocks)
+
+    def test_keyblocking_couples(self):
+        blocking = KeyBlocking(ref_attr_index=1, target_attr_index=1,
+                               callback=partial(soundexcode, language='english'))
+        blocking.fit(SOUNDEX_REFSET, SOUNDEX_TARGETSET)
+        pairs = list(blocking.iter_pairs())
+        self.assertEqual(len(pairs), 8)
+        for pair in SOUNDEX_PAIRS:
+            self.assertIn(pair, pairs)
+
+    def test_soundex_blocks(self):
+        blocking = SoundexBlocking(ref_attr_index=1, target_attr_index=1,
+                                   language='english')
+        blocking.fit(SOUNDEX_REFSET, SOUNDEX_TARGETSET)
+        blocks = list(blocking.iter_blocks())
+        self.assertEqual(len(blocks), 3)
+        self.assertIn((['a1', 'a7'], ['b3', 'b6']), blocks)
+        self.assertIn((['a2', 'a5'], ['b4']), blocks)
+        self.assertIn((['a3'], ['b1', 'b2']), blocks)
+
+    def test_soundex_couples(self):
+        blocking = SoundexBlocking(ref_attr_index=1, target_attr_index=1,
+                                   language='english')
+        blocking.fit(SOUNDEX_REFSET, SOUNDEX_TARGETSET)
+        pairs = list(blocking.iter_pairs())
+        self.assertEqual(len(pairs), 8)
+        for pair in SOUNDEX_PAIRS:
+            self.assertIn(pair, pairs)
+
+
+class SortedNeighborhoodBlockingTest(unittest2.TestCase):
+
+    def test_sorted_neighborhood_blocks(self):
+        blocking = SortedNeighborhoodBlocking(ref_attr_index=1, target_attr_index=1,
+                                              window_width=1)
+        blocking.fit(SOUNDEX_REFSET, SOUNDEX_TARGETSET)
+        blocks = list(blocking.iter_blocks())
+        true_blocks = [(['a6'], ['b7', 'b5']), (['a3'], ['b5', 'b1']),
+                       (['a2'], ['b2']), (['a5'], ['b4']), (['a7'], ['b4', 'b6']),
+                       (['a1'], ['b6', 'b3']), (['a4'], ['b3'])]
+        self.assertEqual(len(blocks), len(true_blocks))
+        for block in true_blocks:
+            self.assertIn(block, blocks)
+
+    def test_sorted_neighborhood_keyfunc(self):
+        """ Test sort reversing values
+        """
+        blocking = SortedNeighborhoodBlocking(ref_attr_index=1, target_attr_index=1,
+                                              key_func=lambda x:x[::-1], window_width=1)
+        blocking.fit(SOUNDEX_REFSET, SOUNDEX_TARGETSET)
+        blocks = list(blocking.iter_blocks())
+        true_blocks = [(['a1'], ['b3']), (['a2'], ['b6']), (['a5'], ['b4']), (['a3'], ['b7', 'b1']),
+                       (['a6'], ['b2', 'b5']), (['a4'], ['b5'])]
+        self.assertEqual(len(blocks), len(true_blocks))
+        for block in true_blocks:
+            self.assertIn(block, blocks)
+
+
+class KmeansBlockingTest(unittest2.TestCase):
+
+    def test_clustering_blocking_kmeans(self):
+        refset = [['V1', 'label1', (6.14194444444, 48.67)],
+                    ['V2', 'label2', (6.2, 49)],
+                    ['V3', 'label3', (5.1, 48)],
+                    ['V4', 'label4', (5.2, 48.1)],
+                    ]
+        targetset = [['T1', 'labelt1', (6.2, 48.9)],
+                     ['T2', 'labelt2', (5.3, 48.2)],
+                     ['T3', 'labelt3', (6.25, 48.91)],
+                     ]
+        try:
+            import sklearn as skl
+        except ImportError:
+            self.skiptTest('Scikit learn does not seem to be installed')
+        if int(skl.__version__.split('-')[0].split('.')[1])<=11:
+            self.skipTest('Scikit learn version is too old - Skipping test')
+        blocking = KmeansBlocking(ref_attr_index=2, target_attr_index=2)
+        blocking.fit(refset, targetset)
+        # Blocks
+        blocks = list(blocking.iter_blocks())
+        self.assertEqual(len(blocks), 2)
+        self.assertIn(([0, 1], [0, 2]), blocks)
+        self.assertIn(([2, 3], [1]), blocks)
+        # Pairs
+        pairs = list(blocking.iter_pairs())
+        self.assertEqual(len(pairs), 6)
+        for pair in ((0, 0), (0, 2), (1, 0), (1, 2), (2, 1), (3, 1)):
+            self.assertIn(pair, pairs)
+
+
+class MinHashingBlockingTest(unittest2.TestCase):
+
+    def test_minhashing(self):
+        refset = [['V1', 'label1', u"Un nuage flotta dans le grand ciel bleu."],
+                  ['V2', 'label2', u"Pour quelle occasion vous êtes-vous apprêtée ?"],
+                  ['V3', 'label3', u"Je les vis ensemble à plusieurs occasions."],
+                  ['V4', 'label4', u"Je n'aime pas ce genre de bandes dessinées tristes."],
+                  ['V5', 'label5', u"Ensemble et à plusieurs occasions, je les vis."],
+                  ]
+        targetset = [['T1', 'labelt1', u"Des grands nuages noirs flottent dans le ciel."],
+                     ['T2', 'labelt2', u"Je les ai vus ensemble à plusieurs occasions."],
+                     ['T3', 'labelt3', u"J'aime les bandes dessinées de genre comiques."],
+                     ]
+        lemmas = loadlemmas(path.join(TESTDIR, 'data', 'french_lemmas.txt'))
+        normalizer = SimplifyNormalizer(attr_index=2, lemmas=lemmas)
+        refset = normalizer.normalize_dataset(refset)
+        targetset = normalizer.normalize_dataset(targetset)
+        blocking = MinHashingBlocking(threshold=0.4, ref_attr_index=2, target_attr_index=2)
+        blocking.fit(refset, targetset)
+        blocks = list(blocking.iter_blocks())
+        for align in (([2, 4], [1]), ([0], [0]), ([3], [2])):
+            self.assertIn(align, blocks)
+
+
+class KdTreeBlockingTest(unittest2.TestCase):
+
+    def test_minhashing(self):
+        refset = [['V1', 'label1', (6.14194444444, 48.67)],
+                  ['V2', 'label2', (6.2, 49)],
+                  ['V3', 'label3', (5.1, 48)],
+                  ['V4', 'label4', (5.2, 48.1)],
+                  ]
+        targetset = [['T1', 'labelt1', (6.2, 48.9)],
+                     ['T2', 'labelt2', (5.3, 48.2)],
+                     ['T3', 'labelt3', (6.25, 48.91)],
+                     ]
+        blocking = KdTreeBlocking(threshold=0.3, ref_attr_index=2, target_attr_index=2)
+        blocking.fit(refset, targetset)
+        blocks = list(blocking.iter_blocks())
+        self.assertEqual([([0], [0, 2]), ([1], [0, 2]), ([2], [1]), ([3], [1])], blocks)
+
+
+if __name__ == '__main__':
+    unittest2.main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_dataio.py	Mon Oct 14 16:29:16 2013 +0000
@@ -0,0 +1,88 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import unittest2
+import shutil
+from contextlib import contextmanager
+from os import path
+from tempfile import mkdtemp
+
+from nazca.dataio import parsefile, autocasted, split_file
+
+
+TESTDIR = path.dirname(__file__)
+
+@contextmanager
+def tempdir():
+    try:
+        temp = mkdtemp()
+        yield temp
+    finally:
+        try:
+            shutil.rmtree(temp)
+        except:
+            pass
+
+
+class DataIOTestCase(unittest2.TestCase):
+    def test_parser(self):
+        data = parsefile(path.join(TESTDIR, 'data', 'file2parse'),
+                         [0, (2, 3), 4, 1], delimiter=',')
+        self.assertEqual([[1, (12, 19), u'apple', u'house'],
+                          [2, (21.9, 19), u'stramberry', u'horse'],
+                          [3, (23, 2.17), u'cherry', u'flower']], data)
+
+        data = parsefile(path.join(TESTDIR, 'data', 'file2parse'),
+                         [0, (2, 3), 4, 1], delimiter=',', formatopt={2:str})
+        self.assertEqual([[1, ('12', 19), u'apple', u'house'],
+                          [2, ('21.9', 19), u'stramberry', u'horse'],
+                          [3, ('23', 2.17), u'cherry', u'flower']], data)
+
+    def test_autocasted(self):
+        self.assertEqual(autocasted('1'), 1)
+        self.assertEqual(autocasted('1.'), 1.)
+        self.assertEqual(autocasted('1,'), 1.)
+        self.assertEqual(autocasted('1,2'), 1.2)
+        self.assertEqual(autocasted('1,2X'), '1,2X')
+        self.assertEqual(autocasted(u'tété'), u'tété')
+        self.assertEqual(autocasted('tété', encoding='utf-8'), u'tété')
+
+    def test_split_file(self):
+        NBLINES = 190
+        with tempdir() as outputdir:
+            file2split = path.join(TESTDIR, 'data', 'file2split')
+            files = split_file(file2split, outputdir, nblines=NBLINES)
+
+            alllines = []
+            nbfiles = len(files)
+            for num, localpath in enumerate(sorted(files)):
+                fullpath = path.join(outputdir, localpath)
+                with open(fullpath) as fobj:
+                    lines = fobj.readlines()
+                # All files, except the last one, must be NBLINES-length.
+                if num < nbfiles - 1:
+                    self.assertEqual(len(lines), NBLINES)
+                alllines.extend(lines)
+
+            with open(file2split) as fobj:
+                self.assertEqual(alllines, fobj.readlines())
+
+
+if __name__ == '__main__':
+    unittest2.main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_distances.py	Mon Oct 14 16:29:16 2013 +0000
@@ -0,0 +1,160 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import unittest2
+import random
+random.seed(6) ### Make sure tests are repeatable
+from dateutil import parser as dateparser
+
+from nazca.distances import (levenshtein, soundex, soundexcode,\
+                             jaccard, euclidean, geographical,
+                             LevenshteinProcessing)
+
+
+class DistancesTest(unittest2.TestCase):
+    def test_levenshtein(self):
+        self.assertEqual(levenshtein('niche', 'chiens'), 5)
+        self.assertEqual(levenshtein('bonjour', 'bonjour !'), 1)
+        self.assertEqual(levenshtein('bon', 'bonjour'), 4)
+        self.assertEqual(levenshtein('Victor Hugo', 'Hugo Victor'), 0)
+
+        #Test symetry
+        self.assertEqual(levenshtein('Victor Hugo', 'Vitor Wugo'),
+                         levenshtein('Vitor Wugo', 'Victor Hugo'))
+
+    def test_soundex(self):
+        ##     Test extracted from Wikipedia en :
+        #Using this algorithm :
+        #both "Robert" and "Rupert" return the same string "R163"
+        #while "Rubin" yields "R150".
+        #
+        # "Ashcraft" and "Ashcroft" both yield "A261" and not "A226"
+        #(the chars 's' and 'c' in the name would receive a single number
+        #of 2 and not 22 since an 'h' lies in between them).
+        #
+        # "Tymczak" yields "T522" not "T520"
+        #(the chars 'z' and 'k' in the name are coded as 2 twice since a vowel
+        #lies in between them).
+        #
+        #"Pfister" yields "P236" not "P123" (the first two letters have the same
+        #number and are coded once as 'P').
+
+        self.assertEqual(soundexcode('Robert', 'english'), 'R163')
+        self.assertEqual(soundexcode('Rubert', 'english'), 'R163')
+        self.assertEqual(soundexcode('Rubin', 'english'), 'R150')
+        self.assertEqual(soundexcode('Ashcraft', 'english'), 'A261')
+        self.assertEqual(soundexcode('Tymczak', 'english'), 'T522')
+        self.assertEqual(soundexcode('Pfister', 'english'), 'P236')
+
+        self.assertEqual(soundex('Rubert', 'Robert', 'english'), 0)
+        self.assertEqual(soundex('Rubin', 'Robert', 'english'), 1)
+
+    def test_jaccard(self):
+        #The jaccard indice between two words is the ratio of the number of
+        #identical letters and the total number of letters
+        #Each letter is counted once only
+        #The distance is 1 - jaccard_indice
+
+        self.assertEqual(jaccard('bonjour', 'bonjour'), 0.0)
+        self.assertAlmostEqual(jaccard('boujour', 'bonjour'), 1, 2)
+        self.assertAlmostEqual(jaccard(u'sacré rubert', u'sacré hubert'), 0.667, 2)
+
+        #Test symetry
+        self.assertEqual(jaccard('orange', 'morange'),
+                         jaccard('morange', 'orange'))
+
+    def test_temporal(self):
+        #Test the distance between two dates. The distance can be given in
+        #``days``, ``months`` or ``years``
+        try:
+            from nazca.distances import temporal
+        except ImportError:
+            return
+        self.assertEqual(temporal('14 aout 1991', '14/08/1991'), 0)
+        self.assertEqual(temporal('14 aout 1991', '08/14/1991'), 0)
+        self.assertEqual(temporal('14 aout 1991', '08/15/1992'), 367)
+        #Test a case of ambiguity
+        self.assertEqual(temporal('1er mai 2012', '01/05/2012'), 0)
+        self.assertEqual(temporal('1er mai 2012', '05/01/2012', dayfirst=False), 0)
+        #Test the different granularities available
+        self.assertAlmostEqual(temporal('14 aout 1991', '08/15/1992', 'years'), 1.0, 1)
+        self.assertAlmostEqual(temporal('1991', '1992', 'years'), 1.0, 1)
+        self.assertAlmostEqual(temporal('13 mars', '13 mai', 'months'), 2.0, 1)
+        self.assertAlmostEqual(temporal('13 march', '13 may', 'months',
+                                        parserinfo=dateparser.parserinfo), 2.0, 1)
+
+        #Test fuzzyness
+        self.assertEqual(temporal('Jean est né le 1er octobre 1958',
+                                  'Le 01-10-1958, Jean est né'), 0)
+
+        #Test symetry
+        self.assertEqual(temporal('14-08-1991', '15/08/1992'),
+                         temporal('15/08/1992', '14/08/1991'))
+
+    def test_euclidean(self):
+        self.assertEqual(euclidean(10, 11), 1)
+        self.assertEqual(euclidean(-10, 11), 21)
+        self.assertEqual(euclidean('-10', '11'), 21)
+
+        #Test symetry
+        self.assertEqual(euclidean(10, 11),
+                         euclidean(11, 10))
+
+    def test_geographical(self):
+        paris = (48.856578, 2.351828)
+        london = (51.504872, -0.07857)
+        dist_parislondon = geographical(paris, london, in_radians=False)
+
+        self.assertAlmostEqual(dist_parislondon, 341564, 0)
+
+
+class MatrixTestCase(unittest2.TestCase):
+
+    def setUp(self):
+        self.input1 = [u'Victor Hugo', u'Albert Camus', 'Jean Valjean']
+        self.input2 = [u'Victor Wugo', u'Albert Camus', 'Albert Camu']
+        self.distance = levenshtein
+        processing = LevenshteinProcessing()
+        self.matrix = processing.cdist(self.input1, self.input2)
+
+    def test_matrixconstruction(self):
+        d = self.distance
+        i1, i2 = self.input1, self.input2
+        m = self.matrix
+
+        for i in xrange(len(i1)):
+            for j in xrange(len(i2)):
+                self.assertAlmostEqual(m[i, j], d(i1[i], i2[j]), 4)
+
+    def test_operation(self):
+        m = self.matrix
+        self.assertTrue((3 * m == m * 3).all())
+        self.assertTrue(((m - 0.5*m) == (0.5 * m)).all())
+        self.assertTrue(((m + 10*m - m * 3) == (8 * m)).all())
+
+    def test_pdist(self):
+        _input = [u'Victor Wugo', u'Albert Camus', 'Albert Camu']
+        d = self.distance
+        processing = LevenshteinProcessing()
+        pdist = processing.pdist(_input)
+        self.assertEqual([6, 6, 1], pdist)
+
+
+if __name__ == '__main__':
+    unittest2.main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_minhashing.py	Mon Oct 14 16:29:16 2013 +0000
@@ -0,0 +1,50 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import unittest2
+from os import path
+import random
+random.seed(6) ### Make sure tests are repeatable
+
+from nazca.normalize import loadlemmas, simplify
+from nazca.minhashing import Minlsh
+
+TESTDIR = path.dirname(__file__)
+
+
+
+class MinLSHTest(unittest2.TestCase):
+    def test_all(self):
+        sentences = [u"Un nuage flotta dans le grand ciel bleu.",
+                     u"Des grands nuages noirs flottent dans le ciel.",
+                     u"Je n'aime pas ce genre de bandes dessinées tristes.",
+                     u"J'aime les bandes dessinées de genre comiques.",
+                     u"Pour quelle occasion vous êtes-vous apprêtée ?",
+                     u"Je les vis ensemble à plusieurs occasions.",
+                     u"Je les ai vus ensemble à plusieurs occasions.",
+                    ]
+        minlsh = Minlsh()
+        lemmas = loadlemmas(path.join(TESTDIR, 'data', 'french_lemmas.txt'))
+        # XXX Should works independantly of the seed. Unstability due to the bands number ?
+        minlsh.train((simplify(s, lemmas, remove_stopwords=True) for s in sentences), 1, 200)
+        self.assertEqual(set([(0, 1), (2, 3), (5,6)]), minlsh.predict(0.4))
+
+
+if __name__ == '__main__':
+    unittest2.main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_normalize.py	Mon Oct 14 16:29:16 2013 +0000
@@ -0,0 +1,189 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import unittest2
+from os import path
+
+from nazca.normalize import (BaseNormalizer, UnicodeNormalizer,
+                             SimplifyNormalizer, TokenizerNormalizer,
+                             LemmatizerNormalizer, RoundNormalizer,
+                             RegexpNormalizer, NormalizerPipeline,
+                             lunormalize, loadlemmas, lemmatized, \
+                             roundstr, rgxformat, tokenize, simplify)
+
+
+TESTDIR = path.dirname(__file__)
+
+
+class NormalizerFunctionTestCase(unittest2.TestCase):
+    def setUp(self):
+        self.lemmas = loadlemmas(path.join(TESTDIR, 'data', 'french_lemmas.txt'))
+
+    def test_unormalize(self):
+        self.assertEqual(lunormalize(u'bépoèàÀêùï'),
+                                     u'bepoeaaeui')
+
+    def test_simplify(self):
+        self.assertEqual(simplify(u"J'aime les frites, les pommes et les" \
+                                  u" scoubidous !", self.lemmas),
+                         u"aimer frites pomme scoubidou")
+
+    def test_tokenize(self):
+        self.assertEqual(tokenize(u"J'aime les frites !"),
+                         [u"J'", u'aime', u'les', u'frites', u'!',])
+
+    def test_lemmatizer(self):
+        self.assertEqual(lemmatized(u'sacré rubert', self.lemmas), u'sacré rubert')
+        self.assertEqual(lemmatized(u"J'aime les frites !", self.lemmas),
+                         u'je aimer le frite')
+        self.assertEqual(lemmatized(u", J'aime les frites", self.lemmas),
+                         u'je aimer le frite')
+
+    def test_round(self):
+        self.assertEqual(roundstr(3.14159, 2), '3.14')
+        self.assertEqual(roundstr(3.14159), '3')
+        self.assertEqual(roundstr('3.14159', 3), '3.142')
+
+    def test_format(self):
+        string = u'[Victor Hugo - 26 fev 1802 / 22 mai 1885]'
+        regex  = r'\[(?P<firstname>\w+) (?P<lastname>\w+) - ' \
+                 r'(?P<birthdate>.*) \/ (?P<deathdate>.*?)\]'
+        output = u'%(lastname)s, %(firstname)s (%(birthdate)s - %(deathdate)s)'
+        self.assertEqual(rgxformat(string, regex, output),
+                         u'Hugo, Victor (26 fev 1802 - 22 mai 1885)')
+
+        string = u'http://perdu.com/42/supertop/cool'
+        regex  = r'http://perdu.com/(?P<id>\d+).*'
+        output = u'%(id)s'
+        self.assertEqual(rgxformat(string, regex, output),
+                         u'42')
+
+
+class NormalizerObjectTestCase(unittest2.TestCase):
+    def setUp(self):
+        self.lemmas = loadlemmas(path.join(TESTDIR, 'data', 'french_lemmas.txt'))
+
+    def test_normalizer(self):
+        normalizer = BaseNormalizer(lunormalize)
+        self.assertEqual(normalizer.normalize(u'bépoèàÀêùï'), u'bepoeaaeui')
+
+    def test_normalizer_record(self):
+        normalizer = BaseNormalizer(lunormalize, attr_index=1)
+        record = ('a1', u'bépoèàÀêùï')
+        self.assertEqual(normalizer.normalize(record), ['a1',u'bepoeaaeui'])
+
+    def test_normalizer_dataset(self):
+        normalizer = BaseNormalizer(lunormalize, attr_index=1)
+        dataset = [('a1', u'bépoèàÀêùï'), ('a2', u'tàtà')]
+        results = normalizer.normalize_dataset(dataset)
+        self.assertEqual([['a1', u'bepoeaaeui'], ['a2', u'tata']], results)
+        self.assertNotEqual(results, dataset)
+
+    def test_normalizer_dataset_inplace(self):
+        normalizer = BaseNormalizer(lunormalize, attr_index=1)
+        dataset = [('a1', u'bépoèàÀêùï'), ('a2', u'tàtà')]
+        normalizer.normalize_dataset(dataset, inplace=True)
+        self.assertEqual([['a1', u'bepoeaaeui'], ['a2', u'tata']], dataset)
+
+    def test_unormalize(self):
+        normalizer = UnicodeNormalizer()
+        self.assertEqual(normalizer.normalize(u'bépoèàÀêùï'), u'bepoeaaeui')
+
+    def test_unormalize_record(self):
+        normalizer = UnicodeNormalizer(attr_index=1)
+        record = ('a1', u'bépoèàÀêùï')
+        self.assertEqual(['a1',u'bepoeaaeui'], normalizer.normalize(record))
+
+    def test_simplify(self):
+        normalizer = SimplifyNormalizer(lemmas=self.lemmas)
+        self.assertEqual(normalizer.normalize(u"J'aime les frites, les pommes et les scoubidous !")
+                         , u"aimer frites pomme scoubidou")
+
+    def test_simplify_record(self):
+        normalizer = SimplifyNormalizer(attr_index=1, lemmas=self.lemmas)
+        self.assertEqual(['a1', u"aimer frites pomme scoubidou"],
+                         normalizer.normalize(['a1', u"J'aime les frites, les pommes "
+                                               "et les scoubidous !"]))
+
+    def test_tokenize(self):
+        normalizer = TokenizerNormalizer()
+        self.assertEqual([u"J'", u'aime', u'les', u'frites', u'!',],
+                         normalizer.normalize(u"J'aime les frites !"))
+
+    def test_tokenize_record(self):
+        normalizer = TokenizerNormalizer(attr_index=1)
+        self.assertEqual(['a1', [u"J'", u'aime', u'les', u'frites', u'!',]],
+                         normalizer.normalize(['a1', u"J'aime les frites !"]))
+
+    def test_lemmatizer(self):
+        normalizer = LemmatizerNormalizer(self.lemmas)
+        self.assertEqual(normalizer.normalize(u'sacré rubert'), u'sacré rubert')
+        self.assertEqual(normalizer.normalize(u"J'aime les frites !"), u'je aimer le frite')
+        self.assertEqual(normalizer.normalize(u", J'aime les frites"), u'je aimer le frite')
+
+    def test_lemmatizer_record(self):
+        normalizer = LemmatizerNormalizer(self.lemmas, attr_index=1)
+        self.assertEqual(['a1', u'sacré rubert'],
+                         normalizer.normalize(['a1', u'sacré rubert']))
+        self.assertEqual(['a1', u'je aimer le frite'],
+                         normalizer.normalize(['a1', u"J'aime les frites !"]))
+        self.assertEqual(['a1', u'je aimer le frite'],
+                         normalizer.normalize(['a1', u", J'aime les frites"]))
+
+    def test_round(self):
+        normalizer = RoundNormalizer()
+        self.assertEqual(normalizer.normalize(3.14159), '3')
+        normalizer = RoundNormalizer(ndigits=2)
+        self.assertEqual(normalizer.normalize(3.14159), '3.14')
+        normalizer = RoundNormalizer(ndigits=3)
+        self.assertEqual(normalizer.normalize(3.14159), '3.142')
+
+    def test_round_record(self):
+        normalizer = RoundNormalizer(attr_index=1)
+        self.assertEqual(['a1', '3'], normalizer.normalize(['a1', 3.14159]))
+        normalizer = RoundNormalizer(attr_index=1, ndigits=2)
+        self.assertEqual(['a1', '3.14'], normalizer.normalize(['a1', 3.14159]))
+        normalizer = RoundNormalizer(attr_index=1, ndigits=3)
+        self.assertEqual(['a1', '3.142'], normalizer.normalize(['a1', 3.14159]))
+
+    def test_regexp(self):
+        normalizer = RegexpNormalizer(r'http://perdu.com/(?P<id>\d+).*', u'%(id)s')
+        self.assertEqual(normalizer.normalize(u'http://perdu.com/42/supertop/cool'), u'42')
+
+    def test_regexp_record(self):
+        normalizer = RegexpNormalizer(r'http://perdu.com/(?P<id>\d+).*', u'%(id)s', attr_index=1)
+        self.assertEqual(['a1', u'42'],
+                         normalizer.normalize(['a1', u'http://perdu.com/42/supertop/cool']))
+
+
+class NormalizerPipelineTestCase(unittest2.TestCase):
+
+    def test_normalizer(self):
+        regexp = r'(?P<id>\d+);{["]?(?P<firstname>.+[^"])["]?};{(?P<surname>.*)};{};{};(?P<date>.*)'
+        output = u'%(id)s\t%(firstname)s\t%(surname)s\t%(date)s'
+        n1 = RegexpNormalizer(regexp, u'%(id)s\t%(firstname)s\t%(surname)s\t%(date)s')
+        n2 = BaseNormalizer(callback= lambda x: x.split('\t'))
+        n3 = UnicodeNormalizer(attr_index=(1, 2, 3))
+        pipeline = NormalizerPipeline((n1, n2, n3))
+        r1 = u'1111;{"Toto tàtà"};{Titi};{};{};'
+        self.assertEqual(['1111', 'toto tata', 'titi', u''], pipeline.normalize(r1))
+
+
+if __name__ == '__main__':
+    unittest2.main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_old_api.py	Mon Oct 14 16:29:16 2013 +0000
@@ -0,0 +1,250 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import unittest2
+import random
+random.seed(6) ### Make sure tests are repeatable
+from os import path
+
+from nazca.normalize import loadlemmas, simplify
+from nazca.old_api import (normalize_set,
+                           findneighbours_clustering,
+                           findneighbours_kdtree,
+                           findneighbours_minhashing,
+                           align, subalign,
+                           conquer_and_divide_alignment,
+                           alignall, alignall_iterative)
+
+
+TESTDIR = path.dirname(__file__)
+
+
+# Backward compatibility. Now, use the BaseAligner inside the functions.
+# Perhaps these functions may be removed later...
+
+
+###############################################################################
+### NORMALIZE TESTS ###########################################################
+###############################################################################
+class NormalizerFunctionTestCase(unittest2.TestCase):
+    def test_normalize_set(self):
+        processings = {1: {'normalization': [simplify,]}}
+
+        alignlist = [['Label1', u"Un nuage flotta dans le grand ciel bleu."],
+                     ['Label2', u"Pour quelle occasion vous êtes-vous apprêtée ?"],
+                     ['Label3', u"Je les vis ensemble à plusieurs occasions."],
+                     ['Label4', u"Je n'aime pas ce genre de bandes dessinées tristes."],
+                     ['Label5', u"Ensemble et à plusieurs occasions, je les vis."],
+                    ]
+        aligntuple = [tuple(l) for l in alignlist]
+
+        normalizedlist = normalize_set(alignlist, processings)
+        normalizedtuple = normalize_set(aligntuple, processings)
+
+        self.assertListEqual(normalizedlist, normalizedtuple)
+        self.assertListEqual(normalizedlist,
+                        [['Label1', u"nuage flotta grand ciel bleu"],
+                         ['Label2', u"occasion êtes apprêtée"],
+                         ['Label3', u"vis ensemble à plusieurs occasions"],
+                         ['Label4', u"n aime genre bandes dessinées tristes"],
+                         ['Label5', u"ensemble à plusieurs occasions vis"],
+                        ])
+
+
+###############################################################################
+### ALIGNER TESTS #############################################################
+###############################################################################
+class AlignerFunctionsTestCase(unittest2.TestCase):
+
+    def test_align(self):
+        alignset = [['V1', 'label1', (6.14194444444, 48.67)],
+                    ['V2', 'label2', (6.2, 49)],
+                    ['V3', 'label3', (5.1, 48)],
+                    ['V4', 'label4', (5.2, 48.1)],
+                    ]
+        targetset = [['T1', 'labelt1', (6.17, 48.7)],
+                     ['T2', 'labelt2', (5.3, 48.2)],
+                     ['T3', 'labelt3', (6.25, 48.91)],
+                     ]
+        processings = {2: {'metric': 'geographical', 'matrix_normalized':False,
+                          'metric_params': {'units': 'km', 'in_radians': False}}}
+        mat, matched = align(alignset, targetset, 30, processings)
+        true_matched = [(0,0), (0, 2), (1,2), (3,1)]
+        for k, values in matched.iteritems():
+            for v, distance in values:
+                self.assertIn((k,v), true_matched)
+
+    def test_neighbours_align(self):
+        alignset = [['V1', 'label1', (6.14194444444, 48.67)],
+                    ['V2', 'label2', (6.2, 49)],
+                    ['V3', 'label3', (5.1, 48)],
+                    ['V4', 'label4', (5.2, 48.1)],
+                    ]
+        targetset = [['T1', 'labelt1', (6.17, 48.7)],
+                     ['T2', 'labelt2', (5.3, 48.2)],
+                     ['T3', 'labelt3', (6.25, 48.91)],
+                     ]
+        true_matched = set([(0,0), (0, 2), (1,2), (3,1)])
+        neighbours = findneighbours_kdtree(alignset, targetset, indexes=(2, 2), threshold=0.3)
+        processings = {2: {'metric': 'geographical', 'matrix_normalized':False,
+                          'metric_params': {'units': 'km', 'in_radians': False}}}
+        predict_matched = set()
+        for alignind, targetind in neighbours:
+            mat, matched = subalign(alignset, targetset, alignind, targetind, 30, processings)
+            for k, values in matched.iteritems():
+                for v, distance in values:
+                    predict_matched.add((k, v))
+        self.assertEqual(true_matched, predict_matched)
+
+    def test_divide_and_conquer_align(self):
+        true_matched = set([(0,0), (0, 2), (1,2), (3,1)])
+        alignset = [['V1', 'label1', (6.14194444444, 48.67)],
+                    ['V2', 'label2', (6.2, 49)],
+                    ['V3', 'label3', (5.1, 48)],
+                    ['V4', 'label4', (5.2, 48.1)],
+                    ]
+        targetset = [['T1', 'labelt1', (6.17, 48.7)],
+                     ['T2', 'labelt2', (5.3, 48.2)],
+                     ['T3', 'labelt3', (6.25, 48.91)],
+                     ]
+        processings = {2: {'metric': 'geographical', 'matrix_normalized':False,
+                          'metric_params': {'units': 'km', 'in_radians': False}}}
+        global_mat, global_matched = conquer_and_divide_alignment(alignset, targetset,
+                                                                  threshold=30,
+                                                                  processings=processings,
+                                                                  indexes=(2,2),
+                                                                  neighbours_threshold=0.3)
+        predict_matched = set()
+        for k, values in global_matched.iteritems():
+            for v, distance in values:
+                predict_matched.add((k, v))
+        self.assertEqual(true_matched, predict_matched)
+
+    def test_alignall(self):
+        alignset = [['V1', 'label1', (6.14194444444, 48.67)],
+                    ['V2', 'label2', (6.2, 49)],
+                    ['V3', 'label3', (5.1, 48)],
+                    ['V4', 'label4', (5.2, 48.1)],
+                    ]
+        targetset = [['T1', 'labelt1', (6.17, 48.7)],
+                     ['T2', 'labelt2', (5.3, 48.2)],
+                     ['T3', 'labelt3', (6.25, 48.91)],
+                     ]
+        all_matched = set([('V1','T1'), ('V1', 'T3'), ('V2','T3'), ('V4','T2')])
+        uniq_matched = set([('V2', 'T3'), ('V4', 'T2'), ('V1', 'T1')])
+        processings = {2: {'metric': 'geographical', 'matrix_normalized': False,
+                          'metric_params': {'units': 'km', 'in_radians': False}}}
+
+        predict_uniq_matched = set(alignall(alignset, targetset,
+                                            threshold=30,
+                                            processings=processings,
+                                            indexes=(2,2),
+                                            neighbours_threshold=0.3,
+                                            uniq=True))
+        predict_matched = set(alignall(alignset, targetset,
+                                       threshold=30,
+                                       processings=processings,
+                                       indexes=(2,2),
+                                       neighbours_threshold=0.3,
+                                       uniq=False))
+
+        self.assertEqual(all_matched, predict_matched)
+        self.assertEqual(uniq_matched, predict_uniq_matched)
+
+    def test_alignall_iterative(self):
+        matched = set([('V2', 'T3'), ('V4', 'T2'), ('V1', 'T1')])
+        processings = {2: {'metric': 'geographical', 'matrix_normalized': False,
+                          'metric_params': {'units': 'km', 'in_radians': False}}}
+
+        _format={'indexes': [0, 1, (2, 3)]}
+        alignements = alignall_iterative(path.join(TESTDIR, 'data',
+                                                   'alignfile.csv'),
+                                         path.join(TESTDIR, 'data',
+                                                   'targetfile.csv'),
+                                         _format, _format, threshold=30,
+                                         size=2, #very small files ;)
+                                         processings=processings,
+                                         indexes=(2,2),
+                                         neighbours_threshold=0.3)
+        predict_matched = set([(a, t) for (a, (t, _)) in
+                               alignements.iteritems()])
+        self.assertEqual(matched, predict_matched)
+
+
+###############################################################################
+### NEIGHBOUR TESTS ###########################################################
+###############################################################################
+class NeigbhoursFunctionsTest(unittest2.TestCase):
+    # For backward compatibility
+
+    def test_findneighbours_kdtree(self):
+        alignset = [['V1', 'label1', (6.14194444444, 48.67)],
+                    ['V2', 'label2', (6.2, 49)],
+                    ['V3', 'label3', (5.1, 48)],
+                    ['V4', 'label4', (5.2, 48.1)],
+                    ]
+        targetset = [['T1', 'labelt1', (6.2, 48.9)],
+                     ['T2', 'labelt2', (5.3, 48.2)],
+                     ['T3', 'labelt3', (6.25, 48.91)],
+                     ]
+        neighbours = findneighbours_kdtree(alignset, targetset, indexes=(2, 2), threshold=0.3)
+        self.assertEqual([([0], [0, 2]), ([1], [0, 2]), ([2], [1]), ([3], [1])], neighbours)
+
+    def test_findneighbours_minhashing(self):
+        lemmas = loadlemmas(path.join(TESTDIR, 'data', 'french_lemmas.txt'))
+        processings = {2: {'normalization': [simplify,], 'norm_params': {'lemmas': lemmas}}}
+        alignset = [['V1', 'label1', u"Un nuage flotta dans le grand ciel bleu."],
+                    ['V2', 'label2', u"Pour quelle occasion vous êtes-vous apprêtée ?"],
+                    ['V3', 'label3', u"Je les vis ensemble à plusieurs occasions."],
+                    ['V4', 'label4', u"Je n'aime pas ce genre de bandes dessinées tristes."],
+                    ['V5', 'label5', u"Ensemble et à plusieurs occasions, je les vis."],
+                    ]
+        targetset = [['T1', 'labelt1', u"Des grands nuages noirs flottent dans le ciel."],
+                     ['T2', 'labelt2', u"Je les ai vus ensemble à plusieurs occasions."],
+                     ['T3', 'labelt3', u"J'aime les bandes dessinées de genre comiques."],
+                     ]
+        alignset = normalize_set(alignset, processings)
+        targetset = normalize_set(targetset, processings)
+        neighbours = findneighbours_minhashing(alignset, targetset, indexes=(2, 2), threshold=0.4)
+        for align in (([2, 4], [1]), ([0], [0]), ([3], [2])):
+            self.assertIn(align, neighbours)
+
+    def test_findneighbours_clustering(self):
+        alignset = [['V1', 'label1', (6.14194444444, 48.67)],
+                    ['V2', 'label2', (6.2, 49)],
+                    ['V3', 'label3', (5.1, 48)],
+                    ['V4', 'label4', (5.2, 48.1)],
+                    ]
+        targetset = [['T1', 'labelt1', (6.2, 48.9)],
+                     ['T2', 'labelt2', (5.3, 48.2)],
+                     ['T3', 'labelt3', (6.25, 48.91)],
+                     ]
+        try:
+            import sklearn as skl
+        except ImportError:
+            self.skiptTest('Scikit learn does not seem to be installed')
+        if int(skl.__version__.split('-')[0].split('.')[1])<=11:
+            self.skipTest('Scikit learn version is too old - Skipping test')
+        neighbours = findneighbours_clustering(alignset, targetset, indexes=(2, 2))
+        for neighbour in neighbours:
+            self.assertIn(neighbour, [([0, 1], [0, 2]), ([2, 3], [1])])
+
+
+if __name__ == '__main__':
+    unittest2.main()
+