[nazca] Create a record linkage directory, related to #187461
authorVincent Michel <vincent.michel@logilab.fr>
Thu, 19 Dec 2013 14:41:02 +0000
changeset 365 4491e020bca2
parent 364 a9a447c59ced
child 366 8978092150e7
[nazca] Create a record linkage directory, related to #187461
aligner.py
blocking.py
old_api.py
record_linkage/__init__.py
record_linkage/aligner.py
record_linkage/blocking.py
record_linkage/old_api.py
test/test_alignment.py
test/test_blocking.py
test/test_old_api.py
--- a/aligner.py	Tue Oct 22 16:04:05 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,324 +0,0 @@
-# -*- coding:utf-8 -*-
-# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-import time
-import logging
-from collections import defaultdict
-
-from scipy import zeros
-from scipy.sparse import lil_matrix
-
-from nazca.dataio import parsefile
-
-
-###############################################################################
-### UTILITY FUNCTIONS #########################################################
-###############################################################################
-def iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique=True):
-    """ Return the aligned pairs
-    """
-    if unique:
-        for refid in global_matched:
-            bestid, _ = sorted(global_matched[refid], key=lambda x:x[1])[0]
-            ref_record = refset[refid]
-            target_record = targetset[bestid]
-            distance = global_mat[refid, bestid] if global_mat is not None else None
-            yield (ref_record[0], refid), (target_record[0], bestid), distance
-    else:
-        for refid in global_matched:
-            for targetid, _ in global_matched[refid]:
-                ref_record = refset[refid]
-                target_record = targetset[targetid]
-                distance = global_mat[refid, targetid] if global_mat is not None else None
-                yield (ref_record[0], refid), (target_record[0], targetid), distance
-
-
-###############################################################################
-### BASE ALIGNER OBJECT #######################################################
-###############################################################################
-class BaseAligner(object):
-
-    def __init__(self, threshold, processings, normalize_matrix=False):
-        self.threshold = threshold
-        self.processings = processings
-        self.normalize_matrix = normalize_matrix
-        self.ref_normalizer = None
-        self.target_normalizer = None
-        self.target_normalizer = None
-        self.blocking = None
-        self.alignments_done = 0
-        self.pairs_found = 0
-        self.nb_comparisons = 0
-        self.nb_blocks = 0
-        self.refset_size = None
-        self.targetset_size = None
-        self.time = None
-        self.logger = logging.getLogger('nazca.aligner')
-
-    def register_ref_normalizer(self, normalizer):
-        """ Register normalizers to be applied
-        before alignment """
-        self.ref_normalizer = normalizer
-
-    def register_target_normalizer(self, normalizer):
-        """ Register normalizers to be applied
-        before alignment """
-        self.target_normalizer = normalizer
-
-    def register_blocking(self, blocking):
-        self.blocking = blocking
-
-    def apply_normalization(self, dataset, normalizer):
-        if normalizer:
-            return normalizer.normalize_dataset(dataset)
-        return dataset
-
-    def compute_distance_matrix(self, refset, targetset,
-                                ref_indexes, target_indexes):
-        """ Compute and return the global alignment matrix.
-        For each `processing` a `Distancematrix` is built, then all the
-        matrices are summed with their own weighting and the result is the global
-        alignment matrix, which is returned.
-        """
-        distmatrix = zeros((len(ref_indexes), len(target_indexes)), dtype='float32')
-        for processing in self.processings:
-            distmatrix += processing.cdist(refset, targetset,
-                                          ref_indexes, target_indexes)
-        return distmatrix
-
-    def threshold_matched(self, distmatrix):
-        """ Return the matched elements within a dictionnary,
-        each key being the indice from X, and the corresponding
-        values being a list of couple (indice from Y, distance)
-        """
-        match = defaultdict(list)
-        if self.normalize_matrix:
-            distmatrix /= distmatrix.max()
-        ind = (distmatrix <= self.threshold).nonzero()
-        indrow = ind[0].tolist()
-        indcol = ind[1].tolist()
-        for (i, j) in zip(indrow, indcol):
-            match[i].append((j, distmatrix[i, j]))
-        return match
-
-    def _get_match(self, refset, targetset, ref_indexes=None, target_indexes=None):
-        # Build items
-        items = []
-        ref_indexes = ref_indexes or xrange(len(refset))
-        target_indexes = target_indexes or xrange(len(targetset))
-        # Apply alignments
-        mat = self.compute_distance_matrix(refset, targetset,
-                                           ref_indexes=ref_indexes,
-                                           target_indexes=target_indexes)
-        matched = self.threshold_matched(mat)
-        # Reapply matched to global indexes
-        new_matched = {}
-        for k, values in matched.iteritems():
-            new_matched[ref_indexes[k]] = [(target_indexes[i], d) for i, d in values]
-        return mat, new_matched
-
-    def align(self, refset, targetset, get_matrix=True):
-        """ Perform the alignment on the referenceset
-        and the targetset
-        """
-        start_time = time.time()
-        refset = self.apply_normalization(refset, self.ref_normalizer)
-        targetset = self.apply_normalization(targetset, self.target_normalizer)
-        self.refset_size = len(refset)
-        self.targetset_size = len(targetset)
-        # If no blocking
-        if not self.blocking:
-            return self._get_match(refset, targetset)
-        # Blocking == conquer_and_divide
-        global_matched = {}
-        global_mat = lil_matrix((len(refset), len(targetset)))
-        self.blocking.fit(refset, targetset)
-        for refblock, targetblock in self.blocking.iter_blocks():
-            self.nb_blocks += 1
-            ref_index = [r[0] for r in refblock]
-            target_index = [r[0] for r in targetblock]
-            self.nb_comparisons += len(ref_index)*len(target_index)
-            _, matched = self._get_match(refset, targetset, ref_index, target_index)
-            for k, values in matched.iteritems():
-                subdict = global_matched.setdefault(k, set())
-                for v, d in values:
-                    subdict.add((v, d))
-                    self.alignments_done += 1
-                    if get_matrix:
-                        # XXX avoid issue in sparse matrix
-                        global_mat[k, v] = d or 10**(-10)
-        self.time = time.time() - start_time
-        return global_mat, global_matched
-
-    def get_aligned_pairs(self, refset, targetset, unique=True):
-        """ Get the pairs of aligned elements
-        """
-        global_mat, global_matched = self.align(refset, targetset, get_matrix=False)
-        for pair in iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique):
-            self.pairs_found += 1
-            yield pair
-        self.log_infos()
-
-    def align_from_files(self, reffile, targetfile,
-                         ref_indexes=None, target_indexes=None,
-                         ref_encoding=None, target_encoding=None,
-                         ref_separator='\t', target_separator='\t',
-                         get_matrix=True):
-        """ Align data from files
-
-        Parameters
-        ----------
-
-        reffile: name of the reference file
-
-        targetfile: name of the target file
-
-        ref_encoding: if given (e.g. 'utf-8' or 'latin-1'), it will
-                      be used to read the files.
-
-        target_encoding: if given (e.g. 'utf-8' or 'latin-1'), it will
-                         be used to read the files.
-
-        ref_separator: separator of the reference file
-
-        target_separator: separator of the target file
-        """
-        refset = parsefile(reffile, indexes=ref_indexes,
-                           encoding=ref_encoding, delimiter=ref_separator)
-        targetset = parsefile(targetfile, indexes=target_indexes,
-                              encoding=target_encoding, delimiter=target_separator)
-        return self.align(refset, targetset, get_matrix=get_matrix)
-
-    def get_aligned_pairs_from_files(self, reffile, targetfile,
-                         ref_indexes=None, target_indexes=None,
-                         ref_encoding=None, target_encoding=None,
-                         ref_separator='\t', target_separator='\t',
-                         unique=True):
-        """ Get the pairs of aligned elements
-        """
-        refset = parsefile(reffile, indexes=ref_indexes,
-                           encoding=ref_encoding, delimiter=ref_separator)
-        targetset = parsefile(targetfile, indexes=target_indexes,
-                              encoding=target_encoding, delimiter=target_separator)
-        global_mat, global_matched = self.align(refset, targetset, get_matrix=False)
-        for pair in iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique):
-            yield pair
-
-    def log_infos(self):
-        """ Display some info on the aligner process
-        """
-        self.logger.info('Computation time : %s' % self.time)
-        self.logger.info('Size reference set : %s' % self.refset_size)
-        self.logger.info('Size target set : %s' % self.targetset_size)
-        self.logger.info('Comparisons done : %s' % self.nb_comparisons)
-        self.logger.info('Alignments done : %s' % self.alignments_done)
-        self.logger.info('Pairs found : %s' % self.pairs_found)
-        self.logger.info('Ratio reference set/alignments done : %s'
-                         % (self.alignments_done/float(self.refset_size)))
-        self.logger.info('Ratio target set/alignments done : %s'
-                         % (self.alignments_done/float(self.targetset_size)))
-        self.logger.info('Ratio reference set/pairs found : %s'
-                         % (self.pairs_found/float(self.refset_size)))
-        self.logger.info('Ratio target set/pairs found : %s'
-                         % (self.pairs_found/float(self.targetset_size)))
-        self.logger.info('Maximum comparisons : %s'
-                         % (self.refset_size * self.targetset_size))
-        self.logger.info('Number of blocks : %s' % self.nb_blocks)
-        if self.nb_blocks:
-            self.logger.info('Ratio comparisons/block : %s'
-                             % (float(self.nb_comparisons)/self.nb_blocks))
-        self.logger.info('Blocking reduction : %s'
-                         % (self.nb_comparisons/float(self.refset_size * self.targetset_size)))
-
-
-###############################################################################
-### PIPELINE ALIGNER OBJECT ##################################################
-###############################################################################
-class PipelineAligner(object):
-    """ This pipeline will perform iterative alignments, removing each time
-    the aligned results from the previous aligner.
-    """
-
-    def __init__(self, aligners):
-        self.aligners = aligners
-        self.pairs = {}
-        self.nb_comparisons = 0
-        self.nb_blocks = 0
-        self.alignments_done = 0
-        self.pairs_found = 0
-        self.refset_size = None
-        self.targetset_size = None
-        self.time = None
-        self.logger = logging.getLogger('nazca.aligner')
-
-    def get_aligned_pairs(self, refset, targetset, unique=True):
-        """ Get the pairs of aligned elements
-        """
-        start_time = time.time()
-        ref_index = range(len(refset))
-        target_index = range(len(targetset))
-        self.refset_size = len(refset)
-        self.targetset_size = len(targetset)
-        global_matched = {}
-        global_mat = lil_matrix((len(refset), len(targetset)))
-        seen_refset = set()
-        # Iteration over aligners
-        for ind_aligner, aligner in enumerate(self.aligners):
-            # Perform alignment
-            _refset = [refset[i] for i in ref_index]
-            _targetset = [targetset[i] for i in target_index]
-            for pair in aligner.get_aligned_pairs(_refset, _targetset, unique):
-                self.pairs_found += 1
-                pair = ((pair[0][0], ref_index[pair[0][1]]),
-                        (pair[1][0], target_index[pair[1][1]]))
-                yield pair
-                seen_refset.add(pair[0][1])
-            # Store stats
-            self.nb_blocks += aligner.nb_blocks
-            self.nb_comparisons += aligner.nb_comparisons
-            # Update indexes if necessary
-            # For now, we remove all the reference set that are already matched
-            if ind_aligner < len(self.aligners) - 1:
-                # There are other aligners after this one
-                ref_index = [i for i in ref_index if i not in seen_refset]
-        self.time = time.time() - start_time
-        self.log_infos()
-
-    def log_infos(self):
-        """ Display some info on the aligner process
-        """
-        self.logger.info('Computation time : %s' % self.time)
-        self.logger.info('Size reference set : %s' % self.refset_size)
-        self.logger.info('Size target set : %s' % self.targetset_size)
-        self.logger.info('Comparisons done : %s' % self.nb_comparisons)
-        self.logger.info('Alignments done : %s' % self.alignments_done)
-        self.logger.info('Pairs found : %s' % self.pairs_found)
-        self.logger.info('Ratio reference set/alignments done : %s'
-                         % (self.alignments_done/float(self.refset_size)))
-        self.logger.info('Ratio target set/alignments done : %s'
-                         % (self.alignments_done/float(self.targetset_size)))
-        self.logger.info('Ratio reference set/pairs found : %s'
-                         % (self.pairs_found/float(self.refset_size)))
-        self.logger.info('Ratio target set/pairs found : %s'
-                         % (self.pairs_found/float(self.targetset_size)))
-        self.logger.info('Maximum comparisons : %s'
-                         % (self.refset_size * self.targetset_size))
-        self.logger.info('Number of blocks : %s' % self.nb_blocks)
-        if self.nb_blocks:
-            self.logger.info('Ratio comparisons/block : %s'
-                             % (float(self.nb_comparisons)/self.nb_blocks))
-        self.logger.info('Blocking reduction : %s'
-                         % (self.nb_comparisons/float(self.refset_size * self.targetset_size)))
--- a/blocking.py	Tue Oct 22 16:04:05 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,666 +0,0 @@
-# -*- coding:utf-8 -*-
-# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-
-
-""" Blocking techniques.
-
-This module implements a set of blocking techniques used to split
-datasets in smaller subsets that will be aligned in more details.
-
-Additional information:
-
-   P. Christen, Data Matching, Data-Centric Systems and Applications,
-
-
-"""
-from functools import partial
-import warnings
-
-from scipy.spatial import KDTree
-
-from nazca.minhashing import Minlsh
-from nazca.distances import soundexcode
-
-
-###############################################################################
-### GENERAL BLOCKING ##########################################################
-###############################################################################
-class BaseBlocking(object):
-    """ An abstract general blocking object that exposes
-    the API that should be common to all blockings object
-    """
-    def __init__(self, ref_attr_index, target_attr_index):
-        """ Build the blocking object
-
-        Parameters
-        ----------
-
-        ref_attr_index: index of the attribute of interest in a record
-                        for the reference dataset
-                        (i.e. attribute to be used for key computation)
-
-        target_attr_index: index of the attribute of interest in a record
-                           for the target dataset
-                           (i.e. attribute to be used for key computation)
-        """
-        self.ref_attr_index = ref_attr_index
-        self.target_attr_index = target_attr_index
-        self.refids = None
-        self.targetids = None
-        self.is_fitted = False
-
-    def _fit(self, refset, targetset):
-        raise NotImplementedError
-
-    def _iter_blocks(self):
-        """ Internal iteration function over blocks
-        """
-        raise NotImplementedError
-
-    def _cleanup(self):
-        """ Internal cleanup blocking for further use (e.g. in pipeline)
-        """
-        raise NotImplementedError
-
-    def fit(self, refset, targetset):
-        """ Fit the blocking technique on the reference and target datasets
-
-        Parameters
-        ----------
-        refset: a dataset (list of records)
-
-        targetset: a dataset (list of records)
-        """
-        self._fit(refset, targetset)
-        # Keep ids for blocks building
-        self.refids = [(i, r[0]) for i, r in enumerate(refset)]
-        self.targetids = [(i, r[0]) for i, r in enumerate(targetset)]
-        self.is_fitted = True
-
-    def iter_blocks(self):
-        """ Iterator over the different possible blocks.
-
-        Returns
-        -------
-
-        (block1, block2): The blocks are always (reference_block, target_block)
-                          and contains the pair (index, id) of the record in the
-                          corresponding dataset.
-        """
-        assert self.is_fitted
-        return self._iter_blocks()
-
-    def iter_indice_blocks(self):
-        """ Iterator over the different possible blocks.
-
-        Returns
-        -------
-
-        (block1, block2): The blocks are always (reference_block, target_block)
-                          and contains the indexes of the record in the
-                          corresponding dataset.
-        """
-        assert self.is_fitted
-        for block1, block2 in self._iter_blocks():
-            yield [r[0] for r in block1], [r[0] for r in block2]
-
-    def iter_id_blocks(self):
-        """ Iterator over the different possible blocks.
-
-        Returns
-        -------
-
-        (block1, block2): The blocks are always (reference_block, target_block)
-                          and contains the ids of the record in the
-                          corresponding dataset.
-        """
-        assert self.is_fitted
-        for block1, block2 in self._iter_blocks():
-            yield [r[1] for r in block1], [r[1] for r in block2]
-
-    def iter_pairs(self):
-        """ Iterator over the different possible pairs.
-
-        Returns
-        -------
-
-        (pair1, pari2): The pairs are always ((ind_reference, id_reference),
-                                              (ind_target, id_target))
-                        and are the ids of the record in the corresponding dataset.
-        """
-        assert self.is_fitted
-        for block1, block2 in self.iter_blocks():
-            for val1 in block1:
-                for val2 in block2:
-                    yield val1, val2
-
-    def iter_indice_pairs(self):
-        """ Iterator over the different possible pairs.
-
-        Returns
-        -------
-
-        (pair1, pari2): The pairs are always (ind_reference, ind_target)
-                        and are the ids of the record in the corresponding dataset.
-        """
-        assert self.is_fitted
-        for block1, block2 in self.iter_indice_blocks():
-            for val1 in block1:
-                for val2 in block2:
-                    yield val1, val2
-
-    def iter_id_pairs(self):
-        """ Iterator over the different possible pairs.
-
-        Returns
-        -------
-
-        (pair1, pari2): The pairs are always (id_reference, id_target)
-                        and are the ids of the record in the corresponding dataset.
-        """
-        assert self.is_fitted
-        for block1, block2 in self.iter_id_blocks():
-            for val1 in block1:
-                for val2 in block2:
-                    yield val1, val2
-
-    def cleanup(self):
-        """ Cleanup blocking for further use (e.g. in pipeline)
-        """
-        self.is_fitted = True
-        self._cleanup()
-
-
-###############################################################################
-### KEY BLOCKING ##############################################################
-###############################################################################
-class KeyBlocking(BaseBlocking):
-    """ This blocking technique is based on a a blocking criteria
-    (or blocking key), that will be used to divide the datasets.
-
-    The main idea here is:
-
-    1 - to create an index of f(x) for each x in the reference set.
-
-    2 - to create an index of f(y) for each y in the target set.
-
-    3 - to iterate on each distinct value of f(x) and to return
-        the identifiers of the records of the both sets for this value.
-    """
-
-    def __init__(self, ref_attr_index, target_attr_index, callback, ignore_none=False):
-        super(KeyBlocking, self).__init__(ref_attr_index, target_attr_index)
-        self.callback = callback
-        self.ignore_none = ignore_none
-        self.reference_index = {}
-        self.target_index = {}
-
-    def _fit(self, refset, targetset):
-        """ Fit a dataset in an index using the callback
-        """
-        for ind, rec in enumerate(refset):
-            key = self.callback(rec[self.ref_attr_index])
-            if not key and self.ignore_none:
-                continue
-            self.reference_index.setdefault(key, []).append((ind, rec[0]))
-        for ind, rec in enumerate(targetset):
-            key = self.callback(rec[self.target_attr_index])
-            if not key and self.ignore_none:
-                continue
-            self.target_index.setdefault(key, []).append((ind, rec[0]))
-
-    def _iter_blocks(self):
-        """ Iterator over the different possible blocks.
-
-        Returns
-        -------
-
-        (block1, block2): The blocks are always (reference_block, target_block)
-                          and containts the indexes of the record in the
-                          corresponding dataset.
-        """
-        for key, block1 in self.reference_index.iteritems():
-            block2 = self.target_index.get(key)
-            if block1 and block2:
-                yield (block1, block2)
-
-    def _cleanup(self):
-        """ Cleanup blocking for further use (e.g. in pipeline)
-        """
-        self.reference_index = {}
-        self.target_index = {}
-
-
-class SoundexBlocking(KeyBlocking):
-
-    def __init__(self, ref_attr_index, target_attr_index, language='french',):
-        super(SoundexBlocking, self).__init__(ref_attr_index, target_attr_index,
-                                              partial(soundexcode, language=language))
-
-
-###############################################################################
-### BIGRAM BLOCKING ###########################################################
-###############################################################################
-class NGramBlocking(BaseBlocking):
-    """ This blocking technique is based on a a n-gram key.
-    """
-
-    def __init__(self, ref_attr_index, target_attr_index, ngram_size=2, depth=2):
-        super(NGramBlocking, self).__init__(ref_attr_index, target_attr_index)
-        self.ngram_size = ngram_size
-        self.depth = depth
-        self.reference_index = {}
-        self.target_index = {}
-
-    def _fit_dataset(self, dataset, cur_index, attr_index):
-        """ Fit a dataset
-        """
-        for ind, r in enumerate(dataset):
-            cur_dict = cur_index
-            text = r[attr_index]
-            for i in range(self.depth):
-                ngram = text[i*self.ngram_size:(i+1)*self.ngram_size]
-                if i < self.depth - 1:
-                    cur_dict = cur_dict.setdefault(ngram, {})
-            cur_dict.setdefault(ngram, []).append((ind, r[0]))
-
-    def _fit(self, refset, targetset):
-        """ Fit the two sets (reference set and target set)
-        """
-        self._fit_dataset(refset, self.reference_index, self.ref_attr_index)
-        self._fit_dataset(targetset, self.target_index, self.target_attr_index)
-
-    def _iter_dict(self, ref_cur_dict, target_cur_dict):
-        """ Iterative function used to create blocks from dicts
-        """
-        for key, sub_dict in ref_cur_dict.iteritems():
-            if key in target_cur_dict:
-                if isinstance(sub_dict, dict):
-                    # There is another dict layer
-                    for block1, block2 in self._iter_dict(sub_dict, target_cur_dict[key]):
-                        yield block1, block2
-                else:
-                    # This is a list
-                    yield sub_dict, target_cur_dict[key]
-
-    def _iter_blocks(self):
-        """ Iterator over the different possible blocks.
-
-        Returns
-        -------
-
-        (block1, block2): The blocks are always (reference_block, target_block)
-                          and containts the indexes of the record in the
-                          corresponding dataset.
-        """
-        for block1, block2 in self._iter_dict(self.reference_index, self.target_index):
-            if block1 and block2:
-                yield block1, block2
-
-    def _cleanup(self):
-        """ Cleanup blocking for further use (e.g. in pipeline)
-        """
-        self.reference_index = {}
-        self.target_index = {}
-
-
-###############################################################################
-### SORTKEY BLOCKING ##########################################################
-###############################################################################
-class SortedNeighborhoodBlocking(BaseBlocking):
-    """ This blocking technique is based on a a sorting blocking criteria
-    (or blocking key), that will be used to divide the datasets.
-    """
-
-    def __init__(self, ref_attr_index, target_attr_index, key_func=lambda x: x, window_width=20):
-        super(SortedNeighborhoodBlocking, self).__init__(ref_attr_index, target_attr_index)
-        self.key_func = key_func
-        self.window_width = window_width
-        self.sorted_dataset = None
-
-    def _fit(self, refset, targetset):
-        """ Fit a dataset in an index using the callback
-        """
-        self.sorted_dataset = [((ind, r[0]), r[self.ref_attr_index], 0)
-                               for ind, r in enumerate(refset)]
-        self.sorted_dataset.extend([((ind, r[0]), r[self.target_attr_index], 1)
-                                    for ind, r in enumerate(targetset)])
-        self.sorted_dataset.sort(key=lambda x: self.key_func(x[1]))
-
-    def _iter_blocks(self):
-        """ Iterator over the different possible blocks.
-        """
-        for ind, (rid, record, dset) in enumerate(self.sorted_dataset):
-            # Only keep reference set record
-            if dset == 1:
-                continue
-            block1 = [rid,]
-            minind = (ind - self.window_width)
-            minind = minind if minind >=0 else 0
-            maxind = (ind + self.window_width + 1)
-            block2 = [ri for ri, re, d in self.sorted_dataset[minind:maxind]
-                      if d == 1]
-            if block1 and block2:
-                yield (block1, block2)
-
-    def _cleanup(self):
-        """ Cleanup blocking for further use (e.g. in pipeline)
-        """
-        self.sorted_dataset = None
-
-
-###############################################################################
-### MERGE BLOCKING ############################################################
-###############################################################################
-class MergeBlocking(BaseBlocking):
-    """ This blocking technique keep only one appearance of one given values,
-    and removes all the other records having this value.
-    The merge is based on a score function
-
-    E.g.
-      ('http://fr.wikipedia.org/wiki/Paris_%28Texas%29', 'Paris', 25898)
-      ('http://fr.wikipedia.org/wiki/Paris', 'Paris', 12223100)
-
-    could be (with a score function based on the population (third value):
-
-      ('http://fr.wikipedia.org/wiki/Paris', 'Paris', 12223100)
-
-    !!! WARNING !!! This is only done on ONE set (the one with a non null attr index)
-    """
-
-    def __init__(self, ref_attr_index, target_attr_index, score_func):
-        super(MergeBlocking, self).__init__(ref_attr_index, target_attr_index)
-        self.score_func = score_func
-        self.merged_dataset = None
-        self.other_dataset = None
-        if ref_attr_index is None and target_attr_index is None:
-            raise ValueError('At least one of ref_attr_index or target_attr_index '
-                             'should not be None')
-
-    def _fit(self, refset, targetset):
-        """ Fit a dataset in an index using the callback
-        """
-        if self.ref_attr_index is not None:
-            # Merge refset
-            self.merged_dataset = self._merge_dataset(refset, self.ref_attr_index)
-            self.other_dataset = [(ind, r[0]) for ind, r in enumerate(targetset)]
-        else:
-            # Merge targetset
-            self.merged_dataset = self._merge_dataset(targetset, self.target_attr_index)
-            self.other_dataset = [(ind, r[0]) for ind, r in enumerate(refset)]
-
-    def _merge_dataset(self, dataset, attr_index):
-        """ Merge a dataset
-        """
-        merged_dataset_dict = {}
-        for ind, record in enumerate(dataset):
-            score = self.score_func(record)
-            if record[attr_index] not in merged_dataset_dict:
-                # Create new entry
-                merged_dataset_dict[record[attr_index]] = (ind, record, score)
-            elif (record[attr_index] in merged_dataset_dict
-                  and merged_dataset_dict[record[attr_index]][2] < score):
-                # Change current score
-                merged_dataset_dict[record[attr_index]] = (ind, record, score)
-        return [(ind, r[0]) for ind, r, score in merged_dataset_dict.itervalues()]
-
-    def _iter_blocks(self):
-        """ Iterator over the different possible blocks.
-        """
-        if self.ref_attr_index is not None:
-            yield self.merged_dataset, self.other_dataset
-        else:
-            # self.target_attr_index is not None
-            yield self.other_dataset, self.merged_dataset
-
-    def _cleanup(self):
-        """ Cleanup blocking for further use (e.g. in pipeline)
-        """
-        self.merged_dataset = None
-        self.other_dataset = None
-
-
-###############################################################################
-### CLUSTERING-BASED BLOCKINGS ################################################
-###############################################################################
-class KmeansBlocking(BaseBlocking):
-    """ A blocking technique based on Kmeans
-    """
-
-    def __init__(self, ref_attr_index, target_attr_index, n_clusters=None):
-        super(KmeansBlocking, self).__init__(ref_attr_index, target_attr_index)
-        self.n_clusters = n_clusters
-        self.kmeans = None
-        self.predicted = None
-        from sklearn import cluster
-        self.cluster_class = cluster.KMeans
-
-    def _fit(self, refset, targetset):
-        """ Fit the reference dataset.
-        """
-        # If an element is None (missing), use instead the identity element.
-        # The identity element is defined as the 0-vector
-        idelement = tuple([0 for _ in xrange(len(refset[0][self.ref_attr_index]))])
-        # We assume here that there are at least 2 elements in the refset
-        n_clusters = self.n_clusters or (len(refset)/10 or len(refset)/2)
-        kmeans =  self.cluster_class(n_clusters=n_clusters)
-        kmeans.fit([elt[self.ref_attr_index] or idelement for elt in refset])
-        self.kmeans = kmeans
-        # Predict on targetset
-        self.predicted = self.kmeans.predict([elt[self.target_attr_index]
-                                              or idelement for elt in targetset])
-
-    def _iter_blocks(self):
-        """ Iterator over the different possible blocks.
-
-        Returns
-        -------
-
-        (block1, block2): The blocks are always (reference_block, target_block)
-                          and containts the indexes of the record in the
-                          corresponding dataset.
-        """
-        neighbours = [[[], []] for _ in xrange(self.kmeans.n_clusters)]
-        for ind, li in enumerate(self.predicted):
-            neighbours[li][1].append(self.targetids[ind])
-        for ind, li in enumerate(self.kmeans.labels_):
-            neighbours[li][0].append(self.refids[ind])
-        for block1, block2 in neighbours:
-            if len(block1) and len(block2):
-                yield block1, block2
-
-    def _cleanup(self):
-        """ Cleanup blocking for further use (e.g. in pipeline)
-        """
-        self.kmeans = None
-        self.predicted = None
-
-
-###############################################################################
-### KDTREE BLOCKINGS ##########################################################
-###############################################################################
-class KdTreeBlocking(BaseBlocking):
-    """ A blocking technique based on KdTree
-    """
-    def __init__(self, ref_attr_index, target_attr_index, threshold=0.1):
-        super(KdTreeBlocking, self).__init__(ref_attr_index, target_attr_index)
-        self.threshold = threshold
-        self.reftree = None
-        self.targettree = None
-        self.nb_elements = None
-
-    def _fit(self, refset, targetset):
-        """ Fit the blocking
-        """
-        firstelement = refset[0][self.ref_attr_index]
-        self.nb_elements = len(refset)
-        idsize = len(firstelement) if isinstance(firstelement, (tuple, list)) else 1
-        idelement = (0,) * idsize
-        # KDTree is expecting a two-dimensional array
-        if idsize == 1:
-            self.reftree  = KDTree([(elt[self.ref_attr_index],) or idelement for elt in refset])
-            self.targettree = KDTree([(elt[self.target_attr_index],) or idelement for elt in targetset])
-        else:
-            self.reftree = KDTree([elt[self.ref_attr_index] or idelement for elt in refset])
-            self.targettree = KDTree([elt[self.target_attr_index] or idelement for elt in targetset])
-
-    def _iter_blocks(self):
-        """ Iterator over the different possible blocks.
-
-        Returns
-        -------
-
-        (block1, block2): The blocks are always (reference_block, target_block)
-                          and containts the indexes of the record in the
-                          corresponding dataset.
-        """
-        extraneighbours = self.reftree.query_ball_tree(self.targettree, self.threshold)
-        neighbours = []
-        for ind in xrange(self.nb_elements):
-            if not extraneighbours[ind]:
-                continue
-            _ref = [self.refids[ind],]
-            _target = [self.targetids[v] for v in extraneighbours[ind]]
-            neighbours.append((_ref, _target))
-        for block1, block2 in neighbours:
-            if len(block1) and len(block2):
-                yield block1, block2
-
-    def _cleanup(self):
-        """ Cleanup blocking for further use (e.g. in pipeline)
-        """
-        self.reftree = None
-        self.targettree = None
-        self.nb_elements = None
-
-
-###############################################################################
-### MINHASHING BLOCKINGS ######################################################
-###############################################################################
-class MinHashingBlocking(BaseBlocking):
-    """ A blocking technique based on MinHashing
-    """
-    def __init__(self, ref_attr_index, target_attr_index,
-                 threshold=0.1, kwordsgram=1, siglen=200):
-        super(MinHashingBlocking, self).__init__(ref_attr_index, target_attr_index)
-        self.threshold = threshold
-        self.kwordsgram = kwordsgram
-        self.siglen = siglen
-        self.minhasher = Minlsh()
-        self.nb_elements = None
-
-    def _fit(self, refset, targetset):
-        """ Find the blocking using minhashing
-        """
-        # If an element is None (missing), use instead the identity element.
-        idelement = ''
-        self.minhasher.train([elt[self.ref_attr_index] or idelement for elt in refset] +
-                        [elt[self.target_attr_index] or idelement for elt in targetset],
-                        self.kwordsgram, self.siglen)
-        self.nb_elements = len(refset)
-
-    def _iter_blocks(self):
-        """ Iterator over the different possible blocks.
-
-        Returns
-        -------
-
-        (block1, block2): The blocks are always (reference_block, target_block)
-                          and containts the indexes of the record in the
-                          corresponding dataset.
-        """
-        rawneighbours = self.minhasher.predict(self.threshold)
-        neighbours = []
-        for data in rawneighbours:
-            neighbours.append([[], []])
-            for i in data:
-                if i >= self.nb_elements:
-                    neighbours[-1][1].append(self.targetids[i - self.nb_elements])
-                else:
-                    neighbours[-1][0].append(self.refids[i])
-            if len(neighbours[-1][0]) == 0 or len(neighbours[-1][1]) == 0:
-                neighbours.pop()
-        for block1, block2 in neighbours:
-            if len(block1) and len(block2):
-                yield block1, block2
-
-    def _cleanup(self):
-        """ Cleanup blocking for further use (e.g. in pipeline)
-        """
-        self.minhasher = Minlsh()
-        self.nb_elements = None
-
-
-###############################################################################
-### BLOCKING PIPELINE #########################################################
-###############################################################################
-class PipelineBlocking(BaseBlocking):
-    """ Pipeline multiple blocking techniques
-    """
-
-    def __init__(self, blockings, collect_stats=False):
-        """ Build the blocking object
-
-        Parameters
-        ----------
-
-        blockings: ordered list of blocking objects
-        """
-        self.blockings = blockings
-        self.stored_blocks = []
-        self.collect_stats = collect_stats
-        self.stats = {}
-
-    def _fit(self, refset, targetset):
-        """ Internal fit of the pipeline """
-        self._recursive_fit(refset, targetset, range(len(refset)), range(len(targetset)), 0)
-
-    def _recursive_fit(self, refset, targetset, ref_index, target_index, ind):
-        """ Recursive fit of the blockings.
-        Blocks are stored in the stored_blocks attribute.
-        """
-        if ind < len(self.blockings) - 1:
-            # There are other blockings after this one
-            blocking = self.blockings[ind]
-            blocking.cleanup()
-            blocking.fit([refset[i] for i in ref_index],
-                         [targetset[i] for i in target_index])
-            for block1, block2 in blocking.iter_indice_blocks():
-                ind_block1 = [ref_index[i] for i in block1]
-                ind_block2 = [target_index[i] for i in block2]
-                if self.collect_stats:
-                    self.stats.setdefault(ind, []).append((len(block1), len(block2)))
-                self._recursive_fit(refset, targetset, ind_block1, ind_block2, ind+1)
-        else:
-            # This is the final blocking
-            blocking = self.blockings[ind]
-            blocking.cleanup()
-            blocking.fit([refset[i] for i in ref_index],
-                         [targetset[i] for i in target_index])
-            for block1, block2 in blocking.iter_blocks():
-                ind_block1 = [(ref_index[i], _id) for i, _id in block1]
-                ind_block2 = [(target_index[i], _id) for i, _id in block2]
-                if self.collect_stats:
-                    self.stats.setdefault(ind, []).append((len(block1), len(block2)))
-                self.stored_blocks.append((ind_block1, ind_block2))
-
-    def _iter_blocks(self):
-        """ Internal iteration function over blocks
-        """
-        for block1, block2 in self.stored_blocks:
-            if block1 and block2:
-                yield block1, block2
--- a/old_api.py	Tue Oct 22 16:04:05 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,432 +0,0 @@
-# -*- coding:utf-8 -*-
-#
-# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-
-from os import listdir
-import os.path as osp
-from shutil import rmtree
-from tempfile import mkdtemp
-import sys
-import warnings
-from functools import partial
-
-from scipy.sparse import lil_matrix
-
-from nazca.dataio import write_results, split_file, parsefile
-from nazca.normalize import BaseNormalizer, NormalizerPipeline
-from nazca.blocking import KmeansBlocking, KdTreeBlocking, MinHashingBlocking
-from nazca.distances import GeographicalProcessing
-from nazca.aligner import BaseAligner
-
-
-# Backward compatibility. Now, use the BaseAligner inside the functions.
-# Perhaps these functions may be removed later...
-
-
-###############################################################################
-### NORMALIZE FUNCTIONS #######################################################
-###############################################################################
-# Backward compatibility. Now, use the NormalizerPipeline inside the functions.
-# Perhaps these functions may be removed later...
-
-def normalize_set(rset, processings):
-    """ Apply all the normalization functions to the given rset """
-    warnings.warn(DeprecationWarning('This function will be removed '
-                                     'in the next release.'
-                                     'You should rather use the BaseNormalizer '
-                                     'object of the normalize module'))
-    normalizers = []
-    for ind, processing in processings.iteritems():
-        for normalizer in extract_normalization_from_treatment(processing, ind):
-            normalizers.append(normalizer)
-    # Create pipeline
-    pipeline = NormalizerPipeline(normalizers)
-    return pipeline.normalize_dataset(rset)
-
-def extract_normalization_from_treatment(processing, ind):
-    """ Extract normalization from processing.
-    This function is used for backward compatibility with
-    the old function-based API """
-    warnings.warn(DeprecationWarning('This function will be removed '
-                                     'in the next release.'
-                                     'You should rather use the BaseNormalizer '
-                                     'object of the normalize module'))
-    for f in processing.get('normalization', []):
-        farg = f.func_code.co_varnames #List of the arguments of f
-        # A kind of union between the arguments needed by f, and the
-        # provided ones
-        givenargs = dict((arg, processing['norm_params'][arg])
-                         for arg in farg if arg in processing.get('norm_params', []))
-        callback = f
-        if givenargs:
-            callback = partial(callback, **givenargs)
-        yield BaseNormalizer(callback=callback, attr_index=ind)
-
-def extract_treatment_from_treatment(processing, ind):
-    """ Extract Treatment object from processing dict.
-    This is only for backward compatibility with the old API.
-    """
-    if processing['metric'] == 'geographical':
-        return GeographicalProcessing(ind, ind,
-                                     matrix_normalized=processing.get('matrix_normalized', False),
-                                     **processing.get('metric_params', {}))
-
-
-###############################################################################
-### ALIGNER ###################################################################
-###############################################################################
-def align(alignset, targetset, threshold, processings=None, resultfile=None,
-          _applyNormalization=True):
-    """ Try to align the items of alignset onto targetset's ones
-
-        `alignset` and `targetset` are the sets to align. Each set contains
-        lists where the first column is the identifier of the item,
-        and the others are
-        the attributs to align. (Note that the order is important !) Both must
-        have the same number of columns.
-
-        `processings` is a dictionary of dictionaries.
-        Each key is the indice of the row, and each value is a dictionary
-        that contains the processings to do on the different attributs.
-        Each dictionary is built as the following:
-
-            processing = {'normalization': [f1, f2, f3],
-                         'norm_params': {'arg1': arg01, 'arg2': arg02},
-                         'metric': d1,
-                         'metric_params': {'arg1': arg11},
-                         'weighting': w,
-                         'matrix_normalize': True
-                        }
-
-            `normalization` is the list of functions called to normalize the
-            given attribut (in order). Each functions is called with `norm_params`
-            as arguments
-
-            Idem for `distance` and `distance_args`
-
-            `weighting` is the weighting for the current attribut in regard to
-            the others
-
-            `resultfile` (default is None). Write the matched elements in a file.
-
-        Return the distance matrix and the matched list.
-    """
-    warnings.warn(DeprecationWarning('This function will be removed in the next '
-                                     'release.'
-                                     ' You should rather use the BaseAligner '
-                                     'object of the aligner module'))
-    processings = processings or {}
-    # Get the normalizers
-    normalizers = []
-    for ind, processing in processings.iteritems():
-        for normalizer in extract_normalization_from_treatment(processing, ind):
-            normalizers.append(normalizer)
-    # Cleanup processings
-    for t in processings.itervalues():
-        if 'normalization' in t:
-            t.pop('normalization')
-        if 'norm_params' in t:
-            t.pop('norm_params')
-    # Build aligner
-    processings = [extract_treatment_from_treatment(t, ind) for ind, t in processings.iteritems()]
-    aligner = BaseAligner(threshold, processings)
-    aligner.register_ref_normalizer(normalizers)
-    aligner.register_target_normalizer(normalizers)
-    # Align
-    return aligner.align(alignset, targetset)
-
-def subalign(alignset, targetset, alignind, targetind, threshold,
-             processings=None, _applyNormalization=True):
-    """ Compute a subalignment for a list of indices of the alignset and
-    a list of indices for the targetset """
-    warnings.warn(DeprecationWarning('This function will be removed in the next '
-                                     'release.'
-                                     ' You should rather use the BaseAligner '
-                                     'object of the aligner module'))
-    mat, matched = align([alignset[i[0]] for i in alignind],
-                         [targetset[i[0]] for i in targetind], threshold,
-                         processings, _applyNormalization=_applyNormalization)
-    new_matched = {}
-    for k, values in matched.iteritems():
-        new_matched[alignind[k]] = [(targetind[i], d) for i, d in values]
-    return mat, new_matched
-
-def conquer_and_divide_alignment(alignset, targetset, threshold, processings=None,
-                                 indexes=(1,1), mode='kdtree', neighbours_threshold=0.1,
-                                 n_clusters=None, kwordsgram=1, siglen=200,
-                                 get_global_mat=True):
-    """ Full conquer and divide method for alignment.
-    Compute neighbours and merge the different subalignments.
-    """
-    warnings.warn(DeprecationWarning('This function will be removed in the next '
-                                     'release.'
-                                     ' You should rather use the BaseAligner '
-                                     'object of the aligner module'))
-    global_matched = {}
-    if get_global_mat:
-        global_mat = lil_matrix((len(alignset), len(targetset)))
-
-    processings = processings or {}
-    ralignset = normalize_set(alignset, processings)
-    rtargetset = normalize_set(targetset, processings)
-
-    for alignind, targetind in findneighbours(ralignset, rtargetset, indexes, mode,
-                                              neighbours_threshold, n_clusters,
-                                              kwordsgram, siglen):
-        _, matched = subalign(alignset, targetset, alignind, targetind,
-                                threshold, processings, _applyNormalization=False)
-        for k, values in matched.iteritems():
-            subdict = global_matched.setdefault(k, set())
-            for v, d in values:
-                subdict.add((v, d))
-                # XXX avoid issue in sparse matrix
-                if get_global_mat:
-                    global_mat[k[0], v[0]] = d or 10**(-10)
-    if get_global_mat:
-        return global_mat, global_matched
-    return global_matched
-
-def alignall(alignset, targetset, threshold, processings=None,
-             indexes=(1,1), mode='kdtree', neighbours_threshold=0.1,
-             n_clusters=None, kwordsgram=1, siglen=200, uniq=False):
-    warnings.warn(DeprecationWarning('This function will be removed in the next '
-                                     'release.'
-                                     ' You should rather use the BaseAligner '
-                                     'object of the aligner module'))
-    if not mode:
-        _, matched = align(alignset, targetset, threshold, processings,
-                           resultfile=None, _applyNormalization=True)
-    else:
-        matched = conquer_and_divide_alignment(alignset, targetset, threshold,
-                                               processings, indexes, mode,
-                                               neighbours_threshold, n_clusters,
-                                               kwordsgram, siglen,
-                                               get_global_mat=False)
-
-    if not uniq:
-        for alignid in matched:
-            for targetid, _ in matched[alignid]:
-                yield alignset[alignid[0]][0], targetset[targetid[0]][0]
-    else:
-        for alignid in matched:
-            bestid, _ = sorted(matched[alignid], key=lambda x:x[1])[0]
-            yield alignset[alignid[0]][0], targetset[bestid[0]][0]
-
-def alignall_iterative(alignfile, targetfile, alignformat, targetformat,
-                       threshold, size=10000, equality_threshold=0.01,
-                       processings=None, indexes=(1,1), mode='kdtree',
-                       neighbours_threshold=0.1, n_clusters=None, kwordsgram=1,
-                       siglen=200, cache=None):
-    """ This function helps you to align *huge* files.
-        It takes your csv files as arguments and split them into smaller ones
-        (files of `size` lines), and runs the alignment on those files.
-
-        `alignformat` and `targetformat` are keyworded arguments given to the
-        nazca.dataio.parsefile function.
-
-        This function returns its own cache. The cache is quite simply a
-        dictionary having align items' id as keys and tuples (target item's id,
-        distance) as value. This dictionary can be regiven to this function to
-        perform another alignment (with different parameters, or just to be
-        sure everything has been caught)
-
-        If the distance of an alignment is below `equality_threshold`, the
-        alignment is considered as perfect, and the corresponding item is
-        removed from the alignset (to speed up the computation).
-    """
-    warnings.warn(DeprecationWarning('This function will be removed in the next '
-                                     'release.'
-                                     ' You should rather use the BaseAligner '
-                                     'object of the aligner module'))
-    #Split the huge files into smaller ones
-    aligndir = mkdtemp()
-    targetdir = mkdtemp()
-    alignfiles = split_file(alignfile, aligndir, size)
-    targetfiles = split_file(targetfile, targetdir, size)
-
-    #Compute the number of iterations that must be done to achieve the alignement
-    nb_iterations = len(alignfiles) * len(targetfiles)
-    current_it = 0
-
-    cache = cache or {} #Contains the better known alignements
-    #Contains the id of perfectly aligned data
-    doneids = set(_id for _id, (_, dist) in cache.iteritems()
-                          if dist < equality_threshold)
-
-    try:
-        for alignfile in alignfiles:
-            alignset = [a for a in parsefile(osp.join(aligndir, alignfile), **alignformat)
-                        if a[0] not in doneids]
-            for targetfile in targetfiles:
-                targetset = parsefile(osp.join(targetdir, targetfile), **targetformat)
-                matched = conquer_and_divide_alignment(alignset, targetset,
-                                                       threshold,
-                                                       processings=processings,
-                                                       indexes=indexes,
-                                                       mode=mode,
-                                                       neighbours_threshold=neighbours_threshold,
-                                                       n_clusters=n_clusters,
-                                                       kwordsgram=kwordsgram,
-                                                       siglen=siglen,
-                                                       get_global_mat=False)
-                for alignid in matched:
-                    bestid, dist = sorted(matched[alignid], key=lambda x:x[1])[0]
-                    #Get the better known distance
-                    _, current_dist = cache.get(alignset[alignid[0]][0], (None, None))
-                    if current_dist is None or current_dist > dist:
-                        #If it's better, update the cache
-                        cache[alignset[alignid[0]][0]] = (targetset[bestid[0]][0], dist)
-                        if dist <= equality_threshold:
-                            #If perfect, stop trying to align this one
-                            doneids.add(alignset[alignid][0])
-
-                current_it += 1
-                sys.stdout.write('\r%0.2f%%' % (current_it * 100. /
-                                                nb_iterations))
-                sys.stdout.flush()
-                if doneids:
-                    alignset = [a for a in alignset if a[0] not in doneids]
-                if not alignset: #All items have been aligned
-                    #TODO Increment current_it.
-                    #The progress of the alignment process is computed with
-                    #`current_it`. If all items of `alignset` are aligned, we
-                    #stop the alignment process for this `alignset`. If
-                    #`current_it` isn’t incremented, the progress shown will be
-                    #false.
-                    break
-
-    finally:
-        rmtree(aligndir)
-        rmtree(targetdir)
-
-    return cache
-
-
-
-
-
-
-
-###############################################################################
-### CLUSTERING-BASED BLOCKINGS FUNCTIONS ######################################
-###############################################################################
-# Backward compatibility. Now, use the BlockingObject inside the functions.
-# Perhaps these functions may be removed later...
-def findneighbours_clustering(alignset, targetset, indexes=(1, 1),
-                              mode='kmeans', n_clusters=None):
-    """ Find the neigbhours using clustering (kmeans or minibatchkmeans)
-    """
-    warnings.warn(DeprecationWarning('This function will be removed in the next '
-                                     'release.'
-                                     ' You should rather use the KmeansBlocking '
-                                     'object of the blocking module'))
-    if mode == 'kmeans':
-        blocking = KmeansBlocking(ref_attr_index=indexes[0],
-                                  target_attr_index=indexes[1],
-                                  n_clusters=n_clusters)
-    elif mode == 'minibatch':
-        blocking = MiniBatchKmeansBlocking(ref_attr_index=indexes[0],
-                                           target_attr_index=indexes[1],
-                                           n_clusters=n_clusters)
-    else:
-        raise ValueError("Mode should be 'kmeans' or 'minibatch'")
-    # Fit blocking object
-    blocking.fit(alignset, targetset)
-    return list(blocking.iter_blocks())
-
-def findneighbours_kdtree(alignset, targetset, indexes=(1, 1), threshold=0.1):
-    """ Find the neigbhours using kdree
-    """
-    warnings.warn(DeprecationWarning('This function will be removed in the next '
-                                     'release.'
-                                     ' You should rather use the KdTreeBlocking '
-                                     'object of the blocking module'))
-    blocking = KdTreeBlocking(ref_attr_index=indexes[0],
-                              target_attr_index=indexes[1],
-                              threshold=threshold)
-    blocking.fit(alignset, targetset)
-    return list(blocking.iter_blocks())
-
-def findneighbours_minhashing(alignset, targetset, indexes=(1, 1), threshold=0.1,
-                              kwordsgram=1, siglen=200):
-    """ Find the neigbhours using minhashing
-    """
-    warnings.warn(DeprecationWarning('This function will be removed in the next '
-                                     'release.'
-                                     ' You should rather use the '
-                                     'MinHashingBlocking '
-                                     'object of the blocking module'))
-    blocking = MinHashingBlocking(ref_attr_index=indexes[0],
-                                  target_attr_index=indexes[1],
-                                  threshold=threshold, kwordsgram=kwordsgram,
-                                  siglen=siglen)
-    blocking.fit(alignset, targetset)
-    return list(blocking.iter_blocks())
-
-def findneighbours(alignset, targetset, indexes=(1, 1), mode='kdtree',
-                   neighbours_threshold=0.1, n_clusters=None, kwordsgram=1, siglen=200):
-    """ This function helps to find neighbours from items of alignset and
-        targetset. “Neighbours” are items that are “not so far”, ie having a
-        close label, are located in the same area etc.
-
-        This function handles two types of neighbouring : text and numeric.
-        For text value, you have to use the “minhashing” and for numeric, you
-        can choose from “kdtree“, “kdmeans“ and “minibatch”
-
-        The arguments to give are :
-            - `alignset` and `targetset` are the sets where neighbours have to
-              be found.
-            - `indexes` are the location of items to compare
-            - `mode` is the search type to use
-            - `neighbours_threshold` is the `mode` neighbours_threshold
-
-            - `n_clusters` is used for "kmeans" and "minibatch" methods, and it
-              is the number of clusters to use.
-
-            - `kwordsgram` and `siglen` are used for "minhashing". `kwordsgram`
-              is the length of wordsgrams to use, and `siglen` is the length of
-              the minhashing signature matrix.
-
-        return a list of lists, built as the following :
-            [
-                [[indexes_of_alignset_0], [indexes_of_targetset_0]],
-                [[indexes_of_alignset_1], [indexes_of_targetset_1]],
-                [[indexes_of_alignset_2], [indexes_of_targetset_2]],
-                [[indexes_of_alignset_3], [indexes_of_targetset_3]],
-                ...
-            ]
-    """
-    warnings.warn(DeprecationWarning('This function will be removed in the next '
-                                     'release.'
-                                     ' You should rather use the '
-                                     'BaseBlocking '
-                                     'objects of the blocking module'))
-    SEARCHERS = set(['kdtree', 'minhashing', 'kmeans', 'minibatch'])
-    mode = mode.lower()
-
-    if mode not in SEARCHERS:
-        raise NotImplementedError('Unknown mode given')
-    if mode == 'kdtree':
-        return findneighbours_kdtree(alignset, targetset, indexes, neighbours_threshold)
-    elif mode == 'minhashing':
-        return findneighbours_minhashing(alignset, targetset, indexes, neighbours_threshold,
-                                         kwordsgram, siglen)
-    elif mode in set(['kmeans', 'minibatch']):
-        try:
-            return findneighbours_clustering(alignset, targetset, indexes, mode, n_clusters)
-        except:
-            raise NotImplementedError('Scikit learn does not seem to be installed')
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/record_linkage/aligner.py	Thu Dec 19 14:41:02 2013 +0000
@@ -0,0 +1,324 @@
+# -*- coding:utf-8 -*-
+# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+import time
+import logging
+from collections import defaultdict
+
+from scipy import zeros
+from scipy.sparse import lil_matrix
+
+from nazca.dataio import parsefile
+
+
+###############################################################################
+### UTILITY FUNCTIONS #########################################################
+###############################################################################
+def iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique=True):
+    """ Return the aligned pairs
+    """
+    if unique:
+        for refid in global_matched:
+            bestid, _ = sorted(global_matched[refid], key=lambda x:x[1])[0]
+            ref_record = refset[refid]
+            target_record = targetset[bestid]
+            distance = global_mat[refid, bestid] if global_mat is not None else None
+            yield (ref_record[0], refid), (target_record[0], bestid), distance
+    else:
+        for refid in global_matched:
+            for targetid, _ in global_matched[refid]:
+                ref_record = refset[refid]
+                target_record = targetset[targetid]
+                distance = global_mat[refid, targetid] if global_mat is not None else None
+                yield (ref_record[0], refid), (target_record[0], targetid), distance
+
+
+###############################################################################
+### BASE ALIGNER OBJECT #######################################################
+###############################################################################
+class BaseAligner(object):
+
+    def __init__(self, threshold, processings, normalize_matrix=False):
+        self.threshold = threshold
+        self.processings = processings
+        self.normalize_matrix = normalize_matrix
+        self.ref_normalizer = None
+        self.target_normalizer = None
+        self.target_normalizer = None
+        self.blocking = None
+        self.alignments_done = 0
+        self.pairs_found = 0
+        self.nb_comparisons = 0
+        self.nb_blocks = 0
+        self.refset_size = None
+        self.targetset_size = None
+        self.time = None
+        self.logger = logging.getLogger('nazca.aligner')
+
+    def register_ref_normalizer(self, normalizer):
+        """ Register normalizers to be applied
+        before alignment """
+        self.ref_normalizer = normalizer
+
+    def register_target_normalizer(self, normalizer):
+        """ Register normalizers to be applied
+        before alignment """
+        self.target_normalizer = normalizer
+
+    def register_blocking(self, blocking):
+        self.blocking = blocking
+
+    def apply_normalization(self, dataset, normalizer):
+        if normalizer:
+            return normalizer.normalize_dataset(dataset)
+        return dataset
+
+    def compute_distance_matrix(self, refset, targetset,
+                                ref_indexes, target_indexes):
+        """ Compute and return the global alignment matrix.
+        For each `processing` a `Distancematrix` is built, then all the
+        matrices are summed with their own weighting and the result is the global
+        alignment matrix, which is returned.
+        """
+        distmatrix = zeros((len(ref_indexes), len(target_indexes)), dtype='float32')
+        for processing in self.processings:
+            distmatrix += processing.cdist(refset, targetset,
+                                          ref_indexes, target_indexes)
+        return distmatrix
+
+    def threshold_matched(self, distmatrix):
+        """ Return the matched elements within a dictionnary,
+        each key being the indice from X, and the corresponding
+        values being a list of couple (indice from Y, distance)
+        """
+        match = defaultdict(list)
+        if self.normalize_matrix:
+            distmatrix /= distmatrix.max()
+        ind = (distmatrix <= self.threshold).nonzero()
+        indrow = ind[0].tolist()
+        indcol = ind[1].tolist()
+        for (i, j) in zip(indrow, indcol):
+            match[i].append((j, distmatrix[i, j]))
+        return match
+
+    def _get_match(self, refset, targetset, ref_indexes=None, target_indexes=None):
+        # Build items
+        items = []
+        ref_indexes = ref_indexes or xrange(len(refset))
+        target_indexes = target_indexes or xrange(len(targetset))
+        # Apply alignments
+        mat = self.compute_distance_matrix(refset, targetset,
+                                           ref_indexes=ref_indexes,
+                                           target_indexes=target_indexes)
+        matched = self.threshold_matched(mat)
+        # Reapply matched to global indexes
+        new_matched = {}
+        for k, values in matched.iteritems():
+            new_matched[ref_indexes[k]] = [(target_indexes[i], d) for i, d in values]
+        return mat, new_matched
+
+    def align(self, refset, targetset, get_matrix=True):
+        """ Perform the alignment on the referenceset
+        and the targetset
+        """
+        start_time = time.time()
+        refset = self.apply_normalization(refset, self.ref_normalizer)
+        targetset = self.apply_normalization(targetset, self.target_normalizer)
+        self.refset_size = len(refset)
+        self.targetset_size = len(targetset)
+        # If no blocking
+        if not self.blocking:
+            return self._get_match(refset, targetset)
+        # Blocking == conquer_and_divide
+        global_matched = {}
+        global_mat = lil_matrix((len(refset), len(targetset)))
+        self.blocking.fit(refset, targetset)
+        for refblock, targetblock in self.blocking.iter_blocks():
+            self.nb_blocks += 1
+            ref_index = [r[0] for r in refblock]
+            target_index = [r[0] for r in targetblock]
+            self.nb_comparisons += len(ref_index)*len(target_index)
+            _, matched = self._get_match(refset, targetset, ref_index, target_index)
+            for k, values in matched.iteritems():
+                subdict = global_matched.setdefault(k, set())
+                for v, d in values:
+                    subdict.add((v, d))
+                    self.alignments_done += 1
+                    if get_matrix:
+                        # XXX avoid issue in sparse matrix
+                        global_mat[k, v] = d or 10**(-10)
+        self.time = time.time() - start_time
+        return global_mat, global_matched
+
+    def get_aligned_pairs(self, refset, targetset, unique=True):
+        """ Get the pairs of aligned elements
+        """
+        global_mat, global_matched = self.align(refset, targetset, get_matrix=False)
+        for pair in iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique):
+            self.pairs_found += 1
+            yield pair
+        self.log_infos()
+
+    def align_from_files(self, reffile, targetfile,
+                         ref_indexes=None, target_indexes=None,
+                         ref_encoding=None, target_encoding=None,
+                         ref_separator='\t', target_separator='\t',
+                         get_matrix=True):
+        """ Align data from files
+
+        Parameters
+        ----------
+
+        reffile: name of the reference file
+
+        targetfile: name of the target file
+
+        ref_encoding: if given (e.g. 'utf-8' or 'latin-1'), it will
+                      be used to read the files.
+
+        target_encoding: if given (e.g. 'utf-8' or 'latin-1'), it will
+                         be used to read the files.
+
+        ref_separator: separator of the reference file
+
+        target_separator: separator of the target file
+        """
+        refset = parsefile(reffile, indexes=ref_indexes,
+                           encoding=ref_encoding, delimiter=ref_separator)
+        targetset = parsefile(targetfile, indexes=target_indexes,
+                              encoding=target_encoding, delimiter=target_separator)
+        return self.align(refset, targetset, get_matrix=get_matrix)
+
+    def get_aligned_pairs_from_files(self, reffile, targetfile,
+                         ref_indexes=None, target_indexes=None,
+                         ref_encoding=None, target_encoding=None,
+                         ref_separator='\t', target_separator='\t',
+                         unique=True):
+        """ Get the pairs of aligned elements
+        """
+        refset = parsefile(reffile, indexes=ref_indexes,
+                           encoding=ref_encoding, delimiter=ref_separator)
+        targetset = parsefile(targetfile, indexes=target_indexes,
+                              encoding=target_encoding, delimiter=target_separator)
+        global_mat, global_matched = self.align(refset, targetset, get_matrix=False)
+        for pair in iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique):
+            yield pair
+
+    def log_infos(self):
+        """ Display some info on the aligner process
+        """
+        self.logger.info('Computation time : %s' % self.time)
+        self.logger.info('Size reference set : %s' % self.refset_size)
+        self.logger.info('Size target set : %s' % self.targetset_size)
+        self.logger.info('Comparisons done : %s' % self.nb_comparisons)
+        self.logger.info('Alignments done : %s' % self.alignments_done)
+        self.logger.info('Pairs found : %s' % self.pairs_found)
+        self.logger.info('Ratio reference set/alignments done : %s'
+                         % (self.alignments_done/float(self.refset_size)))
+        self.logger.info('Ratio target set/alignments done : %s'
+                         % (self.alignments_done/float(self.targetset_size)))
+        self.logger.info('Ratio reference set/pairs found : %s'
+                         % (self.pairs_found/float(self.refset_size)))
+        self.logger.info('Ratio target set/pairs found : %s'
+                         % (self.pairs_found/float(self.targetset_size)))
+        self.logger.info('Maximum comparisons : %s'
+                         % (self.refset_size * self.targetset_size))
+        self.logger.info('Number of blocks : %s' % self.nb_blocks)
+        if self.nb_blocks:
+            self.logger.info('Ratio comparisons/block : %s'
+                             % (float(self.nb_comparisons)/self.nb_blocks))
+        self.logger.info('Blocking reduction : %s'
+                         % (self.nb_comparisons/float(self.refset_size * self.targetset_size)))
+
+
+###############################################################################
+### PIPELINE ALIGNER OBJECT ##################################################
+###############################################################################
+class PipelineAligner(object):
+    """ This pipeline will perform iterative alignments, removing each time
+    the aligned results from the previous aligner.
+    """
+
+    def __init__(self, aligners):
+        self.aligners = aligners
+        self.pairs = {}
+        self.nb_comparisons = 0
+        self.nb_blocks = 0
+        self.alignments_done = 0
+        self.pairs_found = 0
+        self.refset_size = None
+        self.targetset_size = None
+        self.time = None
+        self.logger = logging.getLogger('nazca.aligner')
+
+    def get_aligned_pairs(self, refset, targetset, unique=True):
+        """ Get the pairs of aligned elements
+        """
+        start_time = time.time()
+        ref_index = range(len(refset))
+        target_index = range(len(targetset))
+        self.refset_size = len(refset)
+        self.targetset_size = len(targetset)
+        global_matched = {}
+        global_mat = lil_matrix((len(refset), len(targetset)))
+        seen_refset = set()
+        # Iteration over aligners
+        for ind_aligner, aligner in enumerate(self.aligners):
+            # Perform alignment
+            _refset = [refset[i] for i in ref_index]
+            _targetset = [targetset[i] for i in target_index]
+            for pair in aligner.get_aligned_pairs(_refset, _targetset, unique):
+                self.pairs_found += 1
+                pair = ((pair[0][0], ref_index[pair[0][1]]),
+                        (pair[1][0], target_index[pair[1][1]]))
+                yield pair
+                seen_refset.add(pair[0][1])
+            # Store stats
+            self.nb_blocks += aligner.nb_blocks
+            self.nb_comparisons += aligner.nb_comparisons
+            # Update indexes if necessary
+            # For now, we remove all the reference set that are already matched
+            if ind_aligner < len(self.aligners) - 1:
+                # There are other aligners after this one
+                ref_index = [i for i in ref_index if i not in seen_refset]
+        self.time = time.time() - start_time
+        self.log_infos()
+
+    def log_infos(self):
+        """ Display some info on the aligner process
+        """
+        self.logger.info('Computation time : %s' % self.time)
+        self.logger.info('Size reference set : %s' % self.refset_size)
+        self.logger.info('Size target set : %s' % self.targetset_size)
+        self.logger.info('Comparisons done : %s' % self.nb_comparisons)
+        self.logger.info('Alignments done : %s' % self.alignments_done)
+        self.logger.info('Pairs found : %s' % self.pairs_found)
+        self.logger.info('Ratio reference set/alignments done : %s'
+                         % (self.alignments_done/float(self.refset_size)))
+        self.logger.info('Ratio target set/alignments done : %s'
+                         % (self.alignments_done/float(self.targetset_size)))
+        self.logger.info('Ratio reference set/pairs found : %s'
+                         % (self.pairs_found/float(self.refset_size)))
+        self.logger.info('Ratio target set/pairs found : %s'
+                         % (self.pairs_found/float(self.targetset_size)))
+        self.logger.info('Maximum comparisons : %s'
+                         % (self.refset_size * self.targetset_size))
+        self.logger.info('Number of blocks : %s' % self.nb_blocks)
+        if self.nb_blocks:
+            self.logger.info('Ratio comparisons/block : %s'
+                             % (float(self.nb_comparisons)/self.nb_blocks))
+        self.logger.info('Blocking reduction : %s'
+                         % (self.nb_comparisons/float(self.refset_size * self.targetset_size)))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/record_linkage/blocking.py	Thu Dec 19 14:41:02 2013 +0000
@@ -0,0 +1,666 @@
+# -*- coding:utf-8 -*-
+# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+
+
+""" Blocking techniques.
+
+This module implements a set of blocking techniques used to split
+datasets in smaller subsets that will be aligned in more details.
+
+Additional information:
+
+   P. Christen, Data Matching, Data-Centric Systems and Applications,
+
+
+"""
+from functools import partial
+import warnings
+
+from scipy.spatial import KDTree
+
+from nazca.minhashing import Minlsh
+from nazca.distances import soundexcode
+
+
+###############################################################################
+### GENERAL BLOCKING ##########################################################
+###############################################################################
+class BaseBlocking(object):
+    """ An abstract general blocking object that exposes
+    the API that should be common to all blockings object
+    """
+    def __init__(self, ref_attr_index, target_attr_index):
+        """ Build the blocking object
+
+        Parameters
+        ----------
+
+        ref_attr_index: index of the attribute of interest in a record
+                        for the reference dataset
+                        (i.e. attribute to be used for key computation)
+
+        target_attr_index: index of the attribute of interest in a record
+                           for the target dataset
+                           (i.e. attribute to be used for key computation)
+        """
+        self.ref_attr_index = ref_attr_index
+        self.target_attr_index = target_attr_index
+        self.refids = None
+        self.targetids = None
+        self.is_fitted = False
+
+    def _fit(self, refset, targetset):
+        raise NotImplementedError
+
+    def _iter_blocks(self):
+        """ Internal iteration function over blocks
+        """
+        raise NotImplementedError
+
+    def _cleanup(self):
+        """ Internal cleanup blocking for further use (e.g. in pipeline)
+        """
+        raise NotImplementedError
+
+    def fit(self, refset, targetset):
+        """ Fit the blocking technique on the reference and target datasets
+
+        Parameters
+        ----------
+        refset: a dataset (list of records)
+
+        targetset: a dataset (list of records)
+        """
+        self._fit(refset, targetset)
+        # Keep ids for blocks building
+        self.refids = [(i, r[0]) for i, r in enumerate(refset)]
+        self.targetids = [(i, r[0]) for i, r in enumerate(targetset)]
+        self.is_fitted = True
+
+    def iter_blocks(self):
+        """ Iterator over the different possible blocks.
+
+        Returns
+        -------
+
+        (block1, block2): The blocks are always (reference_block, target_block)
+                          and contains the pair (index, id) of the record in the
+                          corresponding dataset.
+        """
+        assert self.is_fitted
+        return self._iter_blocks()
+
+    def iter_indice_blocks(self):
+        """ Iterator over the different possible blocks.
+
+        Returns
+        -------
+
+        (block1, block2): The blocks are always (reference_block, target_block)
+                          and contains the indexes of the record in the
+                          corresponding dataset.
+        """
+        assert self.is_fitted
+        for block1, block2 in self._iter_blocks():
+            yield [r[0] for r in block1], [r[0] for r in block2]
+
+    def iter_id_blocks(self):
+        """ Iterator over the different possible blocks.
+
+        Returns
+        -------
+
+        (block1, block2): The blocks are always (reference_block, target_block)
+                          and contains the ids of the record in the
+                          corresponding dataset.
+        """
+        assert self.is_fitted
+        for block1, block2 in self._iter_blocks():
+            yield [r[1] for r in block1], [r[1] for r in block2]
+
+    def iter_pairs(self):
+        """ Iterator over the different possible pairs.
+
+        Returns
+        -------
+
+        (pair1, pari2): The pairs are always ((ind_reference, id_reference),
+                                              (ind_target, id_target))
+                        and are the ids of the record in the corresponding dataset.
+        """
+        assert self.is_fitted
+        for block1, block2 in self.iter_blocks():
+            for val1 in block1:
+                for val2 in block2:
+                    yield val1, val2
+
+    def iter_indice_pairs(self):
+        """ Iterator over the different possible pairs.
+
+        Returns
+        -------
+
+        (pair1, pari2): The pairs are always (ind_reference, ind_target)
+                        and are the ids of the record in the corresponding dataset.
+        """
+        assert self.is_fitted
+        for block1, block2 in self.iter_indice_blocks():
+            for val1 in block1:
+                for val2 in block2:
+                    yield val1, val2
+
+    def iter_id_pairs(self):
+        """ Iterator over the different possible pairs.
+
+        Returns
+        -------
+
+        (pair1, pari2): The pairs are always (id_reference, id_target)
+                        and are the ids of the record in the corresponding dataset.
+        """
+        assert self.is_fitted
+        for block1, block2 in self.iter_id_blocks():
+            for val1 in block1:
+                for val2 in block2:
+                    yield val1, val2
+
+    def cleanup(self):
+        """ Cleanup blocking for further use (e.g. in pipeline)
+        """
+        self.is_fitted = True
+        self._cleanup()
+
+
+###############################################################################
+### KEY BLOCKING ##############################################################
+###############################################################################
+class KeyBlocking(BaseBlocking):
+    """ This blocking technique is based on a a blocking criteria
+    (or blocking key), that will be used to divide the datasets.
+
+    The main idea here is:
+
+    1 - to create an index of f(x) for each x in the reference set.
+
+    2 - to create an index of f(y) for each y in the target set.
+
+    3 - to iterate on each distinct value of f(x) and to return
+        the identifiers of the records of the both sets for this value.
+    """
+
+    def __init__(self, ref_attr_index, target_attr_index, callback, ignore_none=False):
+        super(KeyBlocking, self).__init__(ref_attr_index, target_attr_index)
+        self.callback = callback
+        self.ignore_none = ignore_none
+        self.reference_index = {}
+        self.target_index = {}
+
+    def _fit(self, refset, targetset):
+        """ Fit a dataset in an index using the callback
+        """
+        for ind, rec in enumerate(refset):
+            key = self.callback(rec[self.ref_attr_index])
+            if not key and self.ignore_none:
+                continue
+            self.reference_index.setdefault(key, []).append((ind, rec[0]))
+        for ind, rec in enumerate(targetset):
+            key = self.callback(rec[self.target_attr_index])
+            if not key and self.ignore_none:
+                continue
+            self.target_index.setdefault(key, []).append((ind, rec[0]))
+
+    def _iter_blocks(self):
+        """ Iterator over the different possible blocks.
+
+        Returns
+        -------
+
+        (block1, block2): The blocks are always (reference_block, target_block)
+                          and containts the indexes of the record in the
+                          corresponding dataset.
+        """
+        for key, block1 in self.reference_index.iteritems():
+            block2 = self.target_index.get(key)
+            if block1 and block2:
+                yield (block1, block2)
+
+    def _cleanup(self):
+        """ Cleanup blocking for further use (e.g. in pipeline)
+        """
+        self.reference_index = {}
+        self.target_index = {}
+
+
+class SoundexBlocking(KeyBlocking):
+
+    def __init__(self, ref_attr_index, target_attr_index, language='french',):
+        super(SoundexBlocking, self).__init__(ref_attr_index, target_attr_index,
+                                              partial(soundexcode, language=language))
+
+
+###############################################################################
+### BIGRAM BLOCKING ###########################################################
+###############################################################################
+class NGramBlocking(BaseBlocking):
+    """ This blocking technique is based on a a n-gram key.
+    """
+
+    def __init__(self, ref_attr_index, target_attr_index, ngram_size=2, depth=2):
+        super(NGramBlocking, self).__init__(ref_attr_index, target_attr_index)
+        self.ngram_size = ngram_size
+        self.depth = depth
+        self.reference_index = {}
+        self.target_index = {}
+
+    def _fit_dataset(self, dataset, cur_index, attr_index):
+        """ Fit a dataset
+        """
+        for ind, r in enumerate(dataset):
+            cur_dict = cur_index
+            text = r[attr_index]
+            for i in range(self.depth):
+                ngram = text[i*self.ngram_size:(i+1)*self.ngram_size]
+                if i < self.depth - 1:
+                    cur_dict = cur_dict.setdefault(ngram, {})
+            cur_dict.setdefault(ngram, []).append((ind, r[0]))
+
+    def _fit(self, refset, targetset):
+        """ Fit the two sets (reference set and target set)
+        """
+        self._fit_dataset(refset, self.reference_index, self.ref_attr_index)
+        self._fit_dataset(targetset, self.target_index, self.target_attr_index)
+
+    def _iter_dict(self, ref_cur_dict, target_cur_dict):
+        """ Iterative function used to create blocks from dicts
+        """
+        for key, sub_dict in ref_cur_dict.iteritems():
+            if key in target_cur_dict:
+                if isinstance(sub_dict, dict):
+                    # There is another dict layer
+                    for block1, block2 in self._iter_dict(sub_dict, target_cur_dict[key]):
+                        yield block1, block2
+                else:
+                    # This is a list
+                    yield sub_dict, target_cur_dict[key]
+
+    def _iter_blocks(self):
+        """ Iterator over the different possible blocks.
+
+        Returns
+        -------
+
+        (block1, block2): The blocks are always (reference_block, target_block)
+                          and containts the indexes of the record in the
+                          corresponding dataset.
+        """
+        for block1, block2 in self._iter_dict(self.reference_index, self.target_index):
+            if block1 and block2:
+                yield block1, block2
+
+    def _cleanup(self):
+        """ Cleanup blocking for further use (e.g. in pipeline)
+        """
+        self.reference_index = {}
+        self.target_index = {}
+
+
+###############################################################################
+### SORTKEY BLOCKING ##########################################################
+###############################################################################
+class SortedNeighborhoodBlocking(BaseBlocking):
+    """ This blocking technique is based on a a sorting blocking criteria
+    (or blocking key), that will be used to divide the datasets.
+    """
+
+    def __init__(self, ref_attr_index, target_attr_index, key_func=lambda x: x, window_width=20):
+        super(SortedNeighborhoodBlocking, self).__init__(ref_attr_index, target_attr_index)
+        self.key_func = key_func
+        self.window_width = window_width
+        self.sorted_dataset = None
+
+    def _fit(self, refset, targetset):
+        """ Fit a dataset in an index using the callback
+        """
+        self.sorted_dataset = [((ind, r[0]), r[self.ref_attr_index], 0)
+                               for ind, r in enumerate(refset)]
+        self.sorted_dataset.extend([((ind, r[0]), r[self.target_attr_index], 1)
+                                    for ind, r in enumerate(targetset)])
+        self.sorted_dataset.sort(key=lambda x: self.key_func(x[1]))
+
+    def _iter_blocks(self):
+        """ Iterator over the different possible blocks.
+        """
+        for ind, (rid, record, dset) in enumerate(self.sorted_dataset):
+            # Only keep reference set record
+            if dset == 1:
+                continue
+            block1 = [rid,]
+            minind = (ind - self.window_width)
+            minind = minind if minind >=0 else 0
+            maxind = (ind + self.window_width + 1)
+            block2 = [ri for ri, re, d in self.sorted_dataset[minind:maxind]
+                      if d == 1]
+            if block1 and block2:
+                yield (block1, block2)
+
+    def _cleanup(self):
+        """ Cleanup blocking for further use (e.g. in pipeline)
+        """
+        self.sorted_dataset = None
+
+
+###############################################################################
+### MERGE BLOCKING ############################################################
+###############################################################################
+class MergeBlocking(BaseBlocking):
+    """ This blocking technique keep only one appearance of one given values,
+    and removes all the other records having this value.
+    The merge is based on a score function
+
+    E.g.
+      ('http://fr.wikipedia.org/wiki/Paris_%28Texas%29', 'Paris', 25898)
+      ('http://fr.wikipedia.org/wiki/Paris', 'Paris', 12223100)
+
+    could be (with a score function based on the population (third value):
+
+      ('http://fr.wikipedia.org/wiki/Paris', 'Paris', 12223100)
+
+    !!! WARNING !!! This is only done on ONE set (the one with a non null attr index)
+    """
+
+    def __init__(self, ref_attr_index, target_attr_index, score_func):
+        super(MergeBlocking, self).__init__(ref_attr_index, target_attr_index)
+        self.score_func = score_func
+        self.merged_dataset = None
+        self.other_dataset = None
+        if ref_attr_index is None and target_attr_index is None:
+            raise ValueError('At least one of ref_attr_index or target_attr_index '
+                             'should not be None')
+
+    def _fit(self, refset, targetset):
+        """ Fit a dataset in an index using the callback
+        """
+        if self.ref_attr_index is not None:
+            # Merge refset
+            self.merged_dataset = self._merge_dataset(refset, self.ref_attr_index)
+            self.other_dataset = [(ind, r[0]) for ind, r in enumerate(targetset)]
+        else:
+            # Merge targetset
+            self.merged_dataset = self._merge_dataset(targetset, self.target_attr_index)
+            self.other_dataset = [(ind, r[0]) for ind, r in enumerate(refset)]
+
+    def _merge_dataset(self, dataset, attr_index):
+        """ Merge a dataset
+        """
+        merged_dataset_dict = {}
+        for ind, record in enumerate(dataset):
+            score = self.score_func(record)
+            if record[attr_index] not in merged_dataset_dict:
+                # Create new entry
+                merged_dataset_dict[record[attr_index]] = (ind, record, score)
+            elif (record[attr_index] in merged_dataset_dict
+                  and merged_dataset_dict[record[attr_index]][2] < score):
+                # Change current score
+                merged_dataset_dict[record[attr_index]] = (ind, record, score)
+        return [(ind, r[0]) for ind, r, score in merged_dataset_dict.itervalues()]
+
+    def _iter_blocks(self):
+        """ Iterator over the different possible blocks.
+        """
+        if self.ref_attr_index is not None:
+            yield self.merged_dataset, self.other_dataset
+        else:
+            # self.target_attr_index is not None
+            yield self.other_dataset, self.merged_dataset
+
+    def _cleanup(self):
+        """ Cleanup blocking for further use (e.g. in pipeline)
+        """
+        self.merged_dataset = None
+        self.other_dataset = None
+
+
+###############################################################################
+### CLUSTERING-BASED BLOCKINGS ################################################
+###############################################################################
+class KmeansBlocking(BaseBlocking):
+    """ A blocking technique based on Kmeans
+    """
+
+    def __init__(self, ref_attr_index, target_attr_index, n_clusters=None):
+        super(KmeansBlocking, self).__init__(ref_attr_index, target_attr_index)
+        self.n_clusters = n_clusters
+        self.kmeans = None
+        self.predicted = None
+        from sklearn import cluster
+        self.cluster_class = cluster.KMeans
+
+    def _fit(self, refset, targetset):
+        """ Fit the reference dataset.
+        """
+        # If an element is None (missing), use instead the identity element.
+        # The identity element is defined as the 0-vector
+        idelement = tuple([0 for _ in xrange(len(refset[0][self.ref_attr_index]))])
+        # We assume here that there are at least 2 elements in the refset
+        n_clusters = self.n_clusters or (len(refset)/10 or len(refset)/2)
+        kmeans =  self.cluster_class(n_clusters=n_clusters)
+        kmeans.fit([elt[self.ref_attr_index] or idelement for elt in refset])
+        self.kmeans = kmeans
+        # Predict on targetset
+        self.predicted = self.kmeans.predict([elt[self.target_attr_index]
+                                              or idelement for elt in targetset])
+
+    def _iter_blocks(self):
+        """ Iterator over the different possible blocks.
+
+        Returns
+        -------
+
+        (block1, block2): The blocks are always (reference_block, target_block)
+                          and containts the indexes of the record in the
+                          corresponding dataset.
+        """
+        neighbours = [[[], []] for _ in xrange(self.kmeans.n_clusters)]
+        for ind, li in enumerate(self.predicted):
+            neighbours[li][1].append(self.targetids[ind])
+        for ind, li in enumerate(self.kmeans.labels_):
+            neighbours[li][0].append(self.refids[ind])
+        for block1, block2 in neighbours:
+            if len(block1) and len(block2):
+                yield block1, block2
+
+    def _cleanup(self):
+        """ Cleanup blocking for further use (e.g. in pipeline)
+        """
+        self.kmeans = None
+        self.predicted = None
+
+
+###############################################################################
+### KDTREE BLOCKINGS ##########################################################
+###############################################################################
+class KdTreeBlocking(BaseBlocking):
+    """ A blocking technique based on KdTree
+    """
+    def __init__(self, ref_attr_index, target_attr_index, threshold=0.1):
+        super(KdTreeBlocking, self).__init__(ref_attr_index, target_attr_index)
+        self.threshold = threshold
+        self.reftree = None
+        self.targettree = None
+        self.nb_elements = None
+
+    def _fit(self, refset, targetset):
+        """ Fit the blocking
+        """
+        firstelement = refset[0][self.ref_attr_index]
+        self.nb_elements = len(refset)
+        idsize = len(firstelement) if isinstance(firstelement, (tuple, list)) else 1
+        idelement = (0,) * idsize
+        # KDTree is expecting a two-dimensional array
+        if idsize == 1:
+            self.reftree  = KDTree([(elt[self.ref_attr_index],) or idelement for elt in refset])
+            self.targettree = KDTree([(elt[self.target_attr_index],) or idelement for elt in targetset])
+        else:
+            self.reftree = KDTree([elt[self.ref_attr_index] or idelement for elt in refset])
+            self.targettree = KDTree([elt[self.target_attr_index] or idelement for elt in targetset])
+
+    def _iter_blocks(self):
+        """ Iterator over the different possible blocks.
+
+        Returns
+        -------
+
+        (block1, block2): The blocks are always (reference_block, target_block)
+                          and containts the indexes of the record in the
+                          corresponding dataset.
+        """
+        extraneighbours = self.reftree.query_ball_tree(self.targettree, self.threshold)
+        neighbours = []
+        for ind in xrange(self.nb_elements):
+            if not extraneighbours[ind]:
+                continue
+            _ref = [self.refids[ind],]
+            _target = [self.targetids[v] for v in extraneighbours[ind]]
+            neighbours.append((_ref, _target))
+        for block1, block2 in neighbours:
+            if len(block1) and len(block2):
+                yield block1, block2
+
+    def _cleanup(self):
+        """ Cleanup blocking for further use (e.g. in pipeline)
+        """
+        self.reftree = None
+        self.targettree = None
+        self.nb_elements = None
+
+
+###############################################################################
+### MINHASHING BLOCKINGS ######################################################
+###############################################################################
+class MinHashingBlocking(BaseBlocking):
+    """ A blocking technique based on MinHashing
+    """
+    def __init__(self, ref_attr_index, target_attr_index,
+                 threshold=0.1, kwordsgram=1, siglen=200):
+        super(MinHashingBlocking, self).__init__(ref_attr_index, target_attr_index)
+        self.threshold = threshold
+        self.kwordsgram = kwordsgram
+        self.siglen = siglen
+        self.minhasher = Minlsh()
+        self.nb_elements = None
+
+    def _fit(self, refset, targetset):
+        """ Find the blocking using minhashing
+        """
+        # If an element is None (missing), use instead the identity element.
+        idelement = ''
+        self.minhasher.train([elt[self.ref_attr_index] or idelement for elt in refset] +
+                        [elt[self.target_attr_index] or idelement for elt in targetset],
+                        self.kwordsgram, self.siglen)
+        self.nb_elements = len(refset)
+
+    def _iter_blocks(self):
+        """ Iterator over the different possible blocks.
+
+        Returns
+        -------
+
+        (block1, block2): The blocks are always (reference_block, target_block)
+                          and containts the indexes of the record in the
+                          corresponding dataset.
+        """
+        rawneighbours = self.minhasher.predict(self.threshold)
+        neighbours = []
+        for data in rawneighbours:
+            neighbours.append([[], []])
+            for i in data:
+                if i >= self.nb_elements:
+                    neighbours[-1][1].append(self.targetids[i - self.nb_elements])
+                else:
+                    neighbours[-1][0].append(self.refids[i])
+            if len(neighbours[-1][0]) == 0 or len(neighbours[-1][1]) == 0:
+                neighbours.pop()
+        for block1, block2 in neighbours:
+            if len(block1) and len(block2):
+                yield block1, block2
+
+    def _cleanup(self):
+        """ Cleanup blocking for further use (e.g. in pipeline)
+        """
+        self.minhasher = Minlsh()
+        self.nb_elements = None
+
+
+###############################################################################
+### BLOCKING PIPELINE #########################################################
+###############################################################################
+class PipelineBlocking(BaseBlocking):
+    """ Pipeline multiple blocking techniques
+    """
+
+    def __init__(self, blockings, collect_stats=False):
+        """ Build the blocking object
+
+        Parameters
+        ----------
+
+        blockings: ordered list of blocking objects
+        """
+        self.blockings = blockings
+        self.stored_blocks = []
+        self.collect_stats = collect_stats
+        self.stats = {}
+
+    def _fit(self, refset, targetset):
+        """ Internal fit of the pipeline """
+        self._recursive_fit(refset, targetset, range(len(refset)), range(len(targetset)), 0)
+
+    def _recursive_fit(self, refset, targetset, ref_index, target_index, ind):
+        """ Recursive fit of the blockings.
+        Blocks are stored in the stored_blocks attribute.
+        """
+        if ind < len(self.blockings) - 1:
+            # There are other blockings after this one
+            blocking = self.blockings[ind]
+            blocking.cleanup()
+            blocking.fit([refset[i] for i in ref_index],
+                         [targetset[i] for i in target_index])
+            for block1, block2 in blocking.iter_indice_blocks():
+                ind_block1 = [ref_index[i] for i in block1]
+                ind_block2 = [target_index[i] for i in block2]
+                if self.collect_stats:
+                    self.stats.setdefault(ind, []).append((len(block1), len(block2)))
+                self._recursive_fit(refset, targetset, ind_block1, ind_block2, ind+1)
+        else:
+            # This is the final blocking
+            blocking = self.blockings[ind]
+            blocking.cleanup()
+            blocking.fit([refset[i] for i in ref_index],
+                         [targetset[i] for i in target_index])
+            for block1, block2 in blocking.iter_blocks():
+                ind_block1 = [(ref_index[i], _id) for i, _id in block1]
+                ind_block2 = [(target_index[i], _id) for i, _id in block2]
+                if self.collect_stats:
+                    self.stats.setdefault(ind, []).append((len(block1), len(block2)))
+                self.stored_blocks.append((ind_block1, ind_block2))
+
+    def _iter_blocks(self):
+        """ Internal iteration function over blocks
+        """
+        for block1, block2 in self.stored_blocks:
+            if block1 and block2:
+                yield block1, block2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/record_linkage/old_api.py	Thu Dec 19 14:41:02 2013 +0000
@@ -0,0 +1,432 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from os import listdir
+import os.path as osp
+from shutil import rmtree
+from tempfile import mkdtemp
+import sys
+import warnings
+from functools import partial
+
+from scipy.sparse import lil_matrix
+
+from nazca.dataio import write_results, split_file, parsefile
+from nazca.normalize import BaseNormalizer, NormalizerPipeline
+from nazca.distances import GeographicalProcessing
+from nazca.record_linkage.aligner import BaseAligner
+from nazca.record_linkage.blocking import KmeansBlocking, KdTreeBlocking, MinHashingBlocking
+
+
+# Backward compatibility. Now, use the BaseAligner inside the functions.
+# Perhaps these functions may be removed later...
+
+
+###############################################################################
+### NORMALIZE FUNCTIONS #######################################################
+###############################################################################
+# Backward compatibility. Now, use the NormalizerPipeline inside the functions.
+# Perhaps these functions may be removed later...
+
+def normalize_set(rset, processings):
+    """ Apply all the normalization functions to the given rset """
+    warnings.warn(DeprecationWarning('This function will be removed '
+                                     'in the next release.'
+                                     'You should rather use the BaseNormalizer '
+                                     'object of the normalize module'))
+    normalizers = []
+    for ind, processing in processings.iteritems():
+        for normalizer in extract_normalization_from_treatment(processing, ind):
+            normalizers.append(normalizer)
+    # Create pipeline
+    pipeline = NormalizerPipeline(normalizers)
+    return pipeline.normalize_dataset(rset)
+
+def extract_normalization_from_treatment(processing, ind):
+    """ Extract normalization from processing.
+    This function is used for backward compatibility with
+    the old function-based API """
+    warnings.warn(DeprecationWarning('This function will be removed '
+                                     'in the next release.'
+                                     'You should rather use the BaseNormalizer '
+                                     'object of the normalize module'))
+    for f in processing.get('normalization', []):
+        farg = f.func_code.co_varnames #List of the arguments of f
+        # A kind of union between the arguments needed by f, and the
+        # provided ones
+        givenargs = dict((arg, processing['norm_params'][arg])
+                         for arg in farg if arg in processing.get('norm_params', []))
+        callback = f
+        if givenargs:
+            callback = partial(callback, **givenargs)
+        yield BaseNormalizer(callback=callback, attr_index=ind)
+
+def extract_treatment_from_treatment(processing, ind):
+    """ Extract Treatment object from processing dict.
+    This is only for backward compatibility with the old API.
+    """
+    if processing['metric'] == 'geographical':
+        return GeographicalProcessing(ind, ind,
+                                     matrix_normalized=processing.get('matrix_normalized', False),
+                                     **processing.get('metric_params', {}))
+
+
+###############################################################################
+### ALIGNER ###################################################################
+###############################################################################
+def align(alignset, targetset, threshold, processings=None, resultfile=None,
+          _applyNormalization=True):
+    """ Try to align the items of alignset onto targetset's ones
+
+        `alignset` and `targetset` are the sets to align. Each set contains
+        lists where the first column is the identifier of the item,
+        and the others are
+        the attributs to align. (Note that the order is important !) Both must
+        have the same number of columns.
+
+        `processings` is a dictionary of dictionaries.
+        Each key is the indice of the row, and each value is a dictionary
+        that contains the processings to do on the different attributs.
+        Each dictionary is built as the following:
+
+            processing = {'normalization': [f1, f2, f3],
+                         'norm_params': {'arg1': arg01, 'arg2': arg02},
+                         'metric': d1,
+                         'metric_params': {'arg1': arg11},
+                         'weighting': w,
+                         'matrix_normalize': True
+                        }
+
+            `normalization` is the list of functions called to normalize the
+            given attribut (in order). Each functions is called with `norm_params`
+            as arguments
+
+            Idem for `distance` and `distance_args`
+
+            `weighting` is the weighting for the current attribut in regard to
+            the others
+
+            `resultfile` (default is None). Write the matched elements in a file.
+
+        Return the distance matrix and the matched list.
+    """
+    warnings.warn(DeprecationWarning('This function will be removed in the next '
+                                     'release.'
+                                     ' You should rather use the BaseAligner '
+                                     'object of the aligner module'))
+    processings = processings or {}
+    # Get the normalizers
+    normalizers = []
+    for ind, processing in processings.iteritems():
+        for normalizer in extract_normalization_from_treatment(processing, ind):
+            normalizers.append(normalizer)
+    # Cleanup processings
+    for t in processings.itervalues():
+        if 'normalization' in t:
+            t.pop('normalization')
+        if 'norm_params' in t:
+            t.pop('norm_params')
+    # Build aligner
+    processings = [extract_treatment_from_treatment(t, ind) for ind, t in processings.iteritems()]
+    aligner = BaseAligner(threshold, processings)
+    aligner.register_ref_normalizer(normalizers)
+    aligner.register_target_normalizer(normalizers)
+    # Align
+    return aligner.align(alignset, targetset)
+
+def subalign(alignset, targetset, alignind, targetind, threshold,
+             processings=None, _applyNormalization=True):
+    """ Compute a subalignment for a list of indices of the alignset and
+    a list of indices for the targetset """
+    warnings.warn(DeprecationWarning('This function will be removed in the next '
+                                     'release.'
+                                     ' You should rather use the BaseAligner '
+                                     'object of the aligner module'))
+    mat, matched = align([alignset[i[0]] for i in alignind],
+                         [targetset[i[0]] for i in targetind], threshold,
+                         processings, _applyNormalization=_applyNormalization)
+    new_matched = {}
+    for k, values in matched.iteritems():
+        new_matched[alignind[k]] = [(targetind[i], d) for i, d in values]
+    return mat, new_matched
+
+def conquer_and_divide_alignment(alignset, targetset, threshold, processings=None,
+                                 indexes=(1,1), mode='kdtree', neighbours_threshold=0.1,
+                                 n_clusters=None, kwordsgram=1, siglen=200,
+                                 get_global_mat=True):
+    """ Full conquer and divide method for alignment.
+    Compute neighbours and merge the different subalignments.
+    """
+    warnings.warn(DeprecationWarning('This function will be removed in the next '
+                                     'release.'
+                                     ' You should rather use the BaseAligner '
+                                     'object of the aligner module'))
+    global_matched = {}
+    if get_global_mat:
+        global_mat = lil_matrix((len(alignset), len(targetset)))
+
+    processings = processings or {}
+    ralignset = normalize_set(alignset, processings)
+    rtargetset = normalize_set(targetset, processings)
+
+    for alignind, targetind in findneighbours(ralignset, rtargetset, indexes, mode,
+                                              neighbours_threshold, n_clusters,
+                                              kwordsgram, siglen):
+        _, matched = subalign(alignset, targetset, alignind, targetind,
+                                threshold, processings, _applyNormalization=False)
+        for k, values in matched.iteritems():
+            subdict = global_matched.setdefault(k, set())
+            for v, d in values:
+                subdict.add((v, d))
+                # XXX avoid issue in sparse matrix
+                if get_global_mat:
+                    global_mat[k[0], v[0]] = d or 10**(-10)
+    if get_global_mat:
+        return global_mat, global_matched
+    return global_matched
+
+def alignall(alignset, targetset, threshold, processings=None,
+             indexes=(1,1), mode='kdtree', neighbours_threshold=0.1,
+             n_clusters=None, kwordsgram=1, siglen=200, uniq=False):
+    warnings.warn(DeprecationWarning('This function will be removed in the next '
+                                     'release.'
+                                     ' You should rather use the BaseAligner '
+                                     'object of the aligner module'))
+    if not mode:
+        _, matched = align(alignset, targetset, threshold, processings,
+                           resultfile=None, _applyNormalization=True)
+    else:
+        matched = conquer_and_divide_alignment(alignset, targetset, threshold,
+                                               processings, indexes, mode,
+                                               neighbours_threshold, n_clusters,
+                                               kwordsgram, siglen,
+                                               get_global_mat=False)
+
+    if not uniq:
+        for alignid in matched:
+            for targetid, _ in matched[alignid]:
+                yield alignset[alignid[0]][0], targetset[targetid[0]][0]
+    else:
+        for alignid in matched:
+            bestid, _ = sorted(matched[alignid], key=lambda x:x[1])[0]
+            yield alignset[alignid[0]][0], targetset[bestid[0]][0]
+
+def alignall_iterative(alignfile, targetfile, alignformat, targetformat,
+                       threshold, size=10000, equality_threshold=0.01,
+                       processings=None, indexes=(1,1), mode='kdtree',
+                       neighbours_threshold=0.1, n_clusters=None, kwordsgram=1,
+                       siglen=200, cache=None):
+    """ This function helps you to align *huge* files.
+        It takes your csv files as arguments and split them into smaller ones
+        (files of `size` lines), and runs the alignment on those files.
+
+        `alignformat` and `targetformat` are keyworded arguments given to the
+        nazca.dataio.parsefile function.
+
+        This function returns its own cache. The cache is quite simply a
+        dictionary having align items' id as keys and tuples (target item's id,
+        distance) as value. This dictionary can be regiven to this function to
+        perform another alignment (with different parameters, or just to be
+        sure everything has been caught)
+
+        If the distance of an alignment is below `equality_threshold`, the
+        alignment is considered as perfect, and the corresponding item is
+        removed from the alignset (to speed up the computation).
+    """
+    warnings.warn(DeprecationWarning('This function will be removed in the next '
+                                     'release.'
+                                     ' You should rather use the BaseAligner '
+                                     'object of the aligner module'))
+    #Split the huge files into smaller ones
+    aligndir = mkdtemp()
+    targetdir = mkdtemp()
+    alignfiles = split_file(alignfile, aligndir, size)
+    targetfiles = split_file(targetfile, targetdir, size)
+
+    #Compute the number of iterations that must be done to achieve the alignement
+    nb_iterations = len(alignfiles) * len(targetfiles)
+    current_it = 0
+
+    cache = cache or {} #Contains the better known alignements
+    #Contains the id of perfectly aligned data
+    doneids = set(_id for _id, (_, dist) in cache.iteritems()
+                          if dist < equality_threshold)
+
+    try:
+        for alignfile in alignfiles:
+            alignset = [a for a in parsefile(osp.join(aligndir, alignfile), **alignformat)
+                        if a[0] not in doneids]
+            for targetfile in targetfiles:
+                targetset = parsefile(osp.join(targetdir, targetfile), **targetformat)
+                matched = conquer_and_divide_alignment(alignset, targetset,
+                                                       threshold,
+                                                       processings=processings,
+                                                       indexes=indexes,
+                                                       mode=mode,
+                                                       neighbours_threshold=neighbours_threshold,
+                                                       n_clusters=n_clusters,
+                                                       kwordsgram=kwordsgram,
+                                                       siglen=siglen,
+                                                       get_global_mat=False)
+                for alignid in matched:
+                    bestid, dist = sorted(matched[alignid], key=lambda x:x[1])[0]
+                    #Get the better known distance
+                    _, current_dist = cache.get(alignset[alignid[0]][0], (None, None))
+                    if current_dist is None or current_dist > dist:
+                        #If it's better, update the cache
+                        cache[alignset[alignid[0]][0]] = (targetset[bestid[0]][0], dist)
+                        if dist <= equality_threshold:
+                            #If perfect, stop trying to align this one
+                            doneids.add(alignset[alignid][0])
+
+                current_it += 1
+                sys.stdout.write('\r%0.2f%%' % (current_it * 100. /
+                                                nb_iterations))
+                sys.stdout.flush()
+                if doneids:
+                    alignset = [a for a in alignset if a[0] not in doneids]
+                if not alignset: #All items have been aligned
+                    #TODO Increment current_it.
+                    #The progress of the alignment process is computed with
+                    #`current_it`. If all items of `alignset` are aligned, we
+                    #stop the alignment process for this `alignset`. If
+                    #`current_it` isn’t incremented, the progress shown will be
+                    #false.
+                    break
+
+    finally:
+        rmtree(aligndir)
+        rmtree(targetdir)
+
+    return cache
+
+
+
+
+
+
+
+###############################################################################
+### CLUSTERING-BASED BLOCKINGS FUNCTIONS ######################################
+###############################################################################
+# Backward compatibility. Now, use the BlockingObject inside the functions.
+# Perhaps these functions may be removed later...
+def findneighbours_clustering(alignset, targetset, indexes=(1, 1),
+                              mode='kmeans', n_clusters=None):
+    """ Find the neigbhours using clustering (kmeans or minibatchkmeans)
+    """
+    warnings.warn(DeprecationWarning('This function will be removed in the next '
+                                     'release.'
+                                     ' You should rather use the KmeansBlocking '
+                                     'object of the blocking module'))
+    if mode == 'kmeans':
+        blocking = KmeansBlocking(ref_attr_index=indexes[0],
+                                  target_attr_index=indexes[1],
+                                  n_clusters=n_clusters)
+    elif mode == 'minibatch':
+        blocking = MiniBatchKmeansBlocking(ref_attr_index=indexes[0],
+                                           target_attr_index=indexes[1],
+                                           n_clusters=n_clusters)
+    else:
+        raise ValueError("Mode should be 'kmeans' or 'minibatch'")
+    # Fit blocking object
+    blocking.fit(alignset, targetset)
+    return list(blocking.iter_blocks())
+
+def findneighbours_kdtree(alignset, targetset, indexes=(1, 1), threshold=0.1):
+    """ Find the neigbhours using kdree
+    """
+    warnings.warn(DeprecationWarning('This function will be removed in the next '
+                                     'release.'
+                                     ' You should rather use the KdTreeBlocking '
+                                     'object of the blocking module'))
+    blocking = KdTreeBlocking(ref_attr_index=indexes[0],
+                              target_attr_index=indexes[1],
+                              threshold=threshold)
+    blocking.fit(alignset, targetset)
+    return list(blocking.iter_blocks())
+
+def findneighbours_minhashing(alignset, targetset, indexes=(1, 1), threshold=0.1,
+                              kwordsgram=1, siglen=200):
+    """ Find the neigbhours using minhashing
+    """
+    warnings.warn(DeprecationWarning('This function will be removed in the next '
+                                     'release.'
+                                     ' You should rather use the '
+                                     'MinHashingBlocking '
+                                     'object of the blocking module'))
+    blocking = MinHashingBlocking(ref_attr_index=indexes[0],
+                                  target_attr_index=indexes[1],
+                                  threshold=threshold, kwordsgram=kwordsgram,
+                                  siglen=siglen)
+    blocking.fit(alignset, targetset)
+    return list(blocking.iter_blocks())
+
+def findneighbours(alignset, targetset, indexes=(1, 1), mode='kdtree',
+                   neighbours_threshold=0.1, n_clusters=None, kwordsgram=1, siglen=200):
+    """ This function helps to find neighbours from items of alignset and
+        targetset. “Neighbours” are items that are “not so far”, ie having a
+        close label, are located in the same area etc.
+
+        This function handles two types of neighbouring : text and numeric.
+        For text value, you have to use the “minhashing” and for numeric, you
+        can choose from “kdtree“, “kdmeans“ and “minibatch”
+
+        The arguments to give are :
+            - `alignset` and `targetset` are the sets where neighbours have to
+              be found.
+            - `indexes` are the location of items to compare
+            - `mode` is the search type to use
+            - `neighbours_threshold` is the `mode` neighbours_threshold
+
+            - `n_clusters` is used for "kmeans" and "minibatch" methods, and it
+              is the number of clusters to use.
+
+            - `kwordsgram` and `siglen` are used for "minhashing". `kwordsgram`
+              is the length of wordsgrams to use, and `siglen` is the length of
+              the minhashing signature matrix.
+
+        return a list of lists, built as the following :
+            [
+                [[indexes_of_alignset_0], [indexes_of_targetset_0]],
+                [[indexes_of_alignset_1], [indexes_of_targetset_1]],
+                [[indexes_of_alignset_2], [indexes_of_targetset_2]],
+                [[indexes_of_alignset_3], [indexes_of_targetset_3]],
+                ...
+            ]
+    """
+    warnings.warn(DeprecationWarning('This function will be removed in the next '
+                                     'release.'
+                                     ' You should rather use the '
+                                     'BaseBlocking '
+                                     'objects of the blocking module'))
+    SEARCHERS = set(['kdtree', 'minhashing', 'kmeans', 'minibatch'])
+    mode = mode.lower()
+
+    if mode not in SEARCHERS:
+        raise NotImplementedError('Unknown mode given')
+    if mode == 'kdtree':
+        return findneighbours_kdtree(alignset, targetset, indexes, neighbours_threshold)
+    elif mode == 'minhashing':
+        return findneighbours_minhashing(alignset, targetset, indexes, neighbours_threshold,
+                                         kwordsgram, siglen)
+    elif mode in set(['kmeans', 'minibatch']):
+        try:
+            return findneighbours_clustering(alignset, targetset, indexes, mode, n_clusters)
+        except:
+            raise NotImplementedError('Scikit learn does not seem to be installed')
--- a/test/test_alignment.py	Tue Oct 22 16:04:05 2013 +0200
+++ b/test/test_alignment.py	Thu Dec 19 14:41:02 2013 +0000
@@ -22,8 +22,8 @@
 from os import path
 
 from nazca.normalize import simplify
-import nazca.aligner as alig
-import nazca.blocking as blo
+import nazca.record_linkage.aligner as alig
+import nazca.record_linkage.blocking as blo
 from nazca.distances import LevenshteinProcessing, GeographicalProcessing
 
 
--- a/test/test_blocking.py	Tue Oct 22 16:04:05 2013 +0200
+++ b/test/test_blocking.py	Thu Dec 19 14:41:02 2013 +0000
@@ -23,11 +23,11 @@
 
 from nazca.distances import (levenshtein, soundex, soundexcode,   \
                              jaccard, euclidean, geographical)
-from nazca.blocking import (KeyBlocking, SortedNeighborhoodBlocking,
-                            MergeBlocking,
-                            NGramBlocking, PipelineBlocking,
-                            SoundexBlocking, KmeansBlocking,
-                            MinHashingBlocking, KdTreeBlocking)
+from nazca.record_linkage.blocking import (KeyBlocking, SortedNeighborhoodBlocking,
+                                           MergeBlocking,
+                                           NGramBlocking, PipelineBlocking,
+                                           SoundexBlocking, KmeansBlocking,
+                                           MinHashingBlocking, KdTreeBlocking)
 from nazca.normalize import SimplifyNormalizer, loadlemmas
 
 
--- a/test/test_old_api.py	Tue Oct 22 16:04:05 2013 +0200
+++ b/test/test_old_api.py	Thu Dec 19 14:41:02 2013 +0000
@@ -22,13 +22,13 @@
 from os import path
 
 from nazca.normalize import loadlemmas, simplify
-from nazca.old_api import (normalize_set,
-                           findneighbours_clustering,
-                           findneighbours_kdtree,
-                           findneighbours_minhashing,
-                           align, subalign,
-                           conquer_and_divide_alignment,
-                           alignall, alignall_iterative)
+from nazca.record_linkage.old_api import (normalize_set,
+                                          findneighbours_clustering,
+                                          findneighbours_kdtree,
+                                          findneighbours_minhashing,
+                                          align, subalign,
+                                          conquer_and_divide_alignment,
+                                          alignall, alignall_iterative)
 
 
 TESTDIR = path.dirname(__file__)