autopep8
authorAdrien Di Mascio <Adrien.DiMascio@logilab.fr>
Mon, 03 Sep 2018 10:17:09 +0200
changeset 539 7cd2b9a4d60c
parent 538 98d6979ec07d
child 540 03c8777469f5
autopep8
nazca/__pkginfo__.py
nazca/data/__init__.py
nazca/data/countries.py
nazca/data/stopwords.py
nazca/ner/__init__.py
nazca/ner/filters.py
nazca/ner/preprocessors.py
nazca/ner/sources.py
nazca/rl/aligner.py
nazca/rl/blocking.py
nazca/utils/dataio.py
nazca/utils/distances.py
nazca/utils/minhashing.py
nazca/utils/normalize.py
nazca/utils/tokenizer.py
--- a/nazca/__pkginfo__.py	Tue Aug 21 15:50:28 2018 +0200
+++ b/nazca/__pkginfo__.py	Mon Sep 03 10:17:09 2018 +0200
@@ -26,7 +26,7 @@
 numversion = (0, 9, 0)
 version = '.'.join([str(num) for num in numversion])
 
-license = 'LGPL' # 2.1 or later
+license = 'LGPL'  # 2.1 or later
 description = "Python library for data alignment"
 web = "https://www.logilab.org/project/nazca"
 author = "Logilab"
--- a/nazca/data/__init__.py	Tue Aug 21 15:50:28 2018 +0200
+++ b/nazca/data/__init__.py	Mon Sep 03 10:17:09 2018 +0200
@@ -4,4 +4,5 @@
 import os.path as osp
 
 HERE = osp.join(osp.abspath(osp.dirname(__file__)))
-FRENCH_LEMMAS = dict([t.strip().split('\t') for t in open(osp.join(HERE, 'french_lemmas.txt'))])
+FRENCH_LEMMAS = dict([t.strip().split('\t')
+                      for t in open(osp.join(HERE, 'french_lemmas.txt'))])
--- a/nazca/data/countries.py	Tue Aug 21 15:50:28 2018 +0200
+++ b/nazca/data/countries.py	Mon Sep 03 10:17:09 2018 +0200
@@ -991,4 +991,4 @@
                         u'\xceles de la Mer \xc9g\xe9e m\xe9ridionale': u'Gr\xe8ce',
                         u'\xceles de la Mer \xc9g\xe9e septentrionale': u'Gr\xe8ce',
                         u'\xceles mineures \xe9loign\xe9es des \xc9tats-Unis': u'\xceles mineures \xe9loign\xe9es des \xc9tats-Unis'
-                                                }
+                        }
--- a/nazca/data/stopwords.py	Tue Aug 21 15:50:28 2018 +0200
+++ b/nazca/data/stopwords.py	Mon Sep 03 10:17:09 2018 +0200
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+# flake8: noqa
 """
 Stopwords in different languages.
 """
--- a/nazca/ner/__init__.py	Tue Aug 21 15:50:28 2018 +0200
+++ b/nazca/ner/__init__.py	Mon Sep 03 10:17:09 2018 +0200
@@ -11,7 +11,8 @@
     """ High-level process for Named Entities Recognition
     """
 
-    def __init__(self, ner_sources, preprocessors=None, filters=None, unique=False):
+    def __init__(self, ner_sources, preprocessors=None,
+                 filters=None, unique=False):
         """ Initialise the class.
 
         :tokenizer: an instance of tokenizer
@@ -50,7 +51,7 @@
         named_entities = []
         for token in tokens:
             if token.start < last_stop:
-                continue # this token overlaps with a previous match
+                continue  # this token overlaps with a previous match
             word = token.word
             # Applies preprocessors
             # XXX Preprocessors may be sources dependant
--- a/nazca/ner/filters.py	Tue Aug 21 15:50:28 2018 +0200
+++ b/nazca/ner/filters.py	Mon Sep 03 10:17:09 2018 +0200
@@ -19,6 +19,7 @@
     """ A filter based on the number of occurence of
     named entities in the results.
     """
+
     def __init__(self, min_occ=None, max_occ=None):
         self.min_occ = min_occ
         self.max_occ = max_occ
@@ -26,8 +27,8 @@
     def __call__(self, named_entities):
         uris = [u for u, p, t in named_entities]
         counts = dict([(u, uris.count(u)) for u in set(uris)])
-        return [n for n in named_entities if not ((self.min_occ and counts[n[0]]<self.min_occ)
-                                              or (self.max_occ and counts[n[0]]>self.max_occ))]
+        return [n for n in named_entities if not ((self.min_occ and counts[n[0]] < self.min_occ)
+                                                  or (self.max_occ and counts[n[0]] > self.max_occ))]
 
 
 class NerRDFTypeFilter(object):
@@ -40,6 +41,7 @@
                                 'http://dbpedia.org/ontology/Place'))
 
     """
+
     def __init__(self, endpoint, accepted_types):
         self.endpoint = endpoint
         self.accepted_types = accepted_types
@@ -72,6 +74,7 @@
           Then if 'toto' is found in the text, replace the URI 'http://example.com/toto'
           by 'http://example.com/toto_tutu'
     """
+
     def __call__(self, named_entities):
         # Create parts dictionnary
         parts = {}
@@ -92,7 +95,8 @@
 class NerReplacementRulesFilter(object):
     """ Allow to define replacement rules for Named Entities
     """
-    def __init__(self,rules):
+
+    def __init__(self, rules):
         self.rules = rules
 
     def __call__(self, named_entities):
--- a/nazca/ner/preprocessors.py	Tue Aug 21 15:50:28 2018 +0200
+++ b/nazca/ner/preprocessors.py	Mon Sep 03 10:17:09 2018 +0200
@@ -22,13 +22,14 @@
 class NerWordSizeFilterPreprocessor(AbstractNerPreprocessor):
     """ Remove token based on the size of the word
     """
+
     def __init__(self, min_size=None, max_size=None):
         self.min_size = min_size
         self.max_size = max_size
 
     def __call__(self, token):
-        if ((self.min_size and len(token.word)<self.min_size)
-            or (self.max_size and len(token.word)>self.max_size)):
+        if ((self.min_size and len(token.word) < self.min_size)
+                or (self.max_size and len(token.word) > self.max_size)):
             return None
         return token
 
@@ -44,12 +45,13 @@
 class NerLowerFirstWordPreprocessor(AbstractNerPreprocessor):
     """ Lower the first word of each sentence if it is a stopword.
     """
+
     def __init__(self, lang='en'):
         self.lang = lang
 
     def __call__(self, token):
         if (token.start == token.sentence.start and
-            token.word.split()[0].lower() in STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)):
+                token.word.split()[0].lower() in STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)):
             word = token.word[0].lower() + token.word[1:]
             return Token(word, token.start, token.end, token.sentence)
         return token
@@ -58,13 +60,15 @@
 class NerStopwordsFilterPreprocessor(AbstractNerPreprocessor):
     """ Remove stopwords
     """
+
     def __init__(self, split_words=False, lang='en'):
         self.split_words = split_words
         self.lang = lang
 
     def __call__(self, token):
         stopwords = STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)
-        if self.split_words and not [w for w in token.word.split() if w.lower() not in stopwords]:
+        if self.split_words and not [
+                w for w in token.word.split() if w.lower() not in stopwords]:
             return None
         if not self.split_words and token.word.lower() in stopwords:
             return None
@@ -74,6 +78,7 @@
 class NerHashTagPreprocessor(AbstractNerPreprocessor):
     """ Cleanup hashtag
     """
+
     def __call__(self, token):
         if token.word.startswith('@'):
             # XXX Split capitalize letter ?
--- a/nazca/ner/sources.py	Tue Aug 21 15:50:28 2018 +0200
+++ b/nazca/ner/sources.py	Mon Sep 03 10:17:09 2018 +0200
@@ -12,7 +12,8 @@
     """ High-level source for Named Entities Recognition
     """
 
-    def __init__(self, endpoint, query, name=None, use_cache=True, preprocessors=None):
+    def __init__(self, endpoint, query, name=None,
+                 use_cache=True, preprocessors=None):
         """ Initialise the class.
         """
         self.endpoint = endpoint
@@ -51,6 +52,7 @@
 class NerSourceLexicon(AbstractNerSource):
     """ Source based on a (pre-computed) dictionnary of words (token, uri)
     """
+
     def __init__(self, lexicon, name=None, use_cache=True, preprocessors=None):
         self.lexicon = lexicon
         self.name = name
@@ -60,7 +62,7 @@
 
     def query_word(self, word):
         uri = self.lexicon.get(word)
-        return [uri,] if uri else []
+        return [uri, ] if uri else []
 
 
 class NerSourceLocalRql(AbstractNerSource):
@@ -68,7 +70,8 @@
     Local RQL version
     """
 
-    def __init__(self, session, query, name=None, use_cache=True, preprocessors=None):
+    def __init__(self, session, query, name=None,
+                 use_cache=True, preprocessors=None):
         """ Initialise the class.
         """
         self.query = query
@@ -81,7 +84,8 @@
     def query_word(self, word):
         """ Query a word for a Named Entities Recognition process
         """
-        return [r[0] for r in self.session.execute(self.query, dict(word=word))]
+        return [r[0]
+                for r in self.session.execute(self.query, dict(word=word))]
 
 
 class NerSourceRql(AbstractNerSource):
@@ -94,9 +98,11 @@
         """
         if self.endpoint.startswith('http://'):
             # url
-            return [r[0] for r in rqlquery(self.endpoint, self.query % {'word': word})]
+            return [r[0]
+                    for r in rqlquery(self.endpoint, self.query % {'word': word})]
         else:
-            return [r[0] for r in rqlquery(self.endpoint, self.query, word=word)]
+            return [r[0]
+                    for r in rqlquery(self.endpoint, self.query, word=word)]
 
 
 class NerSourceSparql(AbstractNerSource):
@@ -107,18 +113,19 @@
    >>> ner_source = NerSourceSparql('''SELECT ?uri
                                          WHERE{
                                          ?uri rdfs:label "%(word)s"@en}''',
-			                 'http://dbpedia.org/sparql')
+                                         'http://dbpedia.org/sparql')
    >>> print ner_source.recognize_token('Victor Hugo')
-		... ['http://dbpedia.org/resource/Category:Victor_Hugo',
-		     'http://dbpedia.org/resource/Victor_Hugo',
-		     'http://dbpedia.org/class/yago/VictorHugo',
-		     'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
-		     'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
-		     'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
+                ... ['http://dbpedia.org/resource/Category:Victor_Hugo',
+                     'http://dbpedia.org/resource/Victor_Hugo',
+                     'http://dbpedia.org/class/yago/VictorHugo',
+                     'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
+                     'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
+                     'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
 
     """
 
     def query_word(self, word):
         """ Query a word for a Named Entities Recognition process
         """
-        return [r[0] for r in sparqlquery(self.endpoint, self.query % {'word': word})]
+        return [r[0] for r in sparqlquery(
+            self.endpoint, self.query % {'word': word})]
--- a/nazca/rl/aligner.py	Tue Aug 21 15:50:28 2018 +0200
+++ b/nazca/rl/aligner.py	Mon Sep 03 10:17:09 2018 +0200
@@ -29,22 +29,25 @@
 ###############################################################################
 ### UTILITY FUNCTIONS #########################################################
 ###############################################################################
-def iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique=True):
+def iter_aligned_pairs(refset, targetset, global_mat,
+                       global_matched, unique=True):
     """ Return the aligned pairs
     """
     if unique:
         for refid in global_matched:
-            bestid, _ = sorted(global_matched[refid], key=lambda x:x[1])[0]
+            bestid, _ = sorted(global_matched[refid], key=lambda x: x[1])[0]
             ref_record = refset[refid]
             target_record = targetset[bestid]
-            distance = global_mat[refid, bestid] if global_mat is not None else None
+            distance = global_mat[refid,
+                                  bestid] if global_mat is not None else None
             yield (ref_record[0], refid), (target_record[0], bestid), distance
     else:
         for refid in global_matched:
             for targetid, _ in global_matched[refid]:
                 ref_record = refset[refid]
                 target_record = targetset[targetid]
-                distance = global_mat[refid, targetid] if global_mat is not None else None
+                distance = global_mat[refid,
+                                      targetid] if global_mat is not None else None
                 yield (ref_record[0], refid), (target_record[0], targetid), distance
 
 
@@ -95,10 +98,13 @@
         matrices are summed with their own weighting and the result is the global
         alignment matrix, which is returned.
         """
-        distmatrix = zeros((len(ref_indexes), len(target_indexes)), dtype='float32')
+        distmatrix = zeros(
+            (len(ref_indexes),
+             len(target_indexes)),
+            dtype='float32')
         for processing in self.processings:
             distmatrix += processing.cdist(refset, targetset,
-                                          ref_indexes, target_indexes)
+                                           ref_indexes, target_indexes)
         return distmatrix
 
     def threshold_matched(self, distmatrix):
@@ -116,7 +122,8 @@
             match[i].append((j, distmatrix[i, j]))
         return match
 
-    def _get_match(self, refset, targetset, ref_indexes=None, target_indexes=None):
+    def _get_match(self, refset, targetset, ref_indexes=None,
+                   target_indexes=None):
         # Build items
         items = []
         ref_indexes = ref_indexes or xrange(len(refset))
@@ -129,7 +136,8 @@
         # Reapply matched to global indexes
         new_matched = {}
         for k, values in iteritems(matched):
-            new_matched[ref_indexes[k]] = [(target_indexes[i], d) for i, d in values]
+            new_matched[ref_indexes[k]] = [
+                (target_indexes[i], d) for i, d in values]
         return mat, new_matched
 
     def align(self, refset, targetset, get_matrix=True):
@@ -152,8 +160,9 @@
             self.nb_blocks += 1
             ref_index = [r[0] for r in refblock]
             target_index = [r[0] for r in targetblock]
-            self.nb_comparisons += len(ref_index)*len(target_index)
-            _, matched = self._get_match(refset, targetset, ref_index, target_index)
+            self.nb_comparisons += len(ref_index) * len(target_index)
+            _, matched = self._get_match(
+                refset, targetset, ref_index, target_index)
             for k, values in iteritems(matched):
                 subdict = global_matched.setdefault(k, set())
                 for v, d in values:
@@ -165,13 +174,16 @@
         self.time = time.time() - start_time
         return global_mat, global_matched
 
-    def get_aligned_pairs(self, refset, targetset, unique=True, use_distance=True):
+    def get_aligned_pairs(self, refset, targetset,
+                          unique=True, use_distance=True):
         """ Get the pairs of aligned elements
         """
         if not refset or not targetset:
             return
-        global_mat, global_matched = self.align(refset, targetset, get_matrix=use_distance)
-        for pair in iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique):
+        global_mat, global_matched = self.align(
+            refset, targetset, get_matrix=use_distance)
+        for pair in iter_aligned_pairs(
+                refset, targetset, global_mat, global_matched, unique):
             self.pairs_found += 1
             yield pair
         self.log_infos()
@@ -207,10 +219,10 @@
         return self.align(refset, targetset, get_matrix=get_matrix)
 
     def get_aligned_pairs_from_files(self, reffile, targetfile,
-                         ref_indexes=None, target_indexes=None,
-                         ref_encoding=None, target_encoding=None,
-                         ref_separator='\t', target_separator='\t',
-                         unique=True):
+                                     ref_indexes=None, target_indexes=None,
+                                     ref_encoding=None, target_encoding=None,
+                                     ref_separator='\t', target_separator='\t',
+                                     unique=True):
         """ Get the pairs of aligned elements
         """
         refset = parsefile(reffile, indexes=ref_indexes,
@@ -219,8 +231,10 @@
                               encoding=target_encoding, delimiter=target_separator)
         if not refset or not targetset:
             return
-        global_mat, global_matched = self.align(refset, targetset, get_matrix=False)
-        for pair in iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique):
+        global_mat, global_matched = self.align(
+            refset, targetset, get_matrix=False)
+        for pair in iter_aligned_pairs(
+                refset, targetset, global_mat, global_matched, unique):
             yield pair
 
     def log_infos(self):
@@ -233,21 +247,21 @@
         self.logger.info('Alignments done : %s' % self.alignments_done)
         self.logger.info('Pairs found : %s' % self.pairs_found)
         self.logger.info('Ratio reference set/alignments done : %s'
-                         % (self.alignments_done/float(self.refset_size)))
+                         % (self.alignments_done / float(self.refset_size)))
         self.logger.info('Ratio target set/alignments done : %s'
-                         % (self.alignments_done/float(self.targetset_size)))
+                         % (self.alignments_done / float(self.targetset_size)))
         self.logger.info('Ratio reference set/pairs found : %s'
-                         % (self.pairs_found/float(self.refset_size)))
+                         % (self.pairs_found / float(self.refset_size)))
         self.logger.info('Ratio target set/pairs found : %s'
-                         % (self.pairs_found/float(self.targetset_size)))
+                         % (self.pairs_found / float(self.targetset_size)))
         self.logger.info('Maximum comparisons : %s'
                          % (self.refset_size * self.targetset_size))
         self.logger.info('Number of blocks : %s' % self.nb_blocks)
         if self.nb_blocks:
             self.logger.info('Ratio comparisons/block : %s'
-                             % (float(self.nb_comparisons)/self.nb_blocks))
+                             % (float(self.nb_comparisons) / self.nb_blocks))
         self.logger.info('Blocking reduction : %s'
-                         % (self.nb_comparisons/float(self.refset_size * self.targetset_size)))
+                         % (self.nb_comparisons / float(self.refset_size * self.targetset_size)))
 
 
 ###############################################################################
@@ -315,18 +329,18 @@
         self.logger.info('Alignments done : %s' % self.alignments_done)
         self.logger.info('Pairs found : %s' % self.pairs_found)
         self.logger.info('Ratio reference set/alignments done : %s'
-                         % (self.alignments_done/float(self.refset_size)))
+                         % (self.alignments_done / float(self.refset_size)))
         self.logger.info('Ratio target set/alignments done : %s'
-                         % (self.alignments_done/float(self.targetset_size)))
+                         % (self.alignments_done / float(self.targetset_size)))
         self.logger.info('Ratio reference set/pairs found : %s'
-                         % (self.pairs_found/float(self.refset_size)))
+                         % (self.pairs_found / float(self.refset_size)))
         self.logger.info('Ratio target set/pairs found : %s'
-                         % (self.pairs_found/float(self.targetset_size)))
+                         % (self.pairs_found / float(self.targetset_size)))
         self.logger.info('Maximum comparisons : %s'
                          % (self.refset_size * self.targetset_size))
         self.logger.info('Number of blocks : %s' % self.nb_blocks)
         if self.nb_blocks:
             self.logger.info('Ratio comparisons/block : %s'
-                             % (float(self.nb_comparisons)/self.nb_blocks))
+                             % (float(self.nb_comparisons) / self.nb_blocks))
         self.logger.info('Blocking reduction : %s'
-                         % (self.nb_comparisons/float(self.refset_size * self.targetset_size)))
+                         % (self.nb_comparisons / float(self.refset_size * self.targetset_size)))
--- a/nazca/rl/blocking.py	Tue Aug 21 15:50:28 2018 +0200
+++ b/nazca/rl/blocking.py	Mon Sep 03 10:17:09 2018 +0200
@@ -44,6 +44,7 @@
     """ An abstract general blocking object that exposes
     the API that should be common to all blockings object
     """
+
     def __init__(self, ref_attr_index, target_attr_index):
         """ Build the blocking object
 
@@ -203,7 +204,8 @@
         the identifiers of the records of the both sets for this value.
     """
 
-    def __init__(self, ref_attr_index, target_attr_index, callback, ignore_none=False):
+    def __init__(self, ref_attr_index, target_attr_index,
+                 callback, ignore_none=False):
         super(KeyBlocking, self).__init__(ref_attr_index, target_attr_index)
         self.callback = callback
         self.ignore_none = ignore_none
@@ -260,7 +262,8 @@
     """ This blocking technique is based on a a n-gram key.
     """
 
-    def __init__(self, ref_attr_index, target_attr_index, ngram_size=2, depth=2):
+    def __init__(self, ref_attr_index, target_attr_index,
+                 ngram_size=2, depth=2):
         super(NGramBlocking, self).__init__(ref_attr_index, target_attr_index)
         self.ngram_size = ngram_size
         self.depth = depth
@@ -274,7 +277,7 @@
             cur_dict = cur_index
             text = r[attr_index]
             for i in range(self.depth):
-                ngram = text[i*self.ngram_size:(i+1)*self.ngram_size]
+                ngram = text[i * self.ngram_size:(i + 1) * self.ngram_size]
                 if i < self.depth - 1:
                     cur_dict = cur_dict.setdefault(ngram, {})
             cur_dict.setdefault(ngram, []).append((ind, r[0]))
@@ -292,7 +295,8 @@
             if key in target_cur_dict:
                 if isinstance(sub_dict, dict):
                     # There is another dict layer
-                    for block1, block2 in self._iter_dict(sub_dict, target_cur_dict[key]):
+                    for block1, block2 in self._iter_dict(
+                            sub_dict, target_cur_dict[key]):
                         yield block1, block2
                 else:
                     # This is a list
@@ -308,7 +312,8 @@
                           and containts the indexes of the record in the
                           corresponding dataset.
         """
-        for block1, block2 in self._iter_dict(self.reference_index, self.target_index):
+        for block1, block2 in self._iter_dict(
+                self.reference_index, self.target_index):
             if block1 and block2:
                 yield block1, block2
 
@@ -327,8 +332,13 @@
     (or blocking key), that will be used to divide the datasets.
     """
 
-    def __init__(self, ref_attr_index, target_attr_index, key_func=lambda x: x, window_width=20):
-        super(SortedNeighborhoodBlocking, self).__init__(ref_attr_index, target_attr_index)
+    def __init__(self, ref_attr_index, target_attr_index,
+                 key_func=lambda x: x, window_width=20):
+        super(
+            SortedNeighborhoodBlocking,
+            self).__init__(
+            ref_attr_index,
+            target_attr_index)
         self.key_func = key_func
         self.window_width = window_width
         self.sorted_dataset = None
@@ -349,9 +359,9 @@
             # Only keep reference set record
             if dset == 1:
                 continue
-            block1 = [rid,]
+            block1 = [rid, ]
             minind = (ind - self.window_width)
-            minind = minind if minind >=0 else 0
+            minind = minind if minind >= 0 else 0
             maxind = (ind + self.window_width + 1)
             block2 = [ri for ri, re, d in self.sorted_dataset[minind:maxind]
                       if d == 1]
@@ -397,11 +407,14 @@
         """
         if self.ref_attr_index is not None:
             # Merge refset
-            self.merged_dataset = self._merge_dataset(refset, self.ref_attr_index)
-            self.other_dataset = [(ind, r[0]) for ind, r in enumerate(targetset)]
+            self.merged_dataset = self._merge_dataset(
+                refset, self.ref_attr_index)
+            self.other_dataset = [(ind, r[0])
+                                  for ind, r in enumerate(targetset)]
         else:
             # Merge targetset
-            self.merged_dataset = self._merge_dataset(targetset, self.target_attr_index)
+            self.merged_dataset = self._merge_dataset(
+                targetset, self.target_attr_index)
             self.other_dataset = [(ind, r[0]) for ind, r in enumerate(refset)]
 
     def _merge_dataset(self, dataset, attr_index):
@@ -417,7 +430,8 @@
                   and merged_dataset_dict[record[attr_index]][2] < score):
                 # Change current score
                 merged_dataset_dict[record[attr_index]] = (ind, record, score)
-        return [(ind, r[0]) for ind, r, score in itervalues(merged_dataset_dict)]
+        return [(ind, r[0])
+                for ind, r, score in itervalues(merged_dataset_dict)]
 
     def _iter_blocks(self):
         """ Iterator over the different possible blocks.
@@ -455,14 +469,15 @@
         """
         # If an element is None (missing), use instead the identity element.
         # The identity element is defined as the 0-vector
-        idelement = tuple([0 for _ in xrange(len(refset[0][self.ref_attr_index]))])
+        idelement = tuple(
+            [0 for _ in xrange(len(refset[0][self.ref_attr_index]))])
         # We assume here that there are at least 2 elements in the refset
-        n_clusters = self.n_clusters or (len(refset)/10 or len(refset)/2)
+        n_clusters = self.n_clusters or (len(refset) / 10 or len(refset) / 2)
         try:
-            kmeans =  self.cluster_class(n_clusters=n_clusters)
+            kmeans = self.cluster_class(n_clusters=n_clusters)
         except TypeError:
             # Try older API version of sklearn
-            kmeans =  self.cluster_class(k=n_clusters)
+            kmeans = self.cluster_class(k=n_clusters)
         kmeans.fit([elt[self.ref_attr_index] or idelement for elt in refset])
         self.kmeans = kmeans
         # Predict on targetset
@@ -479,7 +494,8 @@
                           and containts the indexes of the record in the
                           corresponding dataset.
         """
-        n_clusters = self.kmeans.n_clusters if hasattr(self.kmeans, 'n_clusters') else self.kmeans.k
+        n_clusters = self.kmeans.n_clusters if hasattr(
+            self.kmeans, 'n_clusters') else self.kmeans.k
         neighbours = [[[], []] for _ in xrange(n_clusters)]
         for ind, li in enumerate(self.predicted):
             neighbours[li][1].append(self.targetids[ind])
@@ -502,6 +518,7 @@
 class KdTreeBlocking(BaseBlocking):
     """ A blocking technique based on KdTree
     """
+
     def __init__(self, ref_attr_index, target_attr_index, threshold=0.1):
         super(KdTreeBlocking, self).__init__(ref_attr_index, target_attr_index)
         self.threshold = threshold
@@ -514,15 +531,20 @@
         """
         firstelement = refset[0][self.ref_attr_index]
         self.nb_elements = len(refset)
-        idsize = len(firstelement) if isinstance(firstelement, (tuple, list)) else 1
+        idsize = len(firstelement) if isinstance(
+            firstelement, (tuple, list)) else 1
         idelement = (0,) * idsize
         # KDTree is expecting a two-dimensional array
         if idsize == 1:
-            self.reftree  = KDTree([(elt[self.ref_attr_index],) or idelement for elt in refset])
-            self.targettree = KDTree([(elt[self.target_attr_index],) or idelement for elt in targetset])
+            self.reftree = KDTree(
+                [(elt[self.ref_attr_index],) or idelement for elt in refset])
+            self.targettree = KDTree(
+                [(elt[self.target_attr_index],) or idelement for elt in targetset])
         else:
-            self.reftree = KDTree([elt[self.ref_attr_index] or idelement for elt in refset])
-            self.targettree = KDTree([elt[self.target_attr_index] or idelement for elt in targetset])
+            self.reftree = KDTree(
+                [elt[self.ref_attr_index] or idelement for elt in refset])
+            self.targettree = KDTree(
+                [elt[self.target_attr_index] or idelement for elt in targetset])
 
     def _iter_blocks(self):
         """ Iterator over the different possible blocks.
@@ -534,12 +556,13 @@
                           and containts the indexes of the record in the
                           corresponding dataset.
         """
-        extraneighbours = self.reftree.query_ball_tree(self.targettree, self.threshold)
+        extraneighbours = self.reftree.query_ball_tree(
+            self.targettree, self.threshold)
         neighbours = []
         for ind in xrange(self.nb_elements):
             if not extraneighbours[ind]:
                 continue
-            _ref = [self.refids[ind],]
+            _ref = [self.refids[ind], ]
             _target = [self.targetids[v] for v in extraneighbours[ind]]
             neighbours.append((_ref, _target))
         for block1, block2 in neighbours:
@@ -560,9 +583,14 @@
 class MinHashingBlocking(BaseBlocking):
     """ A blocking technique based on MinHashing
     """
+
     def __init__(self, ref_attr_index, target_attr_index,
                  threshold=0.1, kwordsgram=1, siglen=200):
-        super(MinHashingBlocking, self).__init__(ref_attr_index, target_attr_index)
+        super(
+            MinHashingBlocking,
+            self).__init__(
+            ref_attr_index,
+            target_attr_index)
         self.threshold = threshold
         self.kwordsgram = kwordsgram
         self.siglen = siglen
@@ -575,8 +603,9 @@
         # If an element is None (missing), use instead the identity element.
         idelement = ''
         self.minhasher.train([elt[self.ref_attr_index] or idelement for elt in refset] +
-                        [elt[self.target_attr_index] or idelement for elt in targetset],
-                        self.kwordsgram, self.siglen)
+                             [elt[self.target_attr_index]
+                                 or idelement for elt in targetset],
+                             self.kwordsgram, self.siglen)
         self.nb_elements = len(refset)
 
     def _iter_blocks(self):
@@ -595,7 +624,8 @@
             neighbours.append([[], []])
             for i in data:
                 if i >= self.nb_elements:
-                    neighbours[-1][1].append(self.targetids[i - self.nb_elements])
+                    neighbours[-1][1].append(self.targetids[i -
+                                                            self.nb_elements])
                 else:
                     neighbours[-1][0].append(self.refids[i])
             if len(neighbours[-1][0]) == 0 or len(neighbours[-1][1]) == 0:
@@ -633,7 +663,10 @@
 
     def _fit(self, refset, targetset):
         """ Internal fit of the pipeline """
-        self._recursive_fit(refset, targetset, range(len(refset)), range(len(targetset)), 0)
+        self._recursive_fit(
+            refset, targetset, range(
+                len(refset)), range(
+                len(targetset)), 0)
 
     def _recursive_fit(self, refset, targetset, ref_index, target_index, ind):
         """ Recursive fit of the blockings.
@@ -649,8 +682,11 @@
                 ind_block1 = [ref_index[i] for i in block1]
                 ind_block2 = [target_index[i] for i in block2]
                 if self.collect_stats:
-                    self.stats.setdefault(ind, []).append((len(block1), len(block2)))
-                self._recursive_fit(refset, targetset, ind_block1, ind_block2, ind+1)
+                    self.stats.setdefault(
+                        ind, []).append(
+                        (len(block1), len(block2)))
+                self._recursive_fit(
+                    refset, targetset, ind_block1, ind_block2, ind + 1)
         else:
             # This is the final blocking
             blocking = self.blockings[ind]
@@ -661,7 +697,9 @@
                 ind_block1 = [(ref_index[i], _id) for i, _id in block1]
                 ind_block2 = [(target_index[i], _id) for i, _id in block2]
                 if self.collect_stats:
-                    self.stats.setdefault(ind, []).append((len(block1), len(block2)))
+                    self.stats.setdefault(
+                        ind, []).append(
+                        (len(block1), len(block2)))
                 self.stored_blocks.append((ind_block1, ind_block2))
 
     def _iter_blocks(self):
--- a/nazca/utils/dataio.py	Tue Aug 21 15:50:28 2018 +0200
+++ b/nazca/utils/dataio.py	Mon Sep 03 10:17:09 2018 +0200
@@ -74,7 +74,9 @@
     req = cnx.request()
     return req
 
-def rqlquery(host, rql, indexes=None, formatopt=None, autocast_data=True, _cache_cnx={}, **kwargs):
+
+def rqlquery(host, rql, indexes=None, formatopt=None,
+             autocast_data=True, _cache_cnx={}, **kwargs):
     """ Run the rql query on the given cubicweb host
     Additional arguments can be passed to be properly substitued
     in the execute() function for appid accces.
@@ -87,9 +89,9 @@
         filehandle = urllib.urlopen('%(host)s/view?'
                                     'rql=%(rql)s&vid=csvexport'
                                     % {'rql': rql, 'host': host})
-        filehandle.readline()#Skip the first line
+        filehandle.readline()  # Skip the first line
         return parsefile(filehandle, delimiter=';', indexes=indexes,
-                         formatopt=formatopt, autocast_data=autocast_data);
+                         formatopt=formatopt, autocast_data=autocast_data)
     else:
         # By appid
         if host in _cache_cnx:
@@ -119,14 +121,17 @@
         except ValueError:
             # Bad json
             rawresults = sparql.query()
-            return json.loads(codecs.escape_decode(rawresults.response.read())[0])
+            return json.loads(codecs.escape_decode(
+                rawresults.response.read())[0])
     except Exception as err:
         if raise_on_error:
             raise RuntimeError('Error in sparql query', err)
         else:
             return []
 
-def sparqlquery(endpoint, query, indexes=None, autocast_data=True, raise_on_error=False):
+
+def sparqlquery(endpoint, query, indexes=None,
+                autocast_data=True, raise_on_error=False):
     """ Run the sparql query on the given endpoint, and wrap the items in the
     indexes form. If indexes is empty, keep raw output"""
     results = []
@@ -146,12 +151,14 @@
         else:
             for il, ind in enumerate(indexes):
                 if isinstance(ind, tuple):
-                    data.append(tuple([transform(raw[labels[i]]['value']) for i in ind]))
+                    data.append(
+                        tuple([transform(raw[labels[i]]['value']) for i in ind]))
                 else:
                     data.append(transform(raw[labels[il]]['value']))
         results.append(data)
     return results
 
+
 def sparqljson(endpoint, query, lang_order=('fr', 'en'), raise_on_error=False):
     """ Execute and format the results of a sparql query.
     Sort the litterals using lang_order.
@@ -172,9 +179,14 @@
                 data[k] = v['value']
             else:
                 # Literal - Use lang
-                data_lang.setdefault(k, []).append((v['value'], v.get('xml:lang')))
-    keyfunc = lambda x: lang_order.index(x[1]) if x[1] in lang_order else len(lang_order)
-    data.update(dict([(k, sorted(v, key=keyfunc)[0][0]) for k, v in iteritems(data_lang)]))
+                data_lang.setdefault(
+                    k, []).append(
+                    (v['value'], v.get('xml:lang')))
+
+    def keyfunc(x): return lang_order.index(
+        x[1]) if x[1] in lang_order else len(lang_order)
+    data.update(dict([(k, sorted(v, key=keyfunc)[0][0])
+                      for k, v in iteritems(data_lang)]))
     return data
 
 
@@ -222,12 +234,11 @@
             yield [cell.strip() for cell in row]
         csvfile.close()
 
-
     # Autocast if asked
     if autocast_data:
-        deffunc = lambda x: autocast(x, encoding)
+        def deffunc(x): return autocast(x, encoding)
     else:
-        deffunc = lambda x: x
+        def deffunc(x): return x
     result = []
     indexes = indexes or []
     formatopt = formatopt or {}
@@ -253,6 +264,7 @@
         result.append(data)
     return result
 
+
 def write_results(matched, alignset, targetset, resultfile):
     """ Given a matched dictionnay, an alignset and a targetset to the
         resultfile
@@ -266,12 +278,13 @@
                 alignid = alignset[aligned][0]
                 targetid = targetset[target][0]
                 fobj.write('%s;%s;%s\n' %
-                    (alignid.encode('utf-8') if isinstance(alignid, basestring)
-                                             else alignid,
-                     targetid.encode('utf-8') if isinstance(targetid, basestring)
-                                              else targetid,
-                     dist
-                     ))
+                           (alignid.encode('utf-8') if isinstance(alignid, basestring)
+                            else alignid,
+                            targetid.encode('utf-8') if isinstance(targetid, basestring)
+                            else targetid,
+                            dist
+                            ))
+
 
 def split_file(filename, outputdir, nblines=60000):
     """ Split `filename` into smaller files of ``nblines`` lines. Files are
@@ -321,7 +334,7 @@
                 uri, t = tindices[indice]
                 words = text[t.start:t.end]
                 fragment = self.pprint_entity(uri, words, **kwargs)
-                if not self.is_valid(newtext+fragment+text[t.end:]):
+                if not self.is_valid(newtext + fragment + text[t.end:]):
                     fragment = words
                 newtext += fragment
                 indice = t.end
@@ -369,7 +382,7 @@
     def is_valid(self, html):
         try:
             etree.fromstring(self.XHTML_DOC_TEMPLATE % html.encode('utf-8'),
-                          parser=etree.XMLParser(dtd_validation=True))
+                             parser=etree.XMLParser(dtd_validation=True))
         except etree.XMLSyntaxError:
             return False
         return True
--- a/nazca/utils/distances.py	Tue Aug 21 15:50:28 2018 +0200
+++ b/nazca/utils/distances.py	Mon Sep 03 10:17:09 2018 +0200
@@ -17,7 +17,7 @@
 
 import difflib
 from functools import partial
-from math import cos, sqrt, pi #Needed for geographical distance
+from math import cos, sqrt, pi  # Needed for geographical distance
 try:
     from dateutil import parser as dateparser
     DATEUTIL_ENABLED = True
@@ -49,7 +49,10 @@
     """
     ref_indexes = ref_indexes or xrange(len(refset))
     target_indexes = target_indexes or xrange(len(targetset))
-    distmatrix = empty((len(ref_indexes), len(target_indexes)), dtype='float32')
+    distmatrix = empty(
+        (len(ref_indexes),
+         len(target_indexes)),
+        dtype='float32')
     size = distmatrix.shape
     for i, iref in enumerate(ref_indexes):
         for j, jref in enumerate(target_indexes):
@@ -57,10 +60,11 @@
             if refset[iref] and targetset[jref]:
                 d = distance_callback(refset[iref], targetset[jref])
                 if matrix_normalized:
-                    d = 1 - (1.0/(1.0 + d))
+                    d = 1 - (1.0 / (1.0 + d))
             distmatrix[i, j] = d
     return distmatrix
 
+
 def _handlespaces(stra, strb, distance, tokenizer=None, **kwargs):
     """ Compute the matrix of distances between all tokens of stra and strb
         (with function ``distance``). Extra args are given to the distance
@@ -86,18 +90,20 @@
 
     toka = tokenize(stra, tokenizer)
     tokb = tokenize(strb, tokenizer)
-    # If not same number of tokens, complete the smallest list with empty strings
+    # If not same number of tokens, complete the smallest list with empty
+    # strings
     if len(toka) != len(tokb):
-        mint = toka if len(toka)<len(tokb) else tokb
-        maxt = toka if len(toka)>len(tokb) else tokb
-        mint.extend(['' for i in range(len(maxt)-len(mint))])
+        mint = toka if len(toka) < len(tokb) else tokb
+        maxt = toka if len(toka) > len(tokb) else tokb
+        mint.extend(['' for i in range(len(maxt) - len(mint))])
 
     listmatrix = []
     for i in xrange(len(toka)):
-        listmatrix.append([distance(toka[i], tokb[j], **kwargs) for j in xrange(len(tokb))])
+        listmatrix.append([distance(toka[i], tokb[j], **kwargs)
+                           for j in xrange(len(tokb))])
     m = matrix(listmatrix)
-    minlist = [m[i,:].min() for i in xrange(m.shape[0])]
-    minlist.extend([m[:,i].min() for i in xrange(m.shape[1])])
+    minlist = [m[i, :].min() for i in xrange(m.shape[0])]
+    minlist.extend([m[:, i].min() for i in xrange(m.shape[1])])
     return max(minlist)
 
 
@@ -110,7 +116,7 @@
     try:
         return abs(a - b)
     except TypeError:
-        #a and b may be strings
+        # a and b may be strings
         return abs(float(a) - float(b))
 
 
@@ -120,7 +126,8 @@
 def exact_match(a, b):
     """ The simplest distance, defined as 0 if both values are equal, 1 elsewise.
     """
-    return 0 if a==b else 1
+    return 0 if a == b else 1
+
 
 def levenshtein(stra, strb, tokenizer=None):
     """ Compute the Levenshtein distance between stra and strb.
@@ -141,7 +148,7 @@
     onerowago = None
     thisrow = range(1, lenb + 1) + [0]
     for x in xrange(len(stra)):
-        onerowago, thisrow = thisrow, [0]*lenb + [x+1]
+        onerowago, thisrow = thisrow, [0] * lenb + [x + 1]
         for y in xrange(lenb):
             delcost = onerowago[y] + 1
             addcost = thisrow[y - 1] + 1
@@ -149,6 +156,7 @@
             thisrow[y] = min(delcost, addcost, subcost)
     return thisrow[lenb - 1]
 
+
 def soundexcode(word, language='french'):
     """ Return the Soundex code of the word ``word``
         For more information about soundex code see wiki_
@@ -162,7 +170,7 @@
     """
 
     vowels = 'AEHIOUWY'
-    if language.lower() == 'french' :
+    if language.lower() == 'french':
         consonnantscode = {'B': '1', 'P': '1',
                            'C': '2', 'K': '2', 'Q': '2',
                            'D': '3', 'T': '3',
@@ -172,7 +180,7 @@
                            'G': '7', 'J': '7',
                            'X': '8', 'Z': '8', 'S': '8',
                            'F': '9', 'V': '9'
-                          }
+                           }
     elif language.lower() == 'english':
         consonnantscode = {'B': '1', 'F': '1', 'P': '1', 'V': '1',
                            'C': '2', 'G': '2', 'J': '2', 'K': '2',
@@ -181,14 +189,14 @@
                            'L': '4',
                            'M': '5', 'N': '5',
                            'R': '6'
-                          }
+                           }
     else:
         raise NotImplementedError('Soundex code is not supported (yet ?) for'
                                   'this language (%s). '
                                   'Supported languages are french and english' % language)
     word = word.strip().upper()
     code = word[0]
-    #After this ``for`` code is
+    # After this ``for`` code is
     # the first letter of ``word`` followed by all the consonnants of word,
     # where from consecutive consonnants, only the first is kept,
     # and from two identical consonnants separated by a W or a H, only the first
@@ -206,18 +214,22 @@
         if len(code) > 4:
             break
 
-    #Replace according to the codes
+    # Replace according to the codes
     code = code[0] + ''.join([consonnantscode[c] for c in code[1:]])
-    ###First four letters, completed by zeros
-    return code[:4] + '0'*(4 - len(code))
+    # First four letters, completed by zeros
+    return code[:4] + '0' * (4 - len(code))
+
 
 def soundex(stra, strb, language='french', tokenizer=None):
     """ Return the 1/0 distance between the soundex code of stra and strb.
         0 means they have the same code, 1 they don't
     """
     if ' ' in stra or ' ' in strb:
-        return _handlespaces(stra, strb, soundex, tokenizer=tokenizer, language=language)
-    return 0 if (soundexcode(stra, language) == soundexcode(strb, language)) else 1
+        return _handlespaces(stra, strb, soundex,
+                             tokenizer=tokenizer, language=language)
+    return 0 if (soundexcode(stra, language) ==
+                 soundexcode(strb, language)) else 1
+
 
 def jaccard(stra, strb, tokenizer=None):
     """ Return the jaccard distance between stra and strb, condering the tokens
@@ -231,13 +243,15 @@
     setb = set(tokenize(strb, tokenizer))
     return generic_jaccard(seta, setb)
 
+
 def generic_jaccard(seta, setb):
     """ Return the jaccard distance between two sets A and B.
 
         J(A, B) = (A \cap B)/(A \cup B)
         d(A, B) = 1 - J(A, B)
     """
-    return 1.0 - 1.0*len(seta.intersection(setb))/len(seta.union(setb))
+    return 1.0 - 1.0 * len(seta.intersection(setb)) / len(seta.union(setb))
+
 
 def difflib_match(stra, strb):
     """ Approximate matching.
@@ -263,9 +277,9 @@
 
         HMS = [(u'h', u'heure', u'heures'),
                (u'm', u'minute', u'minutes'),
-                    (u's', u'seconde', u'seconde'),]
+               (u's', u'seconde', u'seconde'), ]
         JUMP = [u' ', u'.', u',', u';', u'-', u'/', u"'",
-               u'a', u'le', u'et', u'er']
+                u'a', u'le', u'et', u'er']
         MONTHS = [(u'Jan', u'Janvier'), (u'Fev', u'Fevrier'),
                   (u'Mar', u'Mars'), (u'Avr', u'Avril'), (u'Mai', u'Mai'),
                   (u'Jun', u'Juin'), (u'Jui', u'Juillet'),
@@ -296,14 +310,14 @@
         """
 
         datea = dateparser.parse(stra, parserinfo=parserinfo(dayfirst,
-                                 yearfirst), fuzzy=True)
+                                                             yearfirst), fuzzy=True)
         dateb = dateparser.parse(strb, parserinfo=parserinfo(dayfirst,
-                                 yearfirst), fuzzy=True)
+                                                             yearfirst), fuzzy=True)
         diff = datea - dateb
         if granularity.lower() == 'years':
-            return abs(diff.days/365.25)
+            return abs(diff.days / 365.25)
         if granularity.lower() == 'months':
-            return abs(diff.days/30.5)
+            return abs(diff.days / 30.5)
         return abs(diff.days)
 
 
@@ -330,19 +344,20 @@
 
     difflat = pointa[0] - pointb[0]
     difflong = pointa[1] - pointb[1]
-    meanlat = (pointa[0] + pointb[0])/2.0
+    meanlat = (pointa[0] + pointb[0]) / 2.0
 
     if not in_radians:
-        difflat *= pi/180.0
-        difflong *= pi/180.0
-        meanlat *= pi/180.0
+        difflat *= pi / 180.0
+        difflong *= pi / 180.0
+        meanlat *= pi / 180.0
 
     coef = 1. if units == 'm' else 0.001
-    return coef*planet_radius*sqrt(difflat**2 + (cos(meanlat)*difflong)**2)
+    return coef * planet_radius * \
+        sqrt(difflat**2 + (cos(meanlat) * difflong)**2)
 
 
 ###############################################################################
-### BASE PROCESSING ############################################################
+### BASE PROCESSING ######################################################
 ###############################################################################
 class BaseProcessing(object):
     """ A processing object used to provide an abstraction over the different
@@ -438,18 +453,18 @@
         """
         values = []
         for i in xrange(len(dataset)):
-            for j in xrange(i+1, len(dataset)):
+            for j in xrange(i + 1, len(dataset)):
                 d = 1
                 if dataset[i] and dataset[j]:
                     d = self.distance(dataset[i], dataset[j])
                     if self.matrix_normalized:
-                        d = 1 - (1.0/(1.0 + d))
+                        d = 1 - (1.0 / (1.0 + d))
                 values.append(d)
         return values
 
 
 ###############################################################################
-### CONCRETE PROCESSINGS #######################################################
+### CONCRETE PROCESSINGS #################################################
 ###############################################################################
 class ExactMatchProcessing(BaseProcessing):
     """ A processing based on the exact match (1 if a==b, 0 elsewise)
@@ -462,6 +477,7 @@
                                                    exact_match,
                                                    weight, matrix_normalized)
 
+
 class LevenshteinProcessing(BaseProcessing):
     """ A processing based on the levenshtein distance.
     """
@@ -471,9 +487,9 @@
         distance_callback = partial(levenshtein,
                                     tokenizer=tokenizer)
         super(LevenshteinProcessing, self).__init__(ref_attr_index,
-                                                   target_attr_index,
-                                                   distance_callback,
-                                                   weight,matrix_normalized)
+                                                    target_attr_index,
+                                                    distance_callback,
+                                                    weight, matrix_normalized)
 
 
 class GeographicalProcessing(BaseProcessing):
@@ -485,9 +501,9 @@
         distance_callback = partial(geographical, in_radians=in_radians,
                                     planet_radius=planet_radius, units=units)
         super(GeographicalProcessing, self).__init__(ref_attr_index,
-                                                    target_attr_index,
-                                                    distance_callback,
-                                                    weight, matrix_normalized)
+                                                     target_attr_index,
+                                                     distance_callback,
+                                                     weight, matrix_normalized)
 
 
 class SoundexProcessing(BaseProcessing):
@@ -496,7 +512,8 @@
 
     def __init__(self, ref_attr_index=None, target_attr_index=None,
                  tokenizer=None, weight=1, language='french', matrix_normalized=False):
-        distance_callback = partial(soundex, language=language, tokenizer=tokenizer)
+        distance_callback = partial(
+            soundex, language=language, tokenizer=tokenizer)
         super(SoundexProcessing, self).__init__(ref_attr_index,
                                                 target_attr_index,
                                                 distance_callback,
@@ -541,6 +558,6 @@
                                         parserinfo=parserinfo,
                                         dayfirst=dayfirst, yearfirst=yearfirst)
             super(TemporalProcessing, self).__init__(ref_attr_index,
-                                                    target_attr_index,
-                                                    distance_callback,
-                                                    weight, matrix_normalized)
+                                                     target_attr_index,
+                                                     distance_callback,
+                                                     weight, matrix_normalized)
--- a/nazca/utils/minhashing.py	Tue Aug 21 15:50:28 2018 +0200
+++ b/nazca/utils/minhashing.py	Mon Sep 03 10:17:09 2018 +0200
@@ -39,10 +39,11 @@
     b = randint(1, bound)
 
     def hashfunc(x):
-        return ((a*x + b)%zr)
+        return ((a * x + b) % zr)
 
     return hashfunc
 
+
 def count_vectorizer_func(sentences, min_n, max_n):
     """ Perform a tokenization using scikit learn
     """
@@ -107,8 +108,8 @@
         """
         words = sentence.split(' ')
         for r in xrange(len(words)):
-            for width in range(1, k+1):
-                if r+width<=len(words):
+            for width in range(1, k + 1):
+                if r + width <= len(words):
                     yield ' '.join(words[r:r + width])
 
     def _buildmatrixdocument(self, sentences, k):
@@ -144,19 +145,19 @@
 
         nrows, ncols = shape
         sig = np.empty((siglen, nrows))
-        #Generate the random hash functions
+        # Generate the random hash functions
         hashfunc = [randomhashfunction(ncols) for _ in xrange(siglen)]
-        #Compute hashing values just for once.
-        #Avoid multiple recomputations for the same column.
+        # Compute hashing values just for once.
+        # Avoid multiple recomputations for the same column.
         hashvalues = np.array([[hashfunc[i](r) for r in xrange(ncols)]
-                                for i in  xrange(siglen)])
+                               for i in xrange(siglen)])
 
         docind = 0
         while rows:
             doc = rows.pop(0)
-            #Concatenate the needed rows.
+            # Concatenate the needed rows.
             tmp = np.dstack([hashvalues[:, r] for r in doc])
-            #Take the mininum of hashes
+            # Take the mininum of hashes
             sig[:, docind] = np.min(tmp[0], 1)
             docind += 1
             if self._verbose and docind % 50000 == 0:
@@ -189,15 +190,15 @@
     def computebandsize(self, threshold, nbrows):
         """ Compute the bandsize according to the threshold given """
 
-        ### t ~ (1/b)^(1/r), where t is the threshold, b the number of
-        ### bands, and r the number of rows per band. And nbrows (the length
-        ### of the matrix is nbrows = b*r, so t ~ (r/L)^(1/r). So, let's
-        ### find the root of f(x) = (x/L)^(1/r) - t.
+        # t ~ (1/b)^(1/r), where t is the threshold, b the number of
+        # bands, and r the number of rows per band. And nbrows (the length
+        # of the matrix is nbrows = b*r, so t ~ (r/L)^(1/r). So, let's
+        # find the root of f(x) = (x/L)^(1/r) - t.
         def f(x):
-            y = pow(x/nbrows, 1. /x) - threshold
+            y = pow(x / nbrows, 1. / x) - threshold
             return y
 
-        ## Solve f(x) = 0, with x having values in [1, nbrows]
+        # Solve f(x) = 0, with x having values in [1, nbrows]
         return int(bisect(f, 1, nbrows))
 
     def predict(self, threshold, minclustersize=2):
@@ -222,7 +223,7 @@
         for r in xrange(0, sig.shape[0], bandsize):
             buckets.clear()
             for i in xrange(sig.shape[1]):
-                buckets[tuple(sig[r:r+bandsize, i])].add(i)
+                buckets[tuple(sig[r:r + bandsize, i])].add(i)
             similars.update(set(tuple(v) for v in itervalues(buckets)
                                 if len(v) >= minclustersize))
         return similars
--- a/nazca/utils/normalize.py	Tue Aug 21 15:50:28 2018 +0200
+++ b/nazca/utils/normalize.py	Mon Sep 03 10:17:09 2018 +0200
@@ -25,22 +25,22 @@
 
 
 FRENCH_STOPWORDS = set([u'alors', u'au', u'aux', u'aucuns', u'aussi', u'autre', u'avant',
-u'avec', u'avoir', u'bon', u'car', u'ce', u'cela', u'ces', u'ceux', u'chaque',
-u'ci', u'comme', u'comment', u'dans', u'de', u'des', u'du', u'dedans', u'dehors',
-u'depuis', u'deux', u'devrait', u'doit', u'donc', u'dos', u'droite', u'début',
-u'elle', u'elles', u'en', u'encore', u'essai', u'est', u'et', u'eu', u'eux', u'fait',
-u'faites', u'fois', u'font', u'force', u'haut', u'hors', u'ici', u'il', u'ils',
-u'je', u'juste', u'la', u'le', u'les', u'leur', u'lui', u'là', u'ma', u'maintenant',
-u'mais', u'me', u'mes', u'moi', u'moins', u'mon', u'mot', u'même', u'ne',
-u'ni', u'nommés', u'nos',
-u'notre', u'nous', u'nouveaux', u'on', u'ou', u'où', u'par', u'parce', u'parole',
-u'pas', u'personnes', u'peut', u'peu', u'pièce', u'plupart', u'pour',
-u'pourquoi', u'quand', u'que', u'quel', u'quelle', u'quelles', u'quels', u'qui',
-u'sa', u'sans', u'se', u'ses', u'seulement', u'si', u'sien', u'son', u'sont', u'sous',
-u'soyez', u'sujet', u'sur', u'ta', u'tandis', u'tellement', u'te', u'tels', u'tes', u'toi',
-u'ton', u'tous', u'tout', u'trop', u'très', u'tu', u'un', u'une', u'valeur', u'voie',
-u'voient', u'vont', u'vos', u'votre', u'vous', u'vu', u'ça', u'étaient', u'état',
-u'étions', u'été', u'être'])
+                        u'avec', u'avoir', u'bon', u'car', u'ce', u'cela', u'ces', u'ceux', u'chaque',
+                        u'ci', u'comme', u'comment', u'dans', u'de', u'des', u'du', u'dedans', u'dehors',
+                        u'depuis', u'deux', u'devrait', u'doit', u'donc', u'dos', u'droite', u'début',
+                        u'elle', u'elles', u'en', u'encore', u'essai', u'est', u'et', u'eu', u'eux', u'fait',
+                        u'faites', u'fois', u'font', u'force', u'haut', u'hors', u'ici', u'il', u'ils',
+                        u'je', u'juste', u'la', u'le', u'les', u'leur', u'lui', u'là', u'ma', u'maintenant',
+                        u'mais', u'me', u'mes', u'moi', u'moins', u'mon', u'mot', u'même', u'ne',
+                        u'ni', u'nommés', u'nos',
+                        u'notre', u'nous', u'nouveaux', u'on', u'ou', u'où', u'par', u'parce', u'parole',
+                        u'pas', u'personnes', u'peut', u'peu', u'pièce', u'plupart', u'pour',
+                        u'pourquoi', u'quand', u'que', u'quel', u'quelle', u'quelles', u'quels', u'qui',
+                        u'sa', u'sans', u'se', u'ses', u'seulement', u'si', u'sien', u'son', u'sont', u'sous',
+                        u'soyez', u'sujet', u'sur', u'ta', u'tandis', u'tellement', u'te', u'tels', u'tes', u'toi',
+                        u'ton', u'tous', u'tout', u'trop', u'très', u'tu', u'un', u'une', u'valeur', u'voie',
+                        u'voient', u'vont', u'vos', u'votre', u'vous', u'vu', u'ça', u'étaient', u'état',
+                        u'étions', u'été', u'être'])
 
 MANUAL_UNICODE_MAP = {
     u'\xa1': u'!',    # INVERTED EXCLAMATION MARK
@@ -51,14 +51,14 @@
     u'\xab': u'"',    # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
     u'\xe6': u'ae',   # LATIN SMALL LETTER AE
     u'\xae': u'(r)',  # REGISTERED SIGN
-    u'\u0153': u'oe', # LATIN SMALL LIGATURE OE
-    u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE
+    u'\u0153': u'oe',  # LATIN SMALL LIGATURE OE
+    u'\u0152': u'OE',  # LATIN CAPITAL LIGATURE OE
     u'\xd8': u'O',    # LATIN CAPITAL LETTER O WITH STROKE
     u'\xf8': u'o',    # LATIN SMALL LETTER O WITH STROKE
     u'\xbb': u'"',    # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
     u'\xdf': u'ss',   # LATIN SMALL LETTER SHARP S
     u'\u2019': u"'",  # RIGHT SINGLE QUOTATION MARK
-    }
+}
 
 
 ###############################################################################
@@ -89,16 +89,20 @@
                 replacement = letter
             if ord(replacement) >= 2 ** 7:
                 if substitute is None:
-                    raise ValueError("can't deal with non-ascii based characters")
+                    raise ValueError(
+                        "can't deal with non-ascii based characters")
                 replacement = substitute
         res.append(replacement)
     return u''.join(res)
 
+
 def lunormalize(sentence, substitute=None):
     """ Normalize a sentence (ie remove accents, set to lower, etc) """
-    return unormalize(sentence,substitute).lower()
+    return unormalize(sentence, substitute).lower()
+
 
-def simplify(sentence, lemmas=None, remove_stopwords=True, stopwords=FRENCH_STOPWORDS):
+def simplify(sentence, lemmas=None, remove_stopwords=True,
+             stopwords=FRENCH_STOPWORDS):
     """ Simply the given sentence
         0) If remove_stopwords, then remove the stop words
         1) If lemmas are given, the sentence is lemmatized
@@ -112,14 +116,16 @@
         sentence = lemmatized(sentence, lemmas)
     sentence = sentence.lower()
     cleansent = ''.join([s if s not in punctuation
-                           else ' ' for s in sentence]).strip()
-    #comma followed by a space is replaced by two spaces, keep only one
+                         else ' ' for s in sentence]).strip()
+    # comma followed by a space is replaced by two spaces, keep only one
     cleansent = cleansent.replace('  ', ' ')
 
     if not remove_stopwords:
         return cleansent
     else:
-        return ' '.join([w for w in cleansent.split(' ') if w not in stopwords])
+        return ' '.join(
+            [w for w in cleansent.split(' ') if w not in stopwords])
+
 
 def tokenize(sentence, tokenizer=None, regexp=re.compile(r"[^\s]+")):
     """ Tokenize a sentence.
@@ -137,12 +143,13 @@
         # Deals with '
         if "'" in chunk:
             schunks = chunk.split("'")
-            chunks.extend([c+"'" for c in schunks[:-1]])
+            chunks.extend([c + "'" for c in schunks[:-1]])
             chunks.append(schunks[-1])
         else:
             chunks.append(chunk)
     return chunks
 
+
 def lemmatized(sentence, lemmas, tokenizer=None):
     """ Return the lemmatized sentence
     """
@@ -153,7 +160,9 @@
             tokenized_sentformated[-1] += w
         elif w not in punctuation:
             tokenized_sentformated.append(w)
-    return u' '.join([lemmatized_word(w, lemmas) for w in tokenized_sentformated])
+    return u' '.join([lemmatized_word(w, lemmas)
+                      for w in tokenized_sentformated])
+
 
 def lemmatized_word(word, lemmas):
     """ Return the lemmatized word
@@ -167,6 +176,7 @@
             lemma = _words[0]
     return lemma
 
+
 def roundstr(number, ndigits=0):
     """Return an unicode string of ``number`` rounded to a given precision
         in decimal digits (default 0 digits)
@@ -176,6 +186,7 @@
     """
     return format(round(float(number), ndigits), '0.%df' % ndigits)
 
+
 def rgxformat(string, regexp, output):
     """ Apply the regexp to the ``string`` and return a formatted string
     according to ``output``
@@ -217,7 +228,8 @@
         """
         self.callback = callback
         if attr_index is not None:
-            self.attr_index = attr_index if isinstance(attr_index, (tuple, list)) else (attr_index,)
+            self.attr_index = attr_index if isinstance(
+                attr_index, (tuple, list)) else (attr_index,)
         else:
             self.attr_index = attr_index
 
@@ -238,7 +250,7 @@
         else:
             for attr_ind in self.attr_index:
                 record = list(r if ind != attr_ind else self.callback(r)
-                               for ind, r in enumerate(record))
+                              for ind, r in enumerate(record))
             return record
 
     def normalize_dataset(self, dataset, inplace=False):
@@ -268,9 +280,14 @@
     """ Normalizer that unormalize the unicode
     (i.e. replace accentuating characters by ASCII ones)
     """
+
     def __init__(self, attr_index=None, substitute=None):
         callback = partial(lunormalize, substitute=substitute)
-        super(UnicodeNormalizer, self).__init__(callback, attr_index=attr_index)
+        super(
+            UnicodeNormalizer,
+            self).__init__(
+            callback,
+            attr_index=attr_index)
 
 
 class SimplifyNormalizer(BaseNormalizer):
@@ -280,9 +297,17 @@
         2) Set the sentence to lower case
         3) Remove punctuation
     """
+
     def __init__(self, attr_index=None, lemmas=None, remove_stopwords=True):
-        callback = partial(simplify, lemmas=lemmas, remove_stopwords=remove_stopwords)
-        super(SimplifyNormalizer, self).__init__(callback, attr_index=attr_index)
+        callback = partial(
+            simplify,
+            lemmas=lemmas,
+            remove_stopwords=remove_stopwords)
+        super(
+            SimplifyNormalizer,
+            self).__init__(
+            callback,
+            attr_index=attr_index)
 
 
 class TokenizerNormalizer(BaseNormalizer):
@@ -291,17 +316,28 @@
         in case of failure, it just split on spaces.
         Anyway, tokenizer must have a ``tokenize()`` method
     """
-    def __init__(self, attr_index=None, tokenizer=None, regexp=re.compile(r"[^\s]+")):
+
+    def __init__(self, attr_index=None, tokenizer=None,
+                 regexp=re.compile(r"[^\s]+")):
         callback = partial(tokenize, tokenizer=tokenizer, regexp=regexp)
-        super(TokenizerNormalizer, self).__init__(callback, attr_index=attr_index)
+        super(
+            TokenizerNormalizer,
+            self).__init__(
+            callback,
+            attr_index=attr_index)
 
 
 class LemmatizerNormalizer(BaseNormalizer):
     """ Normalizer that lemmatize a string
     """
+
     def __init__(self, lemmas, attr_index=None, tokenizer=None):
         callback = partial(lemmatized, lemmas=lemmas, tokenizer=tokenizer)
-        super(LemmatizerNormalizer, self).__init__(callback, attr_index=attr_index)
+        super(
+            LemmatizerNormalizer,
+            self).__init__(
+            callback,
+            attr_index=attr_index)
 
 
 class RoundNormalizer(BaseNormalizer):
@@ -312,6 +348,7 @@
     If ``number`` is not a float, this method casts it to a float. (An
     exception may be raised if it's not possible)
     """
+
     def __init__(self, attr_index=None, ndigits=0):
         callback = partial(roundstr, ndigits=ndigits)
         super(RoundNormalizer, self).__init__(callback, attr_index=attr_index)
@@ -332,6 +369,7 @@
 
      would return u'Hugo, Victor (26 fev 1802 - 22 mai 1885)'
     """
+
     def __init__(self, regexp, output, attr_index=None):
         callback = partial(rgxformat, regexp=regexp, output=output)
         super(RegexpNormalizer, self).__init__(callback, attr_index=attr_index)
@@ -344,6 +382,7 @@
     """Normalizer that join multiple fields in only one.
     This new field will be put at the end of the new record.
     """
+
     def __init__(self, attr_indexes, join_car=', '):
         self.attr_indexes = attr_indexes
         self.join_car = join_car
@@ -360,8 +399,10 @@
 
         record: the normalized record.
         """
-        _record = [r for ind, r in enumerate(record) if ind not in self.attr_indexes]
-        _record.append(self.join_car.join([r for ind, r in enumerate(record) if ind in self.attr_indexes]))
+        _record = [r for ind, r in enumerate(
+            record) if ind not in self.attr_indexes]
+        _record.append(self.join_car.join(
+            [r for ind, r in enumerate(record) if ind in self.attr_indexes]))
         return _record
 
 
--- a/nazca/utils/tokenizer.py	Tue Aug 21 15:50:28 2018 +0200
+++ b/nazca/utils/tokenizer.py	Mon Sep 03 10:17:09 2018 +0200
@@ -44,11 +44,13 @@
         indice = 0
         while indice < len(words):
             # Choose the current sentence of the first word
-            current_sentence = [s for s in sentences if s.start<=words[indice].start()][-1]
+            current_sentence = [
+                s for s in sentences if s.start <= words[indice].start()][-1]
             # Sliding windows over the different words for each sentence
             remaining = len(words) - indice
-            for length in range(min(self.token_max_size, remaining), self.token_min_size-1, -1):
-                _words = words[indice:indice+length]
+            for length in range(
+                    min(self.token_max_size, remaining), self.token_min_size - 1, -1):
+                _words = words[indice:indice + length]
                 if _words[-1].start() > current_sentence.end:
                     # The last word in not in the same sentence anymore, split
                     continue