[blocking] Add a NGram blocking technique, for blocking on the N first letters of a key, see #182023
authorVincent Michel <vincent.michel@logilab.fr>
Tue, 08 Oct 2013 13:10:21 +0000
changeset 302 8170c858e303
parent 301 50a25080aa33
child 303 52ba47aa143f
[blocking] Add a NGram blocking technique, for blocking on the N first letters of a key, see #182023
blocking.py
test/test_blocking.py
--- a/blocking.py	Mon Oct 14 16:29:16 2013 +0000
+++ b/blocking.py	Tue Oct 08 13:10:21 2013 +0000
@@ -167,6 +167,66 @@
 
 
 ###############################################################################
+### BIGRAM BLOCKING ###########################################################
+###############################################################################
+class NGramBlocking(BaseBlocking):
+    """ This blocking technique is based on a a n-gram key.
+    """
+
+    def __init__(self, ref_attr_index, target_attr_index, ngram_size=2, depth=2):
+        super(NGramBlocking, self).__init__(ref_attr_index, target_attr_index)
+        self.ngram_size = ngram_size
+        self.depth = depth
+        self.reference_index = {}
+        self.target_index = {}
+
+    def _fit_dataset(self, dataset, cur_index, attr_index):
+        """ Fit a dataset
+        """
+        for r in dataset:
+            cur_dict = cur_index
+            text = r[attr_index]
+            for i in range(self.depth):
+                ngram = text[i*self.ngram_size:(i+1)*self.ngram_size]
+                if i < self.depth - 1:
+                    cur_dict = cur_dict.setdefault(ngram, {})
+            cur_dict.setdefault(ngram, []).append(r[0])
+
+    def _fit(self, refset, targetset):
+        """ Fit the two sets (reference set and target set)
+        """
+        self._fit_dataset(refset, self.reference_index, self.ref_attr_index)
+        self._fit_dataset(targetset, self.target_index, self.target_attr_index)
+
+    def _iter_dict(self, ref_cur_dict, target_cur_dict):
+        """ Iterative function used to create blocks from dicts
+        """
+        for key, sub_dict in ref_cur_dict.iteritems():
+            if key in target_cur_dict:
+                if isinstance(sub_dict, dict):
+                    # There is another dict layer
+                    for block1, block2 in self._iter_dict(sub_dict, target_cur_dict[key]):
+                        yield block1, block2
+                else:
+                    # This is a list
+                    yield sub_dict, target_cur_dict[key]
+
+    def _iter_blocks(self):
+        """ Iterator over the different possible blocks.
+
+        Returns
+        -------
+
+        (block1, block2): The blocks are always (reference_block, target_block)
+                          and containts the indexes of the record in the
+                          corresponding dataset.
+        """
+        for block1, block2 in self._iter_dict(self.reference_index, self.target_index):
+            if block1 and block2:
+                yield block1, block2
+
+
+###############################################################################
 ### SORTKEY BLOCKING ##########################################################
 ###############################################################################
 class SortedNeighborhoodBlocking(BaseBlocking):
--- a/test/test_blocking.py	Mon Oct 14 16:29:16 2013 +0000
+++ b/test/test_blocking.py	Tue Oct 08 13:10:21 2013 +0000
@@ -24,6 +24,7 @@
 from nazca.distances import (levenshtein, soundex, soundexcode,   \
                              jaccard, euclidean, geographical)
 from nazca.blocking import (KeyBlocking, SortedNeighborhoodBlocking,
+                            NGramBlocking,
                             SoundexBlocking, KmeansBlocking,
                             MinHashingBlocking, KdTreeBlocking)
 from nazca.normalize import SimplifyNormalizer, loadlemmas
@@ -96,6 +97,17 @@
             self.assertIn(pair, pairs)
 
 
+class NGramBlockingTest(unittest2.TestCase):
+
+    def test_keyblocking_blocks(self):
+        blocking = NGramBlocking(ref_attr_index=1, target_attr_index=1)
+        blocking.fit(SOUNDEX_REFSET, SOUNDEX_TARGETSET)
+        blocks = list(blocking.iter_blocks())
+        self.assertIn((['a3'], ['b1', 'b2']), blocks)
+        self.assertIn((['a5'], ['b4']), blocks)
+        self.assertIn((['a1', 'a4'], ['b3']), blocks)
+
+
 class SortedNeighborhoodBlockingTest(unittest2.TestCase):
 
     def test_sorted_neighborhood_blocks(self):