[distances] Add an ExactMatchProcessing, closes #252734
authorVincent Michel <vincent.michel@logilab.fr>
Tue, 27 May 2014 15:13:22 +0200
changeset 441 99863e609845
parent 440 56114a122a56
child 442 1615cd653f1d
[distances] Add an ExactMatchProcessing, closes #252734
test/test_distances.py
utils/distances.py
--- a/test/test_distances.py	Mon May 26 17:21:21 2014 +0200
+++ b/test/test_distances.py	Tue May 27 15:13:22 2014 +0200
@@ -28,6 +28,7 @@
 from nazca.utils.distances import (levenshtein, soundex, soundexcode,
                                    difflib_match,
                                    jaccard, euclidean, geographical,
+                                   ExactMatchProcessing,
                                    LevenshteinProcessing, SoundexProcessing,
                                    JaccardProcessing, DifflibProcessing, TemporalProcessing)
 
@@ -136,6 +137,15 @@
         self.assertAlmostEqual(dist_parislondon, 341564, 0)
 
 
+class ExactMatchTestCase(unittest.TestCase):
+
+    def test_pdist(self):
+        processing = ExactMatchProcessing()
+        _input = ['Victor Hugo', 'Victo Hugo', 'Victor Hugo']
+        pdist = processing.pdist(_input)
+        self.assertEqual([1, 0., 1], pdist)
+
+
 class LevenshteinTestCase(unittest.TestCase):
 
     def setUp(self):
--- a/utils/distances.py	Mon May 26 17:21:21 2014 +0200
+++ b/utils/distances.py	Tue May 27 15:13:22 2014 +0200
@@ -117,6 +117,11 @@
 ###############################################################################
 ### STRING DISTANCES ##########################################################
 ###############################################################################
+def exact_match(a, b):
+    """ The simplest distance, defined as 0 if both values are equal, 1 elsewise.
+    """
+    return 0 if a==b else 1
+
 def levenshtein(stra, strb, tokenizer=None):
     """ Compute the Levenshtein distance between stra and strb.
 
@@ -440,6 +445,17 @@
 ###############################################################################
 ### CONCRETE PROCESSINGS #######################################################
 ###############################################################################
+class ExactMatchProcessing(BaseProcessing):
+    """ A processing based on the exact match (1 if a==b, 0 elsewise)
+    """
+
+    def __init__(self, ref_attr_index=None, target_attr_index=None,
+                 tokenizer=None, weight=1, matrix_normalized=False):
+        super(ExactMatchProcessing, self).__init__(ref_attr_index,
+                                                   target_attr_index,
+                                                   exact_match,
+                                                   weight, matrix_normalized)
+
 class LevenshteinProcessing(BaseProcessing):
     """ A processing based on the levenshtein distance.
     """