Provides Processings for all the distances, closes #248557
authorVincent Michel <vincent.michel@logilab.fr>
Fri, 23 May 2014 15:16:33 +0200
changeset 439 b5371dba546e
parent 438 5689a4cc4915
child 440 56114a122a56
Provides Processings for all the distances, closes #248557
test/test_distances.py
utils/distances.py
--- a/test/test_distances.py	Mon Jun 09 13:51:36 2014 +0000
+++ b/test/test_distances.py	Fri May 23 15:16:33 2014 +0200
@@ -28,7 +28,8 @@
 from nazca.utils.distances import (levenshtein, soundex, soundexcode,
                                    difflib_match,
                                    jaccard, euclidean, geographical,
-                                   LevenshteinProcessing, SoundexProcessing)
+                                   LevenshteinProcessing, SoundexProcessing,
+                                   JaccardProcessing, DifflibProcessing, TemporalProcessing)
 
 
 class DistancesTest(unittest.TestCase):
@@ -176,6 +177,37 @@
         self.assertEqual([0, 1, 1], pdist)
 
 
+class JaccardTestCase(unittest.TestCase):
+
+    def test_pdist(self):
+        processing = JaccardProcessing()
+        _input = [u'Robert Ugo', u'Rubert Ugo', 'Rubert Pugo']
+        pdist = processing.pdist(_input)
+        results = [0.666, 1, 0.666]
+        for ind, value in enumerate(pdist):
+            self.assertAlmostEqual(results[ind], value, 2)
+
+
+class DifflibTestCase(unittest.TestCase):
+
+    def test_pdist(self):
+        processing = DifflibProcessing()
+        _input = [u'Robert Ugo', u'Rubert Ugo', 'Rubert Pugo']
+        pdist = processing.pdist(_input)
+        results = [0.099, 0.238, 0.14]
+        for ind, value in enumerate(pdist):
+            self.assertAlmostEqual(results[ind], value, 2)
+
+
+class TemporalTestCase(unittest.TestCase):
+
+    def test_pdist(self):
+        processing = TemporalProcessing()
+        _input = ['14 aout 1991', '08/14/1991', '08/15/1992']
+        pdist = processing.pdist(_input)
+        self.assertEqual([0., 367, 367], pdist)
+
+
 if __name__ == '__main__':
     unittest.main()
 
--- a/utils/distances.py	Mon Jun 09 13:51:36 2014 +0000
+++ b/utils/distances.py	Fri May 23 15:16:33 2014 +0200
@@ -478,3 +478,45 @@
                                                 target_attr_index,
                                                 distance_callback,
                                                 weight, matrix_normalized)
+
+
+class JaccardProcessing(BaseProcessing):
+    """ A processing based on the jaccard distance.
+    """
+
+    def __init__(self, ref_attr_index=None, target_attr_index=None,
+                 tokenizer=None, weight=1, matrix_normalized=False):
+        distance_callback = partial(jaccard, tokenizer=tokenizer)
+        super(JaccardProcessing, self).__init__(ref_attr_index,
+                                                target_attr_index,
+                                                distance_callback,
+                                                weight, matrix_normalized)
+
+
+class DifflibProcessing(BaseProcessing):
+    """ A processing based on the difflib distance.
+    """
+
+    def __init__(self, ref_attr_index=None, target_attr_index=None,
+                 weight=1, matrix_normalized=False):
+        super(DifflibProcessing, self).__init__(ref_attr_index,
+                                                target_attr_index,
+                                                difflib_match,
+                                                weight, matrix_normalized)
+
+
+class TemporalProcessing(BaseProcessing):
+    """ A processing based on the temporal distance.
+    """
+
+    def __init__(self, ref_attr_index=None, target_attr_index=None,
+                 granularity=u'days', parserinfo=FrenchParserInfo,
+                 dayfirst=True, yearfirst=False,
+                 weight=1, matrix_normalized=False):
+        distance_callback = partial(temporal, granularity=granularity,
+                                    parserinfo=parserinfo,
+                                    dayfirst=dayfirst, yearfirst=yearfirst)
+        super(TemporalProcessing, self).__init__(ref_attr_index,
+                                                target_attr_index,
+                                                distance_callback,
+                                                weight, matrix_normalized)