[distances] add a `SoundexProcessing`, related to #234919
authorKatia Saurfelt <katia.saurfelt@logilab.fr>
Tue, 08 Apr 2014 11:54:46 +0000
changeset 417 96fc22a46fad
parent 415 dd4a0f979759
child 419 c8d11a09128a
[distances] add a `SoundexProcessing`, related to #234919
test/test_distances.py
utils/distances.py
--- a/test/test_distances.py	Mon Apr 07 17:13:27 2014 +0200
+++ b/test/test_distances.py	Tue Apr 08 11:54:46 2014 +0000
@@ -28,7 +28,7 @@
 from nazca.utils.distances import (levenshtein, soundex, soundexcode,
                                    difflib_match,
                                    jaccard, euclidean, geographical,
-                                   LevenshteinProcessing)
+                                   LevenshteinProcessing, SoundexProcessing)
 
 
 class DistancesTest(unittest.TestCase):
@@ -135,7 +135,7 @@
         self.assertAlmostEqual(dist_parislondon, 341564, 0)
 
 
-class MatrixTestCase(unittest.TestCase):
+class LevenshteinTestCase(unittest.TestCase):
 
     def setUp(self):
         self.input1 = [u'Victor Hugo', u'Albert Camus', 'Jean Valjean']
@@ -167,6 +167,15 @@
         self.assertEqual([6, 6, 1], pdist)
 
 
+class SoundexTestCase(unittest.TestCase):
+
+    def test_pdist(self):
+        processing = SoundexProcessing()
+        _input = [u'Robert Ugo', u'Rubert Ugo', 'Rubert Pugo']
+        pdist = processing.pdist(_input)
+        self.assertEqual([0, 1, 1], pdist)
+
+
 if __name__ == '__main__':
     unittest.main()
 
--- a/utils/distances.py	Mon Apr 07 17:13:27 2014 +0200
+++ b/utils/distances.py	Tue Apr 08 11:54:46 2014 +0000
@@ -212,9 +212,7 @@
     """
     if ' ' in stra or ' ' in strb:
         return _handlespaces(stra, strb, soundex, tokenizer=tokenizer, language=language)
-
-    return 0 if (soundexcode(stra, language) == soundexcode(strb, language)) \
-             else 1
+    return 0 if (soundexcode(stra, language) == soundexcode(strb, language)) else 1
 
 def jaccard(stra, strb, tokenizer=None):
     """ Return the jaccard distance between stra and strb, condering the tokens
@@ -466,4 +464,16 @@
         super(GeographicalProcessing, self).__init__(ref_attr_index,
                                                     target_attr_index,
                                                     distance_callback,
-                                                    weight,matrix_normalized)
+                                                    weight, matrix_normalized)
+
+class SoundexProcessing(BaseProcessing):
+    """ A processing based on the soundex distance.
+    """
+
+    def __init__(self, ref_attr_index=None, target_attr_index=None,
+                 tokenizer=None, weight=1, language='french', matrix_normalized=False):
+        distance_callback = partial(soundex, language=language, tokenizer=tokenizer)
+        super(SoundexProcessing, self).__init__(ref_attr_index,
+                                                target_attr_index,
+                                                distance_callback,
+                                                weight, matrix_normalized)