[distances] Add jaccard distance (related #128982)
authorSimon Chabot <simon.chabot@logilab.fr>
Wed, 17 Oct 2012 12:34:28 +0200
changeset 12 9ef85db30fb5
parent 11 c4d63951cc16
child 13 2f19c16c3610
[distances] Add jaccard distance (related #128982)
distances.py
test/test_alignment.py
--- a/distances.py	Wed Oct 17 12:05:02 2012 +0200
+++ b/distances.py	Wed Oct 17 12:34:28 2012 +0200
@@ -83,3 +83,17 @@
         1 means they have the same code, 0 they don't
     """
     return 1 if soundexcode(stra, language) == soundexcode(strb, language) else 0
+
+def jaccard(stra, strb):
+    """ Return the jaccard distance between stra and strb, condering the letters
+    set of stra and strb
+
+    J(A, B) = (A \cap B) / (A \cup B)
+    d(A, B) = 1 - J(A, B)
+    """
+
+    seta = set(stra)
+    setb = set(strb)
+
+    jab = 1.0 * len(seta.intersection(setb)) / len(seta.union(setb))
+    return 1.0 - jab
--- a/test/test_alignment.py	Wed Oct 17 12:05:02 2012 +0200
+++ b/test/test_alignment.py	Wed Oct 17 12:34:28 2012 +0200
@@ -38,7 +38,8 @@
 """
 
 from cubicweb.devtools import testlib
-from cubes.alignment.distances import (levenshtein, soundex, soundexcode)
+from cubes.alignment.distances import (levenshtein, soundex, soundexcode, \
+                                       jaccard)
 
 class DistancesTest(testlib.CubicWebTC):
     def test_levenshtein(self):
@@ -73,6 +74,15 @@
         self.assertEqual(soundex('Rubert', 'Robert', 'english'), 1)
         self.assertEqual(soundex('Rubin', 'Robert', 'english'), 0)
 
+    def test_jaccard(self):
+        #The jaccard indice between two words is the ratio of the number of
+        #identical letters and the total number of letters
+        #Each letter is counted once only
+        #The distance is 1 - jaccard_indice
+
+        self.assertEqual(jaccard('bonjour', 'bonjour'), 0.0)
+        self.assertAlmostEqual(jaccard('boujour', 'bonjour'), 0.166, 2)
+        self.assertAlmostEqual(jaccard('rubert', 'robert'), 0.333, 2)
 
 if __name__ == '__main__':
     from logilab.common.testlib import unittest_main