author Simon Chabot Wed, 17 Oct 2012 12:34:28 +0200 changeset 12 9ef85db30fb5 parent 11 c4d63951cc16 child 13 2f19c16c3610
[distances] Add jaccard distance (related #128982)
 distances.py file | annotate | diff | comparison | revisions test/test_alignment.py file | annotate | diff | comparison | revisions
```--- a/distances.py	Wed Oct 17 12:05:02 2012 +0200
+++ b/distances.py	Wed Oct 17 12:34:28 2012 +0200
@@ -83,3 +83,17 @@
1 means they have the same code, 0 they don't
"""
return 1 if soundexcode(stra, language) == soundexcode(strb, language) else 0
+
+def jaccard(stra, strb):
+    """ Return the jaccard distance between stra and strb, condering the letters
+    set of stra and strb
+
+    J(A, B) = (A \cap B) / (A \cup B)
+    d(A, B) = 1 - J(A, B)
+    """
+
+    seta = set(stra)
+    setb = set(strb)
+
+    jab = 1.0 * len(seta.intersection(setb)) / len(seta.union(setb))
+    return 1.0 - jab```
```--- a/test/test_alignment.py	Wed Oct 17 12:05:02 2012 +0200
+++ b/test/test_alignment.py	Wed Oct 17 12:34:28 2012 +0200
@@ -38,7 +38,8 @@
"""

from cubicweb.devtools import testlib
-from cubes.alignment.distances import (levenshtein, soundex, soundexcode)
+from cubes.alignment.distances import (levenshtein, soundex, soundexcode, \
+                                       jaccard)

class DistancesTest(testlib.CubicWebTC):
def test_levenshtein(self):
@@ -73,6 +74,15 @@
self.assertEqual(soundex('Rubert', 'Robert', 'english'), 1)
self.assertEqual(soundex('Rubin', 'Robert', 'english'), 0)

+    def test_jaccard(self):
+        #The jaccard indice between two words is the ratio of the number of
+        #identical letters and the total number of letters
+        #Each letter is counted once only
+        #The distance is 1 - jaccard_indice
+
+        self.assertEqual(jaccard('bonjour', 'bonjour'), 0.0)
+        self.assertAlmostEqual(jaccard('boujour', 'bonjour'), 0.166, 2)
+        self.assertAlmostEqual(jaccard('rubert', 'robert'), 0.333, 2)

if __name__ == '__main__':
from logilab.common.testlib import unittest_main```