[distances] For the jaccard distance, consider the set of tokens
authorSimon Chabot <simon.chabot@logilab.fr>
Tue, 06 Nov 2012 10:48:06 +0100
changeset 66 221a299d9fb8
parent 65 5efd92896bbb
child 67 e37a33ec5b41
[distances] For the jaccard distance, consider the set of tokens Considering the set of tokens instead of letters is much more accurate. Eg: before this changeset, the jaccard distance between “silence” and “license” was zero. *** [Test] Jaccard implementation changed, so the tests are changed too.
distances.py
test/test_alignment.py
--- a/distances.py	Tue Oct 30 16:41:53 2012 +0100
+++ b/distances.py	Tue Nov 06 10:48:06 2012 +0100
@@ -20,6 +20,8 @@
 
 from scipy import matrix
 
+from alignement.normalize import tokenize
+
 def levenshtein(stra, strb):
     """ Compute the Levenshtein distance between stra and strb.
 
@@ -154,16 +156,17 @@
     return 0 if (soundexcode(stra, language) == soundexcode(strb, language)) \
              else 1
 
-def jaccard(stra, strb):
-    """ Return the jaccard distance between stra and strb, condering the letters
-    set of stra and strb
+def jaccard(stra, strb, tokenizer = None):
+    """ Return the jaccard distance between stra and strb, condering the tokens
+        set of stra and strb. If no tokenizer is given, it use if
+        alignement.normalize.tokenize's default one.
 
-    J(A, B) = (A \cap B) / (A \cup B)
-    d(A, B) = 1 - J(A, B)
+        J(A, B) = (A \cap B) / (A \cup B)
+        d(A, B) = 1 - J(A, B)
     """
 
-    seta = set(stra)
-    setb = set(strb)
+    seta = set(tokenize(stra, tokenizer))
+    setb = set(tokenize(strb, tokenizer))
 
     jacc = 1.0 * len(seta.intersection(setb)) / len(seta.union(setb))
     return 1.0 - jacc
--- a/test/test_alignment.py	Tue Oct 30 16:41:53 2012 +0100
+++ b/test/test_alignment.py	Tue Nov 06 10:48:06 2012 +0100
@@ -96,8 +96,8 @@
         #The distance is 1 - jaccard_indice
 
         self.assertEqual(jaccard('bonjour', 'bonjour'), 0.0)
-        self.assertAlmostEqual(jaccard('boujour', 'bonjour'), 0.166, 2)
-        self.assertAlmostEqual(jaccard('rubert', 'robert'), 0.333, 2)
+        self.assertAlmostEqual(jaccard('boujour', 'bonjour'), 1, 2)
+        self.assertAlmostEqual(jaccard('sacré rubert', 'sacré hubert'), 0.5, 2)
 
         #Test symetry
         self.assertEqual(jaccard('orange', 'morange'),