author Simon Chabot Tue, 06 Nov 2012 10:48:06 +0100 changeset 66 221a299d9fb8 parent 65 5efd92896bbb child 67 e37a33ec5b41
[distances] For the jaccard distance, consider the set of tokens Considering the set of tokens instead of letters is much more accurate. Eg: before this changeset, the jaccard distance between “silence” and “license” was zero. *** [Test] Jaccard implementation changed, so the tests are changed too.
 distances.py file | annotate | diff | comparison | revisions test/test_alignment.py file | annotate | diff | comparison | revisions
```--- a/distances.py	Tue Oct 30 16:41:53 2012 +0100
+++ b/distances.py	Tue Nov 06 10:48:06 2012 +0100
@@ -20,6 +20,8 @@

from scipy import matrix

+from alignement.normalize import tokenize
+
def levenshtein(stra, strb):
""" Compute the Levenshtein distance between stra and strb.

@@ -154,16 +156,17 @@
return 0 if (soundexcode(stra, language) == soundexcode(strb, language)) \
else 1

-def jaccard(stra, strb):
-    """ Return the jaccard distance between stra and strb, condering the letters
-    set of stra and strb
+def jaccard(stra, strb, tokenizer = None):
+    """ Return the jaccard distance between stra and strb, condering the tokens
+        set of stra and strb. If no tokenizer is given, it use if
+        alignement.normalize.tokenize's default one.

-    J(A, B) = (A \cap B) / (A \cup B)
-    d(A, B) = 1 - J(A, B)
+        J(A, B) = (A \cap B) / (A \cup B)
+        d(A, B) = 1 - J(A, B)
"""

-    seta = set(stra)
-    setb = set(strb)
+    seta = set(tokenize(stra, tokenizer))
+    setb = set(tokenize(strb, tokenizer))

jacc = 1.0 * len(seta.intersection(setb)) / len(seta.union(setb))
return 1.0 - jacc```
```--- a/test/test_alignment.py	Tue Oct 30 16:41:53 2012 +0100
+++ b/test/test_alignment.py	Tue Nov 06 10:48:06 2012 +0100
@@ -96,8 +96,8 @@
#The distance is 1 - jaccard_indice

self.assertEqual(jaccard('bonjour', 'bonjour'), 0.0)
-        self.assertAlmostEqual(jaccard('boujour', 'bonjour'), 0.166, 2)
-        self.assertAlmostEqual(jaccard('rubert', 'robert'), 0.333, 2)
+        self.assertAlmostEqual(jaccard('boujour', 'bonjour'), 1, 2)
+        self.assertAlmostEqual(jaccard('sacré rubert', 'sacré hubert'), 0.5, 2)

#Test symetry
self.assertEqual(jaccard('orange', 'morange'),```