[distances] Add difflib match distance, closes #234655
authorVincent Michel <vincent.michel@logilab.fr>
Mon, 07 Apr 2014 13:56:17 +0000
changeset 414 68d1ae13c3b0
parent 413 0ef0ac00ae61
child 415 dd4a0f979759
[distances] Add difflib match distance, closes #234655
test/test_distances.py
utils/distances.py
--- a/test/test_distances.py	Tue Mar 25 14:31:08 2014 +0000
+++ b/test/test_distances.py	Mon Apr 07 13:56:17 2014 +0000
@@ -26,11 +26,19 @@
 from dateutil import parser as dateparser
 
 from nazca.utils.distances import (levenshtein, soundex, soundexcode,
+                                   difflib_match,
                                    jaccard, euclidean, geographical,
                                    LevenshteinProcessing)
 
 
 class DistancesTest(unittest.TestCase):
+
+    def test_difflib_match(self):
+        self.assertEqual(round(difflib_match('Victor Hugo', 'Victor Hugo'), 2), 0.)
+        self.assertEqual(round(difflib_match('Victor Hugo', 'Victor Wugo'), 2), 0.09)
+        self.assertEqual(round(difflib_match('niche', 'chiens'), 2), 0.45)
+        self.assertEqual(round(difflib_match('bonjour', 'bonjour !'), 2), 0.13)
+
     def test_levenshtein(self):
         self.assertEqual(levenshtein('niche', 'chiens'), 5)
         self.assertEqual(levenshtein('bonjour', 'bonjour !'), 1)
--- a/utils/distances.py	Tue Mar 25 14:31:08 2014 +0000
+++ b/utils/distances.py	Mon Apr 07 13:56:17 2014 +0000
@@ -15,6 +15,7 @@
 # You should have received a copy of the GNU Lesser General Public License along
 # with this program. If not, see <http://www.gnu.org/licenses/>.
 
+import difflib
 from functools import partial
 from math import cos, sqrt, pi #Needed for geographical distance
 try:
@@ -235,6 +236,18 @@
     """
     return 1.0 - 1.0*len(seta.intersection(setb))/len(seta.union(setb))
 
+def difflib_match(stra, strb):
+    """ Approximate matching.
+    Extract of SequenceMatched documentation
+    '[...] The basic algorithm predates, and is a little fancier than, an algorithm
+    published in the late 1980's by Ratcliff and Obershelp under the
+    hyperbolic name "gestalt pattern matching"[...]'
+
+    A value smaller than 0.4 means that sequences are close matches (we take
+    1 - difflib.SequenceMatched)
+    """
+    return 1.0 - difflib.SequenceMatcher(None, stra, strb).ratio()
+
 
 ###############################################################################
 ### TEMPORAL DISTANCES ########################################################