[Distance] Spaces are correctly supported by distances functions
authorSimon Chabot <simon.chabot@logilab.fr>
Fri, 19 Oct 2012 14:44:26 +0200
changeset 32 47efa28c96a3
parent 31 1410ccfe08fd
child 33 0d95c1cfde36
[Distance] Spaces are correctly supported by distances functions In fact, the problem was 'Victor Hugo' and 'Hugo Victor' had a big distance when we'd like in this case to have a small one (even a zero one !) So, the approach followed was : Construct a distance matrix : | Victor | Hugo Victor | 0 | 5 Hugo | 5 | 0 And return the minimun of the minimun of each row. In fact, we return the maximun of the minimum of the previous matrix, and the its transpose, to handle the following case : | Victor | Hugo | Jean | Victor | 0 | 5 | 6 |--> min of each row : 0 Hugo | 5 | 0 | 4 | | Victor | Hugo | Victor | 0 | 5 | Hugo | 5 | 0 |--> min of each row : 4 Jean | 6 | 4 | Return the max, ie : 4.
distances.py
test/test_alignment.py
--- a/distances.py	Fri Oct 19 11:38:54 2012 +0200
+++ b/distances.py	Fri Oct 19 14:44:26 2012 +0200
@@ -16,6 +16,7 @@
 # with this program. If not, see <http://www.gnu.org/licenses/>.
 
 from dateutil import parser as dateparser
+from scipy import matrix
 
 def levenshtein(stra, strb):
     """ Compute the Levenshtein distance between stra and strb.
@@ -25,8 +26,14 @@
         - Replace one character of stra into a character of strb
         - Add one character of strb into stra
         - Remove one character of strb
+
+        If spaces are found in stra or strb, this method returns
+            _handlespaces(stra, strb), levenshtein)
     """
 
+    if ' ' in (stra + strb):
+        return _handlespaces(stra, strb, levenshtein)
+
     lena = len(stra)
     lenb = len(strb)
     onerowago = None
@@ -40,6 +47,43 @@
             thisrow[y] = min(delcost, addcost, subcost)
     return thisrow[lenb - 1]
 
+def _handlespaces(stra, strb, distance, **args):
+    """ Compute the matrix of distances between all tokens of stra and strb
+        (with function ``distance``). Extra args are given to the distance
+        function
+
+        The distance returned is defined as the max of the min of each rows of
+        each distance matrix, see the example above :
+
+                 |  Victor |  Hugo                  Victor | Jean | Hugo
+         Victor  |     0   |    5           Victor |  0    |  6   |  5
+          Jean   |     6   |    4           Hugo   |  5    |  4   |  0
+          Hugo   |     5   |    0
+
+                 --> 4                                --> 0
+
+        Return 4
+    """
+
+    if ' ' not in stra:
+        stra += ' '
+    if ' ' not in strb:
+        strb += ' '
+
+    toka, tokb = stra.split(' '), strb.split(' ')
+
+    listmatrix = []
+    for i in xrange(len(toka)):
+        listmatrix.append([])
+        for j in xrange(len(tokb)):
+            listmatrix[-1].append(distance(toka[i], tokb[j], **args))
+    m = matrix(listmatrix)
+    minlist = [m[i,:].min() for i in xrange(m.shape[0])]
+    minlist.extend([m[:,i].min() for i in xrange(m.shape[1])])
+
+    return max(minlist)
+
+
 def soundexcode(word, language = 'french'):
     """ Return the Soundex code of the word ``word``
         For more information about soundex code see wiki_
@@ -47,12 +91,11 @@
         ``language`` can be 'french' or 'english'
 
         .:: wiki_ : https://en.wikipedia.org/wiki/Soundex
+
+        If spaces are found in stra or strb, this method returns
+            _handlespaces(stra, strb), soundex, language = language)
     """
 
-    if ' ' in word:
-        words = word.split(' ')
-        return ' '.join([soundexcode(w.strip(), language) for w in words])
-
     vowels = 'AEHIOUWY'
     if language.lower() == 'french' :
         consonnantscode = { 'B' : '1', 'P' : '1',
@@ -103,6 +146,9 @@
     """ Return the 1/0 distance between the soundex code of stra and strb.
         0 means they have the same code, 1 they don't
     """
+    if ' ' in (stra + strb):
+        return _handlespaces(stra, strb, soundex, language = language)
+
     return 0 if (soundexcode(stra, language) == soundexcode(strb, language)) \
              else 1
 
--- a/test/test_alignment.py	Fri Oct 19 11:38:54 2012 +0200
+++ b/test/test_alignment.py	Fri Oct 19 14:44:26 2012 +0200
@@ -51,8 +51,9 @@
 class DistancesTest(unittest2.TestCase):
     def test_levenshtein(self):
         self.assertEqual(levenshtein('niche', 'chiens'), 5)
-        self.assertEqual(levenshtein('bonjour', 'bonjour !'), 2)
+        self.assertEqual(levenshtein('bonjour', 'bonjour !'), 1)
         self.assertEqual(levenshtein('bon', 'bonjour'), 4)
+        self.assertEqual(levenshtein('Victor Hugo', 'Hugo Victor'), 0)
 
         #Test symetry
         self.assertEqual(levenshtein('Victor Hugo', 'Vitor Wugo'),
@@ -197,7 +198,7 @@
 
         #Victor Hugo --> Victor Wugo
         #Albert Camus --> Albert Camus, Albert Camu
-        self.assertEqual(m.matched(cutoff = 0.1, normalized = True),
+        self.assertEqual(m.matched(cutoff = 0.2, normalized = True),
                         {0: [(0, d(i1[0], i2[0]))], 1: [(1, d(i1[1], i2[1])), 
                                                        (2, d(i1[1], i2[2]))]})