author Simon Chabot Thu, 18 Oct 2012 16:35:18 +0200 changeset 20 16f66a0aaa0e parent 19 fa22f8965c4a child 21 2f077077b266
[Matrix] Compute a distance matrix
 matrix.py file | annotate | diff | comparison | revisions
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix.py	Thu Oct 18 16:35:18 2012 +0200
@@ -0,0 +1,66 @@
+# -*- coding:utf-8 -*-
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from cubes.alignment.distances import (levenshtein, soundex, \
+                                       jaccard, temporal, euclidean)
+from collections import defaultdict
+from scipy.sparse import lil_matrix
+from scipy import where
+
+class Distancematrix(object):
+    """ Construct and compute a matrix of distance given a distance function.
+
+        Given :
+            input1 = ['Victor Hugo', 'Albert Camus']
+            input2 = ['Victor Wugo', 'Albert Camus']
+            distance = 'levensthein'
+
+            constructs the following matrix :
+                 +----+----+
+                 | 1  | 11 |
+                 +----+----+
+                 | 11 | 0  |
+                 +----+----+
+    """
+
+    def __init__(self, input1, input2, distance):
+        self.input1 = input1
+        self.input2 = input2
+        self.distance = distance
+        self._matrix = lil_matrix((len(input1), len(input2)), dtype='float32')
+        self._compute()
+
+    def _compute(self):
+       for i in xrange(len(self.input1)):
+           # The matrix is an upper triangular one, so don't compute the
+           # bottom
+           for j in xrange(i, len(self.input2)):
+               self._matrix[i,j] = self.distance(self.input1[i], self.input2[j])
+
+    def __getitem__(self, index):
+        (i, j) = index
+        if i <= j:
+            return self._matrix[index]
+        return self._matrix[j, i]
+
+    def matched(self, cutoff = 0):
+        match = defaultdict(list)
+        for i in xrange(len(self.input1)):
+            for j in xrange(i, len(self.input2)):
+                if self._matrix[i, j] <= cutoff:
+                    match[i].append(j)
+        return match