author Simon Chabot Tue, 20 Nov 2012 15:06:33 +0100 changeset 154 f30b1c1bd109 parent 153 b0d53d47cb08 child 155 90412c2a4d9e
 matrix.py file | annotate | diff | comparison | revisions
```--- a/matrix.py	Tue Nov 20 10:59:05 2012 +0100
+++ b/matrix.py	Tue Nov 20 15:06:33 2012 +0100
@@ -23,21 +23,6 @@

import alignment.distances as ds

-""" Construct and compute a matrix of distance given a distance function.
-
-    Given :
-        input1 = ['Victor Hugo', 'Albert Camus']
-        input2 = ['Victor Wugo', 'Albert Camus']
-        distance = levenshtein
-
-        constructs the following matrix :
-             +----+----+
-             | 1  | 11 |
-             +----+----+
-             | 11 | 0  |
-             +----+----+
-"""
-
METRICS = {'euclidean': ds.euclidean, 'levenshtein': ds.levenshtein,
'soundex': ds.soundex, 'jaccard': ds.jaccard,
'temporal': ds.temporal, 'geographical': ds.geographical}
@@ -46,8 +31,12 @@
def pdist(X, metric='euclidean', matrix_normalized=True, metric_params=None):
""" Compute the upper triangular matrix in a way similar
to scipy.spatial.metric
-    XXX Comment on normalization
-
+
+    If matrix_normalized is True, the distance between two points is changed to
+    a value between 0 (equal) and 1 (totaly different). To avoid useless
+    computation and scale problems the following “normalization” is done:
+        d = 1 - 1/(1 + d(x, y))
+
"""
metric = metric if not isinstance(metric, basestring) else METRICS.get(metric, ds.euclidean)
values = []
@@ -63,7 +52,12 @@

def cdist(X, Y, metric='euclidean', matrix_normalized=True, metric_params=None):
""" Compute the metric matrix, given two inputs and a metric
-    XXX Comment on normalization
+
+    If matrix_normalized is True, the distance between two points is changed to
+    a value between 0 (equal) and 1 (totaly different). To avoid useless
+    computation and scale problems the following “normalization” is done:
+        d = 1 - 1/(1 + d(x, y))
+
"""
metric = metric if not isinstance(metric, basestring) else METRICS.get(metric, ds.euclidean)
distmatrix = empty((len(X), len(Y)), dtype='float32')```