author Simon Chabot Thu, 08 Nov 2012 15:53:27 +0100 changeset 99 a8b6d9262c20 parent 98 20f79a013965 child 100 5c0462ab311d
[aligner] Handle any dimension for clustering and kdtree
 aligner.py file | annotate | diff | comparison | revisions
--- a/aligner.py	Thu Nov 08 15:38:05 2012 +0100
+++ b/aligner.py	Thu Nov 08 15:53:27 2012 +0100
@@ -45,12 +45,15 @@
if mode not in SEARCHERS:
raise NotImplementedError('Unknown mode given')

+    #If an element is None (missing), use instead the identity element.
+    #The identity element is defined as the 0-vector
+    idelement = tuple([0 for _ in xrange(len(alignset[0][indexes[0]]))])
##### KDTree #######
if mode == 'kdtree':
from scipy.spatial import KDTree
-        # XXX : If there are more than 2 dimensions ??
-        aligntree  = KDTree([elt[indexes[0]] or (0, 0) for elt in alignset])
-        targettree = KDTree([elt[indexes[1]] or (0, 0) for elt in targetset])
+
+        aligntree  = KDTree([elt[indexes[0]] or idelement for elt in alignset])
+        targettree = KDTree([elt[indexes[1]] or idelement for elt in targetset])
extraneighbours = aligntree.query_ball_tree(targettree, threshold)
neighbours = []
for ind in xrange(len(alignset)):
@@ -63,8 +66,9 @@
elif mode == 'minhashing':
from alignment.minhashing import Minlsh
minhasher = Minlsh()
-        minhasher.train([elt[indexes[0]] or '' for elt in alignset] +
-                        [elt[indexes[1]] or '' for elt in targetset],
+        idelement = ''
+        minhasher.train([elt[indexes[0]] or idelement for elt in alignset] +
+                        [elt[indexes[1]] or idelement for elt in targetset],
k)
rawneighbours = minhasher.findsimilarsentences(threshold)
neighbours = []
@@ -88,9 +92,10 @@
kmeans = cluster.KMeans(n_clusters=n_clusters)
else:
kmeans = cluster.MiniBatchKMeans(n_clusters=n_clusters)
+
# XXX : If there are more than 2 dimensions ??
-        kmeans.fit([elt[indexes[0]] or (0, 0) for elt in alignset])
-        predicted = kmeans.predict([elt[indexes[1]] or (0, 0) for elt in targetset])
+        kmeans.fit([elt[indexes[0]] or idelement for elt in alignset])
+        predicted = kmeans.predict([elt[indexes[1]] or idelement for elt in targetset])

clusters = [[[], []] for _ in xrange(kmeans.n_clusters)]
for ind, i in enumerate(predicted):