[aligner] Instead of returning 1xN matrices, return MxN ones
authorSimon Chabot <simon.chabot@logilab.fr>
Thu, 08 Nov 2012 14:43:32 +0100
changeset 96 0a63b9d4024e
parent 95 8a13d62cd8ff
child 97 f70fb53a019e
[aligner] Instead of returning 1xN matrices, return MxN ones
aligner.py
demo.py
--- a/aligner.py	Thu Nov 08 13:25:20 2012 +0100
+++ b/aligner.py	Thu Nov 08 14:43:32 2012 +0100
@@ -51,7 +51,16 @@
         # XXX : If there are more than 2 dimensions ??
         aligntree  = KDTree([elt[indexes[0]] or (0, 0) for elt in alignset])
         targettree = KDTree([elt[indexes[1]] or (0, 0) for elt in targetset])
-        return aligntree.query_ball_tree(targettree, threshold)
+        intraneighbours = aligntree.query_ball_tree(aligntree, threshold)
+        extraneighbours = aligntree.query_ball_tree(targettree, threshold)
+        neighbours = []
+        for intra in intraneighbours:#XXX: Return an iterator
+            neighbours.append([intra, []])
+            for i in intra:
+                neighbours[-1][1].extend(extraneighbours[i])
+            if len(neighbours[-1][1] == 0):
+                neighbours[-1].pop()
+        return neighbours
 
 #### Minhashing #####
     elif mode == 'minhashing':
@@ -61,35 +70,37 @@
                         [elt[indexes[1]] or '' for elt in targetset],
                         k)
         rawneighbours = minhasher.findsimilarsentences(threshold)
-        neighbours = [[] for _ in xrange(len(alignset))]
-        for data in rawneighbours:
+        neighbours = []
+        for data in rawneighbours: #XXX: Return an iterator
+            neighbours.append([[], []])
             for i in data:
                 if i >= len(alignset):
-                    continue
-                neighbours[i].extend([e - len(alignset)
-                                      for e in data if e >= len(alignset)])
+                    neighbours[-1][1].append(i - len(alignset))
+                else:
+                    neighbours[-1][0].append(i)
+            if len(neighbours[-1][0]) == 0 or len(neighbours[-1][1] == 0):
+                neighbours.pop()
         return neighbours
 
 #### Kmeans #####
-    elif mode in set(['kmeans', 'minbatch']):
+    elif mode in set(['kmeans', 'minibatch']):
         from sklearn import cluster
+        n_clusters = n_clusters or len(alignset) / 10
+
         if mode == 'kmeans':
-            kmeans = cluster.KMeans(n_clusters=n_clusters or (len(alignset)/100))
+            kmeans = cluster.KMeans(n_clusters=n_clusters)
         else:
-            kmeans = cluster.MiniBatchKMeans(n_clusters=n_clusters or (len(alignset)/100))
+            kmeans = cluster.MiniBatchKMeans(n_clusters=n_clusters)
         # XXX : If there are more than 2 dimensions ??
         kmeans.fit([elt[indexes[0]] or (0, 0) for elt in alignset])
         predicted = kmeans.predict([elt[indexes[1]] or (0, 0) for elt in targetset])
 
-        clusters = [[] for _ in xrange(kmeans.n_clusters)]
-        print kmeans.n_clusters
-        for ind, j in enumerate(predicted):
-            clusters[j].append(ind)
-        neighbours = []
-        labels = kmeans.labels_
-        for i in xrange(len(alignset)):
-            neighbours.append(clusters[labels[i]])
-        return neighbours
+        clusters = [[[], []] for _ in xrange(kmeans.n_clusters)]
+        for ind, i in enumerate(predicted):
+            clusters[i][1].append(ind)
+        for ind, i in enumerate(kmeans.labels_):
+            clusters[i][0].append(ind)
+        return clusters
 
 def align(alignset, targetset, treatments, threshold, resultfile):
     """ Try to align the items of alignset onto targetset's ones
--- a/demo.py	Thu Nov 08 13:25:20 2012 +0100
+++ b/demo.py	Thu Nov 08 14:43:32 2012 +0100
@@ -106,13 +106,14 @@
               }
 
     print "Start computation"
-    for ind, nei in enumerate(neighbours):
-        m, b = align([alignset[ind][:2]],      # The dataset to align
-              [targetset[i][:2] for i in nei], # The target dataset
-              [tr_name],
-              0.3,
-              'demo2_results')  # Filename of the output
-                                #   result file
+    for ind, (alignid, targetid) in enumerate(neighbours):
+        print '%3d' % ind, len(alignid), 'x', len(targetid)
+        m, b = align([alignset[i][:2] for i in alignid],   # The dataset to align
+                     [targetset[i][:2] for i in targetid], # The target dataset
+                     [tr_name],
+                     0.3,
+                     'demo2_results')  # Filename of the output
+                                       #   result file
 
 if __name__ == '__main__':
     import sys