[aligner] higher level implementation of KDTree and Minhashing
authorSimon Chabot <simon.chabot@logilab.fr>
Wed, 07 Nov 2012 17:09:20 +0100
changeset 84 3cd1edfd28d0
parent 83 da44e0b956bb
child 85 ab75d8ffb260
[aligner] higher level implementation of KDTree and Minhashing
aligner.py
--- a/aligner.py	Wed Nov 07 15:58:46 2012 +0100
+++ b/aligner.py	Wed Nov 07 17:09:20 2012 +0100
@@ -19,9 +19,33 @@
 
 import csv
 
+from scipy.spatial import KDTree
+
 import alignment.matrix as m
+from alignment.minhashing import Minlsh
 
 
+def findneighbours(alignset, targetset, indexes = (1, 1), mode = 'kdtree',
+                   threshold = 0.1, extraargs = {}):
+    if mode == 'kdtree':
+        aligntree  = KDTree([elt[indexes[0]] or (0, 0) for elt in alignset])
+        targettree = KDTree([elt[indexes[1]] or (0, 0) for elt in targetset])
+        return aligntree.query_ball_tree(targettree, threshold)
+    elif mode == 'minhashing':
+        minhasher = Minlsh()
+        minhasher.train([elt[indexes[0]] or '' for elt in alignset] +
+                        [elt[indexes[1]] or '' for elt in targetset],
+                        **extraargs)
+        rawneighbours = minhasher.findsimilarsentences(threshold)
+        neighbours = [[] for _ in xrange(len(alignset))]
+        for data in rawneighbours:
+            for i in data:
+                if i >= len(alignset):
+                    continue
+                neighbours[i].extend([e - len(alignset)
+                                      for e in data if e >= len(alignset)])
+        return neighbours
+
 def align(alignset, targetset, treatments, threshold, resultfile):
     """ Try to align the items of alignset onto targetset's ones