[aligner] Enable the user to customize the equality_threshold (closes #116940)
authorSimon Chabot <simon.chabot@logilab.fr>
Wed, 30 Jan 2013 14:43:24 +0100
changeset 243 7498d98cde7a
parent 242 f942f2393fb2
child 244 33cc52731e55
[aligner] Enable the user to customize the equality_threshold (closes #116940)
aligner.py
--- a/aligner.py	Thu Apr 04 18:10:10 2013 +0200
+++ b/aligner.py	Wed Jan 30 14:43:24 2013 +0100
@@ -299,9 +299,10 @@
             yield alignset[alignid][0], targetset[bestid][0]
 
 def alignall_iterative(alignfile, targetfile, alignformat, targetformat,
-                       threshold, size=10000, treatments=None, indexes=(1,1),
-                       mode='kdtree', neighbours_threshold=0.1, n_clusters=None,
-                       kwordsgram=1, siglen=200, cache=None):
+                       threshold, size=10000, equality_threshold=0.01,
+                       treatments=None, indexes=(1,1), mode='kdtree',
+                       neighbours_threshold=0.1, n_clusters=None, kwordsgram=1,
+                       siglen=200, cache=None):
 
     """ This function helps you to align *huge* files.
         It takes your csv files as arguments and split them into smaller ones
@@ -315,6 +316,10 @@
         distance) as value. This dictionary can be regiven to this function to
         perform another alignment (with different parameters, or just to be
         sure everything has been caught)
+
+        If the distance of an alignment is below `equality_threshold`, the
+        alignment is considered as perfect, and the corresponding item is
+        removed from the alignset (to speed up the computation).
     """
 
     #Split the huge files into smaller ones
@@ -360,7 +365,7 @@
                     if not current_dist or current_dist > dist:
                         #If it's better, update the cache
                         cache[alignset[alignid][0]] = (targetset[bestid][0], dist)
-                        if dist <= 0.01 :
+                        if dist <= equality_threshold:
                             #If perfect, stop trying to align this one
                             doneids.add(alignset[alignid][0])