[aligner] Add the alignall_iterative() function (closes #116932)
authorSimon Chabot <simon.chabot@logilab.fr>
Thu, 14 Feb 2013 16:00:55 +0100
changeset 231 4b6119e623cf
parent 230 6afc3891e633
child 242 f942f2393fb2
[aligner] Add the alignall_iterative() function (closes #116932) This function splits the files to align into smaller ones, and run the alignment using a cache. *** [aligner] Better display of progression
aligner.py
test/data/alignfile.csv
test/data/targetfile.csv
test/test_alignment.py
--- a/aligner.py	Fri Feb 15 10:37:39 2013 +0100
+++ b/aligner.py	Thu Feb 14 16:00:55 2013 +0100
@@ -15,12 +15,17 @@
 # You should have received a copy of the GNU Lesser General Public License along
 # with this program. If not, see <http://www.gnu.org/licenses/>.
 
+from os import listdir
+import os.path as osp
+from shutil import rmtree
+from tempfile import mkdtemp
+import sys
 
 from scipy.spatial import KDTree
 from scipy.sparse import lil_matrix
 
 from nazca.minhashing import Minlsh
-from nazca.dataio import write_results
+from nazca.dataio import write_results, split_file, parsefile
 import nazca.matrix as m
 
 
@@ -292,3 +297,73 @@
         for alignid in matched:
             bestid, _ = sorted(matched[alignid], key=lambda x:x[1])[0]
             yield alignset[alignid][0], targetset[bestid][0]
+
+def alignall_iterative(alignfile, targetfile, alignformat, targetformat,
+                       threshold, size=10000, treatments=None, indexes=(1,1),
+                       mode='kdtree', neighbours_threshold=0.1, n_clusters=None,
+                       kwordsgram=1, siglen=200):
+
+    """ This function helps you to align *huge* files.
+        It takes your csv files as arguments and split them into smaller ones
+        (files of `size` lines), and runs the alignement on those files.
+
+        `alignformat` and `targetformat` are keyworded arguments given to the
+        nazca.dataio.parsefile function.
+    """
+
+    #Split the huge files into smaller ones
+    aligndir = mkdtemp()
+    targetdir = mkdtemp()
+    alignfiles = split_file(alignfile, aligndir, size)
+    targetfiles = split_file(targetfile, targetdir, size)
+
+    #Compute the number of iterations that must be done to achieve the alignement
+    nb_iterations = len(alignfiles) * len(targetfiles)
+    current_it = 0
+
+    doneids = set([]) #Contains the id of perfectly aligned data
+    cache = {} #Contains the better known alignements
+
+    try:
+        for alignfile in alignfiles:
+            alignset = parsefile(osp.join(aligndir, alignfile), **alignformat)
+            for targetfile in targetfiles:
+                if doneids: #If some alignements are already perfect,
+                            #don't redo them !
+                    tmp_align = []
+                    for a in alignset:
+                        if a[0] not in doneids:
+                            tmp_align.append(a)
+                    alignset = tmp_align
+
+                targetset = parsefile(osp.join(targetdir, targetfile), **targetformat)
+                matched = conquer_and_divide_alignment(alignset, targetset,
+                                                       threshold,
+                                                       treatments=treatments,
+                                                       indexes=indexes,
+                                                       mode=mode,
+                                                       neighbours_threshold=neighbours_threshold,
+                                                       n_clusters=n_clusters,
+                                                       kwordsgram=kwordsgram,
+                                                       siglen=siglen,
+                                                       get_global_mat=False)
+                for alignid in matched:
+                    bestid, dist = sorted(matched[alignid], key=lambda x:x[1])[0]
+                    #Get the better known distance
+                    _, current_dist = cache.get(alignset[alignid][0], (None, None))
+                    if not current_dist or current_dist > dist:
+                        #If it's better, update the cache
+                        cache[alignset[alignid][0]] = (targetset[bestid][0], dist)
+                        if dist <= 0.01 :
+                            #If perfect, stop trying to align this one
+                            doneids.add(alignset[alignid][0])
+
+                current_it += 1
+                sys.stdout.write('\r%0.2f%%' % (current_it * 100. /
+                                                nb_iterations))
+                sys.stdout.flush()
+    finally:
+        rmtree(aligndir)
+        rmtree(targetdir)
+
+    return cache
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/data/alignfile.csv	Thu Feb 14 16:00:55 2013 +0100
@@ -0,0 +1,4 @@
+V1	label1	6.14194444444	48.67
+V2	label2	6.2	49
+V3	label3	5.1	48
+V4	label4	5.2	48.1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/data/targetfile.csv	Thu Feb 14 16:00:55 2013 +0100
@@ -0,0 +1,3 @@
+T1	labelt1	6.17	48.7
+T2	labelt2	5.3	48.2
+T3	labelt3	6.25	48.91
--- a/test/test_alignment.py	Fri Feb 15 10:37:39 2013 +0100
+++ b/test/test_alignment.py	Thu Feb 14 16:00:55 2013 +0100
@@ -445,6 +445,29 @@
         self.assertEqual(predict_matched, all_matched)
         self.assertEqual(predict_uniq_matched, uniq_matched)
 
+    def test_alignall_iterative(self):
+        matched = set([('V2', 'T3'), ('V4', 'T2'), ('V1', 'T1')])
+        treatments = {2: {'metric': 'geographical', 'matrix_normalized': False,
+                          'metric_params': {'units': 'km', 'in_radians': False}}}
+
+        _format={'indexes': [0, 1, (2, 3)]}
+        alignements = alig.alignall_iterative(path.join(TESTDIR, 'data',
+                                                        'alignfile.csv'),
+                                              path.join(TESTDIR, 'data',
+                                                        'targetfile.csv'),
+                                              _format, _format, threshold=30,
+                                              size=2, #very small files ;)
+                                              treatments=treatments,
+                                              indexes=(2,2),
+                                              neighbours_threshold=0.3)
+
+        predict_matched = set([(a, t) for (a, (t, _)) in
+                               alignements.iteritems()])
+        self.assertEqual(predict_matched, matched)
+
+
+
+
 
 
 if __name__ == '__main__':