[aligner] Simplify for now the aligners pipeline, see #183468
authorVincent Michel <vincent.michel@logilab.fr>
Tue, 15 Oct 2013 09:43:26 +0000
changeset 315 c66d1517d56f
parent 314 22de89ab1274
child 316 eecaebe54657
[aligner] Simplify for now the aligners pipeline, see #183468
aligner.py
test/test_alignment.py
--- a/aligner.py	Tue Oct 15 12:17:33 2013 +0000
+++ b/aligner.py	Tue Oct 15 09:43:26 2013 +0000
@@ -262,9 +262,8 @@
         self.time = None
         self.logger = logging.getLogger('nazca.aligner')
 
-    def align(self, refset, targetset):
-        """ Perform the alignment on the referenceset
-        and the targetset
+    def get_aligned_pairs(self, refset, targetset, unique=True):
+        """ Get the pairs of aligned elements
         """
         start_time = time.time()
         ref_index = range(len(refset))
@@ -273,41 +272,27 @@
         self.targetset_size = len(targetset)
         global_matched = {}
         global_mat = lil_matrix((len(refset), len(targetset)))
+        seen_refset = set()
         # Iteration over aligners
         for ind_aligner, aligner in enumerate(self.aligners):
             # Perform alignment
             _refset = [refset[i] for i in ref_index]
             _targetset = [targetset[i] for i in target_index]
-            _global_mat, _global_matched = aligner.align(_refset, _targetset, get_matrix=False)
-            # Store results
-            for k, values in _global_matched.iteritems():
-                subdict = global_matched.setdefault(ref_index[k], set())
-                for v, d in values:
-                    self.alignments_done += 1
-                    subdict.add((target_index [v], d))
+            for pair in aligner.get_aligned_pairs(_refset, _targetset, unique):
+                self.pairs_found += 1
+                pair = ((pair[0][0], ref_index[pair[0][1]]),
+                        (pair[1][0], target_index[pair[1][1]]))
+                yield pair
+                seen_refset.add(pair[0][1])
             # Store stats
             self.nb_blocks += aligner.nb_blocks
             self.nb_comparisons += aligner.nb_comparisons
             # Update indexes if necessary
+            # For now, we remove all the reference set that are already matched
             if ind_aligner < len(self.aligners) - 1:
                 # There are other aligners after this one
-                _ref_index, _target_index = set(), set()
-                for k, values in _global_matched.iteritems():
-                    _ref_index.add(k)
-                    for v, d in values:
-                        _target_index.add(v)
-                ref_index = [i for i in ref_index if i not in _ref_index]
-                target_index = [i for i in target_index if i not in _target_index]
+                ref_index = [i for i in ref_index if i not in seen_refset]
         self.time = time.time() - start_time
-        return global_mat, global_matched
-
-    def get_aligned_pairs(self, refset, targetset, unique=True):
-        """ Get the pairs of aligned elements
-        """
-        global_mat, global_matched = self.align(refset, targetset)
-        for pair in iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique):
-            self.pairs_found += 1
-            yield pair
         self.log_infos()
 
     def log_infos(self):
--- a/test/test_alignment.py	Tue Oct 15 12:17:33 2013 +0000
+++ b/test/test_alignment.py	Tue Oct 15 09:43:26 2013 +0000
@@ -148,32 +148,6 @@
 
 class PipelineAlignerTestCase(unittest2.TestCase):
 
-    def test_pipeline_align(self):
-        refset = [['V1', 'aaa', (6.14194444444, 48.67)],
-                  ['V2', 'bbb', (6.2, 49)],
-                  ['V3', 'ccc', (5.1, 48)],
-                  ['V4', 'ddd', (5.2, 48.1)],
-                  ]
-        targetset = [['T1', 'zzz', (6.17, 48.7)],
-                     ['T2', 'eec', (5.3, 48.2)],
-                     ['T3', 'fff', (6.25, 48.91)],
-                     ['T4', 'ccd', (0, 0)],
-                     ]
-        # Creation of the aligner object
-        processings = (GeographicalProcessing(2, 2, units='km'),)
-        aligner_1 = alig.BaseAligner(threshold=30, processings=processings)
-        processings = (LevenshteinProcessing(1, 1),)
-        aligner_2 = alig.BaseAligner(threshold=1, processings=processings)
-        pipeline = alig.PipelineAligner((aligner_1, aligner_2))
-        global_mat, global_matched = pipeline.align(refset, targetset)
-        true_global_matched =  {0: set([(2, 29.124842), (0, 4.5532517)]),
-                                1: set([(2, 11.396689)]), 2: set([(3, 1.0)]),
-                                3: set([(1, 15.69241)])}
-        self.assertEqual(len(global_matched), len(true_global_matched))
-        for k, v in true_global_matched.iteritems():
-            self.assertIn(k, global_matched)
-            self.assertEqual(len(v), len(global_matched[k]))
-
     def test_pipeline_align_pairs(self):
         refset = [['V1', 'aaa', (6.14194444444, 48.67)],
                   ['V2', 'bbb', (6.2, 49)],