[aligner] Update verbose infos, see #183439
authorVincent Michel <vincent.michel@logilab.fr>
Tue, 15 Oct 2013 08:42:27 +0000
changeset 307 31a13077a293
parent 306 cf1e78baf803
child 308 723bfd8b0ff6
[aligner] Update verbose infos, see #183439
aligner.py
blocking.py
--- a/aligner.py	Tue Oct 15 08:41:20 2013 +0000
+++ b/aligner.py	Tue Oct 15 08:42:27 2013 +0000
@@ -14,7 +14,7 @@
 #
 # You should have received a copy of the GNU Lesser General Public License along
 # with this program. If not, see <http://www.gnu.org/licenses/>.
-
+import time
 from collections import defaultdict
 
 from scipy import zeros
@@ -36,6 +36,10 @@
         self.target_normalizer = None
         self.blocking = None
         self.nb_comparisons = 0
+        self.nb_blocks = 0
+        self.refset_size = None
+        self.targetset_size = None
+        self.time = None
 
     def register_ref_normalizer(self, normalizer):
         """ Register normalizers to be applied
@@ -103,8 +107,11 @@
         """ Perform the alignment on the referenceset
         and the targetset
         """
+        start_time = time.time()
         refset = self.apply_normalization(refset, self.ref_normalizer)
         targetset = self.apply_normalization(targetset, self.target_normalizer)
+        self.refset_size = len(refset)
+        self.targetset_size = len(targetset)
         # If no blocking
         if not self.blocking:
             return self._get_match(refset, targetset)
@@ -113,21 +120,11 @@
         global_mat = lil_matrix((len(refset), len(targetset)))
         self.blocking.fit(refset, targetset)
         for refblock, targetblock in self.blocking.iter_blocks():
+            self.nb_blocks += 1
             ref_index = [r[0] for r in refblock]
             target_index = [r[0] for r in targetblock]
             self.nb_comparisons += len(ref_index)*len(target_index)
-            if self.verbose:
-                print 'Blocking: %s reference ids, %s target ids' % (len(ref_index),
-                                                                     len(target_index))
-                print 'Reference records :'
-                for ind in ref_index:
-                    print '\t--->', refset[ind]
-                print 'Target records :'
-                for ind in target_index:
-                    print '\t--->', targetset[ind]
             _, matched = self._get_match(refset, targetset, ref_index, target_index)
-            if self.verbose:
-                print 'Matched: %s / Total comparisons %s' % (len(matched), self.nb_comparisons)
             for k, values in matched.iteritems():
                 subdict = global_matched.setdefault(k, set())
                 for v, d in values:
@@ -135,6 +132,7 @@
                     if get_matrix:
                         # XXX avoid issue in sparse matrix
                         global_mat[k, v] = d or 10**(-10)
+        self.time = time.time() - start_time
         return global_mat, global_matched
 
     def _iter_aligned_pairs(self, refset, targetset, global_mat, global_matched, unique=True):
@@ -145,18 +143,21 @@
                 bestid, _ = sorted(global_matched[refid], key=lambda x:x[1])[0]
                 ref_record = refset[refid]
                 target_record = targetset[bestid]
-                if self.verbose:
-                    print '\t\t', ref_record, ' <--> ', target_record
                 yield (ref_record[0], refid), (target_record[0], bestid)
         else:
             for refid in global_matched:
                 for targetid, _ in global_matched[refid]:
                     ref_record = refset[refid]
                     target_record = targetset[targetid]
-                    if self.verbose:
-                        print '\t\t', ref_record, ' <--> ', target_record
                     yield (ref_record[0], refid), (target_record[0], targetid)
-        print 'Total comparisons : ', self.nb_comparisons
+        if self.verbose:
+            print 'Computation time : ', self.time
+            print 'Size reference set : ', self.refset_size
+            print 'Size target set : ', self.targetset_size
+            print 'Done comparisons : ', self.nb_comparisons
+            print 'Maximum comparisons : ', self.refset_size * self.targetset_size
+            print 'Number of blocks : ', self.nb_blocks
+            print 'Blocking reduction : ', self.nb_comparisons/(self.refset_size * self.targetset_size)
 
     def get_aligned_pairs(self, refset, targetset, unique=True):
         """ Get the pairs of aligned elements
--- a/blocking.py	Tue Oct 15 08:41:20 2013 +0000
+++ b/blocking.py	Tue Oct 15 08:42:27 2013 +0000
@@ -536,7 +536,7 @@
     """ Pipeline multiple blocking techniques
     """
 
-    def __init__(self, blockings):
+    def __init__(self, blockings, collect_stats=False):
         """ Build the blocking object
 
         Parameters
@@ -546,6 +546,8 @@
         """
         self.blockings = blockings
         self.stored_blocks = []
+        self.collect_stats = collect_stats
+        self.stats = {}
 
     def _fit(self, refset, targetset):
         """ Internal fit of the pipeline """
@@ -564,6 +566,8 @@
             for block1, block2 in blocking.iter_indice_blocks():
                 ind_block1 = [ref_index[i] for i in block1]
                 ind_block2 = [target_index[i] for i in block2]
+                if self.collect_stats:
+                    self.stats.setdefault(ind, []).append((len(block1), len(block2)))
                 self._recursive_fit(refset, targetset, ind_block1, ind_block2, ind+1)
         else:
             # This is the final blocking
@@ -574,6 +578,8 @@
             for block1, block2 in blocking.iter_blocks():
                 ind_block1 = [(ref_index[i], _id) for i, _id in block1]
                 ind_block2 = [(target_index[i], _id) for i, _id in block2]
+                if self.collect_stats:
+                    self.stats.setdefault(ind, []).append((len(block1), len(block2)))
                 self.stored_blocks.append((ind_block1, ind_block2))
 
     def _iter_blocks(self):