Add some XXX on Adrien's comments
authorSimon Chabot <simon.chabot@logilab.fr>
Thu, 15 Nov 2012 16:48:36 +0100
changeset 143 e538838ee124
parent 142 b5ffd85f4711
child 144 71aa735b6e3f
Add some XXX on Adrien's comments
TODO
demo.py
distances.py
matrix.py
normalize.py
test/test_alignment.py
--- a/TODO	Thu Nov 15 14:38:15 2012 +0100
+++ b/TODO	Thu Nov 15 16:48:36 2012 +0100
@@ -1,1 +0,0 @@
-Écrire des test pour aligner.py
--- a/demo.py	Thu Nov 15 14:38:15 2012 +0100
+++ b/demo.py	Thu Nov 15 16:48:36 2012 +0100
@@ -3,6 +3,7 @@
 
 from os import path
 
+#XXX aln, ald
 import alignment.distances as d
 import alignment.normalize as n
 from alignment.aligner import align, subalign, findneighbours
@@ -16,6 +17,8 @@
 
     #We try to align Goncourt winers onto dbpedia results
 
+    #XXX Make some prints
+
 
     query = """
        SELECT ?writer, ?name WHERE {
@@ -25,6 +28,7 @@
        }
     """
 
+    print "Sending query to dbpedia"
     targetset = sparqlquery('http://dbpedia.org/sparql', query)
     alignset = parsefile(path.join(DEMODIR, 'demo','prixgoncourt'), indexes=[1, 1])
 
@@ -37,7 +41,7 @@
                'metric': d.levenshtein
               }
 
-    treatments = {1: tr_name }
+    treatments = {1: tr_name}
 
     dmatrix, hasmatched = align(alignset, targetset, 0.4, treatments,
                                 'demo0_results')
@@ -54,9 +58,9 @@
     # ``nbmax`` is the number of locations to load
 
     targetset = parsefile(path.join(DEMODIR, 'demo', 'FR.txt'), indexes=[0, 1, (4, 5)],
-                          nbmax = 2000)
+                          nbmax=2000)
     alignset = parsefile(path.join(DEMODIR, 'demo', 'frenchbnf'),
-                         indexes = [0, 2, (14, 12)], nbmax=1000)
+                         indexes=[0, 2, (14, 12)], nbmax=1000)
 
 
     # Let's define the treatments to apply on the location's name
@@ -90,6 +94,26 @@
     #    otherwise
     print dmatrix
 
+
+#def parsefile(filepath, transforms):
+#    pass
+#
+#
+#parsefile('fr.txt', {0: int, 1: lambda x: x.decode('utf-8'), 14: float, 12:
+#                     float}, indexes=[0, 2, (14, 12)])
+#
+#
+#def make_index_transformer(indexes, transform_map):
+#    def xxx(row):
+#        data = [transform_map[i](row[i]) for in indexes]
+#        return data
+#    return xxx
+#
+#
+#
+#parsefile('fr.txt', line_transformer=make_index_transformer)
+#
+#
 def demo_2():
     targetset = parsefile(path.join(DEMODIR, 'demo', 'FR.txt'), indexes=[0, 1, (4, 5)])
     alignset = parsefile(path.join(DEMODIR, 'demo', 'frenchbnf'), indexes=[0, 2, (14, 12)],
--- a/distances.py	Thu Nov 15 14:38:15 2012 +0100
+++ b/distances.py	Thu Nov 15 16:48:36 2012 +0100
@@ -71,6 +71,7 @@
     try:
         return abs(a - b)
     except TypeError:
+        #a and b may be strings
         return abs(float(a) - float(b))
 
 
@@ -234,7 +235,7 @@
 
 
 ### GEOGRAPHICAL DISTANCES ####################################################
-def geographical(pointa, pointb, in_radians=False, planetRadius=6371009,
+def geographical(pointa, pointb, in_radians=False, planet_radius=6371009,
                  units='m'):
     """ Return the geographical distance between two points.
 
@@ -260,4 +261,4 @@
         meanlat *= pi/180.0
 
     coef = 1. if units == 'm' else 0.001
-    return coef*planetRadius*sqrt(difflat**2 + (cos(meanlat)*difflong)**2)
+    return coef*planet_radius*sqrt(difflat**2 + (cos(meanlat)*difflong)**2)
--- a/matrix.py	Thu Nov 15 14:38:15 2012 +0100
+++ b/matrix.py	Thu Nov 15 16:48:36 2012 +0100
@@ -45,7 +45,10 @@
 
 def pdist(X, metric='euclidean', matrix_normalized=True, metric_params=None):
     """ Compute the upper triangular matrix in a way similar
-    to scipy.spatial.metric"""
+    to scipy.spatial.metric
+    XXX Comment on normalization 
+    
+    """
     metric = metric if not isinstance(metric, basestring) else METRICS.get(metric, ds.euclidean)
     values = []
     for i in xrange(len(X)):
@@ -60,6 +63,7 @@
 
 def cdist(X, Y, metric='euclidean', matrix_normalized=True, metric_params=None):
     """ Compute the metric matrix, given two inputs and a metric
+    XXX Comment on normalization 
     """
     metric = metric if not isinstance(metric, basestring) else METRICS.get(metric, ds.euclidean)
     distmatrix = empty((len(X), len(Y)), dtype='float32')
@@ -122,6 +126,7 @@
 
        /!\ All `input1` and `input2` of each tuple must have the same size
            in twos
+      XXX Write an assertion
     """
     globalmatrix = items[0][0]*cdist(*items[0][1:])
     for item in items[1:]:
--- a/normalize.py	Thu Nov 15 14:38:15 2012 +0100
+++ b/normalize.py	Thu Nov 15 16:48:36 2012 +0100
@@ -137,12 +137,14 @@
     """ Generator of k-wordgrams on the given sentence
     """
     words = sentence.split(' ')
+    #XXX Call tokenizer
     for r in xrange(len(words)):
         yield ' '.join(words[r:r + k])
 
 def loadlemmas(filename):
     """ Return the default lemmas dictionnary
     """
+    #XXX Make a loop
     return dict([line.decode('utf-8').strip().split('\t') for line in open(filename)
                  if len(line.strip().split('\t'))==2])
 
--- a/test/test_alignment.py	Thu Nov 15 14:38:15 2012 +0100
+++ b/test/test_alignment.py	Thu Nov 15 16:48:36 2012 +0100
@@ -374,7 +374,6 @@
                      ['T2', 'labelt2', (5.3, 48.2)],
                      ['T3', 'labelt3', (6.25, 48.91)],
                      ]
-        neighbours = alig.findneighbours_kdtree(alignset, targetset, indexes=(2, 2), threshold=0.3)
         treatments = {2: {'metric': 'geographical', 'matrix_normalized':False,
                           'metric_params': {'units': 'km', 'in_radians': False}}}
         global_mat, global_matched = alig.conquer_and_divide_alignment(alignset, targetset,