[demo] Add some prints to show the alignment progress
authorSimon Chabot <simon.chabot@logilab.fr>
Mon, 19 Nov 2012 11:30:02 +0100
changeset 146 48013ba27844
parent 145 3ab512c6bc1a
child 147 3c59f5cb4559
[demo] Add some prints to show the alignment progress
demo.py
--- a/demo.py	Mon Nov 19 10:43:11 2012 +0100
+++ b/demo.py	Mon Nov 19 11:30:02 2012 +0100
@@ -3,10 +3,12 @@
 
 from os import path
 
+import urllib
+
 #XXX aln, ald
 import alignment.distances as d
 import alignment.normalize as n
-from alignment.aligner import align, subalign, findneighbours
+from alignment.aligner import align, subalign, findneighbours, alignall
 from alignment.dataio import parsefile, sparqlquery, write_results
 
 DEMODIR = path.dirname(__file__)
@@ -14,6 +16,21 @@
 def dpath(filename):
     return path.join(DEMODIR, 'demo', filename)
 
+def remove_after(string, sub):
+    try:
+        return string[:string.lower().index(sub)].strip()
+    except ValueError:
+        return string
+
+def parserql(host, rql):
+    filehandle = urllib.urlopen('%(host)sview?'
+                                'rql=%(rql)s&vid=csvexport'
+                                % {'rql': rql, 'host': host})
+    filehandle.readline()
+    rset = [[e.decode('utf-8') for e in line.strip().split(';')]
+            for line in filehandle]
+    return rset
+
 def demo_0():
     # prixgoncourt is the list of Goncourt Prize, extracted
     # from wikipedia
@@ -35,12 +52,8 @@
     targetset = sparqlquery('http://dbpedia.org/sparql', query)
     alignset = parsefile(dpath('prixgoncourt'), indexes=[1, 1])
 
-    def removeparenthesis(string):
-        if '(' in string:
-            return string[:string.index('(')]
-        return string
-
-    tr_name = {'normalization': [removeparenthesis, n.simplify],
+    tr_name = {'normalization': [lambda x:remove_after(x, '('),
+                                 n.simplify],
                'metric': d.levenshtein
               }
 
@@ -148,6 +161,32 @@
                                 treatments)
         write_results(matched, alignset, targetset, dpath('demo2_results'))
 
+def demo_3():
+    print "Parsing files"
+    alignset = parserql(host='http://demo.cubicweb.org/elections/',
+                        rql='Any E, N WHERE X is Commune, X eid E, X label N')
+    targetset = parsefile(dpath('FR.txt'), indexes=[0, 1])
+    print '%s×%s' % (len(alignset), len(targetset))
+
+    tr_name = {'normalization': [n.simplify],
+               'metric': 'levenshtein'
+              }
+
+    print "Alignment started"
+    #XXX alignall rewrite the data (see normalize_set())
+    results = alignall(alignset, targetset, 0.75, treatments={1: tr_name},
+                       indexes=(1,1), mode='minhashing', kwordsgram=1, siglen=200,
+                       uniq=True)
+    dicresults = dict([(a, b) for (a, b) in results])
+
+    print "Done, writing output"
+
+    with open(dpath('demo3_res'), 'w') as fout:
+        for line in alignset:
+            sent = u'http://demo.cubicweb.org/elections/commune/%s;'\
+                   u'http://www.geonames.org/%s\n' \
+                   % (line[0], dicresults.get(line[0], 'not_found'))
+            fout.write(sent.encode('utf-8'))
 if __name__ == '__main__':
     import sys
     from time import time
@@ -167,5 +206,9 @@
         ## Same as demo_1, but in a more efficient way, using a KDTree
         demo_2()
 
+    if runall or '3' in sys.argv:
+        print "Running demo_3"
+        demo_3()
+
     print "Demo terminated"
     print "Took %d min" % ((time() - t)/60)