[dataio] Create an helper that execute a sparql query and return a clean json dict, see #198745
authorVincent Michel <vincent.michel@logilab.fr>
Wed, 08 Jan 2014 16:52:18 +0000
changeset 380 041c88ad2c3b
parent 377 0b09ea26fff9
child 381 f6b7eff50f7f
[dataio] Create an helper that execute a sparql query and return a clean json dict, see #198745
test/test_dataio.py
utils/dataio.py
--- a/test/test_dataio.py	Thu Dec 19 14:46:08 2013 +0000
+++ b/test/test_dataio.py	Wed Jan 08 16:52:18 2014 +0000
@@ -23,7 +23,8 @@
 from tempfile import mkdtemp
 
 from nazca.utils.dataio import (HTMLPrettyPrint, ValidXHTMLPrettyPrint,
-                                sparqlquery, rqlquery, parsefile,
+                                sparqlquery, sparqljson, _sparqlexecute,
+                                rqlquery, parsefile,
                                 autocast, split_file)
 from nazca.ner import NerProcess
 from nazca.ner.sources import NerSourceLexicon
@@ -142,6 +143,50 @@
                                    ['http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage'],
                                    ['http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ']])
 
+    def test_sparql_execute(self):
+        rawresults = _sparqlexecute(u'http://dbpedia.org/sparql',
+                                    u'''SELECT DISTINCT ?uri
+                                    WHERE{
+                                    ?uri rdfs:label "Python"@en .
+                                    ?uri rdf:type ?type}''')
+        self.assertEqual(rawresults, {u'head': {u'link': [], u'vars': [u'uri']},
+                                      u'results': {u'distinct': False, u'bindings': [{u'uri': {u'type': u'uri', u'value': u'http://dbpedia.org/resource/Python'}}, {u'uri': {u'type': u'uri', u'value': u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage'}}, {u'uri': {u'type': u'uri', u'value': u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'}}],
+                                                   u'ordered': True}})
+
+    def test_sparql_execute_no_raise_on_error(self):
+        rawresults = _sparqlexecute(u'http://dbpedia.org/sparql',
+                                    u'''SELECT DISTINCT ?uri
+                                    WHERE{
+                                    ?uri faultyrdf
+                                    ?uri rdf:type ?type}''')
+        self.assertEqual(rawresults, [])
+
+    def test_sparql_execute_raise_on_error(self):
+        with self.assertRaises(RuntimeError):
+            rawresults = _sparqlexecute(u'http://dbpedia.org/sparql',
+                                        u'''SELECT DISTINCT ?uri
+                                        WHERE{
+                                        ?uri faultyrdf
+                                        ?uri rdf:type ?type}''',
+                                        raise_on_error=True)
+
+    def test_sparql_json(self):
+        results = sparqljson(u'http://dbpedia.org/sparql',
+                             u'''SELECT DISTINCT ?uri
+                             WHERE{
+                             ?uri rdfs:label "Python"@en .
+                             ?uri rdf:type ?type}''')
+        self.assertEqual(results, {u'uri': set([u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ', u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage', u'http://dbpedia.org/resource/Python'])})
+
+    def test_sparql_json2(self):
+        results = sparqljson(u'http://dbpedia.org/sparql',
+                             u'''SELECT DISTINCT ?uri ?label
+                             WHERE{
+                             ?uri rdfs:label "Python"@en .
+                             ?uri rdfs:label ?label .
+                             ?uri rdf:type ?type}''')
+        self.assertEqual(results, {u'uri': set([u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ', u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage', u'http://dbpedia.org/resource/Python']), u'label': u'Python'})
+
     def test_sparql_autocast(self):
         alignset = sparqlquery('http://dbpedia.inria.fr/sparql',
                                  'prefix db-owl: <http://dbpedia.org/ontology/>'
--- a/utils/dataio.py	Thu Dec 19 14:46:08 2013 +0000
+++ b/utils/dataio.py	Wed Jan 08 16:52:18 2014 +0000
@@ -18,6 +18,7 @@
 from os.path import exists as fileexists
 from os import path as osp
 
+import json
 import csv
 import urllib
 
@@ -96,17 +97,33 @@
 ###############################################################################
 ### SPARQL FUNCTIONS ##########################################################
 ###############################################################################
-def sparqlquery(endpoint, query, indexes=None, autocaste_data=True):
-    """ Run the sparql query on the given endpoint, and wrap the items in the
-    indexes form. If indexes is empty, keep raw output"""
-
+def _sparqlexecute(endpoint, query, raise_on_error=False):
+    """ Execute a sparql query and return the raw results
+    """
     if not SPARQL_ENABLED:
         raise ImportError("You have to install SPARQLWrapper and JSON modules to"
                           "used this function")
     sparql = SPARQLWrapper(endpoint)
     sparql.setQuery(query)
     sparql.setReturnFormat(JSON)
-    rawresults = sparql.query().convert()
+    try:
+        try:
+            rawresults = sparql.query().convert()
+            return rawresults
+        except ValueError:
+            # Bad json
+            rawresults = sparql.query()
+            return json.loads(codecs.escape_decode(rawresults.response.read())[0])
+    except:
+        if raise_on_error:
+            raise RuntimeError('Error in sparql query')
+        else:
+            return []
+
+def sparqlquery(endpoint, query, indexes=None, autocaste_data=True, raise_on_error=False):
+    """ Run the sparql query on the given endpoint, and wrap the items in the
+    indexes form. If indexes is empty, keep raw output"""
+    rawresults = _sparqlexecute(endpoint, query, raise_on_error)
     labels = rawresults['head']['vars']
     results = []
     indexes = indexes or []
@@ -127,6 +144,29 @@
         results.append(data)
     return results
 
+def sparqljson(endpoint, query, lang_order=('fr', 'en'), raise_on_error=False):
+    """ Execute and format the results of a sparql query.
+    Sort the litterals using lang_order.
+    """
+    rawresults = _sparqlexecute(endpoint, query, raise_on_error)
+    results = rawresults["results"]["bindings"]
+    data_lang = {}
+    data = {}
+    for row in results:
+        for k, v in row.iteritems():
+            if v['type'] == 'uri':
+                # Uri, keep it in a set
+                data.setdefault(k, set()).add(v['value'])
+            elif v['type'] == 'typed-literal':
+                # E.g. latitude, longitude, geometry - Keep on value
+                data[k] = v['value']
+            else:
+                # Literal - Use lang
+                data_lang.setdefault(k, []).append((v['value'], v.get('xml:lang')))
+    keyfunc = lambda x: lang_order.index(x[1]) if x[1] in lang_order else len(lang_order)
+    data.update(dict([(k, sorted(v, key=keyfunc)[0][0]) for k, v in data_lang.iteritems()]))
+    return data
+
 
 ###############################################################################
 ### FILE FUNCTIONS ############################################################