[dataio] Merge dataio and tests, related to #187461
authorvincent.michel@logilab.fr
Thu, 19 Dec 2013 14:45:21 +0000
changeset 372 4ef3109eab7a
parent 371 b104bae5b9c2
child 373 77a3a4107f5c
[dataio] Merge dataio and tests, related to #187461
test/test_dataio.py
test/test_ner_dataio.py
utils/dataio.py
utils/ner_dataio.py
--- a/test/test_dataio.py	Thu Dec 19 14:45:14 2013 +0000
+++ b/test/test_dataio.py	Thu Dec 19 14:45:21 2013 +0000
@@ -22,8 +22,11 @@
 from os import path
 from tempfile import mkdtemp
 
-from nazca.utils.dataio import sparqlquery, parsefile, autocast, split_file
-
+from nazca.utils.dataio import (HTMLPrettyPrint, ValidXHTMLPrettyPrint,
+                                sparqlquery, rqlquery, parsefile,
+                                autocast, split_file)
+from nazca.named_entities import NerProcess
+from nazca.named_entities.sources import NerSourceLexicon
 
 TESTDIR = path.dirname(__file__)
 
@@ -39,7 +42,54 @@
             pass
 
 
+class ValidXHTMLPrettyPrintTest(unittest2.TestCase):
+
+    def test_valid(self):
+        from lxml import etree
+        if int(etree.__version__< '3.2.0'):
+            # https://bugs.launchpad.net/lxml/+bug/673205
+            self.skipTest('Lxml version to old for ValidXHTMLPrettyPrint')
+        self.assertTrue(ValidXHTMLPrettyPrint().is_valid(u'<p>coucou</p>'))
+
+    def test_valid_unicode(self):
+        from lxml import etree
+        if int(etree.__version__< '3.2.0'):
+            # https://bugs.launchpad.net/lxml/+bug/673205
+            self.skipTest('Lxml version to old for ValidXHTMLPrettyPrint')
+        self.assertTrue(ValidXHTMLPrettyPrint().is_valid(u'<p>hé</p>'))
+
+    def test_invalid(self):
+        from lxml import etree
+        if int(etree.__version__< '3.2.0'):
+            # https://bugs.launchpad.net/lxml/+bug/673205
+            self.skipTest('Lxml version to old for ValidXHTMLPrettyPrint')
+        self.assertFalse(ValidXHTMLPrettyPrint().is_valid(u'<p><div>coucou</div></p>'))
+
+    def test_prettyprint(self):
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
+                                   'me': 'http://example.com/me'})
+        ner = NerProcess((source,))
+        named_entities = ner.process_text(text)
+        html = HTMLPrettyPrint().pprint_text(text, named_entities)
+        self.assertEqual(html, (u'Hello <a href="http://example.com/everyone">everyone</a>, '
+                                u'this is   <a href="http://example.com/me">me</a> speaking. '
+                                u'And <a href="http://example.com/me">me</a>.'))
+
+    def test_prettyprint_class(self):
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
+                                   'me': 'http://example.com/me'})
+        ner = NerProcess((source,))
+        named_entities = ner.process_text(text)
+        html = HTMLPrettyPrint().pprint_text(text, named_entities, html_class='ner')
+        self.assertEqual(html, (u'Hello <a href="http://example.com/everyone" class="ner">everyone</a>, '
+                                u'this is   <a href="http://example.com/me" class="ner">me</a> speaking. '
+                                u'And <a href="http://example.com/me" class="ner">me</a>.'))
+
+
 class DataIOTestCase(unittest2.TestCase):
+
     def test_parser(self):
         data = parsefile(path.join(TESTDIR, 'data', 'file2parse'),
                          [0, (2, 3), 4, 1], delimiter=',')
@@ -82,6 +132,16 @@
             with open(file2split) as fobj:
                 self.assertEqual(alllines, fobj.readlines())
 
+    def test_sparql_query(self):
+        results = sparqlquery(u'http://dbpedia.org/sparql',
+                              u'''SELECT DISTINCT ?uri
+                                  WHERE{
+                                  ?uri rdfs:label "Python"@en .
+                                  ?uri rdf:type ?type}''')
+        self.assertEqual(results, [['http://dbpedia.org/resource/Python'],
+                                   ['http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage'],
+                                   ['http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ']])
+
     def test_sparql_autocast(self):
         alignset = sparqlquery('http://dbpedia.inria.fr/sparql',
                                  'prefix db-owl: <http://dbpedia.org/ontology/>'
@@ -114,6 +174,11 @@
         self.assertEqual(len(alignset), 100)
         self.assertFalse(isinstance(alignset[0][2][0], float))
 
+    def test_rqlquery(self):
+        results = rqlquery('http://www.cubicweb.org',
+                           'Any U LIMIT 1 WHERE X cwuri U, X name "apycot"')
+        self.assertEqual(results, [[u'http://www.cubicweb.org/1310453']])
+
 
 if __name__ == '__main__':
     unittest2.main()
--- a/test/test_ner_dataio.py	Thu Dec 19 14:45:14 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,85 +0,0 @@
-# -*- coding:utf-8 -*-
-#
-# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-import unittest2
-
-from nerdy import dataio, core
-
-
-class DataioTest(unittest2.TestCase):
-    """ Test of dataio """
-
-    def test_sparql_query(self):
-        results = dataio.sparql_query(query=u'''SELECT ?uri
-                                                WHERE{
-                                                ?uri rdfs:label "Python"@en .
-                                                ?uri rdf:type ?type}''',
-                                      endpoint=u'http://dbpedia.org/sparql')
-        truth = [{u'uri':
-                  {u'type': u'uri',
-                   u'value': u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage'}},
-                 {u'uri':
-                  {u'type': u'uri',
-                   u'value': u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'}}]
-        self.assertEqual(results, truth)
-
-    def test_rql_url_query(self):
-        results = dataio.rql_url_query('Any U LIMIT 1 WHERE X cwuri U, X name "apycot"',
-                                       'http://www.cubicweb.org')
-        self.assertEqual(results, [[u'http://www.cubicweb.org/1310453']])
-
-    def test_prettyprint(self):
-        text = 'Hello everyone, this is   me speaking. And me.'
-        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'me': 'http://example.com/me'})
-        nerdy = core.NerdyProcess((source,))
-        named_entities = nerdy.process_text(text)
-        html = dataio.NerdyHTMLPrettyPrint().pprint_text(text, named_entities)
-        self.assertEqual(html, (u'Hello <a href="http://example.com/everyone">everyone</a>, '
-                                u'this is   <a href="http://example.com/me">me</a> speaking. '
-                                u'And <a href="http://example.com/me">me</a>.'))
-
-    def test_prettyprint_class(self):
-        text = 'Hello everyone, this is   me speaking. And me.'
-        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'me': 'http://example.com/me'})
-        nerdy = core.NerdyProcess((source,))
-        named_entities = nerdy.process_text(text)
-        html = dataio.NerdyHTMLPrettyPrint().pprint_text(text, named_entities, html_class='ner')
-        self.assertEqual(html, (u'Hello <a href="http://example.com/everyone" class="ner">everyone</a>, '
-                                u'this is   <a href="http://example.com/me" class="ner">me</a> speaking. '
-                                u'And <a href="http://example.com/me" class="ner">me</a>.'))
-
-
-class NerdyValidXHTMLPrettyPrintTest(unittest2.TestCase):
-
-    def test_valid(self):
-        self.assertTrue(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
-            '<p>coucou</p>'))
-
-    def test_valid_unicode(self):
-        self.assertTrue(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
-            u'<p>hé</p>'))
-
-    def test_invalid(self):
-        self.assertFalse(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
-            '<p><div>coucou</div></p>'))
-
-
-if __name__ == '__main__':
-    unittest2.main()
-
--- a/utils/dataio.py	Thu Dec 19 14:45:14 2013 +0000
+++ b/utils/dataio.py	Thu Dec 19 14:45:21 2013 +0000
@@ -21,6 +21,8 @@
 import csv
 import urllib
 
+from lxml import etree
+
 try:
     from SPARQLWrapper import SPARQLWrapper, JSON
     SPARQL_ENABLED = True
@@ -50,20 +52,45 @@
 ###############################################################################
 ### RQL FUNCTIONS #############################################################
 ###############################################################################
-def rqlquery(host, rql, indexes=None, formatopt=None):
-    """ Run the rql query on the given cubicweb host
+def get_cw_cnx(endpoint):
+    """ Get a cnx on a CubicWeb database
     """
-
-    if host.endswith('/'):
-        host = host[:-1]
+    from cubicweb import dbapi
+    from cubicweb.cwconfig import CubicWebConfiguration
+    from cubicweb.entities import AnyEntity
+    CubicWebConfiguration.load_cwctl_plugins()
+    config = CubicWebConfiguration.config_for(endpoint)
+    sourceinfo = config.sources()['admin']
+    login = sourceinfo['login']
+    password = sourceinfo['password']
+    _, cnx = dbapi.in_memory_repo_cnx(config, login, password=password)
+    req = cnx.request()
+    return req
 
-    indexes = indexes or []
-    filehandle = urllib.urlopen('%(host)s/view?'
-                                'rql=%(rql)s&vid=csvexport'
-                                % {'rql': rql, 'host': host})
-    filehandle.readline()#Skip the first line
-    return parsefile(filehandle, delimiter=';', indexes=indexes,
-                     formatopt=formatopt);
+def rqlquery(host, rql, indexes=None, formatopt=None, _cache_cnx={}, **kwargs):
+    """ Run the rql query on the given cubicweb host
+    Additional arguments can be passed to be properly substitued
+    in the execute() function for appid accces.
+    """
+    if host.startswith('http://'):
+        # By url
+        if host.endswith('/'):
+            host = host[:-1]
+        indexes = indexes or []
+        filehandle = urllib.urlopen('%(host)s/view?'
+                                    'rql=%(rql)s&vid=csvexport'
+                                    % {'rql': rql, 'host': host})
+        filehandle.readline()#Skip the first line
+        return parsefile(filehandle, delimiter=';', indexes=indexes,
+                         formatopt=formatopt);
+    else:
+        # By appid
+        if host in _cache_cnx:
+            cnx = _cache_cnx[host]
+        else:
+            cnx = get_cw_cnx(host)
+            _cache_cnx[host] = cnx
+        return cnx.execute(query, kwargs)
 
 
 ###############################################################################
@@ -222,3 +249,73 @@
         outfile.close()
         count += 1
     return map(str, xrange(count))
+
+
+###############################################################################
+### OUTPUT UTILITIES ##########################################################
+###############################################################################
+class AbstractPrettyPrint(object):
+    """ Pretty print the output of a named entities process
+    """
+
+    def pprint_text(self, text, named_entities, **kwargs):
+        newtext = u''
+        indice = 0
+        tindices = dict([(t.start, (uri, t)) for uri, p, t in named_entities])
+        while indice < len(text):
+            if indice in tindices:
+                uri, t = tindices[indice]
+                words = text[t.start:t.end]
+                fragment = self.pprint_entity(uri, words, **kwargs)
+                if not self.is_valid(newtext+fragment+text[t.end:]):
+                    fragment = words
+                newtext += fragment
+                indice = t.end
+            else:
+                newtext += text[indice]
+                indice += 1
+        return newtext
+
+    def pprint_entity(self, uri, word, **kwargs):
+        """ Pretty print an entity """
+        raise NotImplementedError
+
+    def is_valid(self, newtext):
+        """Override to check the validity of the prettified content at each
+        enrichement step"""
+        return True
+
+
+class HTMLPrettyPrint(AbstractPrettyPrint):
+    """ Pretty print the output of a named entities process, in HTML
+    """
+
+    def pprint_entity(self, uri, word, **kwargs):
+        """ Pretty print an entity """
+        klass = ' class="%s"' % kwargs['html_class'] if 'html_class' in kwargs else ''
+        return u'<a href="%s"%s>%s</a>' % (uri, klass, word)
+
+
+class ValidXHTMLPrettyPrint(HTMLPrettyPrint):
+    """ Pretty print the output of a named entities process,
+    in valid XHTML.
+    """
+
+    XHTML_DOC_TEMPLATE = '''\
+<?xml version="1.0" encoding="UTF-8" ?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
+<title>ner</title>
+</head>
+<body><div>%s</div></body>
+</html>'''
+
+    def is_valid(self, html):
+        try:
+            etree.fromstring(self.XHTML_DOC_TEMPLATE % html.encode('utf-8'),
+                          parser=etree.XMLParser(dtd_validation=True))
+        except etree.XMLSyntaxError:
+            return False
+        return True
--- a/utils/ner_dataio.py	Thu Dec 19 14:45:14 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,140 +0,0 @@
-# -*- coding: utf-8 -*-
-""" IO for Named Entities Recognition.
-"""
-import json
-import urllib
-import lxml.etree as ET
-
-
-###############################################################################
-### SPARQL UTILITIES ##########################################################
-###############################################################################
-def sparql_query(query, endpoint):
-    """ Execute a query on an endpoint:
-
-    sparql_query(query=u'''SELECT ?uri ?type
-                           WHERE{
-                           ?uri rdfs:label "Python"@en .
-                           ?uri rdf:type ?type}''',
-                           endpoint=u'http://dbpedia.org/sparql')
-    """
-    from SPARQLWrapper import SPARQLWrapper, JSON
-    sparql = SPARQLWrapper(endpoint)
-    sparql.setQuery(query)
-    sparql.setReturnFormat(JSON)
-    try:
-        rawresults = sparql.query().convert()
-        labels = rawresults['head']['vars']
-        return rawresults["results"]["bindings"]
-    except:
-        print 'Error in sparql query'
-        return []
-
-
-###############################################################################
-### RQL UTILITIES #############################################################
-###############################################################################
-def get_cw_cnx(endpoint):
-    """ Get a cnx on a CubicWeb database
-    """
-    from cubicweb import dbapi
-    from cubicweb.cwconfig import CubicWebConfiguration
-    from cubicweb.entities import AnyEntity
-    CubicWebConfiguration.load_cwctl_plugins()
-    config = CubicWebConfiguration.config_for(endpoint)
-    sourceinfo = config.sources()['admin']
-    login = sourceinfo['login']
-    password = sourceinfo['password']
-    _, cnx = dbapi.in_memory_repo_cnx(config, login, password=password)
-    req = cnx.request()
-    return req
-
-def rql_appid_query(query, endpoint, _cache_cnx={}, **kwargs):
-    """ Execute a query on an appid endpoint:
-
-    rql_query('Any X WHERE X label "Python"', 'localhost')
-
-    Additional arguments can be passed to be properly substitued
-    in the execute() function.
-    """
-    if endpoint in _cache_cnx:
-        cnx = _cache_cnx[endpoint]
-    else:
-        cnx = get_cw_cnx(endpoint)
-        _cache_cnx[endpoint] = cnx
-    return cnx.execute(query, kwargs)
-
-def rql_url_query(query, endpoint):
-    """ Execute a query on an url endpoint:
-
-    rql_query('Any X WHERE X label "Python"', 'localhost')
-    """
-    url = urllib.basejoin(endpoint, '?rql=%s&vid=jsonexport' % query)
-    return json.loads(urllib.urlopen(url).read())
-
-
-###############################################################################
-### OUTPUT UTILITIES ##########################################################
-###############################################################################
-class AbstractNerdyPrettyPrint(object):
-    """ Pretty print the output of a Nerdy process
-    """
-
-    def pprint_text(self, text, named_entities, **kwargs):
-        newtext = u''
-        indice = 0
-        tindices = dict([(t.start, (uri, t)) for uri, p, t in named_entities])
-        while indice < len(text):
-            if indice in tindices:
-                uri, t = tindices[indice]
-                words = text[t.start:t.end]
-                fragment = self.pprint_entity(uri, words, **kwargs)
-                if not self.is_valid(newtext+fragment+text[t.end:]):
-                    fragment = words
-                newtext += fragment
-                indice = t.end
-            else:
-                newtext += text[indice]
-                indice += 1
-        return newtext
-
-    def pprint_entity(self, uri, word, **kwargs):
-        """ Pretty print an entity """
-        raise NotImplementedError
-
-    def is_valid(self, newtext):
-        """Override to check the validity of the prettified content at each
-        enrichement step"""
-        return True
-
-
-class NerdyHTMLPrettyPrint(AbstractNerdyPrettyPrint):
-    """ Pretty print the output of a Nerdy process
-    """
-
-    def pprint_entity(self, uri, word, **kwargs):
-        """ Pretty print an entity """
-        klass = ' class="%s"' % kwargs['html_class'] if 'html_class' in kwargs else ''
-        return u'<a href="%s"%s>%s</a>' % (uri, klass, word)
-
-
-class NerdyValidXHTMLPrettyPrint(NerdyHTMLPrettyPrint):
-
-    XHTML_DOC_TEMPLATE = '''\
-<?xml version="1.0" encoding="UTF-8" ?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml">
-<head>
-<meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
-<title>nerdy</title>
-</head>
-<body><div>%s</div></body>
-</html>'''
-
-    def is_valid(self, html):
-        try:
-            ET.fromstring(self.XHTML_DOC_TEMPLATE % html.encode('utf-8'),
-                          parser=ET.XMLParser(dtd_validation=True))
-        except ET.XMLSyntaxError:
-            return False
-        return True