Add a XHTML valid pretty printer
authorDenis Laxalde <denis.laxalde@logilab.fr>
Wed, 26 Jun 2013 15:27:20 +0200
changeset 335 601731a76796
parent 334 3a319fcad5cb
child 336 45c21dd506eb
Add a XHTML valid pretty printer It wraps the text with link only if the resulting HTML fragment is valid. Closes #149377.
dataio.py
test/test_dataio.py
--- a/dataio.py	Wed Jun 12 11:39:19 2013 +0200
+++ b/dataio.py	Wed Jun 26 15:27:20 2013 +0200
@@ -3,6 +3,7 @@
 """
 import json
 import urllib
+import lxml.etree as ET
 
 
 ###############################################################################
@@ -86,7 +87,11 @@
         while indice < len(text):
             if indice in tindices:
                 uri, t = tindices[indice]
-                newtext += self.pprint_entity(uri, text[t.start:t.end])
+                words = text[t.start:t.end]
+                fragment = self.pprint_entity(uri, words)
+                if not self.is_valid(newtext+fragment+text[t.end:]):
+                    fragment = words
+                newtext += fragment
                 indice = t.end
             else:
                 newtext += text[indice]
@@ -97,6 +102,11 @@
         """ Pretty print an entity """
         raise NotImplementedError
 
+    def is_valid(self, newtext):
+        """Override to check the validity of the prettified content at each
+        enrichement step"""
+        return True
+
 
 class NerdyHTMLPrettyPrint(AbstractNerdyPrettyPrint):
     """ Pretty print the output of a Nerdy process
@@ -107,4 +117,23 @@
         return u'<a href="%s">%s</a>' % (uri, word)
 
 
+class NerdyValidXHTMLPrettyPrint(NerdyHTMLPrettyPrint):
 
+    XHTML_DOC_TEMPLATE = '''\
+<?xml version="1.0" encoding="UTF-8" ?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
+<title>nerdy</title>
+</head>
+<body><div>%s</div></body>
+</html>'''
+
+    def is_valid(self, html):
+        try:
+            ET.fromstring(self.XHTML_DOC_TEMPLATE % html.encode('utf-8'),
+                          parser=ET.XMLParser(dtd_validation=True))
+        except ET.XMLSyntaxError:
+            return False
+        return True
--- a/test/test_dataio.py	Wed Jun 12 11:39:19 2013 +0200
+++ b/test/test_dataio.py	Wed Jun 26 15:27:20 2013 +0200
@@ -42,7 +42,7 @@
                                        'http://www.cubicweb.org')
         self.assertEqual(results, [[u'http://www.cubicweb.org/1310453']])
 
-    def test_prerryprint(self):
+    def test_prettyprint(self):
         text = 'Hello everyone, this is   me speaking. And me.'
         source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
                                           'me': 'http://example.com/me'})
@@ -53,6 +53,19 @@
                                 u'this is   <a href="http://example.com/me">me</a> speaking. '
                                 u'And <a href="http://example.com/me">me</a>.'))
 
+class NerdyValidXHTMLPrettyPrintTest(unittest2.TestCase):
+
+    def test_valid(self):
+        self.assertTrue(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
+            '<p>coucou</p>'))
+
+    def test_valid_unicode(self):
+        self.assertTrue(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
+            u'<p>hé</p>'))
+
+    def test_invalid(self):
+        self.assertFalse(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
+            '<p><div>coucou</div></p>'))
 
 
 if __name__ == '__main__':