add sioc parser to enable import/export of blog posts via sioc
authorNicolas Chauvat <nicolas.chauvat@logilab.fr>
Sun, 25 Jul 2010 17:03:20 +0200
changeset 215 4c9a9b321087
parent 214 4acad691ab73
child 216 e8340fe485c9
add sioc parser to enable import/export of blog posts via sioc
entities.py
site_cubicweb.py
sobjects.py
update-feeds.py
--- a/entities.py	Tue Jul 20 18:05:22 2010 +0200
+++ b/entities.py	Sun Jul 25 17:03:20 2010 +0200
@@ -27,7 +27,7 @@
                                   vtitle=self.entity.dc_title())
 
 class BlogISiocContainerAdapter(EntityAdapter):
-    __regid__ = 'ISiocContainer'
+    __regid__ = 'ISIOCContainer'
     __select__ = EntityAdapter.__select__ & is_instance('Blog')
 
     def isioc_type(self):
@@ -84,7 +84,7 @@
 
 
 class BlogEntryISiocItemAdapter(EntityAdapter):
-    __regid__ = 'ISiocItem'
+    __regid__ = 'ISIOCItem'
     __select__ = EntityAdapter.__select__ & is_instance('BlogEntry')
 
     def isioc_content(self):
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/site_cubicweb.py	Sun Jul 25 17:03:20 2010 +0200
@@ -0,0 +1,6 @@
+# XML <-> yams equivalence
+from cubicweb.xy import xy
+xy.add_equivalence('Blog', 'sioc:Weblog')
+xy.add_equivalence('BlogEntry', 'sioc:BlogPost')
+xy.add_equivalence('BlogEntry title', 'dcterms:title')
+xy.add_equivalence('BlogEntry content', 'sioc:content')
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sobjects.py	Sun Jul 25 17:03:20 2010 +0200
@@ -0,0 +1,89 @@
+# -*- coding: utf-8 -*-
+
+import sys
+from datetime import datetime
+from lxml import etree
+import feedparser
+import rdflib
+
+from cubes.datafeed.sobjects import DataFeedParser
+
+SIOC = 'http://rdfs.org/sioc/ns#'
+RDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
+DCTERMS = 'http://purl.org/dc/terms/'
+
+def get_subject(g, pred, obj):
+    subjects = list(g.subjects(pred, obj))
+    assert len(subjects) == 1
+    return subjects[0]
+
+def get_object(g, subj, pred):
+    objects = list(g.objects(subj, pred))
+    assert len(objects) == 1
+    return objects[0]
+
+def parse_blogpost_sioc(url):
+    g = rdflib.ConjunctiveGraph()
+    g.parse(url)
+    rdf_type = rdflib.URIRef(RDF+'type')
+    sioc_blogpost = rdflib.URIRef(SIOC+'BlogPost')
+    dcterms_title = rdflib.URIRef(DCTERMS+'title')
+    sioc_content = rdflib.URIRef(SIOC+'content')
+    for post, type_, blogpost_ in g.triples((None, rdf_type, sioc_blogpost)):
+        item = {'uri': unicode(post)}
+        item['title'] = unicode(get_object(g, post, dcterms_title))
+        item['content'] = unicode(get_object(g, post, sioc_content))
+        yield item
+
+def parse_blogpost_rss(url):
+    feed = feedparser.parse(url)
+    for entry in feed.entries:
+        item = {}
+        item['uri'] = entry.id
+        item['title'] = entry.title
+        item['content'] = entry.description
+        item['creation_date'] = datetime(*entry.date_parsed[:6])
+        yield item
+
+class BlogPostParser(DataFeedParser):
+    __abstract__ = True
+
+    def process(self, url):
+        for item in self.parse(url):
+            euri = self.sget_externaluri(item.pop('uri'))
+            if euri.same_as:
+                sys.stdout.write('.')
+                self.update_blogpost(euri.same_as[0], item)
+            else:
+                sys.stdout.write('+')
+                self.create_blogpost(item, euri)
+            sys.stdout.flush()
+
+    def create_blogpost(self, item, uri):
+        entity = self._cw.create_entity('BlogEntry', **item)
+        entity.set_relations(same_as=uri)
+        return self.update_blogpost(entity, None)
+
+    def update_blogpost(self, entity, item):
+        if item:
+            entity.set_attributes(**item)
+        return entity
+
+class BlogPostSiocParser(BlogPostParser):
+    __regid__ = 'blogpost-sioc'
+    parse = staticmethod(parse_blogpost_sioc)
+
+class BlogPostRSSParser(BlogPostParser):
+    __regid__ = 'blogpost-rss'
+    parse = staticmethod(parse_blogpost_rss)
+
+if __name__ == '__main__':
+    import sys
+    from pprint import pprint
+
+    name = sys.argv[1]
+    url = sys.argv[2]
+
+    parser = globals()[name]
+    pprint(list(parser(url)))
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/update-feeds.py	Sun Jul 25 17:03:20 2010 +0200
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+import sys
+
+feeds = rql('Any A WHERE A is DataFeed').entities()
+for feed in feeds:
+    if 'reset' in sys.argv:
+        feed.set_attributes(latest_retrieval=None)
+    else:
+        print '----- processing %r with %s' % (feed.title, feed.parser)
+        feed.pull_data()
+        print
+commit()
+