conduit r1583 - in trunk: . conduit/modules/FeedModule



Author: jstowers
Date: Wed Jul 23 05:24:09 2008
New Revision: 1583
URL: http://svn.gnome.org/viewvc/conduit?rev=1583&view=rev

Log:
	* NEWS:
	* conduit/modules/FeedModule/FeedModule.py: Use feedparser to
	parse RSS/Atom feed enclosures and media content. Fixes #543685


Modified:
   trunk/   (props changed)
   trunk/ChangeLog
   trunk/NEWS
   trunk/conduit/modules/FeedModule/FeedModule.py

Modified: trunk/NEWS
==============================================================================
--- trunk/NEWS	(original)
+++ trunk/NEWS	Wed Jul 23 05:24:09 2008
@@ -1,6 +1,7 @@
 NEW in 0.3.13:
 ==============
-* 
+* Much inproved RSS feed enclosure support, thanks to the use of
+  the python-feedparser module.
 
 NEW in 0.3.12:
 ==============

Modified: trunk/conduit/modules/FeedModule/FeedModule.py
==============================================================================
--- trunk/conduit/modules/FeedModule/FeedModule.py	(original)
+++ trunk/conduit/modules/FeedModule/FeedModule.py	Wed Jul 23 05:24:09 2008
@@ -1,14 +1,3 @@
-try:
-    from elementtree import ElementTree
-except:
-    from xml.etree import ElementTree
-
-import traceback
-import urllib2
-import os
-from os.path import abspath, expanduser
-import sys
-from gettext import gettext as _
 import logging
 log = logging.getLogger("modules.Feed")
 
@@ -21,9 +10,47 @@
 import conduit.datatypes.Video as Video
 import conduit.datatypes.Photo as Photo
 
-MODULES = {
-    "RSSSource" : { "type": "dataprovider" }    
-}
+from gettext import gettext as _
+
+try:
+    import feedparser
+    MODULES = {
+        "RSSSource" : { "type": "dataprovider" }    
+    }
+    log.info("Module Information: %s" % Utils.get_module_information(feedparser, "__version__"))
+
+    #work around a bug in feedparser where it incorrectly detects
+    #media enclosures
+    #http://code.google.com/p/feedparser/issues/detail?id=100
+    if feedparser.__version__ <= '4.1':
+        log.info("Patching feedparser issue #100")
+        def _start_media_content(self, attrsD):
+            context = self._getContext()
+            context.setdefault('media_content', [])
+            context['media_content'].append(attrsD)
+
+
+        def _start_media_thumbnail(self, attrsD):
+            context = self._getContext()
+            context.setdefault('media_thumbnail', [])
+            self.push('url', 1) # new
+            context['media_thumbnail'].append(attrsD)
+
+
+        def _end_media_thumbnail(self):
+            url = self.pop('url')
+            context = self._getContext()
+            if url != None and len(url.strip()) != 0:
+                if not context['media_thumbnail'][-1].has_key('url'):
+                    context['media_thumbnail'][-1]['url'] = url
+
+        feedparser._FeedParserMixin._start_media_content = _start_media_content
+        feedparser._FeedParserMixin._start_media_thumbnail = _start_media_thumbnail
+        feedparser._FeedParserMixin._end_media_thumbnail = _end_media_thumbnail
+
+except ImportError:
+    MODULES = {}
+    log.info("RSS Feed support disabled")
 
 class RSSSource(DataProvider.DataSource):
 
@@ -55,6 +82,14 @@
             ok = Video.mimetype_is_video(mimetype)
         return ok
 
+    def _add_file(self, url, title, t):
+        log.debug("Got enclosure %s %s (%s)" % (title,url,t))
+        if self._is_allowed_type(t):
+            if len(self.files) < self.limit or self.limit == 0:
+                self.files[url] = (title,t)
+        else:
+            log.debug("Enclosure %s is an illegal type (%s)" % (title,t))
+
     def initialize(self):
         return True
 
@@ -100,38 +135,16 @@
 
     def refresh(self):
         DataProvider.DataSource.refresh(self)
+        #url : (title, mimetype)
         self.files = {}
-        try:
-            url_info = urllib2.urlopen(self.feedUrl)
-            if (url_info):
-                doc = ElementTree.parse(url_info).getroot()
-                #FIXME: My XML element tree foo is not that good. It seems to reprocess
-                #each item tag for each namespace???. This means i get n+1 copies
-                #of each enclosure. 1 from the bland doc, and one from each other namespace.
-                allreadyInserted = []
-                for item in doc.getiterator("item"):
-                    url = None
-                    t = None
-                    title = None
-                    for c in item.getchildren():
-                        if c.tag == "enclosure":
-                            url = c.get("url")
-                            t = c.get("type")
-                        if c.tag == "title":
-                            title = c.text
-                            
-                        #Check if we have all the info
-                        if url and t and title:
-                            log.debug("Got enclosure %s %s (%s)" % (title,url,t))
-                            if self._is_allowed_type(t):
-                                if ((url not in allreadyInserted) and ((len(allreadyInserted) < self.limit) or (self.limit == 0))):
-                                    allreadyInserted.append(url)
-                                    self.files[url] = (title,t)
-                            else:
-                                log.debug("Enclosure %s is an illegal type (%s)" % (title,t))
-        except:
-            log.info("Error getting/parsing feed \n%s" % traceback.format_exc())
-            raise Exceptions.RefreshError
+        d = feedparser.parse(self.feedUrl)
+        for entry in d.entries:
+            #check for enclosures first (i.e. podcasts)
+            for enclosure in entry.get('enclosures', ()):
+                self._add_file(enclosure['href'], entry.title, enclosure['type'])
+            #also check for media_content (like flickr)
+            for media in entry.get('media_content', ()):
+                self._add_file(media['url'], entry.title, media['type'])
 
     def get_all(self):
         DataProvider.DataSource.get_all(self)                            



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]