r7465 - dumbhippo/trunk/firehose/firehose/jobs



Author: walters
Date: 2008-05-01 16:04:07 -0500 (Thu, 01 May 2008)
New Revision: 7465

Modified:
   dumbhippo/trunk/firehose/firehose/jobs/poller.py
Log:
Add some more feed processing


Modified: dumbhippo/trunk/firehose/firehose/jobs/poller.py
===================================================================
--- dumbhippo/trunk/firehose/firehose/jobs/poller.py	2008-05-01 20:10:26 UTC (rev 7464)
+++ dumbhippo/trunk/firehose/firehose/jobs/poller.py	2008-05-01 21:04:07 UTC (rev 7465)
@@ -68,6 +68,20 @@
             parent.remove(node)
         return lxml.etree.tostring(tree, pretty_print=True)
     
+class RegexpEater(FeedPostProcessor):
+    def __init__(self, regexps):
+        self.__regexps = map(re.compile, regexps)
+        
+    def get_value(self):
+        value = StringIO(super(RegexpEater, self).get_value())
+        outvalue = StringIO()
+        for line in value:
+            for regexp in self.__regexps:
+                if regexp.search(line):
+                    continue
+                outvalue.write(line)
+        return outvalue
+    
 class ChainedProcessors(object):
     def __init__(self, processors):
         super(ChainedProcessors, self).__init__()
@@ -85,9 +99,14 @@
             buf = processor.get_value()
         return buf
     
+# Define a shared eater for rss which has a lastBuildDate
+rss_eater = XmlElementEater(['/rss/channel/lastBuildDate', '/rss/channel/pubDate'])
 # This maps from a regular expression matching a URL to a list of processors
 feed_transformations = [
-  (r'digg.com/users/.*/history/diggs.rss', [XmlElementEater(['/rss/channel/lastBuildDate', '/rss/channel/pubDate'])]),
+  (r'digg.com/users/.*/history/diggs.rss', [rss_eater]),
+  (r'picasaweb.google.com.*feed.*base.*album', [rss_eater]),
+  (r'google.com/reader/public', [XmlElementEater(['/feed/updated'])]),
+  (r'blogs.gnome.org', [RegexpEater(['<!--.*page served in.*seconds.*-->'])]),
 ]
 feed_transformations = [(re.compile(r'^https?://([A-Z0-9]+\.)*' + x[0]), x[1]) for x in feed_transformations]
 
@@ -294,4 +313,7 @@
     processor = ChainedProcessors(transformers)
     processor.feed(testdata)
     print processor.get_value()
+    processor = ChainedProcessors([])
+    processor.feed(testdata)
+    print processor.get_value()
     
\ No newline at end of file



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]