r7472 - dumbhippo/trunk/firehose/firehose/jobs



Author: walters
Date: 2008-05-08 13:36:14 -0500 (Thu, 08 May 2008)
New Revision: 7472

Modified:
   dumbhippo/trunk/firehose/firehose/jobs/poller.py
Log:
Consume all HTML comments, also fix regexp return value


Modified: dumbhippo/trunk/firehose/firehose/jobs/poller.py
===================================================================
--- dumbhippo/trunk/firehose/firehose/jobs/poller.py	2008-05-07 21:47:23 UTC (rev 7471)
+++ dumbhippo/trunk/firehose/firehose/jobs/poller.py	2008-05-08 18:36:14 UTC (rev 7472)
@@ -3,6 +3,7 @@
 import os,sys,re,heapq,time,Queue,sha,threading
 import BaseHTTPServer,httplib,urlparse,urllib
 from email.Utils import formatdate,parsedate_tz,mktime_tz
+from BeautifulSoup import BeautifulSoup,Comment
 import logging
 from StringIO import StringIO
 
@@ -74,28 +75,34 @@
                 if regexp.search(line):
                     continue
                 outvalue.write(line)
-        return outvalue
+        return outvalue.getvalue()
 
-# This one is designed solely for MySpace which adds some odd
-# base64 encoded binary goo in HTML comments
-class BasicHtmlCommentEater(FeedPostProcessor):
+class HtmlCommentEater(FeedPostProcessor):
     def __init__(self):
-        super(BasicHtmlCommentEater, self).__init__()
+        super(HtmlCommentEater, self).__init__()
         
     def process(self, data):
-        buf = StringIO(data)
-        outvalue = StringIO()
-        state = 0
-        for line in data:
-            if state == 0:
-                if line.startswith('<!--'):
-                    state = 1
-                else:
-                    outvalue.write(line)
-            elif state == 1:
-                if line.startswith('-->'):
-                    state = 0
-        return outvalue
+        is_xml = False
+        is_html = False
+        for i,line in enumerate(StringIO(data)):
+            if i > 20:
+                break
+            # This is low tech, but should be reliable enough; remember
+            # it's just an optimization here.
+            # We could investiate MIME sniffers though.
+            if line.find('<?.*xml.*version') >= 0:
+                is_xml = True
+                break
+            if line.find('<html>') >= 0:
+                is_html = True
+                break
+        if is_html and not is_xml:
+            soup = BeautifulSoup(data)
+            comments = soup.findAll(text=lambda text:isinstance(text, Comment))
+            for comment in comments:
+                comment.extract()
+            return soup.prettify()
+        return data
                 
 class ChainedProcessors(object):
     def __init__(self, processors):
@@ -122,7 +129,8 @@
   (r'picasaweb.google.com.*feed.*base.*album', [rss_eater, atom_eater]),
   (r'google.com/reader/public', [XmlElementEater(['/feed/updated'])]),
   (r'blogs.gnome.org', [RegexpEater(['<!--.*page served in.*seconds.*-->'])]),
-  (r'blog.myspace.com', [BasicHtmlCommentEater()]),
+  # We try to consume all HTML
+  (r'.*', [HtmlCommentEater()]),
 ]
 feed_transformations = [(re.compile(r'^https?://([A-Z0-9]+\.)*' + x[0]), x[1]) for x in feed_transformations]
 
@@ -341,4 +349,17 @@
     print processor.process(testdata)
     processor = ChainedProcessors([])
     print processor.process(testdata)
+    transformers = get_transformations('http://myspace.com/blah')
+    processor = ChainedProcessors(transformers)
+    print processor.process('''
+<html>
+  <!-- foo bar
+  baz -->
+  <head>moo</head>
+<body>
+  testing<!-- one -->two three
+  <b>four</b><!--
+    blabla-->
+</body>
+</html>''')
     
\ No newline at end of file



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]