r7481 - dumbhippo/trunk/firehose/firehose/jobs



Author: walters
Date: 2008-05-13 14:12:01 -0500 (Tue, 13 May 2008)
New Revision: 7481

Modified:
   dumbhippo/trunk/firehose/firehose/jobs/poller.py
Log:
Fix picasa and myspace filters


Modified: dumbhippo/trunk/firehose/firehose/jobs/poller.py
===================================================================
--- dumbhippo/trunk/firehose/firehose/jobs/poller.py	2008-05-12 22:36:45 UTC (rev 7480)
+++ dumbhippo/trunk/firehose/firehose/jobs/poller.py	2008-05-13 19:12:01 UTC (rev 7481)
@@ -1,4 +1,5 @@
 #!/usr/bin/python
+# -*- coding: utf-8 -*-
 
 import os,sys,re,heapq,time,Queue,sha,threading
 import BaseHTTPServer,httplib,urlparse,urllib
@@ -46,16 +47,17 @@
         return data
 
 class XmlElementEater(FeedPostProcessor):
-    def __init__(self, deletepaths=[]):
+    def __init__(self, deletepaths=[], namespaces={}):
         super(XmlElementEater, self).__init__()
         self.__deletepaths = deletepaths
+        self.__namespaces = namespaces
         
     def process(self, data):
         import lxml.etree
         tree = lxml.etree.parse(StringIO(data))
         root = tree.getroot()
         for path in self.__deletepaths:
-            node = root.xpath(path)
+            node = root.xpath(path, namespaces=self.__namespaces)
             if not node:
                 continue
             node = node[0]
@@ -85,13 +87,14 @@
     def process(self, data):
         is_xml = False
         is_html = False
+        xmlre = re.compile(r'<\?.*xml.*version')
         for i,line in enumerate(StringIO(data)):
-            if i > 20:
+            if i > 200:
                 break
             # This is low tech, but should be reliable enough; remember
             # it's just an optimization here.
             # We could investiate MIME sniffers though.
-            if line.find('<?.*xml.*version') >= 0:
+            if xmlre.search(line):
                 is_xml = True
                 break
             if line.find('<html>') >= 0:
@@ -128,17 +131,16 @@
 
 # Define shared eaters for these feed types
 rss_eater = XmlElementEater(['/rss/channel/lastBuildDate', '/rss/channel/pubDate'])
-atom_eater = XmlElementEater(['/feed/updated'])
+atom_eater = XmlElementEater(['/a:feed/a:updated'], {'a': 'http://www.w3.org/2005/Atom'})
 # This maps from a regular expression matching a URL to a list of processors
 feed_transformations = [
   (r'digg.com/users/.*/history/diggs.rss', [rss_eater]),
   (r'picasaweb.google.com/data/feed', [rss_eater, atom_eater]),
   (r'google.com/reader/public', [XmlElementEater(['/feed/updated'])]),
   (r'blogs.gnome.org', [RegexpEater(['<!--.*page served in.*seconds.*-->'])]),
-  # We try to consume all HTML
-  (r'.*', [HtmlCommentEater()]),
+  (r'blog.myspace.com', [HtmlCommentEater()]),
 ]
-feed_transformations = [(re.compile(r'^https?://([A-Z0-9]+\.)*' + x[0]), x[1]) for x in feed_transformations]
+feed_transformations = [(re.compile(r'^https?://([A-Za-z0-9]+\.)*' + x[0]), x[1]) for x in feed_transformations]
 
 def get_transformations(url):
     transformers = []
@@ -225,7 +227,13 @@
             if prev_hash != hash_hex:
                 _logger.info("Got new hash:%r (prev:%r) ts:%r %s for url %r", hash_hex, prev_hash, timestamp, filters_applied, targeturl)                
                 return (hash_hex, timestamp)
-            _logger.info("Fetched full unmodified content %s for %r", filters_applied, targeturl)             
+            if rawhash_hex != hash_hex:
+                filter_status = "(filters succeeded)"
+            elif filters_applied:
+                filter_status = "(filters applied)"
+            else:
+                filter_status = ""
+            _logger.info("Fetched full unmodified content %s for %r", filter_status, targeturl)             
             return (prev_hash, prev_timestamp)
         finally:
             try:
@@ -332,59 +340,3 @@
             thread.start()
         collector = threading.Thread(target=self.__run_collect_tasks, args=(taskcount,resultqueue,masterhost,serial))
         collector.start()
-
-if __name__ == '__main__':
-    testdata = '''<?xml version="1.0" encoding="UTF-8"?>
-<rss version="2.0">
-    <channel>
-        <title>digg / jdhore1 / history / diggs</title>
-        <link>http://digg.com/users/jdhore1/history/diggs</link>
-        <description>A history of jdhore1's diggs</description>
-        <language>en-us</language>
-        <pubDate>Wed, 30 Apr 2008 16:42:42 UTC</pubDate>
-        <lastBuildDate>Wed, 30 Apr 2008 16:42:42 UTC</lastBuildDate>
-        <generator>Digg.com</generator>
-        <item>
-            <title>Hans Reiser was convicted Monday of first degree murder in t</title>
-            <link>http://digg.com/linux_unix/Hans_Reiser_was_convicted_Monday_of_first_degree_murder_in_t</link>
-            <description><![CDATA[
-                 A jury has found an Oakland software programmer guilty in the death of his estranged wife.        
-    ]]></description>
-            <pubDate>Mon, 28 Apr 2008 23:41:43 UTC</pubDate>
-            <author>jdhore1</author>
-            <guid>http://digg.com/linux_unix/Hans_Reiser_was_convicted_Monday_of_first_degree_murder_in_t</guid>
-        </item>
-        <item>
-            <title>The Democrats Have a Nominee: It's Obama!</title>
-            <link>http://digg.com/political_opinion/The_Democrats_Have_a_Nominee_It_s_Obama</link>
-            <description><![CDATA[
-                Other than ensuring the Greatest Show on Earth will continue, does it matter that Hillary Clinton d
-efeated Barack Obama Tuesday in Pennsylvania by nine-plus points? Barack Obama is the nominee.
-
-            ]]></description>
-            <pubDate>Fri, 25 Apr 2008 21:23:01 UTC</pubDate>
-            <author>jdhore1</author>
-            <guid>http://digg.com/political_opinion/The_Democrats_Have_a_Nominee_It_s_Obama</guid>
-        </item>
-    </channel>
-</rss>
-    '''
-    transformers = get_transformations('http://digg.com/users/jdhore/history/diggs.rss')
-    processor = ChainedProcessors(transformers)
-    print processor.process(testdata)
-    processor = ChainedProcessors([])
-    print processor.process(testdata)
-    transformers = get_transformations('http://myspace.com/blah')
-    processor = ChainedProcessors(transformers)
-    print processor.process('''
-<html>
-  <!-- foo bar
-  baz -->
-  <head>moo</head>
-<body>
-  testing<!-- one -->two three
-  <b>four</b><!--
-    blabla-->
-</body>
-</html>''')
-    
\ No newline at end of file



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]