r7471 - in dumbhippo/trunk: firehose/firehose firehose/firehose/jobs super/firehose/files/data



Author: walters
Date: 2008-05-07 16:47:23 -0500 (Wed, 07 May 2008)
New Revision: 7471

Added:
   dumbhippo/trunk/firehose/firehose/gendiffs.groovy
   dumbhippo/trunk/super/firehose/files/data/feedcache/
Modified:
   dumbhippo/trunk/firehose/firehose/find_smalldiffs.py
   dumbhippo/trunk/firehose/firehose/gendiffs.py
   dumbhippo/trunk/firehose/firehose/jobs/poller.py
Log:
Improve MySpace and Picasa filtering.  Tweak diff scripts and caching


Modified: dumbhippo/trunk/firehose/firehose/find_smalldiffs.py
===================================================================
--- dumbhippo/trunk/firehose/firehose/find_smalldiffs.py	2008-05-06 22:19:31 UTC (rev 7470)
+++ dumbhippo/trunk/firehose/firehose/find_smalldiffs.py	2008-05-07 21:47:23 UTC (rev 7471)
@@ -19,7 +19,7 @@
             c -= 3
         if c > 0 and c < 13:
             entry_lines[path] = c
-    simplejson.dump(entry_lines, sys.stdout)
+    simplejson.dump(entry_lines, sys.stdout, sort_keys=True, indent=4)
         
 if __name__ == '__main__':
     main()
\ No newline at end of file

Added: dumbhippo/trunk/firehose/firehose/gendiffs.groovy
===================================================================
--- dumbhippo/trunk/firehose/firehose/gendiffs.groovy	2008-05-06 22:19:31 UTC (rev 7470)
+++ dumbhippo/trunk/firehose/firehose/gendiffs.groovy	2008-05-07 21:47:23 UTC (rev 7471)
@@ -0,0 +1,45 @@
+#!/usr/bin/env groovy
+
+/* utility function */
+def zip(List a, List b) {
+	aIt = a.iterator()
+	bIt = b.iterator()
+	List ret = []
+	while (aIt.hasNext()) {
+		aV = aIt.next()
+		bV = bIt.next()
+		ret.add([aV, bV])
+	}
+	return ret
+}
+
+def cacheDir = new File(args[0])
+def cacheFiles = cacheDir.list({ d,n -> !n.endsWith('.tmp') && !n.endsWith('.diff')} as FilenameFilter)
+
+def urlToSnapshots = new HashMap()
+cacheFiles.each { String urlTimestamp ->
+	int periodIdx = urlTimestamp.lastIndexOf('.')
+	String url = urlTimestamp.substring(0, periodIdx)
+	String ts = urlTimestamp.substring(periodIdx+1)
+	if (urlToSnapshots[url] == null)
+		urlToSnapshots[url] = []
+	urlToSnapshots[url].add(ts)
+}
+
+urlToSnapshots.each { String url,List timestamps ->
+	zip(timestamps.subList(0, timestamps.size()-1), timestamps.subList(1, timestamps.size())).each { ts1, ts2 ->
+		ts1Path = new File(cacheDir, "${url}.${ts1}")
+		ts2Path = new File(cacheDir, "${url}.${ts2}")
+		diffPath = new File(cacheDir, "${url}.${ts1}-${ts2}.diff")
+		if (diffPath.exists())
+			return
+		println "Generating diff between ${ts1Path.getName()} <=> ${ts2Path.getName()}"
+		def outStream = diffPath.newOutputStream()
+		def pb = new ProcessBuilder("diff", "-u", ts1Path.toString(), ts2Path.toString())
+		pb.redirectErrorStream(true)
+		def proc = pb.start()
+		
+		proc.waitFor()
+		outStream.close()
+	}
+}


Property changes on: dumbhippo/trunk/firehose/firehose/gendiffs.groovy
___________________________________________________________________
Name: svn:executable
   + *

Modified: dumbhippo/trunk/firehose/firehose/gendiffs.py
===================================================================
--- dumbhippo/trunk/firehose/firehose/gendiffs.py	2008-05-06 22:19:31 UTC (rev 7470)
+++ dumbhippo/trunk/firehose/firehose/gendiffs.py	2008-05-07 21:47:23 UTC (rev 7471)
@@ -8,7 +8,7 @@
     entries.sort()
     items = {}
     for entry in entries:
-        if entry.endswith('.diff'):
+        if entry.endswith('.diff') or entry.endswith('.tmp'):
             continue
         (key, ts) = entry.rsplit('.', 1)        
         if key not in items:

Modified: dumbhippo/trunk/firehose/firehose/jobs/poller.py
===================================================================
--- dumbhippo/trunk/firehose/firehose/jobs/poller.py	2008-05-06 22:19:31 UTC (rev 7470)
+++ dumbhippo/trunk/firehose/firehose/jobs/poller.py	2008-05-07 21:47:23 UTC (rev 7471)
@@ -75,27 +75,54 @@
                     continue
                 outvalue.write(line)
         return outvalue
-    
+
+# This one is designed solely for MySpace which adds some odd
+# base64 encoded binary goo in HTML comments
+class BasicHtmlCommentEater(FeedPostProcessor):
+    def __init__(self):
+        super(BasicHtmlCommentEater, self).__init__()
+        
+    def process(self, data):
+        buf = StringIO(data)
+        outvalue = StringIO()
+        state = 0
+        for line in data:
+            if state == 0:
+                if line.startswith('<!--'):
+                    state = 1
+                else:
+                    outvalue.write(line)
+            elif state == 1:
+                if line.startswith('-->'):
+                    state = 0
+        return outvalue
+                
 class ChainedProcessors(object):
     def __init__(self, processors):
         super(ChainedProcessors, self).__init__()
-        if len(processors) == 0:
+        self.__is_identity = len(processors) == 0        
+        if self.__is_identity:
             processors = [FeedPostProcessor()]
         self._processors = processors
         
+    def is_identity(self):
+        return self.__is_identity
+        
     def process(self, data):
         for processor in self._processors:
             data = processor.process(data)
         return data
 
-# Define a shared eater for rss which has a lastBuildDate
+# Define shared eaters for these feed types
 rss_eater = XmlElementEater(['/rss/channel/lastBuildDate', '/rss/channel/pubDate'])
+atom_eater = XmlElementEater(['/feed/updated'])
 # This maps from a regular expression matching a URL to a list of processors
 feed_transformations = [
   (r'digg.com/users/.*/history/diggs.rss', [rss_eater]),
-  (r'picasaweb.google.com.*feed.*base.*album', [rss_eater]),
+  (r'picasaweb.google.com.*feed.*base.*album', [rss_eater, atom_eater]),
   (r'google.com/reader/public', [XmlElementEater(['/feed/updated'])]),
   (r'blogs.gnome.org', [RegexpEater(['<!--.*page served in.*seconds.*-->'])]),
+  (r'blog.myspace.com', [BasicHtmlCommentEater()]),
 ]
 feed_transformations = [(re.compile(r'^https?://([A-Z0-9]+\.)*' + x[0]), x[1]) for x in feed_transformations]
 
@@ -139,13 +166,16 @@
             else:
                 outpath_tmpname = None
                 outfile = None
+            rawhash = sha.new()
             data = StringIO()
             buf = response.read(8192)
             while buf:
                 if outfile is not None:
                     outfile.write(buf)
                 data.write(buf)
+                rawhash.update(buf)
                 buf = response.read(8192)
+            rawhash_hex = rawhash.hexdigest()
             datavalue = data.getvalue()
             processor = ChainedProcessors(transformlist)            
             processed = processor.process(datavalue)
@@ -164,10 +194,11 @@
             else:
                 _logger.debug("no last-modified for %r", targeturl)
                 timestamp = time.time()
+            filters_applied = (not processor.is_identity()) and "(filters applied)" or ""  
             if prev_hash != hash_hex:
-                _logger.info("Got new hash:%r (prev:%r) ts:%r for url %r", hash_hex, prev_hash, timestamp, targeturl)                
+                _logger.info("Got new hash:%r (prev:%r) ts:%r %s for url %r", hash_hex, prev_hash, timestamp, filters_applied, targeturl)                
                 return (hash_hex, timestamp)
-            _logger.info("Fetched full unmodified content for %r", targeturl)             
+            _logger.info("Fetched full unmodified content %s for %r", filters_applied, targeturl)             
             return (prev_hash, prev_timestamp)
         finally:
             try:



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]