r7471 - in dumbhippo/trunk: firehose/firehose firehose/firehose/jobs super/firehose/files/data
- From: commits mugshot org
- To: online-desktop-list gnome org
- Subject: r7471 - in dumbhippo/trunk: firehose/firehose firehose/firehose/jobs super/firehose/files/data
- Date: Wed, 7 May 2008 16:47:23 -0500 (CDT)
Author: walters
Date: 2008-05-07 16:47:23 -0500 (Wed, 07 May 2008)
New Revision: 7471
Added:
dumbhippo/trunk/firehose/firehose/gendiffs.groovy
dumbhippo/trunk/super/firehose/files/data/feedcache/
Modified:
dumbhippo/trunk/firehose/firehose/find_smalldiffs.py
dumbhippo/trunk/firehose/firehose/gendiffs.py
dumbhippo/trunk/firehose/firehose/jobs/poller.py
Log:
Improve MySpace and Picasa filtering. Tweak diff scripts and caching
Modified: dumbhippo/trunk/firehose/firehose/find_smalldiffs.py
===================================================================
--- dumbhippo/trunk/firehose/firehose/find_smalldiffs.py 2008-05-06 22:19:31 UTC (rev 7470)
+++ dumbhippo/trunk/firehose/firehose/find_smalldiffs.py 2008-05-07 21:47:23 UTC (rev 7471)
@@ -19,7 +19,7 @@
c -= 3
if c > 0 and c < 13:
entry_lines[path] = c
- simplejson.dump(entry_lines, sys.stdout)
+ simplejson.dump(entry_lines, sys.stdout, sort_keys=True, indent=4)
if __name__ == '__main__':
main()
\ No newline at end of file
Added: dumbhippo/trunk/firehose/firehose/gendiffs.groovy
===================================================================
--- dumbhippo/trunk/firehose/firehose/gendiffs.groovy 2008-05-06 22:19:31 UTC (rev 7470)
+++ dumbhippo/trunk/firehose/firehose/gendiffs.groovy 2008-05-07 21:47:23 UTC (rev 7471)
@@ -0,0 +1,45 @@
+#!/usr/bin/env groovy
+
+/* utility function */
+def zip(List a, List b) {
+ aIt = a.iterator()
+ bIt = b.iterator()
+ List ret = []
+ while (aIt.hasNext()) {
+ aV = aIt.next()
+ bV = bIt.next()
+ ret.add([aV, bV])
+ }
+ return ret
+}
+
+def cacheDir = new File(args[0])
+def cacheFiles = cacheDir.list({ d,n -> !n.endsWith('.tmp') && !n.endsWith('.diff')} as FilenameFilter)
+
+def urlToSnapshots = new HashMap()
+cacheFiles.each { String urlTimestamp ->
+ int periodIdx = urlTimestamp.lastIndexOf('.')
+ String url = urlTimestamp.substring(0, periodIdx)
+ String ts = urlTimestamp.substring(periodIdx+1)
+ if (urlToSnapshots[url] == null)
+ urlToSnapshots[url] = []
+ urlToSnapshots[url].add(ts)
+}
+
+urlToSnapshots.each { String url,List timestamps ->
+ zip(timestamps.subList(0, timestamps.size()-1), timestamps.subList(1, timestamps.size())).each { ts1, ts2 ->
+ ts1Path = new File(cacheDir, "${url}.${ts1}")
+ ts2Path = new File(cacheDir, "${url}.${ts2}")
+ diffPath = new File(cacheDir, "${url}.${ts1}-${ts2}.diff")
+ if (diffPath.exists())
+ return
+ println "Generating diff between ${ts1Path.getName()} <=> ${ts2Path.getName()}"
+ def outStream = diffPath.newOutputStream()
+ def pb = new ProcessBuilder("diff", "-u", ts1Path.toString(), ts2Path.toString())
+ pb.redirectErrorStream(true)
+ def proc = pb.start()
+
+ proc.waitFor()
+ outStream.close()
+ }
+}
Property changes on: dumbhippo/trunk/firehose/firehose/gendiffs.groovy
___________________________________________________________________
Name: svn:executable
+ *
Modified: dumbhippo/trunk/firehose/firehose/gendiffs.py
===================================================================
--- dumbhippo/trunk/firehose/firehose/gendiffs.py 2008-05-06 22:19:31 UTC (rev 7470)
+++ dumbhippo/trunk/firehose/firehose/gendiffs.py 2008-05-07 21:47:23 UTC (rev 7471)
@@ -8,7 +8,7 @@
entries.sort()
items = {}
for entry in entries:
- if entry.endswith('.diff'):
+ if entry.endswith('.diff') or entry.endswith('.tmp'):
continue
(key, ts) = entry.rsplit('.', 1)
if key not in items:
Modified: dumbhippo/trunk/firehose/firehose/jobs/poller.py
===================================================================
--- dumbhippo/trunk/firehose/firehose/jobs/poller.py 2008-05-06 22:19:31 UTC (rev 7470)
+++ dumbhippo/trunk/firehose/firehose/jobs/poller.py 2008-05-07 21:47:23 UTC (rev 7471)
@@ -75,27 +75,54 @@
continue
outvalue.write(line)
return outvalue
-
+
+# This one is designed solely for MySpace which adds some odd
+# base64 encoded binary goo in HTML comments
+class BasicHtmlCommentEater(FeedPostProcessor):
+ def __init__(self):
+ super(BasicHtmlCommentEater, self).__init__()
+
+ def process(self, data):
+ buf = StringIO(data)
+ outvalue = StringIO()
+ state = 0
+ for line in data:
+ if state == 0:
+ if line.startswith('<!--'):
+ state = 1
+ else:
+ outvalue.write(line)
+ elif state == 1:
+ if line.startswith('-->'):
+ state = 0
+ return outvalue
+
class ChainedProcessors(object):
def __init__(self, processors):
super(ChainedProcessors, self).__init__()
- if len(processors) == 0:
+ self.__is_identity = len(processors) == 0
+ if self.__is_identity:
processors = [FeedPostProcessor()]
self._processors = processors
+ def is_identity(self):
+ return self.__is_identity
+
def process(self, data):
for processor in self._processors:
data = processor.process(data)
return data
-# Define a shared eater for rss which has a lastBuildDate
+# Define shared eaters for these feed types
rss_eater = XmlElementEater(['/rss/channel/lastBuildDate', '/rss/channel/pubDate'])
+atom_eater = XmlElementEater(['/feed/updated'])
# This maps from a regular expression matching a URL to a list of processors
feed_transformations = [
(r'digg.com/users/.*/history/diggs.rss', [rss_eater]),
- (r'picasaweb.google.com.*feed.*base.*album', [rss_eater]),
+ (r'picasaweb.google.com.*feed.*base.*album', [rss_eater, atom_eater]),
(r'google.com/reader/public', [XmlElementEater(['/feed/updated'])]),
(r'blogs.gnome.org', [RegexpEater(['<!--.*page served in.*seconds.*-->'])]),
+ (r'blog.myspace.com', [BasicHtmlCommentEater()]),
]
feed_transformations = [(re.compile(r'^https?://([A-Z0-9]+\.)*' + x[0]), x[1]) for x in feed_transformations]
@@ -139,13 +166,16 @@
else:
outpath_tmpname = None
outfile = None
+ rawhash = sha.new()
data = StringIO()
buf = response.read(8192)
while buf:
if outfile is not None:
outfile.write(buf)
data.write(buf)
+ rawhash.update(buf)
buf = response.read(8192)
+ rawhash_hex = rawhash.hexdigest()
datavalue = data.getvalue()
processor = ChainedProcessors(transformlist)
processed = processor.process(datavalue)
@@ -164,10 +194,11 @@
else:
_logger.debug("no last-modified for %r", targeturl)
timestamp = time.time()
+ filters_applied = (not processor.is_identity()) and "(filters applied)" or ""
if prev_hash != hash_hex:
- _logger.info("Got new hash:%r (prev:%r) ts:%r for url %r", hash_hex, prev_hash, timestamp, targeturl)
+ _logger.info("Got new hash:%r (prev:%r) ts:%r %s for url %r", hash_hex, prev_hash, timestamp, filters_applied, targeturl)
return (hash_hex, timestamp)
- _logger.info("Fetched full unmodified content for %r", targeturl)
+ _logger.info("Fetched full unmodified content %s for %r", filters_applied, targeturl)
return (prev_hash, prev_timestamp)
finally:
try:
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]