r7470 - in dumbhippo/trunk/firehose/firehose: . jobs



Author: walters
Date: 2008-05-06 17:19:31 -0500 (Tue, 06 May 2008)
New Revision: 7470

Modified:
   dumbhippo/trunk/firehose/firehose/jobs/poller.py
   dumbhippo/trunk/firehose/firehose/logstats.groovy
Log:
Tweaks to log parsing and feed caching


Modified: dumbhippo/trunk/firehose/firehose/jobs/poller.py
===================================================================
--- dumbhippo/trunk/firehose/firehose/jobs/poller.py	2008-05-06 19:48:53 UTC (rev 7469)
+++ dumbhippo/trunk/firehose/firehose/jobs/poller.py	2008-05-06 22:19:31 UTC (rev 7470)
@@ -109,7 +109,7 @@
 class FeedTaskHandler(object):
     FAMILY = 'FEED'
 
-    def run(self, id, prev_hash, prev_timestamp, outpath=None):
+    def run(self, id, prev_hash, prev_timestamp, cachedir=None):
         targeturl = id
         transformlist = get_transformations(targeturl)
         parsedurl = urlparse.urlparse(targeturl)
@@ -130,13 +130,15 @@
             if response.status == 304:
                 _logger.info("Got 304 Unmodified for %r", targeturl)
                 return (prev_hash, prev_timestamp)
-            if outpath is not None:
-                outpath_tmpname = outpath+'.tmp'
+            if cachedir is not None:
+                quotedname = urllib.quote_plus(targeturl)
+                ts = int(time.time())
+                outpath = os.path.join(cachedir, quotedname + '.' + unicode(ts))
+                outpath_tmpname = outpath + '.tmp'
                 outfile = open(outpath_tmpname, 'w')
             else:
                 outpath_tmpname = None
                 outfile = None
-            processor = ChainedProcessors(transformlist)
             data = StringIO()
             buf = response.read(8192)
             while buf:
@@ -145,13 +147,17 @@
                 data.write(buf)
                 buf = response.read(8192)
             datavalue = data.getvalue()
+            processor = ChainedProcessors(transformlist)            
             processed = processor.process(datavalue)
             hash = sha.new()
             hash.update(processed)
             hash_hex = hash.hexdigest()
             if outfile is not None:
                 outfile.close()
-                os.rename(outpath_tmpname, outpath)
+                if prev_hash != hash_hex:                
+                    os.rename(outpath_tmpname, outpath)
+                else:
+                    os.unlink(outpath_tmpname)
             timestamp_str = response.getheader('Last-Modified', None)
             if timestamp_str is not None:
                 timestamp = mktime_tz(parsedate_tz(timestamp_str))
@@ -241,10 +247,8 @@
         inst = fclass()
         kwargs = {}
         if self.__savefetches:
-            quotedname = urllib.quote_plus(taskid)
-            ts = int(time.time())
-            outpath = os.path.join(os.getcwd(), 'data', quotedname + '.' + unicode(ts))
-            kwargs['outpath'] = outpath       
+            outpath = os.path.join(os.getcwd(), 'data', 'feedcache')
+            kwargs['cachedir'] = outpath       
         try:
             (new_hash, new_timestamp) = inst.run(tid, prev_hash, prev_timestamp, **kwargs)            
         except Exception, e:

Modified: dumbhippo/trunk/firehose/firehose/logstats.groovy
===================================================================
--- dumbhippo/trunk/firehose/firehose/logstats.groovy	2008-05-06 19:48:53 UTC (rev 7469)
+++ dumbhippo/trunk/firehose/firehose/logstats.groovy	2008-05-06 22:19:31 UTC (rev 7470)
@@ -79,11 +79,16 @@
     	return
 }
 
-def printTopKeys(String prefix, domainHash) {
+def printTopKeys(String prefix, domainHash, other) {
 	def keys = new ArrayList(domainHash.keySet())
 	keys.sort({ a,b -> domainHash[b].compareTo(domainHash[a])})
 	keys.subList(0, 5).each { k ->
-		println " ${prefix} ${k} -> ${domainHash[k]}"
+		def otherValue = other != null ? other[k] : null;
+		print " ${prefix} ${k} -> ${domainHash[k]}"
+		if (otherValue != null)
+			println " (${otherValue})"
+		else
+			println ""	
 	}
 }
 
@@ -92,6 +97,6 @@
     Date d = new Date(group.startDate)
     def pollsPerSec = group.updates/(timeSliceMilliseconds/1000)
 	println "updates from ${d}: ${group.updates} (${pollsPerSec} checks per second)"
-	printTopKeys("U", group.domainUnmodified)
-	printTopKeys("M", group.domainModified)
+	printTopKeys("U", group.domainUnmodified, group.domainModified)
+	printTopKeys("M", group.domainModified, group.domainUnmodified)
 }



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]