r7469 - dumbhippo/trunk/firehose/firehose



Author: walters
Date: 2008-05-06 14:48:53 -0500 (Tue, 06 May 2008)
New Revision: 7469

Modified:
   dumbhippo/trunk/firehose/firehose/logstats.groovy
Log:
Much more extensive analysis


Modified: dumbhippo/trunk/firehose/firehose/logstats.groovy
===================================================================
--- dumbhippo/trunk/firehose/firehose/logstats.groovy	2008-05-06 16:39:53 UTC (rev 7468)
+++ dumbhippo/trunk/firehose/firehose/logstats.groovy	2008-05-06 19:48:53 UTC (rev 7469)
@@ -2,27 +2,96 @@
 
 import java.text.SimpleDateFormat
 
+long timeSliceMilliseconds = 1*60*60*1000 /* 1 hour */
+
 def updatePattern = ~/splitting ([0-9]+) tasks into messages/
-
+def fullUnmodifiedFetchPattern = ~/Fetched full unmodified content for (.+)$/
+def unmodifiedFetchPattern = ~/Got 304 Unmodified for (.+)$/
+def modifiedFetchPattern = ~/Got new hash.+for url (.+)$/
 def asctimeFormat = new SimpleDateFormat('yyyy-MM-dd HH:mm:ss,SSS')
 
+/* This works for ASCII at least */
+def parsePyUnicodeRepr(String s) {
+	return s.substring(2, s.length()-1)
+}
+
+println "Parsing ${args[0]}..."
 def logf = new File(args[0])
-def firstDate = null
-def lastDate = null
-def total = 0
+long tsStart = 0
+long nextCutoff = 0
+long tsEnd = 0
+int total = 0
+def resultGroups = []
+class ResultGroup {
+	long startDate
+	long updates = 0
+	def domainUnmodified = new HashMap()
+	def domainModified = new HashMap()
+}
+def currentResultGroup = null
+/* Check whether current line matches the pattern; if so parse out
+ * the domain and increment its count in given hash
+ */
+def incrementHashIfMatches(String msg, pattern, hash) {
+	def matcher = pattern.matcher(msg)
+	if (matcher.matches()) {
+    	def domain = new URL(parsePyUnicodeRepr(matcher.group(1))).host
+    	if (!hash.containsKey(domain)) {
+    		hash[domain] = 1
+    	} else {
+    		hash[domain] += 1
+    	}	
+    	return true;
+	}
+	return false;
+}
 logf.eachLine { line ->
+	if (!line.startsWith('2'))
+		return
     def elts = line.split(' ', 5)
-    def msg = elts[4]
+    String msg = elts[4]
+    /* First check to see if this is a total count change */
 	def matcher = updatePattern.matcher(msg)
-	if (!matcher.matches())
-		return
-	def count = Integer.parseInt(matcher.group(1))
-	total += count
-	
-    def date = asctimeFormat.parse(elts[0] + ' ' + elts[1])
-	if (firstDate == null)
-		firstDate = date
-	lastDate = date
+	if (matcher.matches()) {
+		int updateCount = Integer.parseInt(matcher.group(1))
+		total += updateCount
+		if (currentResultGroup != null)
+			currentResultGroup.updates += updateCount
+	    long date = asctimeFormat.parse(elts[0] + ' ' + elts[1]).getTime()
+		if (tsStart == 0 || date - tsStart > timeSliceMilliseconds) {
+			if (tsStart != 0) {
+				resultGroups.add(currentResultGroup)
+			}
+			currentResultGroup = new ResultGroup(startDate: date)
+			tsStart = date
+		}
+	    tsEnd = date
+	    return
+	} 
+    /* If we don't have a current group, wait until we do */
+    if (currentResultGroup == null)
+    	return
+    if (incrementHashIfMatches(msg, fullUnmodifiedFetchPattern, currentResultGroup.domainUnmodified))
+    	return
+    if (incrementHashIfMatches(msg, unmodifiedFetchPattern, currentResultGroup.domainUnmodified))
+    	return    	
+    if (incrementHashIfMatches(msg, modifiedFetchPattern, currentResultGroup.domainModified))
+    	return
 }
-def overSeconds = (lastDate.getTime() - firstDate.getTime()) / 1000
-println "total: ${total} in ${overSeconds} seconds"
+
+def printTopKeys(String prefix, domainHash) {
+	def keys = new ArrayList(domainHash.keySet())
+	keys.sort({ a,b -> domainHash[b].compareTo(domainHash[a])})
+	keys.subList(0, 5).each { k ->
+		println " ${prefix} ${k} -> ${domainHash[k]}"
+	}
+}
+
+print "Got ${resultGroups.size()} groups"
+resultGroups.each { group ->
+    Date d = new Date(group.startDate)
+    def pollsPerSec = group.updates/(timeSliceMilliseconds/1000)
+	println "updates from ${d}: ${group.updates} (${pollsPerSec} checks per second)"
+	printTopKeys("U", group.domainUnmodified)
+	printTopKeys("M", group.domainModified)
+}



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]