beagle r4786 - in trunk/beagle: Filters Util beagled/KonqHistoryQueryable



Author: dbera
Date: Sun Jun 15 00:47:25 2008
New Revision: 4786
URL: http://svn.gnome.org/viewvc/beagle?rev=4786&view=rev

Log:
Add support for KDE4 konqueror history.
* The cache files are now stored gzipped.
* Change of format of the gunzipped file, now the HTTP response header is also stored in the cached file.


Modified:
   trunk/beagle/Filters/FilterKonqHistory.cs
   trunk/beagle/Util/KonqHistoryUtil.cs
   trunk/beagle/beagled/KonqHistoryQueryable/KonqQueryable.cs

Modified: trunk/beagle/Filters/FilterKonqHistory.cs
==============================================================================
--- trunk/beagle/Filters/FilterKonqHistory.cs	(original)
+++ trunk/beagle/Filters/FilterKonqHistory.cs	Sun Jun 15 00:47:25 2008
@@ -29,6 +29,7 @@
 using System.Collections;
 using System.IO;
 using System.Text;
+using ICSharpCode.SharpZipLib.GZip;
 
 using Beagle.Daemon;
 using Beagle.Util;
@@ -51,35 +52,69 @@
 			if (buf == null)
 				buf = new byte [1024];
 
-			StreamReader reader = new StreamReader (Stream, Encoding.GetEncoding (28591));
-
 			// read the charset hint from indexable
 			string charset = null;
+			string gzipped = null;
+			string revision = "7";
+
 			foreach (Property property in Indexable.Properties) {
-				if (property.Key != (StringFu.UnindexedNamespace + "charset"))
-					continue;
-				charset = (string) property.Value;
+				if (property.Key == (StringFu.UnindexedNamespace + "charset"))
+					charset = (string) property.Value;
+				else if (property.Key == (StringFu.UnindexedNamespace + "revision"))
+					revision = (string) property.Value;
+				else if (property.Key == (StringFu.UnindexedNamespace + "gzipped"))
+					gzipped = (string) property.Value.ToLower ();
 				//Console.WriteLine ("charset hint accepted: " + charset);
-				break;
 			}
-					
 
-			// now create a memorystream where htmlfilter will begin his show
-			Stream.Seek (0, SeekOrigin.Begin);
+			if (revision != "7" && revision != "9") {
+				Error ();
+				return;
+			}
+
+			Stream stream;
+			if (gzipped == "true")
+				stream = new GZipInputStream (Stream);
+			else
+				stream = Stream;
+
 			// count past 8 lines ... Streams suck!
 			int c = 0; // stores the number of newlines read
 			int b = 0;
-			while (c < 8 && (b = Stream.ReadByte ()) != -1) {
+			int lines_to_skip = 0;
+			if (revision == "7")
+				lines_to_skip = 8;
+			else if (revision == "9")
+				lines_to_skip = 7;
+
+			while (c < lines_to_skip && (b = stream.ReadByte ()) != -1) {
 				if (b == '\n')
 					c ++;
-			}	
+			}
+
+			if (revision == "9") {
+				// skip HTTP response headers i.e. keep reading till a blank line is found
+				long last_pos = 0; // GZipInputStream.Position can be fake, keep our own count
+				long cur_pos = 0;
+
+				while ((b = stream.ReadByte ()) != -1) {
+					cur_pos ++;
+					if (b != '\n')
+						continue;
+
+					if (cur_pos == last_pos + 1)
+						break;
+					else
+						last_pos = cur_pos;
+				}
+			}
+
 			// copy the rest of the file to a memory stream
 			MemoryStream mem_stream = new MemoryStream ();
-			while ((b = Stream.Read (buf, 0, 1024)) != 0)
+			while ((b = stream.Read (buf, 0, 1024)) != 0)
 				mem_stream.Write (buf, 0, b);
 			mem_stream.Seek (0, SeekOrigin.Begin);
-			reader.Close ();
-			
+
 			HtmlDocument doc = new HtmlDocument ();
 			doc.ReportNode += HandleNodeEvent;
 			doc.StreamMode = true;

Modified: trunk/beagle/Util/KonqHistoryUtil.cs
==============================================================================
--- trunk/beagle/Util/KonqHistoryUtil.cs	(original)
+++ trunk/beagle/Util/KonqHistoryUtil.cs	Sun Jun 15 00:47:25 2008
@@ -31,16 +31,46 @@
 	public class KonqHistoryUtil {
 		public const string KonqCacheMimeType = "beagle/x-konq-cache";
 
+		public static bool IsGZipCache (Stream stream)
+		{
+			byte id1 = (byte) stream.ReadByte ();
+			byte id2 = (byte) stream.ReadByte ();
+			stream.Position = 0;
+
+			return (id1 == (byte) 0x1f && id2 == (byte) 0x8b);
+		}
+
 		public static bool ShouldIndex (StreamReader reader,
 					  out string url,
 					  out string creation_date,
 					  out string mimetype,
-					  out string charset)
+					  out string charset,
+					  out string revision)
 		{
+			url = null;
+			creation_date = null;
+			mimetype = null;
+			charset = null;
+
 			// format from kdelibs/kioslave/http/http.cc
-			// line-1: Cache revision - mine is 7
+			// line-1: Cache revision
 			// FIXME: What happens when cache revision changes ???
-			reader.ReadLine ();
+			revision = reader.ReadLine ().Trim ();
+			if (revision == "7")
+				return ShouldIndexRev7 (reader, out url, out creation_date, out mimetype, out charset);
+			else if (revision == "9")
+				return ShouldIndexRev9 (reader, out url, out creation_date, out mimetype, out charset);
+
+			return false;
+		}
+
+		public static bool ShouldIndexRev7 (StreamReader reader,
+					  out string url,
+					  out string creation_date,
+					  out string mimetype,
+					  out string charset)
+		{
+			// format from kdelibs/kioslave/http/http.cc
 
 			// line-2: URL
 			url = reader.ReadLine ();
@@ -79,5 +109,34 @@
 			// rest is data ...
 			return (mimetype == "text/html");
 		}
+
+		public static bool ShouldIndexRev9 (StreamReader reader,
+					  out string url,
+					  out string creation_date,
+					  out string mimetype,
+					  out string charset)
+		{
+			bool ret = ShouldIndexRev7 (reader, out url, out creation_date, out mimetype, out charset);
+
+			// Revision 9 is mostly same as revisiom 7
+			// 7 lines followed by HTTP response headers.
+			// http://websvn.kde.org/trunk/KDE/kdelibs/kioslave/http/http.cpp
+
+			// The charset line is at a different place. What the above function read was the HTTP response headers.
+			// Keep reading the HTTP response headers till an empty line is found or the charset line is found
+			string line;
+			while ((line = reader.ReadLine ().ToLower ()) != String.Empty) {
+				if (! line.StartsWith ("content-type: "))
+					continue;
+
+				int pos = line.IndexOf ("charset=");
+				if (pos != -1)
+					charset = line.Substring (pos + 8);
+				else
+					charset = "utf-8";
+			}
+
+			return ret;
+		}
 	}
 }

Modified: trunk/beagle/beagled/KonqHistoryQueryable/KonqQueryable.cs
==============================================================================
--- trunk/beagle/beagled/KonqHistoryQueryable/KonqQueryable.cs	(original)
+++ trunk/beagle/beagled/KonqHistoryQueryable/KonqQueryable.cs	Sun Jun 15 00:47:25 2008
@@ -28,6 +28,7 @@
 using System.Collections;
 using System.Threading;
 using System.Text;
+using ICSharpCode.SharpZipLib.GZip;
 
 using Beagle.Daemon;
 using Beagle.Util;
@@ -166,24 +167,34 @@
 		private Indexable FileToIndexable (string path, bool crawling) {
 			//Logger.Log.Debug ("KonqQ: Trying to index " + path);
 
-			FileStream stream;
+			Stream stream;
 			try {
 				stream = new FileStream (path, FileMode.Open, FileAccess.Read, FileShare.Read);
 			} catch (FileNotFoundException) {
 				// that was fast - lost the file
 				return null;
 			}
-			
+
+			bool gzip_cached = false;
+
+			if (KonqHistoryUtil.IsGZipCache (stream))
+				gzip_cached = true;
+
+			if (gzip_cached)
+				stream = new GZipInputStream (stream);
+
 			using (StreamReader reader = new StreamReader (stream, latin_encoding)) {
 				string url = null;
 				string creation_date = null;
 				string mimetype = null;
 				string charset = null;
+				string revision = null;
 				bool is_ok = KonqHistoryUtil.ShouldIndex (reader, 
 									  out url, 
 									  out creation_date,
 									  out mimetype, 
-									  out charset);
+									  out charset,
+									  out revision);
 			
 				if (!is_ok || url == String.Empty) {
 					//Logger.Log.Debug ("KonqQ: Skipping non-html file " + path + " of type=" + mimetype);
@@ -193,13 +204,13 @@
 					return null; // we wont index bad files and non-html files
 				}
 
-				//Logger.Log.Debug ("KonqQ: Indexing " + path + " with url=" + url);
+				//Logger.Log.Debug ("KonqQ: Indexing {0} = {1},{2},{3},{4},{5}", path, url, creation_date, mimetype, charset, revision);
 				Uri uri = new Uri (url, true);
 				if (uri.Scheme == Uri.UriSchemeHttps) {
 					Logger.Log.Error ("Indexing secure https:// URIs is not secure!");
 					return null;
 				}
-			
+
 				Indexable indexable = new Indexable (uri);
 				indexable.HitType = "WebHistory";
 				indexable.MimeType = KonqHistoryUtil.KonqCacheMimeType;
@@ -207,8 +218,10 @@
 				indexable.AddProperty (Property.NewUnstored ("fixme:urltoken", StringFu.UrlFuzzyDivide (url)));
 				// hint for the filter about the charset
 				indexable.AddProperty (Property.NewUnsearched (StringFu.UnindexedNamespace + "charset", charset));
+				indexable.AddProperty (Property.NewUnsearched (StringFu.UnindexedNamespace + "gzipped", gzip_cached));
+				indexable.AddProperty (Property.NewUnsearched (StringFu.UnindexedNamespace + "revision", revision));
 				indexable.FlushBufferCache = crawling;
-			
+
 				DateTime date = DateTimeUtil.UnixToDateTimeUtc (0);
 				date = date.AddSeconds (Int64.Parse (creation_date));
 				indexable.Timestamp = date;
@@ -253,7 +266,8 @@
 				current_file = (FileInfo) file_enumerator.Current;
 				//if (!IsUpToDate (current_file.FullName))
 				//	Logger.Log.Debug (current_file.FullName + " is not upto date");
-			} while (IsUpToDate (current_file.FullName));
+				// KDE4 cache contains _freq files which are non-cache files
+			} while (current_file.FullName.EndsWith ("_freq") || IsUpToDate (current_file.FullName));
 
 			return true;
 		}



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]