beagle r4786 - in trunk/beagle: Filters Util beagled/KonqHistoryQueryable
- From: dbera svn gnome org
- To: svn-commits-list gnome org
- Subject: beagle r4786 - in trunk/beagle: Filters Util beagled/KonqHistoryQueryable
- Date: Sun, 15 Jun 2008 00:47:26 +0000 (UTC)
Author: dbera
Date: Sun Jun 15 00:47:25 2008
New Revision: 4786
URL: http://svn.gnome.org/viewvc/beagle?rev=4786&view=rev
Log:
Add support for KDE4 konqueror history.
* The cache files are now stored gzipped.
* Change of format of the gunzipped file, now the HTTP response header is also stored in the cached file.
Modified:
trunk/beagle/Filters/FilterKonqHistory.cs
trunk/beagle/Util/KonqHistoryUtil.cs
trunk/beagle/beagled/KonqHistoryQueryable/KonqQueryable.cs
Modified: trunk/beagle/Filters/FilterKonqHistory.cs
==============================================================================
--- trunk/beagle/Filters/FilterKonqHistory.cs (original)
+++ trunk/beagle/Filters/FilterKonqHistory.cs Sun Jun 15 00:47:25 2008
@@ -29,6 +29,7 @@
using System.Collections;
using System.IO;
using System.Text;
+using ICSharpCode.SharpZipLib.GZip;
using Beagle.Daemon;
using Beagle.Util;
@@ -51,35 +52,69 @@
if (buf == null)
buf = new byte [1024];
- StreamReader reader = new StreamReader (Stream, Encoding.GetEncoding (28591));
-
// read the charset hint from indexable
string charset = null;
+ string gzipped = null;
+ string revision = "7";
+
foreach (Property property in Indexable.Properties) {
- if (property.Key != (StringFu.UnindexedNamespace + "charset"))
- continue;
- charset = (string) property.Value;
+ if (property.Key == (StringFu.UnindexedNamespace + "charset"))
+ charset = (string) property.Value;
+ else if (property.Key == (StringFu.UnindexedNamespace + "revision"))
+ revision = (string) property.Value;
+ else if (property.Key == (StringFu.UnindexedNamespace + "gzipped"))
+ gzipped = (string) property.Value.ToLower ();
//Console.WriteLine ("charset hint accepted: " + charset);
- break;
}
-
- // now create a memorystream where htmlfilter will begin his show
- Stream.Seek (0, SeekOrigin.Begin);
+ if (revision != "7" && revision != "9") {
+ Error ();
+ return;
+ }
+
+ Stream stream;
+ if (gzipped == "true")
+ stream = new GZipInputStream (Stream);
+ else
+ stream = Stream;
+
// count past 8 lines ... Streams suck!
int c = 0; // stores the number of newlines read
int b = 0;
- while (c < 8 && (b = Stream.ReadByte ()) != -1) {
+ int lines_to_skip = 0;
+ if (revision == "7")
+ lines_to_skip = 8;
+ else if (revision == "9")
+ lines_to_skip = 7;
+
+ while (c < lines_to_skip && (b = stream.ReadByte ()) != -1) {
if (b == '\n')
c ++;
- }
+ }
+
+ if (revision == "9") {
+ // skip HTTP response headers i.e. keep reading till a blank line is found
+ long last_pos = 0; // GZipInputStream.Position can be fake, keep our own count
+ long cur_pos = 0;
+
+ while ((b = stream.ReadByte ()) != -1) {
+ cur_pos ++;
+ if (b != '\n')
+ continue;
+
+ if (cur_pos == last_pos + 1)
+ break;
+ else
+ last_pos = cur_pos;
+ }
+ }
+
// copy the rest of the file to a memory stream
MemoryStream mem_stream = new MemoryStream ();
- while ((b = Stream.Read (buf, 0, 1024)) != 0)
+ while ((b = stream.Read (buf, 0, 1024)) != 0)
mem_stream.Write (buf, 0, b);
mem_stream.Seek (0, SeekOrigin.Begin);
- reader.Close ();
-
+
HtmlDocument doc = new HtmlDocument ();
doc.ReportNode += HandleNodeEvent;
doc.StreamMode = true;
Modified: trunk/beagle/Util/KonqHistoryUtil.cs
==============================================================================
--- trunk/beagle/Util/KonqHistoryUtil.cs (original)
+++ trunk/beagle/Util/KonqHistoryUtil.cs Sun Jun 15 00:47:25 2008
@@ -31,16 +31,46 @@
public class KonqHistoryUtil {
public const string KonqCacheMimeType = "beagle/x-konq-cache";
+ public static bool IsGZipCache (Stream stream)
+ {
+ byte id1 = (byte) stream.ReadByte ();
+ byte id2 = (byte) stream.ReadByte ();
+ stream.Position = 0;
+
+ return (id1 == (byte) 0x1f && id2 == (byte) 0x8b);
+ }
+
public static bool ShouldIndex (StreamReader reader,
out string url,
out string creation_date,
out string mimetype,
- out string charset)
+ out string charset,
+ out string revision)
{
+ url = null;
+ creation_date = null;
+ mimetype = null;
+ charset = null;
+
// format from kdelibs/kioslave/http/http.cc
- // line-1: Cache revision - mine is 7
+ // line-1: Cache revision
// FIXME: What happens when cache revision changes ???
- reader.ReadLine ();
+ revision = reader.ReadLine ().Trim ();
+ if (revision == "7")
+ return ShouldIndexRev7 (reader, out url, out creation_date, out mimetype, out charset);
+ else if (revision == "9")
+ return ShouldIndexRev9 (reader, out url, out creation_date, out mimetype, out charset);
+
+ return false;
+ }
+
+ public static bool ShouldIndexRev7 (StreamReader reader,
+ out string url,
+ out string creation_date,
+ out string mimetype,
+ out string charset)
+ {
+ // format from kdelibs/kioslave/http/http.cc
// line-2: URL
url = reader.ReadLine ();
@@ -79,5 +109,34 @@
// rest is data ...
return (mimetype == "text/html");
}
+
+ public static bool ShouldIndexRev9 (StreamReader reader,
+ out string url,
+ out string creation_date,
+ out string mimetype,
+ out string charset)
+ {
+ bool ret = ShouldIndexRev7 (reader, out url, out creation_date, out mimetype, out charset);
+
+ // Revision 9 is mostly same as revisiom 7
+ // 7 lines followed by HTTP response headers.
+ // http://websvn.kde.org/trunk/KDE/kdelibs/kioslave/http/http.cpp
+
+ // The charset line is at a different place. What the above function read was the HTTP response headers.
+ // Keep reading the HTTP response headers till an empty line is found or the charset line is found
+ string line;
+ while ((line = reader.ReadLine ().ToLower ()) != String.Empty) {
+ if (! line.StartsWith ("content-type: "))
+ continue;
+
+ int pos = line.IndexOf ("charset=");
+ if (pos != -1)
+ charset = line.Substring (pos + 8);
+ else
+ charset = "utf-8";
+ }
+
+ return ret;
+ }
}
}
Modified: trunk/beagle/beagled/KonqHistoryQueryable/KonqQueryable.cs
==============================================================================
--- trunk/beagle/beagled/KonqHistoryQueryable/KonqQueryable.cs (original)
+++ trunk/beagle/beagled/KonqHistoryQueryable/KonqQueryable.cs Sun Jun 15 00:47:25 2008
@@ -28,6 +28,7 @@
using System.Collections;
using System.Threading;
using System.Text;
+using ICSharpCode.SharpZipLib.GZip;
using Beagle.Daemon;
using Beagle.Util;
@@ -166,24 +167,34 @@
private Indexable FileToIndexable (string path, bool crawling) {
//Logger.Log.Debug ("KonqQ: Trying to index " + path);
- FileStream stream;
+ Stream stream;
try {
stream = new FileStream (path, FileMode.Open, FileAccess.Read, FileShare.Read);
} catch (FileNotFoundException) {
// that was fast - lost the file
return null;
}
-
+
+ bool gzip_cached = false;
+
+ if (KonqHistoryUtil.IsGZipCache (stream))
+ gzip_cached = true;
+
+ if (gzip_cached)
+ stream = new GZipInputStream (stream);
+
using (StreamReader reader = new StreamReader (stream, latin_encoding)) {
string url = null;
string creation_date = null;
string mimetype = null;
string charset = null;
+ string revision = null;
bool is_ok = KonqHistoryUtil.ShouldIndex (reader,
out url,
out creation_date,
out mimetype,
- out charset);
+ out charset,
+ out revision);
if (!is_ok || url == String.Empty) {
//Logger.Log.Debug ("KonqQ: Skipping non-html file " + path + " of type=" + mimetype);
@@ -193,13 +204,13 @@
return null; // we wont index bad files and non-html files
}
- //Logger.Log.Debug ("KonqQ: Indexing " + path + " with url=" + url);
+ //Logger.Log.Debug ("KonqQ: Indexing {0} = {1},{2},{3},{4},{5}", path, url, creation_date, mimetype, charset, revision);
Uri uri = new Uri (url, true);
if (uri.Scheme == Uri.UriSchemeHttps) {
Logger.Log.Error ("Indexing secure https:// URIs is not secure!");
return null;
}
-
+
Indexable indexable = new Indexable (uri);
indexable.HitType = "WebHistory";
indexable.MimeType = KonqHistoryUtil.KonqCacheMimeType;
@@ -207,8 +218,10 @@
indexable.AddProperty (Property.NewUnstored ("fixme:urltoken", StringFu.UrlFuzzyDivide (url)));
// hint for the filter about the charset
indexable.AddProperty (Property.NewUnsearched (StringFu.UnindexedNamespace + "charset", charset));
+ indexable.AddProperty (Property.NewUnsearched (StringFu.UnindexedNamespace + "gzipped", gzip_cached));
+ indexable.AddProperty (Property.NewUnsearched (StringFu.UnindexedNamespace + "revision", revision));
indexable.FlushBufferCache = crawling;
-
+
DateTime date = DateTimeUtil.UnixToDateTimeUtc (0);
date = date.AddSeconds (Int64.Parse (creation_date));
indexable.Timestamp = date;
@@ -253,7 +266,8 @@
current_file = (FileInfo) file_enumerator.Current;
//if (!IsUpToDate (current_file.FullName))
// Logger.Log.Debug (current_file.FullName + " is not upto date");
- } while (IsUpToDate (current_file.FullName));
+ // KDE4 cache contains _freq files which are non-cache files
+ } while (current_file.FullName.EndsWith ("_freq") || IsUpToDate (current_file.FullName));
return true;
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]