beagle r4755 - in trunk/beagle: . BeagleClient Filters beagled



Author: dbera
Date: Sun May 18 02:46:42 2008
New Revision: 4755
URL: http://svn.gnome.org/viewvc/beagle?rev=4755&view=rev

Log:
Merge the links-extraction feature from the rdf branch. Builds with and without ENABLE_RDF_ADAPTER. TODO for the links feature:
- store the links in the index (do we need to ?)
- create virtual RDF property for the links metadata


Modified:
   trunk/beagle/BeagleClient/Indexable.cs
   trunk/beagle/Filters/FilterHtml.cs
   trunk/beagle/Filters/FilterMail.cs
   trunk/beagle/beagled/ExtractContent.cs
   trunk/beagle/beagled/Filter.cs
   trunk/beagle/beagled/FilterFactory.cs
   trunk/beagle/beagled/LuceneCommon.cs
   trunk/beagle/beagled/LuceneIndexingDriver.cs
   trunk/beagle/beagled/Makefile.am
   trunk/beagle/beagled/NoiseFilter.cs
   trunk/beagle/beagled/TextCache.cs
   trunk/beagle/configure.in

Modified: trunk/beagle/BeagleClient/Indexable.cs
==============================================================================
--- trunk/beagle/BeagleClient/Indexable.cs	(original)
+++ trunk/beagle/BeagleClient/Indexable.cs	Sun May 18 02:46:42 2008
@@ -28,6 +28,7 @@
 
 using System;
 using System.Collections;
+using System.Collections.Generic;
 using System.IO;
 using System.Text;
 using System.Threading;
@@ -420,6 +421,17 @@
 			binary_stream = stream;
 		}
 
+#if ENABLE_RDF_ADAPTER
+		// List of links found in the content while indexing
+		private IList<string> links = null;
+
+		[XmlIgnore]
+		public IList<string> Links {
+			get { return links; }
+			set { links = value; }
+		}
+#endif
+
 		[XmlArrayItem (ElementName="Property", Type=typeof (Property))]
 		public ArrayList Properties {
 			get { return properties; }

Modified: trunk/beagle/Filters/FilterHtml.cs
==============================================================================
--- trunk/beagle/Filters/FilterHtml.cs	(original)
+++ trunk/beagle/Filters/FilterHtml.cs	Sun May 18 02:46:42 2008
@@ -61,6 +61,7 @@
 		public delegate void AddPropertyCallback (Beagle.Property p);
 		public delegate bool AppendSpaceCallback ();
 		public delegate void HotCallback ();
+		public delegate void AddLinkCallback (string s);
 
 		// delegates
 		private new AppendTextCallback AppendText;
@@ -70,6 +71,7 @@
 		private new AppendSpaceCallback AppendStructuralBreak;
 		private new HotCallback HotUp;
 		private new HotCallback HotDown;
+		private new AddLinkCallback AddLink;
 
 		// 1: Add meta keyword fields as meta:key
 		private int version = 1;
@@ -88,6 +90,9 @@
 				AppendStructuralBreak = new AppendSpaceCallback (base.AppendStructuralBreak);
 				HotUp = new HotCallback (base.HotUp);
 				HotDown = new HotCallback (base.HotDown);
+#if ENABLE_RDF_ADAPTER
+				AddLink = new AddLinkCallback (base.AddLink);
+#endif
 			}
 
 			ignore_level = 0;
@@ -103,6 +108,14 @@
 			base.SetVersion (this.version);
 		}
 
+#if ENABLE_RDF_ADAPTER
+		public void SetAddLinkHandler (AddLinkCallback link_handler)
+		{
+			if (link_handler != null)
+				AddLink = link_handler;
+		}
+#endif
+
 		protected bool NodeIsHot (String nodeName) 
 		{
 			return nodeName == "b"
@@ -222,6 +235,11 @@
 						string s = HtmlEntity.DeEntitize (
 							    SW.HttpUtility.UrlDecode (attr, enc));
 						AppendWord (s);
+#if ENABLE_RDF_ADAPTER
+						// Add valid and global URLs to special field "Link"
+						if (s.StartsWith ("http://";) || s.StartsWith ("mailto:";) || s.StartsWith ("ftp://";))
+							AddLink (s);
+#endif
 						ret = AppendWhiteSpace ();
 					}
 				} else if (node.Name == "br") // both <br> and </br> are used - special case
@@ -438,11 +456,19 @@
 
 		public static TextReader GetHtmlReader (Stream stream, string charset)
 		{
+			return GetHtmlReader (stream, charset, null);
+		}
+
+		public static TextReader GetHtmlReader (Stream stream, string charset, AddLinkCallback link_handler)
+		{
 			if (stream == null)
 				throw new ArgumentNullException ("stream");
 
 			FilterHtml html_filter = new FilterHtml ();
 			html_filter.SnippetMode = false;
+#if ENABLE_RDF_ADAPTER
+			html_filter.SetAddLinkHandler (link_handler);
+#endif
 
 			html_filter.Indexable = new Indexable (); // fake an indexable
 			html_filter.AddProperty (Property.NewUnsearched (StringFu.              UnindexedNamespace + "encoding", charset));

Modified: trunk/beagle/Filters/FilterMail.cs
==============================================================================
--- trunk/beagle/Filters/FilterMail.cs	(original)
+++ trunk/beagle/Filters/FilterMail.cs	Sun May 18 02:46:42 2008
@@ -147,6 +147,7 @@
 					AddProperty (Property.New ("fixme:to_address", ia.Addr));
 
 				AddProperty (Property.New ("fixme:to_name", ia.Name));
+				AddEmailLink (ia);
 			}
 			addrs.Dispose ();
 
@@ -157,6 +158,7 @@
 					AddProperty (Property.New ("fixme:cc_address", ia.Addr));
 
 				AddProperty (Property.New ("fixme:cc_name", ia.Name));
+				AddEmailLink (ia);
 			}
 			addrs.Dispose ();
 
@@ -167,6 +169,7 @@
 					AddProperty (Property.New ("fixme:from_address", ia.Addr));
 
 				AddProperty (Property.New ("fixme:from_name", ia.Name));
+				AddEmailLink (ia);
 			}
 			addrs.Dispose ();
 
@@ -202,9 +205,23 @@
 				AddProperty (Property.NewFlag ("fixme:isSent"));
 		}
 
+		private void AddEmailLink (GMime.InternetAddress ia)
+		{
+#if ENABLE_RDF_ADAPTER
+			if (String.IsNullOrEmpty (ia.Name))
+				AddLink (String.Concat ("mailto://";, ia.Addr));
+			else
+				AddLink (String.Concat ("mailto://";, ia.Addr, "/", Uri.EscapeDataString (ia.Name)));
+#endif
+		}
+
 		protected override void DoPullSetup ()
 		{
-			this.handler = new PartHandler (Indexable);
+			this.handler = new PartHandler (Indexable,
+							delegate (string s)
+							{
+								AddLink (s);
+							});
 			using (GMime.Object mime_part = this.message.MimePart)
 				this.handler.OnEachPart (mime_part);
 
@@ -274,6 +291,7 @@
 			private int depth = 0; // part recursion depth
 			private ArrayList child_indexables = new ArrayList ();
 			private TextReader reader;
+			private FilterHtml.AddLinkCallback link_handler;
 
 			private bool html_part = false;
 			internal bool HtmlPart {
@@ -291,9 +309,10 @@
 				"text/x-vcard"
 			};
 
-			public PartHandler (Indexable parent_indexable)
+			public PartHandler (Indexable parent_indexable, FilterHtml.AddLinkCallback link_handler)
 			{
 				this.indexable = parent_indexable;
+				this.link_handler = link_handler;
 			}
 
 			private bool IsMimeTypeHandled (string mime_type)
@@ -406,7 +425,7 @@
 							stream.Close ();
 
 							try {
-								this.reader = FilterHtml.GetHtmlReader (html_stream, enc);
+								this.reader = FilterHtml.GetHtmlReader (html_stream, enc, link_handler);
 							} catch (Exception e) {
 								Log.Debug (e, "Exception while filtering HTML email {0}", this.indexable.Uri);
 								this.reader = null;

Modified: trunk/beagle/beagled/ExtractContent.cs
==============================================================================
--- trunk/beagle/beagled/ExtractContent.cs	(original)
+++ trunk/beagle/beagled/ExtractContent.cs	Sun May 18 02:46:42 2008
@@ -30,6 +30,7 @@
 using System.Net;
 using System.Reflection;
 using System.Collections;
+using System.Collections.Generic;
 
 using Beagle;
 using Beagle.Util;
@@ -278,6 +279,16 @@
 		Console.WriteLine ();
 		Console.WriteLine ("Text extracted in {0}", watch);
 
+#if ENABLE_RDF_ADAPTER
+		IList<string> links = indexable.Links;
+		if (links != null && links.Count != 0) {
+			Console.WriteLine ("Links:");
+			foreach (string link in links)
+				Console.WriteLine (link);
+			Console.WriteLine ();
+		}
+#endif
+
 		foreach (Indexable gi in generated_indexables)
 			Display (gi);
 

Modified: trunk/beagle/beagled/Filter.cs
==============================================================================
--- trunk/beagle/beagled/Filter.cs	(original)
+++ trunk/beagle/beagled/Filter.cs	Sun May 18 02:46:42 2008
@@ -28,6 +28,7 @@
 
 using System;
 using System.Collections;
+using System.Collections.Generic;
 using System.IO;
 using System.Text;
 using System.Reflection;
@@ -448,6 +449,21 @@
 			return UpdateCharsAdded (1);
 		}
 
+#if ENABLE_RDF_ADAPTER
+		private IList<string> links_list = new List<string> ();
+
+		public IList<string> Links {
+			get { return links_list; }
+		}
+#endif
+
+		public void AddLink (string link)
+		{
+#if ENABLE_RDF_ADAPTER
+			links_list.Add (link);
+#endif
+		}
+
 		//private bool NeedsWhiteSpace (ArrayList array)
 		//{
 		//	if (array.Count == 0)

Modified: trunk/beagle/beagled/FilterFactory.cs
==============================================================================
--- trunk/beagle/beagled/FilterFactory.cs	(original)
+++ trunk/beagle/beagled/FilterFactory.cs	Sun May 18 02:46:42 2008
@@ -329,6 +329,9 @@
 
 					indexable.SetTextReader (candidate_filter.GetTextReader ());
 					indexable.SetHotTextReader (candidate_filter.GetHotTextReader ());
+#if ENABLE_RDF_ADAPTER
+					indexable.Links = candidate_filter.Links;
+#endif
 
 					if (Debug)
 						Logger.Log.Debug ("Successfully filtered {0} with {1}", path, candidate_filter);

Modified: trunk/beagle/beagled/LuceneCommon.cs
==============================================================================
--- trunk/beagle/beagled/LuceneCommon.cs	(original)
+++ trunk/beagle/beagled/LuceneCommon.cs	Sun May 18 02:46:42 2008
@@ -492,6 +492,15 @@
 			private bool strip_extra_property_info = false;
 			private bool tokenize_email_hostname = false;
 
+			private NoiseEmailHostFilter.LinkCallback add_link = null;
+
+			public NoiseEmailHostFilter.LinkCallback AddLink {
+				set {
+				    lock (this)
+					    add_link = value;
+				}
+			}
+
 			public BeagleAnalyzer (bool is_indexing_analyzer)
 			{
 				if (is_indexing_analyzer) {
@@ -545,11 +554,17 @@
 				TokenStream outstream;
 				outstream = base.TokenStream (fieldName, reader);
 
+				NoiseEmailHostFilter.LinkCallback add_link_callback = null;
+				lock (this) {
+					if (fieldName == "Text")
+						add_link_callback = add_link;
+				}
+
 				if (fieldName == "Text"
 				    || fieldName == "HotText"
 				    || fieldName == "PropertyText"
 				    || is_text_prop) {
-					outstream = new NoiseEmailHostFilter (outstream, tokenize_email_hostname);
+					outstream = new NoiseEmailHostFilter (outstream, tokenize_email_hostname, add_link_callback);
 					// Sharing Stemmer is not thread safe.
 					// Currently our underlying lucene indexing is not done in multiple threads.
 					StemmerInfo stemmer_info = GetStemmer (DEFAULT_STEMMER_LANGUAGE);
@@ -560,11 +575,11 @@
 			}
 		}
 
-		static private Analyzer indexing_analyzer = new BeagleAnalyzer (true);
-		static private Analyzer query_analyzer = new BeagleAnalyzer (false);
+		static private BeagleAnalyzer indexing_analyzer = new BeagleAnalyzer (true);
+		static private BeagleAnalyzer query_analyzer = new BeagleAnalyzer (false);
 
-		static protected Analyzer IndexingAnalyzer { get { return indexing_analyzer; } }
-		static protected Analyzer QueryAnalyzer { get { return query_analyzer; } }
+		static protected BeagleAnalyzer IndexingAnalyzer { get { return indexing_analyzer; } }
+		static protected BeagleAnalyzer QueryAnalyzer { get { return query_analyzer; } }
 
 		////////////////////////////////////////////////////////////////
 

Modified: trunk/beagle/beagled/LuceneIndexingDriver.cs
==============================================================================
--- trunk/beagle/beagled/LuceneIndexingDriver.cs	(original)
+++ trunk/beagle/beagled/LuceneIndexingDriver.cs	Sun May 18 02:46:42 2008
@@ -622,8 +622,21 @@
 			Document primary_doc = null, secondary_doc = null;
 
 			try {
+#if ENABLE_RDF_ADAPTER
+				// Add a callback to extract emails and links from the anaylyzer
+				// and add them to secondary_doc's "References" field.
+				IndexingAnalyzer.AddLink = delegate (string s, bool email)
+							    {
+								    // Only add emails for now
+								    // NoiseFilter is not good with URLs
+								    if (! email || indexable.Links == null)
+									    return;
+								    indexable.Links.Add (s);
+							    };
+#endif
 				BuildDocuments (indexable, out primary_doc, out secondary_doc);
 				primary_writer.AddDocument (primary_doc);
+				IndexingAnalyzer.AddLink = null;
 			} catch (Exception ex) {
 					
 				// If an exception was thrown, something bad probably happened
@@ -652,6 +665,12 @@
 				secondary_writer.AddDocument (secondary_doc);
 			}
 
+#if ENABLE_RDF_ADAPTER
+			// Store the extracted links in the textcache
+			if (! disable_textcache && text_cache != null)
+				text_cache.AddLinks (indexable.Uri, indexable.Links);
+#endif
+
 			AdjustItemCount (1);
 		}
 

Modified: trunk/beagle/beagled/Makefile.am
==============================================================================
--- trunk/beagle/beagled/Makefile.am	(original)
+++ trunk/beagle/beagled/Makefile.am	Sun May 18 02:46:42 2008
@@ -54,7 +54,7 @@
 
 PLUGIN_TARGET = BeagleDaemonPlugins.dll
 
-PLUGIN_CSFLAGS = -target:library
+PLUGIN_CSFLAGS = -target:library $(BEAGLE_DEFINES)
 
 PLUGIN_CSFILES =				\
 	$(srcdir)/Flavor.cs			\
@@ -527,7 +527,8 @@
 
 
 EXTRACT_CONTENT_CSFLAGS = \
-	-target:exe
+	-target:exe	  \
+	$(BEAGLE_DEFINES)
 
 EXTRACT_CONTENT_CSFILES = \
 	$(srcdir)/ExtractContent.cs		\

Modified: trunk/beagle/beagled/NoiseFilter.cs
==============================================================================
--- trunk/beagle/beagled/NoiseFilter.cs	(original)
+++ trunk/beagle/beagled/NoiseFilter.cs	Sun May 18 02:46:42 2008
@@ -40,15 +40,24 @@
 	// 3. Splits hostnames into subparts
 	public class NoiseEmailHostFilter : TokenFilter {
 			
+		public delegate void LinkCallback (string s, bool email);
+		private LinkCallback link_call_back;
+
 		private bool tokenize_email_hostname;
 
 		TokenStream token_stream;
 
 		public NoiseEmailHostFilter (TokenStream input, bool tokenize_email_hostname)
+			: this (input, tokenize_email_hostname, null)
+		{
+		}
+
+		public NoiseEmailHostFilter (TokenStream input, bool tokenize_email_hostname, LinkCallback link_call_back)
 			: base (input)
 		{
 			this.token_stream = input;
 			this.tokenize_email_hostname = tokenize_email_hostname;
+			this.link_call_back = link_call_back;
 		}
 
 		// FIXME: we should add some heuristics that are stricter
@@ -248,6 +257,10 @@
 			// and also remove the final tld part
 			Array.Copy (parts, 0, parts, 1, parts.Length - 1);
 			parts [0] = email.Substring (0, index_at);
+#if ENABLE_RDF_ADAPTER
+			if (link_call_back != null)
+				link_call_back ("mailto://"; + email, true);
+#endif
 		}
 
 		private void ProcessURLToken (Lucene.Net.Analysis.Token token)

Modified: trunk/beagle/beagled/TextCache.cs
==============================================================================
--- trunk/beagle/beagled/TextCache.cs	(original)
+++ trunk/beagle/beagled/TextCache.cs	Sun May 18 02:46:42 2008
@@ -27,7 +27,9 @@
 
 using System;
 using System.Collections;
+using System.Collections.Generic;
 using System.IO;
+using System.Text;
 using System.Threading;
 
 using Mono.Data.Sqlite;
@@ -50,10 +52,14 @@
 
 		private const string SELF_CACHE_TAG = "*self*";
 		private const string BLOB_TAG = "*blob*";
-		public SqliteCommand InsertCommand;
-		public SqliteCommand LookupPathCommand;
-		public SqliteCommand LookupDataCommand;
-		public SqliteCommand DeleteCommand;
+		private SqliteCommand InsertCommand;
+		private SqliteCommand LookupPathCommand;
+		private SqliteCommand LookupDataCommand;
+		private SqliteCommand DeleteCommand;
+#if ENABLE_RDF_ADAPTER
+		private SqliteCommand UpdateLinksCommand;
+		private SqliteCommand LookupLinksCommand;
+#endif
 		private string text_cache_dir;
 		internal string TextCacheDir {
 			get { return text_cache_dir; }
@@ -163,12 +169,21 @@
 
 				// Database schema: uri, filename, data
 				SqliteUtils.DoNonQuery (connection,
-							"CREATE TABLE textcache_data (            " +
+							"CREATE TABLE textcache_data (     " +
 							"  uri      TEXT UNIQUE NOT NULL,  " +
 							"  filename TEXT NOT NULL,         " +
-							"  data     BLOB                     " +
+							"  data     BLOB                   " +
 							")");
 			}
+#if ENABLE_RDF_ADAPTER
+			try {
+				SqliteUtils.DoNonQuery (connection,
+							"CREATE TABLE links_data (  " +
+							"  uri TEXT UNIQUE NOT NULL," +
+							"  links TEXT		    " +
+							")");
+			} catch { }
+#endif
 			this.InitCommands ();
 		}
 
@@ -182,6 +197,14 @@
 			LookupDataCommand.CommandText = "SELECT filename, data FROM textcache_data WHERE uri= uri";
 			DeleteCommand = new SqliteCommand (this.connection);
 			DeleteCommand.CommandText = "DELETE FROM textcache_data WHERE uri= uri";
+
+#if ENABLE_RDF_ADAPTER
+			UpdateLinksCommand = new SqliteCommand (this.connection);
+			UpdateLinksCommand.CommandText = "UPDATE links_data SET links= links WHERE uri= uri";
+
+			LookupLinksCommand = new SqliteCommand (this.connection);
+			LookupLinksCommand.CommandText = "SELECT links FROM links_data WHERE uri= uri";
+#endif
 		}
 		private SqliteConnection Open (string db_filename)
 		{
@@ -458,7 +481,7 @@
 
 			lock (connection) {
 				
-				LookupDataCommand.Parameters.AddWithValue("@uri",UriToString (uri));
+				LookupDataCommand.Parameters.AddWithValue ("@uri", UriToString (uri));
 				using (SqliteDataReader reader = SqliteUtils.ExecuteReaderOrWait (LookupDataCommand)) {
 					if (! SqliteUtils.ReadOrWait (reader)) {
 						if (self_cache)
@@ -466,9 +489,9 @@
 						return null;
 					}
 
-				filename = reader.GetString (0);
-				if (! reader.IsDBNull (1))
-					blob = reader.GetValue (1) as byte [];
+					filename = reader.GetString (0);
+					if (! reader.IsDBNull (1))
+						blob = reader.GetValue (1) as byte [];
 				}
 
 			}
@@ -518,6 +541,58 @@
 			}
 		}
 
+#if ENABLE_RDF_ADAPTER
+		public void AddLinks (Uri uri, IList<string> links)
+		{
+			lock (connection) {
+				string path = LookupPathRawUnlocked (uri);
+				if (path == null)
+					return;
+				MaybeStartTransaction_Unlocked ();
+				UpdateLinksCommand.Parameters.AddWithValue("@uri", UriToString (uri));
+				UpdateLinksCommand.Parameters.AddWithValue("@links", GetLinksText (links));
+				SqliteUtils.DoNonQuery (UpdateLinksCommand);
+			}
+		}
+
+		private string GetLinksText (IList<string> links)
+		{
+			if (links == null || links.Count == 0)
+				return String.Empty;
+
+			StringBuilder sb = new StringBuilder ();
+			foreach (string s in links) {
+				sb.Append (s);
+				sb.Append (' ');
+			}
+
+			return sb.ToString ();
+		}
+
+		public IList<string> GetLinks (Uri uri)
+		{
+			string links_text = null;
+			List<string> links = null;
+
+			lock (connection) {
+				LookupLinksCommand.Parameters.AddWithValue ("@uri", UriToString (uri));
+				using (SqliteDataReader reader = SqliteUtils.ExecuteReaderOrWait (LookupLinksCommand)) {
+					if (! SqliteUtils.ReadOrWait (reader))
+						return null;
+
+					links_text = reader.GetString (0);
+				}
+			}
+
+			if (String.IsNullOrEmpty (links_text))
+				return null;
+
+			return links_text.Split (links_separator, StringSplitOptions.RemoveEmptyEntries);
+		}
+
+		static readonly char[] links_separator = new char[] {' '};
+#endif
+
 		private void MaybeStartTransaction_Unlocked ()
 		{
 			if (transaction_state == TransactionState.Requested)

Modified: trunk/beagle/configure.in
==============================================================================
--- trunk/beagle/configure.in	(original)
+++ trunk/beagle/configure.in	Sun May 18 02:46:42 2008
@@ -717,6 +717,6 @@
 	beagle-search GUI	  ${enable_gui}
 	Qt beagle-settings GUI    ${enable_qt}
 
-	Build RDF Adapter         ${enable_rdf_adapter}
+	Build RDF Adapter         ${enable_rdf_adapter} (purely experimental)
 	Build docs?               ${with_docs}
 "



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]