beagle r4755 - in trunk/beagle: . BeagleClient Filters beagled
- From: dbera svn gnome org
- To: svn-commits-list gnome org
- Subject: beagle r4755 - in trunk/beagle: . BeagleClient Filters beagled
- Date: Sun, 18 May 2008 02:46:43 +0000 (UTC)
Author: dbera
Date: Sun May 18 02:46:42 2008
New Revision: 4755
URL: http://svn.gnome.org/viewvc/beagle?rev=4755&view=rev
Log:
Merge the links-extraction feature from the rdf branch. Builds with and without ENABLE_RDF_ADAPTER. TODO for the links feature:
- store the links in the index (do we need to ?)
- create virtual RDF property for the links metadata
Modified:
trunk/beagle/BeagleClient/Indexable.cs
trunk/beagle/Filters/FilterHtml.cs
trunk/beagle/Filters/FilterMail.cs
trunk/beagle/beagled/ExtractContent.cs
trunk/beagle/beagled/Filter.cs
trunk/beagle/beagled/FilterFactory.cs
trunk/beagle/beagled/LuceneCommon.cs
trunk/beagle/beagled/LuceneIndexingDriver.cs
trunk/beagle/beagled/Makefile.am
trunk/beagle/beagled/NoiseFilter.cs
trunk/beagle/beagled/TextCache.cs
trunk/beagle/configure.in
Modified: trunk/beagle/BeagleClient/Indexable.cs
==============================================================================
--- trunk/beagle/BeagleClient/Indexable.cs (original)
+++ trunk/beagle/BeagleClient/Indexable.cs Sun May 18 02:46:42 2008
@@ -28,6 +28,7 @@
using System;
using System.Collections;
+using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Threading;
@@ -420,6 +421,17 @@
binary_stream = stream;
}
+#if ENABLE_RDF_ADAPTER
+ // List of links found in the content while indexing
+ private IList<string> links = null;
+
+ [XmlIgnore]
+ public IList<string> Links {
+ get { return links; }
+ set { links = value; }
+ }
+#endif
+
[XmlArrayItem (ElementName="Property", Type=typeof (Property))]
public ArrayList Properties {
get { return properties; }
Modified: trunk/beagle/Filters/FilterHtml.cs
==============================================================================
--- trunk/beagle/Filters/FilterHtml.cs (original)
+++ trunk/beagle/Filters/FilterHtml.cs Sun May 18 02:46:42 2008
@@ -61,6 +61,7 @@
public delegate void AddPropertyCallback (Beagle.Property p);
public delegate bool AppendSpaceCallback ();
public delegate void HotCallback ();
+ public delegate void AddLinkCallback (string s);
// delegates
private new AppendTextCallback AppendText;
@@ -70,6 +71,7 @@
private new AppendSpaceCallback AppendStructuralBreak;
private new HotCallback HotUp;
private new HotCallback HotDown;
+ private new AddLinkCallback AddLink;
// 1: Add meta keyword fields as meta:key
private int version = 1;
@@ -88,6 +90,9 @@
AppendStructuralBreak = new AppendSpaceCallback (base.AppendStructuralBreak);
HotUp = new HotCallback (base.HotUp);
HotDown = new HotCallback (base.HotDown);
+#if ENABLE_RDF_ADAPTER
+ AddLink = new AddLinkCallback (base.AddLink);
+#endif
}
ignore_level = 0;
@@ -103,6 +108,14 @@
base.SetVersion (this.version);
}
+#if ENABLE_RDF_ADAPTER
+ public void SetAddLinkHandler (AddLinkCallback link_handler)
+ {
+ if (link_handler != null)
+ AddLink = link_handler;
+ }
+#endif
+
protected bool NodeIsHot (String nodeName)
{
return nodeName == "b"
@@ -222,6 +235,11 @@
string s = HtmlEntity.DeEntitize (
SW.HttpUtility.UrlDecode (attr, enc));
AppendWord (s);
+#if ENABLE_RDF_ADAPTER
+ // Add valid and global URLs to special field "Link"
+ if (s.StartsWith ("http://") || s.StartsWith ("mailto:") || s.StartsWith ("ftp://"))
+ AddLink (s);
+#endif
ret = AppendWhiteSpace ();
}
} else if (node.Name == "br") // both <br> and </br> are used - special case
@@ -438,11 +456,19 @@
public static TextReader GetHtmlReader (Stream stream, string charset)
{
+ return GetHtmlReader (stream, charset, null);
+ }
+
+ public static TextReader GetHtmlReader (Stream stream, string charset, AddLinkCallback link_handler)
+ {
if (stream == null)
throw new ArgumentNullException ("stream");
FilterHtml html_filter = new FilterHtml ();
html_filter.SnippetMode = false;
+#if ENABLE_RDF_ADAPTER
+ html_filter.SetAddLinkHandler (link_handler);
+#endif
html_filter.Indexable = new Indexable (); // fake an indexable
html_filter.AddProperty (Property.NewUnsearched (StringFu. UnindexedNamespace + "encoding", charset));
Modified: trunk/beagle/Filters/FilterMail.cs
==============================================================================
--- trunk/beagle/Filters/FilterMail.cs (original)
+++ trunk/beagle/Filters/FilterMail.cs Sun May 18 02:46:42 2008
@@ -147,6 +147,7 @@
AddProperty (Property.New ("fixme:to_address", ia.Addr));
AddProperty (Property.New ("fixme:to_name", ia.Name));
+ AddEmailLink (ia);
}
addrs.Dispose ();
@@ -157,6 +158,7 @@
AddProperty (Property.New ("fixme:cc_address", ia.Addr));
AddProperty (Property.New ("fixme:cc_name", ia.Name));
+ AddEmailLink (ia);
}
addrs.Dispose ();
@@ -167,6 +169,7 @@
AddProperty (Property.New ("fixme:from_address", ia.Addr));
AddProperty (Property.New ("fixme:from_name", ia.Name));
+ AddEmailLink (ia);
}
addrs.Dispose ();
@@ -202,9 +205,23 @@
AddProperty (Property.NewFlag ("fixme:isSent"));
}
+ private void AddEmailLink (GMime.InternetAddress ia)
+ {
+#if ENABLE_RDF_ADAPTER
+ if (String.IsNullOrEmpty (ia.Name))
+ AddLink (String.Concat ("mailto://", ia.Addr));
+ else
+ AddLink (String.Concat ("mailto://", ia.Addr, "/", Uri.EscapeDataString (ia.Name)));
+#endif
+ }
+
protected override void DoPullSetup ()
{
- this.handler = new PartHandler (Indexable);
+ this.handler = new PartHandler (Indexable,
+ delegate (string s)
+ {
+ AddLink (s);
+ });
using (GMime.Object mime_part = this.message.MimePart)
this.handler.OnEachPart (mime_part);
@@ -274,6 +291,7 @@
private int depth = 0; // part recursion depth
private ArrayList child_indexables = new ArrayList ();
private TextReader reader;
+ private FilterHtml.AddLinkCallback link_handler;
private bool html_part = false;
internal bool HtmlPart {
@@ -291,9 +309,10 @@
"text/x-vcard"
};
- public PartHandler (Indexable parent_indexable)
+ public PartHandler (Indexable parent_indexable, FilterHtml.AddLinkCallback link_handler)
{
this.indexable = parent_indexable;
+ this.link_handler = link_handler;
}
private bool IsMimeTypeHandled (string mime_type)
@@ -406,7 +425,7 @@
stream.Close ();
try {
- this.reader = FilterHtml.GetHtmlReader (html_stream, enc);
+ this.reader = FilterHtml.GetHtmlReader (html_stream, enc, link_handler);
} catch (Exception e) {
Log.Debug (e, "Exception while filtering HTML email {0}", this.indexable.Uri);
this.reader = null;
Modified: trunk/beagle/beagled/ExtractContent.cs
==============================================================================
--- trunk/beagle/beagled/ExtractContent.cs (original)
+++ trunk/beagle/beagled/ExtractContent.cs Sun May 18 02:46:42 2008
@@ -30,6 +30,7 @@
using System.Net;
using System.Reflection;
using System.Collections;
+using System.Collections.Generic;
using Beagle;
using Beagle.Util;
@@ -278,6 +279,16 @@
Console.WriteLine ();
Console.WriteLine ("Text extracted in {0}", watch);
+#if ENABLE_RDF_ADAPTER
+ IList<string> links = indexable.Links;
+ if (links != null && links.Count != 0) {
+ Console.WriteLine ("Links:");
+ foreach (string link in links)
+ Console.WriteLine (link);
+ Console.WriteLine ();
+ }
+#endif
+
foreach (Indexable gi in generated_indexables)
Display (gi);
Modified: trunk/beagle/beagled/Filter.cs
==============================================================================
--- trunk/beagle/beagled/Filter.cs (original)
+++ trunk/beagle/beagled/Filter.cs Sun May 18 02:46:42 2008
@@ -28,6 +28,7 @@
using System;
using System.Collections;
+using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Reflection;
@@ -448,6 +449,21 @@
return UpdateCharsAdded (1);
}
+#if ENABLE_RDF_ADAPTER
+ private IList<string> links_list = new List<string> ();
+
+ public IList<string> Links {
+ get { return links_list; }
+ }
+#endif
+
+ public void AddLink (string link)
+ {
+#if ENABLE_RDF_ADAPTER
+ links_list.Add (link);
+#endif
+ }
+
//private bool NeedsWhiteSpace (ArrayList array)
//{
// if (array.Count == 0)
Modified: trunk/beagle/beagled/FilterFactory.cs
==============================================================================
--- trunk/beagle/beagled/FilterFactory.cs (original)
+++ trunk/beagle/beagled/FilterFactory.cs Sun May 18 02:46:42 2008
@@ -329,6 +329,9 @@
indexable.SetTextReader (candidate_filter.GetTextReader ());
indexable.SetHotTextReader (candidate_filter.GetHotTextReader ());
+#if ENABLE_RDF_ADAPTER
+ indexable.Links = candidate_filter.Links;
+#endif
if (Debug)
Logger.Log.Debug ("Successfully filtered {0} with {1}", path, candidate_filter);
Modified: trunk/beagle/beagled/LuceneCommon.cs
==============================================================================
--- trunk/beagle/beagled/LuceneCommon.cs (original)
+++ trunk/beagle/beagled/LuceneCommon.cs Sun May 18 02:46:42 2008
@@ -492,6 +492,15 @@
private bool strip_extra_property_info = false;
private bool tokenize_email_hostname = false;
+ private NoiseEmailHostFilter.LinkCallback add_link = null;
+
+ public NoiseEmailHostFilter.LinkCallback AddLink {
+ set {
+ lock (this)
+ add_link = value;
+ }
+ }
+
public BeagleAnalyzer (bool is_indexing_analyzer)
{
if (is_indexing_analyzer) {
@@ -545,11 +554,17 @@
TokenStream outstream;
outstream = base.TokenStream (fieldName, reader);
+ NoiseEmailHostFilter.LinkCallback add_link_callback = null;
+ lock (this) {
+ if (fieldName == "Text")
+ add_link_callback = add_link;
+ }
+
if (fieldName == "Text"
|| fieldName == "HotText"
|| fieldName == "PropertyText"
|| is_text_prop) {
- outstream = new NoiseEmailHostFilter (outstream, tokenize_email_hostname);
+ outstream = new NoiseEmailHostFilter (outstream, tokenize_email_hostname, add_link_callback);
// Sharing Stemmer is not thread safe.
// Currently our underlying lucene indexing is not done in multiple threads.
StemmerInfo stemmer_info = GetStemmer (DEFAULT_STEMMER_LANGUAGE);
@@ -560,11 +575,11 @@
}
}
- static private Analyzer indexing_analyzer = new BeagleAnalyzer (true);
- static private Analyzer query_analyzer = new BeagleAnalyzer (false);
+ static private BeagleAnalyzer indexing_analyzer = new BeagleAnalyzer (true);
+ static private BeagleAnalyzer query_analyzer = new BeagleAnalyzer (false);
- static protected Analyzer IndexingAnalyzer { get { return indexing_analyzer; } }
- static protected Analyzer QueryAnalyzer { get { return query_analyzer; } }
+ static protected BeagleAnalyzer IndexingAnalyzer { get { return indexing_analyzer; } }
+ static protected BeagleAnalyzer QueryAnalyzer { get { return query_analyzer; } }
////////////////////////////////////////////////////////////////
Modified: trunk/beagle/beagled/LuceneIndexingDriver.cs
==============================================================================
--- trunk/beagle/beagled/LuceneIndexingDriver.cs (original)
+++ trunk/beagle/beagled/LuceneIndexingDriver.cs Sun May 18 02:46:42 2008
@@ -622,8 +622,21 @@
Document primary_doc = null, secondary_doc = null;
try {
+#if ENABLE_RDF_ADAPTER
+ // Add a callback to extract emails and links from the anaylyzer
+ // and add them to secondary_doc's "References" field.
+ IndexingAnalyzer.AddLink = delegate (string s, bool email)
+ {
+ // Only add emails for now
+ // NoiseFilter is not good with URLs
+ if (! email || indexable.Links == null)
+ return;
+ indexable.Links.Add (s);
+ };
+#endif
BuildDocuments (indexable, out primary_doc, out secondary_doc);
primary_writer.AddDocument (primary_doc);
+ IndexingAnalyzer.AddLink = null;
} catch (Exception ex) {
// If an exception was thrown, something bad probably happened
@@ -652,6 +665,12 @@
secondary_writer.AddDocument (secondary_doc);
}
+#if ENABLE_RDF_ADAPTER
+ // Store the extracted links in the textcache
+ if (! disable_textcache && text_cache != null)
+ text_cache.AddLinks (indexable.Uri, indexable.Links);
+#endif
+
AdjustItemCount (1);
}
Modified: trunk/beagle/beagled/Makefile.am
==============================================================================
--- trunk/beagle/beagled/Makefile.am (original)
+++ trunk/beagle/beagled/Makefile.am Sun May 18 02:46:42 2008
@@ -54,7 +54,7 @@
PLUGIN_TARGET = BeagleDaemonPlugins.dll
-PLUGIN_CSFLAGS = -target:library
+PLUGIN_CSFLAGS = -target:library $(BEAGLE_DEFINES)
PLUGIN_CSFILES = \
$(srcdir)/Flavor.cs \
@@ -527,7 +527,8 @@
EXTRACT_CONTENT_CSFLAGS = \
- -target:exe
+ -target:exe \
+ $(BEAGLE_DEFINES)
EXTRACT_CONTENT_CSFILES = \
$(srcdir)/ExtractContent.cs \
Modified: trunk/beagle/beagled/NoiseFilter.cs
==============================================================================
--- trunk/beagle/beagled/NoiseFilter.cs (original)
+++ trunk/beagle/beagled/NoiseFilter.cs Sun May 18 02:46:42 2008
@@ -40,15 +40,24 @@
// 3. Splits hostnames into subparts
public class NoiseEmailHostFilter : TokenFilter {
+ public delegate void LinkCallback (string s, bool email);
+ private LinkCallback link_call_back;
+
private bool tokenize_email_hostname;
TokenStream token_stream;
public NoiseEmailHostFilter (TokenStream input, bool tokenize_email_hostname)
+ : this (input, tokenize_email_hostname, null)
+ {
+ }
+
+ public NoiseEmailHostFilter (TokenStream input, bool tokenize_email_hostname, LinkCallback link_call_back)
: base (input)
{
this.token_stream = input;
this.tokenize_email_hostname = tokenize_email_hostname;
+ this.link_call_back = link_call_back;
}
// FIXME: we should add some heuristics that are stricter
@@ -248,6 +257,10 @@
// and also remove the final tld part
Array.Copy (parts, 0, parts, 1, parts.Length - 1);
parts [0] = email.Substring (0, index_at);
+#if ENABLE_RDF_ADAPTER
+ if (link_call_back != null)
+ link_call_back ("mailto://" + email, true);
+#endif
}
private void ProcessURLToken (Lucene.Net.Analysis.Token token)
Modified: trunk/beagle/beagled/TextCache.cs
==============================================================================
--- trunk/beagle/beagled/TextCache.cs (original)
+++ trunk/beagle/beagled/TextCache.cs Sun May 18 02:46:42 2008
@@ -27,7 +27,9 @@
using System;
using System.Collections;
+using System.Collections.Generic;
using System.IO;
+using System.Text;
using System.Threading;
using Mono.Data.Sqlite;
@@ -50,10 +52,14 @@
private const string SELF_CACHE_TAG = "*self*";
private const string BLOB_TAG = "*blob*";
- public SqliteCommand InsertCommand;
- public SqliteCommand LookupPathCommand;
- public SqliteCommand LookupDataCommand;
- public SqliteCommand DeleteCommand;
+ private SqliteCommand InsertCommand;
+ private SqliteCommand LookupPathCommand;
+ private SqliteCommand LookupDataCommand;
+ private SqliteCommand DeleteCommand;
+#if ENABLE_RDF_ADAPTER
+ private SqliteCommand UpdateLinksCommand;
+ private SqliteCommand LookupLinksCommand;
+#endif
private string text_cache_dir;
internal string TextCacheDir {
get { return text_cache_dir; }
@@ -163,12 +169,21 @@
// Database schema: uri, filename, data
SqliteUtils.DoNonQuery (connection,
- "CREATE TABLE textcache_data ( " +
+ "CREATE TABLE textcache_data ( " +
" uri TEXT UNIQUE NOT NULL, " +
" filename TEXT NOT NULL, " +
- " data BLOB " +
+ " data BLOB " +
")");
}
+#if ENABLE_RDF_ADAPTER
+ try {
+ SqliteUtils.DoNonQuery (connection,
+ "CREATE TABLE links_data ( " +
+ " uri TEXT UNIQUE NOT NULL," +
+ " links TEXT " +
+ ")");
+ } catch { }
+#endif
this.InitCommands ();
}
@@ -182,6 +197,14 @@
LookupDataCommand.CommandText = "SELECT filename, data FROM textcache_data WHERE uri= uri";
DeleteCommand = new SqliteCommand (this.connection);
DeleteCommand.CommandText = "DELETE FROM textcache_data WHERE uri= uri";
+
+#if ENABLE_RDF_ADAPTER
+ UpdateLinksCommand = new SqliteCommand (this.connection);
+ UpdateLinksCommand.CommandText = "UPDATE links_data SET links= links WHERE uri= uri";
+
+ LookupLinksCommand = new SqliteCommand (this.connection);
+ LookupLinksCommand.CommandText = "SELECT links FROM links_data WHERE uri= uri";
+#endif
}
private SqliteConnection Open (string db_filename)
{
@@ -458,7 +481,7 @@
lock (connection) {
- LookupDataCommand.Parameters.AddWithValue("@uri",UriToString (uri));
+ LookupDataCommand.Parameters.AddWithValue ("@uri", UriToString (uri));
using (SqliteDataReader reader = SqliteUtils.ExecuteReaderOrWait (LookupDataCommand)) {
if (! SqliteUtils.ReadOrWait (reader)) {
if (self_cache)
@@ -466,9 +489,9 @@
return null;
}
- filename = reader.GetString (0);
- if (! reader.IsDBNull (1))
- blob = reader.GetValue (1) as byte [];
+ filename = reader.GetString (0);
+ if (! reader.IsDBNull (1))
+ blob = reader.GetValue (1) as byte [];
}
}
@@ -518,6 +541,58 @@
}
}
+#if ENABLE_RDF_ADAPTER
+ public void AddLinks (Uri uri, IList<string> links)
+ {
+ lock (connection) {
+ string path = LookupPathRawUnlocked (uri);
+ if (path == null)
+ return;
+ MaybeStartTransaction_Unlocked ();
+ UpdateLinksCommand.Parameters.AddWithValue("@uri", UriToString (uri));
+ UpdateLinksCommand.Parameters.AddWithValue("@links", GetLinksText (links));
+ SqliteUtils.DoNonQuery (UpdateLinksCommand);
+ }
+ }
+
+ private string GetLinksText (IList<string> links)
+ {
+ if (links == null || links.Count == 0)
+ return String.Empty;
+
+ StringBuilder sb = new StringBuilder ();
+ foreach (string s in links) {
+ sb.Append (s);
+ sb.Append (' ');
+ }
+
+ return sb.ToString ();
+ }
+
+ public IList<string> GetLinks (Uri uri)
+ {
+ string links_text = null;
+ List<string> links = null;
+
+ lock (connection) {
+ LookupLinksCommand.Parameters.AddWithValue ("@uri", UriToString (uri));
+ using (SqliteDataReader reader = SqliteUtils.ExecuteReaderOrWait (LookupLinksCommand)) {
+ if (! SqliteUtils.ReadOrWait (reader))
+ return null;
+
+ links_text = reader.GetString (0);
+ }
+ }
+
+ if (String.IsNullOrEmpty (links_text))
+ return null;
+
+ return links_text.Split (links_separator, StringSplitOptions.RemoveEmptyEntries);
+ }
+
+ static readonly char[] links_separator = new char[] {' '};
+#endif
+
private void MaybeStartTransaction_Unlocked ()
{
if (transaction_state == TransactionState.Requested)
Modified: trunk/beagle/configure.in
==============================================================================
--- trunk/beagle/configure.in (original)
+++ trunk/beagle/configure.in Sun May 18 02:46:42 2008
@@ -717,6 +717,6 @@
beagle-search GUI ${enable_gui}
Qt beagle-settings GUI ${enable_qt}
- Build RDF Adapter ${enable_rdf_adapter}
+ Build RDF Adapter ${enable_rdf_adapter} (purely experimental)
Build docs? ${with_docs}
"
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]