beagle r4598 - in branches/beagle-rdf: BeagleClient Filters beagled
- From: dbera svn gnome org
- To: svn-commits-list gnome org
- Subject: beagle r4598 - in branches/beagle-rdf: BeagleClient Filters beagled
- Date: Sun, 9 Mar 2008 21:36:01 +0000 (GMT)
Author: dbera
Date: Sun Mar 9 21:36:01 2008
New Revision: 4598
URL: http://svn.gnome.org/viewvc/beagle?rev=4598&view=rev
Log:
Extract URLs and Email addresses from the text content and store them in the textcache. That completes the cycle of linking documents - we had parentdir, parent email for attachments and referrer for web history as incoming links, and now we also have "outgoing" links - the URLs and email addresses (which are made into URIs e.g. "D Bera" <dbera web gmail com> becomes mailto://dbera web gmail com/D%20Bera). Only the HTML filter is capable to filtering out URLs right now; I dont know if it is possible to get URLs from other types of files. Email is extracted from anywhere, even text files, thanks for Lucene analyzers.
WARNING: Delete TextCache/ - the new textcache is incompatible with the previous one. When merged with trunk, the textcache will be appropriately versioned.
Modified:
branches/beagle-rdf/BeagleClient/Indexable.cs
branches/beagle-rdf/Filters/FilterHtml.cs
branches/beagle-rdf/Filters/FilterMail.cs
branches/beagle-rdf/beagled/ExtractContent.cs
branches/beagle-rdf/beagled/Filter.cs
branches/beagle-rdf/beagled/FilterFactory.cs
branches/beagle-rdf/beagled/LuceneCommon.cs
branches/beagle-rdf/beagled/LuceneIndexingDriver.cs
branches/beagle-rdf/beagled/NoiseFilter.cs
branches/beagle-rdf/beagled/TextCache.cs
Modified: branches/beagle-rdf/BeagleClient/Indexable.cs
==============================================================================
--- branches/beagle-rdf/BeagleClient/Indexable.cs (original)
+++ branches/beagle-rdf/BeagleClient/Indexable.cs Sun Mar 9 21:36:01 2008
@@ -28,6 +28,7 @@
using System;
using System.Collections;
+using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Threading;
@@ -122,6 +123,9 @@
// When should we try to filter this indexable?
private IndexableFiltering filtering = IndexableFiltering.Automatic;
+ // List of links found in the content while indexing
+ private IList<string> links = null;
+
// Local state: these are key/value pairs that never get serialized
// into XML
Hashtable local_state = new Hashtable ();
@@ -420,6 +424,12 @@
binary_stream = stream;
}
+ [XmlIgnore]
+ public IList<string> Links {
+ get { return links; }
+ set { links = value; }
+ }
+
[XmlArrayItem (ElementName="Property", Type=typeof (Property))]
public ArrayList Properties {
get { return properties; }
Modified: branches/beagle-rdf/Filters/FilterHtml.cs
==============================================================================
--- branches/beagle-rdf/Filters/FilterHtml.cs (original)
+++ branches/beagle-rdf/Filters/FilterHtml.cs Sun Mar 9 21:36:01 2008
@@ -61,6 +61,7 @@
public delegate void AddPropertyCallback (Beagle.Property p);
public delegate bool AppendSpaceCallback ();
public delegate void HotCallback ();
+ public delegate void AddLinkCallback (string s);
// delegates
private new AppendTextCallback AppendText;
@@ -70,6 +71,7 @@
private new AppendSpaceCallback AppendStructuralBreak;
private new HotCallback HotUp;
private new HotCallback HotDown;
+ private new AddLinkCallback AddLink;
// 1: Add meta keyword fields as meta:key
private int version = 1;
@@ -88,6 +90,7 @@
AppendStructuralBreak = new AppendSpaceCallback (base.AppendStructuralBreak);
HotUp = new HotCallback (base.HotUp);
HotDown = new HotCallback (base.HotDown);
+ AddLink = new AddLinkCallback (base.AddLink);
}
ignore_level = 0;
@@ -103,6 +106,12 @@
base.SetVersion (this.version);
}
+ public void SetAddLinkHandler (AddLinkCallback link_handler)
+ {
+ if (link_handler != null)
+ AddLink = link_handler;
+ }
+
protected bool NodeIsHot (String nodeName)
{
return nodeName == "b"
@@ -222,6 +231,9 @@
string s = HtmlEntity.DeEntitize (
SW.HttpUtility.UrlDecode (attr, enc));
AppendWord (s);
+ // Add valid and global URLs to special field "Link"
+ if (s.StartsWith ("http://") || s.StartsWith ("mailto:") || s.StartsWith ("ftp://"))
+ AddLink (s);
ret = AppendWhiteSpace ();
}
} else if (node.Name == "br") // both <br> and </br> are used - special case
@@ -438,11 +450,17 @@
public static TextReader GetHtmlReader (Stream stream, string charset)
{
+ return GetHtmlReader (stream, charset, null);
+ }
+
+ public static TextReader GetHtmlReader (Stream stream, string charset, AddLinkCallback link_handler)
+ {
if (stream == null)
throw new ArgumentNullException ("stream");
FilterHtml html_filter = new FilterHtml ();
html_filter.SnippetMode = false;
+ html_filter.SetAddLinkHandler (link_handler);
html_filter.Indexable = new Indexable (); // fake an indexable
html_filter.AddProperty (Property.NewUnsearched (StringFu. UnindexedNamespace + "encoding", charset));
Modified: branches/beagle-rdf/Filters/FilterMail.cs
==============================================================================
--- branches/beagle-rdf/Filters/FilterMail.cs (original)
+++ branches/beagle-rdf/Filters/FilterMail.cs Sun Mar 9 21:36:01 2008
@@ -147,6 +147,7 @@
AddProperty (Property.New ("fixme:to_address", ia.Addr));
AddProperty (Property.New ("fixme:to_name", ia.Name));
+ AddEmailLink (ia);
}
addrs.Dispose ();
@@ -157,6 +158,7 @@
AddProperty (Property.New ("fixme:cc_address", ia.Addr));
AddProperty (Property.New ("fixme:cc_name", ia.Name));
+ AddEmailLink (ia);
}
addrs.Dispose ();
@@ -167,6 +169,7 @@
AddProperty (Property.New ("fixme:from_address", ia.Addr));
AddProperty (Property.New ("fixme:from_name", ia.Name));
+ AddEmailLink (ia);
}
addrs.Dispose ();
@@ -202,9 +205,21 @@
AddProperty (Property.NewFlag ("fixme:isSent"));
}
+ private void AddEmailLink (GMime.InternetAddress ia)
+ {
+ if (String.IsNullOrEmpty (ia.Name))
+ AddLink (String.Concat ("mailto://", ia.Addr));
+ else
+ AddLink (String.Concat ("mailto://", ia.Addr, "/", Uri.EscapeDataString (ia.Name)));
+ }
+
protected override void DoPullSetup ()
{
- this.handler = new PartHandler (Indexable);
+ this.handler = new PartHandler (Indexable,
+ delegate (string s)
+ {
+ AddLink (s);
+ });
using (GMime.Object mime_part = this.message.MimePart)
this.handler.OnEachPart (mime_part);
@@ -274,6 +289,7 @@
private int depth = 0; // part recursion depth
private ArrayList child_indexables = new ArrayList ();
private TextReader reader;
+ private FilterHtml.AddLinkCallback link_handler;
private bool html_part = false;
internal bool HtmlPart {
@@ -291,9 +307,10 @@
"text/x-vcard"
};
- public PartHandler (Indexable parent_indexable)
+ public PartHandler (Indexable parent_indexable, FilterHtml.AddLinkCallback link_handler)
{
this.indexable = parent_indexable;
+ this.link_handler = link_handler;
}
private bool IsMimeTypeHandled (string mime_type)
@@ -406,7 +423,7 @@
stream.Close ();
try {
- this.reader = FilterHtml.GetHtmlReader (html_stream, enc);
+ this.reader = FilterHtml.GetHtmlReader (html_stream, enc, link_handler);
} catch (Exception e) {
Log.Debug (e, "Exception while filtering HTML email {0}", this.indexable.Uri);
this.reader = null;
Modified: branches/beagle-rdf/beagled/ExtractContent.cs
==============================================================================
--- branches/beagle-rdf/beagled/ExtractContent.cs (original)
+++ branches/beagle-rdf/beagled/ExtractContent.cs Sun Mar 9 21:36:01 2008
@@ -30,6 +30,7 @@
using System.Net;
using System.Reflection;
using System.Collections;
+using System.Collections.Generic;
using Beagle;
using Beagle.Util;
@@ -278,6 +279,14 @@
Console.WriteLine ();
Console.WriteLine ("Text extracted in {0}", watch);
+ IList<string> links = indexable.Links;
+ if (links != null && links.Count != 0) {
+ Console.WriteLine ("Links:");
+ foreach (string link in links)
+ Console.WriteLine (link);
+ Console.WriteLine ();
+ }
+
foreach (Indexable gi in generated_indexables)
Display (gi);
Modified: branches/beagle-rdf/beagled/Filter.cs
==============================================================================
--- branches/beagle-rdf/beagled/Filter.cs (original)
+++ branches/beagle-rdf/beagled/Filter.cs Sun Mar 9 21:36:01 2008
@@ -28,6 +28,7 @@
using System;
using System.Collections;
+using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Reflection;
@@ -229,6 +230,7 @@
private StringBuilder text_builder = new StringBuilder (DEFAULT_CHARS_TO_PULL);
private ArrayList textPool;
private ArrayList hotPool;
+ private IList<string> links_list = new List<string> ();
private bool last_was_structural_break = false;
const string WHITESPACE = " ";
@@ -448,6 +450,11 @@
return UpdateCharsAdded (1);
}
+ public void AddLink (string link)
+ {
+ links_list.Add (link);
+ }
+
//private bool NeedsWhiteSpace (ArrayList array)
//{
// if (array.Count == 0)
@@ -893,6 +900,10 @@
return new PullingReader (new PullingReader.Pull (PullHotText));
}
+ public IList<string> Links {
+ get { return links_list; }
+ }
+
//////////////////////////////
//////////// Default implementation of generated indexables
Modified: branches/beagle-rdf/beagled/FilterFactory.cs
==============================================================================
--- branches/beagle-rdf/beagled/FilterFactory.cs (original)
+++ branches/beagle-rdf/beagled/FilterFactory.cs Sun Mar 9 21:36:01 2008
@@ -332,6 +332,7 @@
indexable.SetTextReader (candidate_filter.GetTextReader ());
indexable.SetHotTextReader (candidate_filter.GetHotTextReader ());
+ indexable.Links = candidate_filter.Links;
if (Debug)
Logger.Log.Debug ("Successfully filtered {0} with {1}", path, candidate_filter);
Modified: branches/beagle-rdf/beagled/LuceneCommon.cs
==============================================================================
--- branches/beagle-rdf/beagled/LuceneCommon.cs (original)
+++ branches/beagle-rdf/beagled/LuceneCommon.cs Sun Mar 9 21:36:01 2008
@@ -492,6 +492,15 @@
private bool strip_extra_property_info = false;
private bool tokenize_email_hostname = false;
+ private NoiseEmailHostFilter.LinkCallback add_link = null;
+
+ public NoiseEmailHostFilter.LinkCallback AddLink {
+ set {
+ lock (this)
+ add_link = value;
+ }
+ }
+
public BeagleAnalyzer (bool is_indexing_analyzer)
{
if (is_indexing_analyzer) {
@@ -545,11 +554,17 @@
TokenStream outstream;
outstream = base.TokenStream (fieldName, reader);
+ NoiseEmailHostFilter.LinkCallback add_link_callback = null;
+ lock (this) {
+ if (fieldName == "Text")
+ add_link_callback = add_link;
+ }
+
if (fieldName == "Text"
|| fieldName == "HotText"
|| fieldName == "PropertyText"
|| is_text_prop) {
- outstream = new NoiseEmailHostFilter (outstream, tokenize_email_hostname);
+ outstream = new NoiseEmailHostFilter (outstream, tokenize_email_hostname, add_link_callback);
// Sharing Stemmer is not thread safe.
// Currently our underlying lucene indexing is not done in multiple threads.
StemmerInfo stemmer_info = GetStemmer (DEFAULT_STEMMER_LANGUAGE);
@@ -560,11 +575,11 @@
}
}
- static private Analyzer indexing_analyzer = new BeagleAnalyzer (true);
- static private Analyzer query_analyzer = new BeagleAnalyzer (false);
+ static private BeagleAnalyzer indexing_analyzer = new BeagleAnalyzer (true);
+ static private BeagleAnalyzer query_analyzer = new BeagleAnalyzer (false);
- static protected Analyzer IndexingAnalyzer { get { return indexing_analyzer; } }
- static protected Analyzer QueryAnalyzer { get { return query_analyzer; } }
+ static protected BeagleAnalyzer IndexingAnalyzer { get { return indexing_analyzer; } }
+ static protected BeagleAnalyzer QueryAnalyzer { get { return query_analyzer; } }
////////////////////////////////////////////////////////////////
@@ -1003,7 +1018,7 @@
if (seen_properties)
doc.RemoveFields ("Properties");
- Field field = new Field ("Properties", sb.ToString (), Field.Store.YES, Field.Index.TOKENIZED); // FIXME: Field.Store.No
+ Field field = new Field ("Properties", sb.ToString (), Field.Store.NO, Field.Index.TOKENIZED);
doc.Add (field);
}
@@ -1037,8 +1052,8 @@
static protected void AddPropertiesToHit (Hit hit, Document doc, bool from_primary_index)
{
+ Property prop;
foreach (Field f in doc.Fields ()) {
- Property prop;
prop = GetPropertyFromDocument (f, doc, from_primary_index);
if (prop != null)
hit.AddProperty (prop);
Modified: branches/beagle-rdf/beagled/LuceneIndexingDriver.cs
==============================================================================
--- branches/beagle-rdf/beagled/LuceneIndexingDriver.cs (original)
+++ branches/beagle-rdf/beagled/LuceneIndexingDriver.cs Sun Mar 9 21:36:01 2008
@@ -616,8 +616,19 @@
Document primary_doc = null, secondary_doc = null;
try {
+ // Add a callback to extract emails and links from the anaylyzer
+ // and add them to secondary_doc's "References" field.
+ IndexingAnalyzer.AddLink = delegate (string s, bool email)
+ {
+ // Only add emails for now
+ // NoiseFilter is not good with URLs
+ if (! email || indexable.Links == null)
+ return;
+ indexable.Links.Add (s);
+ };
BuildDocuments (indexable, out primary_doc, out secondary_doc);
primary_writer.AddDocument (primary_doc);
+ IndexingAnalyzer.AddLink = null;
} catch (Exception ex) {
// If an exception was thrown, something bad probably happened
@@ -646,6 +657,10 @@
secondary_writer.AddDocument (secondary_doc);
}
+ // Store the extracted links in the textcache
+ if (! disable_textcache && text_cache != null)
+ text_cache.AddLinks (indexable.Uri, indexable.Links);
+
AdjustItemCount (1);
}
Modified: branches/beagle-rdf/beagled/NoiseFilter.cs
==============================================================================
--- branches/beagle-rdf/beagled/NoiseFilter.cs (original)
+++ branches/beagle-rdf/beagled/NoiseFilter.cs Sun Mar 9 21:36:01 2008
@@ -40,15 +40,24 @@
// 3. Splits hostnames into subparts
public class NoiseEmailHostFilter : TokenFilter {
+ public delegate void LinkCallback (string s, bool email);
+ private LinkCallback link_call_back;
+
private bool tokenize_email_hostname;
TokenStream token_stream;
public NoiseEmailHostFilter (TokenStream input, bool tokenize_email_hostname)
+ : this (input, tokenize_email_hostname, null)
+ {
+ }
+
+ public NoiseEmailHostFilter (TokenStream input, bool tokenize_email_hostname, LinkCallback link_call_back)
: base (input)
{
this.token_stream = input;
this.tokenize_email_hostname = tokenize_email_hostname;
+ this.link_call_back = link_call_back;
}
// FIXME: we should add some heuristics that are stricter
@@ -248,6 +257,9 @@
// and also remove the final tld part
Array.Copy (parts, 0, parts, 1, parts.Length - 1);
parts [0] = email.Substring (0, index_at);
+
+ if (link_call_back != null)
+ link_call_back ("mailto://" + email, true);
}
private void ProcessURLToken (Lucene.Net.Analysis.Token token)
Modified: branches/beagle-rdf/beagled/TextCache.cs
==============================================================================
--- branches/beagle-rdf/beagled/TextCache.cs (original)
+++ branches/beagle-rdf/beagled/TextCache.cs Sun Mar 9 21:36:01 2008
@@ -27,7 +27,9 @@
using System;
using System.Collections;
+using System.Collections.Generic;
using System.IO;
+using System.Text;
using System.Threading;
using Mono.Data.Sqlite;
@@ -50,10 +52,12 @@
private const string SELF_CACHE_TAG = "*self*";
private const string BLOB_TAG = "*blob*";
- public SqliteCommand InsertCommand;
- public SqliteCommand LookupPathCommand;
- public SqliteCommand LookupDataCommand;
- public SqliteCommand DeleteCommand;
+ private SqliteCommand InsertCommand;
+ private SqliteCommand LookupPathCommand;
+ private SqliteCommand LookupDataCommand;
+ private SqliteCommand DeleteCommand;
+ private SqliteCommand UpdateLinksCommand;
+ private SqliteCommand LookupLinksCommand;
private string text_cache_dir;
internal string TextCacheDir {
get { return text_cache_dir; }
@@ -166,7 +170,8 @@
"CREATE TABLE textcache_data ( " +
" uri TEXT UNIQUE NOT NULL, " +
" filename TEXT NOT NULL, " +
- " data BLOB " +
+ " data BLOB, " +
+ " links TEXT " +
")");
}
this.InitCommands ();
@@ -182,6 +187,12 @@
LookupDataCommand.CommandText = "SELECT filename, data FROM textcache_data WHERE uri= uri";
DeleteCommand = new SqliteCommand (this.connection);
DeleteCommand.CommandText = "DELETE FROM textcache_data WHERE uri= uri";
+
+ UpdateLinksCommand = new SqliteCommand (this.connection);
+ UpdateLinksCommand.CommandText = "UPDATE textcache_data SET links= links WHERE uri= uri";
+
+ LookupLinksCommand = new SqliteCommand (this.connection);
+ LookupLinksCommand.CommandText = "SELECT links FROM textcache_data WHERE uri= uri";
}
private SqliteConnection Open (string db_filename)
{
@@ -458,7 +469,7 @@
lock (connection) {
- LookupDataCommand.Parameters.AddWithValue("@uri",UriToString (uri));
+ LookupDataCommand.Parameters.AddWithValue ("@uri", UriToString (uri));
using (SqliteDataReader reader = SqliteUtils.ExecuteReaderOrWait (LookupDataCommand)) {
if (! SqliteUtils.ReadOrWait (reader)) {
if (self_cache)
@@ -466,9 +477,9 @@
return null;
}
- filename = reader.GetString (0);
- if (! reader.IsDBNull (1))
- blob = reader.GetValue (1) as byte [];
+ filename = reader.GetString (0);
+ if (! reader.IsDBNull (1))
+ blob = reader.GetValue (1) as byte [];
}
}
@@ -518,6 +529,56 @@
}
}
+ public void AddLinks (Uri uri, IList<string> links)
+ {
+ lock (connection) {
+ string path = LookupPathRawUnlocked (uri);
+ if (path == null)
+ return;
+ MaybeStartTransaction_Unlocked ();
+ UpdateLinksCommand.Parameters.AddWithValue("@uri", UriToString (uri));
+ UpdateLinksCommand.Parameters.AddWithValue("@links", GetLinksText (links));
+ SqliteUtils.DoNonQuery (UpdateLinksCommand);
+ }
+ }
+
+ private string GetLinksText (IList<string> links)
+ {
+ if (links == null || links.Count == 0)
+ return String.Empty;
+
+ StringBuilder sb = new StringBuilder ();
+ foreach (string s in links) {
+ sb.Append (s);
+ sb.Append (' ');
+ }
+
+ return sb.ToString ();
+ }
+
+ public IList<string> GetLinks (Uri uri)
+ {
+ string links_text = null;
+ List<string> links = null;
+
+ lock (connection) {
+ LookupLinksCommand.Parameters.AddWithValue ("@uri", UriToString (uri));
+ using (SqliteDataReader reader = SqliteUtils.ExecuteReaderOrWait (LookupLinksCommand)) {
+ if (! SqliteUtils.ReadOrWait (reader))
+ return null;
+
+ links_text = reader.GetString (0);
+ }
+ }
+
+ if (String.IsNullOrEmpty (links_text))
+ return null;
+
+ return links_text.Split (links_separator, StringSplitOptions.RemoveEmptyEntries);
+ }
+
+ static readonly char[] links_separator = new char[] {' '};
+
private void MaybeStartTransaction_Unlocked ()
{
if (transaction_state == TransactionState.Requested)
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]