beagle r4757 - trunk/beagle/beagled



Author: dbera
Date: Sun May 18 05:54:29 2008
New Revision: 4757
URL: http://svn.gnome.org/viewvc/beagle?rev=4757&view=rev

Log:
Finish the remaining work wrt extracted-links for rdf. Specifically,
* Add an unstored field "TextLinks" in the primary-index which is a whitespace separated list of all extracted links.
* Store the actual links in the textcache.
* The reason for not storing the links in the index itself is, the links are extracted when the "Text" field is getting added to the index. Lucene makes it impossible to add a stored field whose content is dynamic.
* This textcache-index shuffling requires us to use the text-cache when dealing with "TextLinks" predicate. So expose the TextCache in LuceneQueryable and pass it to the rdf query function. This is because static queryables handle their own textcache.
* Rest is a continuation of creating virtual TextLinks properties for a Hit as and when needed. This is expensive and adds a necessary but costly step to rdf overlaying.

In short, if you are asking for all properties for all documents, watch out for smoke coming out of your computer.


Modified:
   trunk/beagle/beagled/LuceneCommon.cs
   trunk/beagle/beagled/LuceneQueryable.cs
   trunk/beagle/beagled/LuceneQueryingDriver.cs
   trunk/beagle/beagled/StaticQueryable.cs
   trunk/beagle/beagled/TextCache.cs

Modified: trunk/beagle/beagled/LuceneCommon.cs
==============================================================================
--- trunk/beagle/beagled/LuceneCommon.cs	(original)
+++ trunk/beagle/beagled/LuceneCommon.cs	Sun May 18 05:54:29 2008
@@ -549,7 +549,8 @@
 					return new LowerCaseFilter (new SingletonTokenStream (reader.ReadToEnd ()));
 				else if (fieldName == "Properties")
 					return new WhitespaceTokenizer (new StringReader (reader.ReadToEnd ()));
-
+				else if (fieldName == "TextLinks")
+					return new WhitespaceTokenizer (new StringReader (reader.ReadToEnd ()));
 
 				TokenStream outstream;
 				outstream = base.TokenStream (fieldName, reader);
@@ -893,13 +894,61 @@
 			}
 #if ENABLE_RDF_ADAPTER
 
-			// Now add a field containing a whitespace separated list of other fields in the document
+			// Now add the whitespace separated list of links extracted from the document of the text
+			// Add the property to the primary document. Why primary ?
+			// Because it stays with the "Text" property
+			Fieldable links_field = new LinksField (indexable.Links);
+			primary_doc.Add (links_field);
+
+			// Finally add a field containing a whitespace separated list of other fields in the document
 			AddFieldProperies (primary_doc);
 			if (secondary_doc != null)
 				AddFieldProperies (secondary_doc);
 #endif
 		}
 
+#if ENABLE_RDF_ADAPTER
+		private class LinksField : Fieldable {
+			IList<string> links;
+			internal LinksField (IList<string> links)
+			{
+				this.links = links;
+			}
+
+			public void  SetBoost(float boost) { }
+			public float GetBoost() { return 1.0f; }
+			public System.String Name() { return "TextLinks"; }
+
+			public System.String StringValue()
+			{
+				if (links == null)
+					return String.Empty;
+
+				StringBuilder sb = new StringBuilder ();
+				foreach (string link in links) {
+					sb.Append (link);
+					sb.Append (" ");
+				}
+
+				return sb.ToString ();
+			}
+
+			public System.IO.TextReader ReaderValue() { return null; }
+			public byte[] BinaryValue() { return null; }
+			public bool IsStored() { return false; }
+			public bool IsIndexed() { return true; }
+			public bool IsTokenized() { return true; }
+			public bool IsCompressed() { return false; }
+			public bool IsTermVectorStored() { return false; }
+			public bool IsStoreOffsetWithTermVector() { return false; }
+			public bool IsStorePositionWithTermVector() { return false; }
+			public bool IsBinary() { return false; }
+			public bool GetOmitNorms() { return true; }
+			public void  SetOmitNorms(bool omitNorms) { }
+			public bool IsLazy() { return false; }
+		}
+#endif
+
 		static private Document CreateSecondaryDocument (Uri uri, Uri parent_uri)
 		{
 			Document secondary_doc = new Document ();
@@ -1008,7 +1057,7 @@
 			StringBuilder sb = new StringBuilder ();
 			bool seen_properties = false;
 
-			foreach (Field f in doc.Fields ()) {
+			foreach (Fieldable f in doc.Fields ()) {
 				if (f.Name () == "Properties") {
 					seen_properties = true;
 					continue;

Modified: trunk/beagle/beagled/LuceneQueryable.cs
==============================================================================
--- trunk/beagle/beagled/LuceneQueryable.cs	(original)
+++ trunk/beagle/beagled/LuceneQueryable.cs	Sun May 18 05:54:29 2008
@@ -282,9 +282,13 @@
 		}
 
 #if ENABLE_RDF_ADAPTER
+		protected virtual TextCache TextCache {
+			get { return TextCache.UserCache; }
+		}
+
 		public ICollection DoRDFQuery (Query query)
 		{
-			return Driver.DoRDFQuery (query);
+			return Driver.DoRDFQuery (query, TextCache);
 		}
 #endif
 

Modified: trunk/beagle/beagled/LuceneQueryingDriver.cs
==============================================================================
--- trunk/beagle/beagled/LuceneQueryingDriver.cs	(original)
+++ trunk/beagle/beagled/LuceneQueryingDriver.cs	Sun May 18 05:54:29 2008
@@ -321,7 +321,7 @@
 		// They will come into play in the final FetchDocument part
 		// FIXME: Should RDFQuery do any query mapping using backend_query_part_hook ?
 		// I think it should not. QueryPart hooks are for human beings, RDF is for softwares.
-		public ICollection DoRDFQuery (Query _query)
+		public ICollection DoRDFQuery (Query _query, TextCache text_cache)
 		{
 			RDFQuery query = (RDFQuery) _query;
 
@@ -339,8 +339,13 @@
 			// ******** 8 cases **********
 
 			// Return all uris
-			if (subject == String.Empty && predicate == String.Empty && _object == String.Empty)
-				return GetAllHitsByUri ().Values;
+			if (subject == String.Empty && predicate == String.Empty && _object == String.Empty) {
+				ICollection hits = GetAllHitsByUri ().Values;
+				foreach (Hit hit in hits)
+					foreach (Property text_link_property in GetTextLinks (hit.Uri, text_cache))
+						hit.AddProperty (text_link_property);
+				return hits;
+			}
 
 			// Normal query
 			if (subject == String.Empty && predicate == String.Empty && _object != String.Empty) {
@@ -348,7 +353,7 @@
 				part.Text = _object;
 				part.SearchFullText = false; // We only search properties in RDF query
 				query.AddPart (part);
-				return DoLowLevelRDFQuery (query, pred_type, predicate, _object);
+				return DoLowLevelRDFQuery (query, pred_type, predicate, _object, text_cache);
 			}
 
 			// Return uris for all documents with this property
@@ -361,7 +366,7 @@
 				part.Value = field_name;
 				query.AddPart (part);
 
-				return DoLowLevelRDFQuery (query, pred_type, predicate, null);
+				return DoLowLevelRDFQuery (query, pred_type, predicate, null, text_cache);
 			}
 
 			// Property query
@@ -371,7 +376,7 @@
 				part.Key = predicate;
 				part.Value = _object;
 				query.AddPart (part);
-				return DoLowLevelRDFQuery (query, pred_type, predicate, _object);
+				return DoLowLevelRDFQuery (query, pred_type, predicate, _object, text_cache);
 			}
 
 			// Return if the URI exists
@@ -380,7 +385,7 @@
 				part.Uri = UriFu.UserUritoEscapedUri (subject); // better be URI!
 				query.AddPart (part);
 				// FIXME: Which properties to return in the hit? All or none ?
-				return DoLowLevelRDFQuery (query, pred_type, predicate, null);
+				return DoLowLevelRDFQuery (query, pred_type, predicate, null, text_cache);
 			}
 
 			// Normal query in the document with this URI
@@ -394,7 +399,7 @@
 				part.SearchFullText = false; // We only search properties in RDF query
 				query.AddPart (part);
 
-				return DoLowLevelRDFQuery (query, pred_type, predicate, _object);
+				return DoLowLevelRDFQuery (query, pred_type, predicate, _object, text_cache);
 			}
 
 			// Return URI if the document with this URI contains this property
@@ -407,6 +412,11 @@
 				string field_name = PropertyToFieldName (pred_type, predicate);
 				FieldSelector fields = new MapFieldSelector (new string[] { "Uri", "Timestamp", field_name });
 				ICollection hits = GetHitsForUris (uri_list, fields);
+				if (predicate == "TextLinks") {
+					foreach (Hit hit in hits)
+						foreach (Property text_link_property in GetTextLinks (hit.Uri, text_cache))
+							hit.AddProperty (text_link_property);
+				}
 
 				return hits;
 			}
@@ -423,17 +433,17 @@
 				part.Value = _object;
 				query.AddPart (part);
 
-				return DoLowLevelRDFQuery (query, pred_type, predicate, _object);
+				return DoLowLevelRDFQuery (query, pred_type, predicate, _object, text_cache);
 			}
 
 			throw new Exception ("Never reaches");
 		}
-#endif
 
 		private ICollection DoLowLevelRDFQuery (Query query,
 							PropertyType pred_type,
 							string predicate,
-							string field_value)
+							string field_value,
+							TextCache text_cache)
 		{
 
 			Stopwatch total, a, b, c, d, e, f;
@@ -540,7 +550,7 @@
 
 			d.Stop ();
 			if (Debug)
-				Logger.Log.Debug ("###### {0}: Low-level queries finished in {1}", IndexName, d);
+				Logger.Log.Debug ("###### {0}: Low-level queries finished in {1} and returned {2} matches", IndexName, d, primary_matches.TrueCount);
 
 			e.Start ();
 
@@ -627,6 +637,17 @@
 					}
 					
 					hits.Add (hit);
+				} else if (predicate == "TextLinks") {
+					// Special treatment: TextLinks is not stored but can be queried
+					doc = primary_searcher.Doc (match_index, fields_timestamp_uri);
+					Hit hit = CreateHit (doc, secondary_reader, secondary_term_docs, fields);
+					if (field_value != null)
+						hit.AddProperty (Property.New ("TextLinks", field_value));
+					else {
+						foreach (Property text_link_property in GetTextLinks (hit.Uri, text_cache))
+							hit.AddProperty (text_link_property);
+					}
+					hits.Add (hit);
 				} else {
 					doc = primary_searcher.Doc (match_index, fields);
 					Hit hit = CreateHit (doc, secondary_reader, secondary_term_docs, fields);
@@ -723,6 +744,19 @@
 			return field_analyzed.Contains (value_analyzed);
 		}
 
+		private IEnumerable GetTextLinks (Uri uri, TextCache text_cache)
+		{
+			if (text_cache == null)
+				yield break;
+
+			IList<string> links = text_cache.GetLinks (uri);
+			if (links == null)
+				yield break;
+
+			foreach (string link in links)
+				yield return Property.NewKeyword ("TextLinks", link);
+		}
+#endif
 		////////////////////////////////////////////////////////////////
 
 		public int DoCountMatchQuery (Query query, QueryPartHook query_part_hook)

Modified: trunk/beagle/beagled/StaticQueryable.cs
==============================================================================
--- trunk/beagle/beagled/StaticQueryable.cs	(original)
+++ trunk/beagle/beagled/StaticQueryable.cs	Sun May 18 05:54:29 2008
@@ -38,7 +38,7 @@
 
 	public class StaticQueryable : LuceneQueryable 	{
 
-		protected TextCache text_cache;
+		protected TextCache text_cache = null;
 
 		public StaticQueryable (string index_name, string index_path, bool read_only_mode) : base (index_path, read_only_mode)
 		{
@@ -53,6 +53,12 @@
 			}
 		}
 
+#if ENABLE_RDF_ADAPTER
+		protected override TextCache TextCache {
+			get { return text_cache; }
+		}
+#endif
+
 		override public ISnippetReader GetSnippet (string[] query_terms, Hit hit, bool full_text, int ctx_length, int snp_length) 
 		{
 			if (text_cache == null)

Modified: trunk/beagle/beagled/TextCache.cs
==============================================================================
--- trunk/beagle/beagled/TextCache.cs	(original)
+++ trunk/beagle/beagled/TextCache.cs	Sun May 18 05:54:29 2008
@@ -178,11 +178,11 @@
 #if ENABLE_RDF_ADAPTER
 			try {
 				SqliteUtils.DoNonQuery (connection,
-							"CREATE TABLE links_data (  " +
-							"  uri TEXT UNIQUE NOT NULL," +
-							"  links TEXT		    " +
+							"CREATE TABLE IF NOT EXISTS links_data (  " +
+							"  uri TEXT UNIQUE NOT NULL,		  " +
+							"  links TEXT				  " +
 							")");
-			} catch { }
+			} catch (SqliteException) { }
 #endif
 			this.InitCommands ();
 		}
@@ -200,7 +200,7 @@
 
 #if ENABLE_RDF_ADAPTER
 			UpdateLinksCommand = new SqliteCommand (this.connection);
-			UpdateLinksCommand.CommandText = "UPDATE links_data SET links= links WHERE uri= uri";
+			UpdateLinksCommand.CommandText = "INSERT OR REPLACE INTO links_data (uri, links) VALUES (@uri, @links);";
 
 			LookupLinksCommand = new SqliteCommand (this.connection);
 			LookupLinksCommand.CommandText = "SELECT links FROM links_data WHERE uri= uri";
@@ -256,7 +256,7 @@
 				if (SqliteUtils.ReadOrWait (reader))
 					path = reader.GetString (0);
 			}
-			
+
 			return path;
 		}
 
@@ -545,9 +545,6 @@
 		public void AddLinks (Uri uri, IList<string> links)
 		{
 			lock (connection) {
-				string path = LookupPathRawUnlocked (uri);
-				if (path == null)
-					return;
 				MaybeStartTransaction_Unlocked ();
 				UpdateLinksCommand.Parameters.AddWithValue("@uri", UriToString (uri));
 				UpdateLinksCommand.Parameters.AddWithValue("@links", GetLinksText (links));



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]