beagle r4493 - branches/beagle-rdf/beagled



Author: dbera
Date: Sun Feb 17 00:34:20 2008
New Revision: 4493
URL: http://svn.gnome.org/viewvc/beagle?rev=4493&view=rev

Log:
Remove the cumbersome GetDocsWithProperty method. Instead store a field with the names of all the other properties, whitespace separated and use that to query. Interestingly, this increased the query time; however the earlier method only searched in the PrimaryIndex while this one searches in both the indexes (and is of course much cleaner and a lot less code).
Use a FieldSelector in LuceneBitArray ... again no improvement in query time. Still it is the right thing to do.
Several other minor fixes.


Modified:
   branches/beagle-rdf/beagled/DumpIndex.cs
   branches/beagle-rdf/beagled/LuceneBitArray.cs
   branches/beagle-rdf/beagled/LuceneCommon.cs
   branches/beagle-rdf/beagled/LuceneQueryingDriver.cs
   branches/beagle-rdf/beagled/NoiseFilter.cs

Modified: branches/beagle-rdf/beagled/DumpIndex.cs
==============================================================================
--- branches/beagle-rdf/beagled/DumpIndex.cs	(original)
+++ branches/beagle-rdf/beagled/DumpIndex.cs	Sun Feb 17 00:34:20 2008
@@ -205,7 +205,7 @@
 				int freq;
 				freq = term_enum.DocFreq ();
 
-				Console.WriteLine ("{0} {1} {2}", index_name, term_enum.Term ().Text (), freq);
+				Console.WriteLine ("{0} '{1}' {2}", index_name, term_enum.Term ().Text (), freq);
 
 				// FIXME: spew these as a count
 				++distinct_term_count;

Modified: branches/beagle-rdf/beagled/LuceneBitArray.cs
==============================================================================
--- branches/beagle-rdf/beagled/LuceneBitArray.cs	(original)
+++ branches/beagle-rdf/beagled/LuceneBitArray.cs	Sun Feb 17 00:34:20 2008
@@ -198,6 +198,8 @@
 
 		////////////////////////////////////////////////////////////
 
+		static string[] fields_uri = { "Timestamp", "Uri" };
+
 		public void ProjectOnto (LuceneBitArray other)
 		{
 			int j = 0;
@@ -209,7 +211,7 @@
 				j = i+1;
 
 				Document doc;
-				doc = searcher.Doc (i);
+				doc = searcher.Doc (i, fields_uri);
 
 				other.AddUri (doc.Get ("Uri"));
 			}

Modified: branches/beagle-rdf/beagled/LuceneCommon.cs
==============================================================================
--- branches/beagle-rdf/beagled/LuceneCommon.cs	(original)
+++ branches/beagle-rdf/beagled/LuceneCommon.cs	Sun Feb 17 00:34:20 2008
@@ -83,7 +83,8 @@
 		// 18: add IsPersistent to properties, and adjust coded values
 		//     in AddPropertyToDocument() and GetPropertyFromDocument();
 		//     changed subdate field format rules for better readability
-		private const int MAJOR_VERSION = 18;
+		// 19: store a list of current properties in a field
+		private const int MAJOR_VERSION = 19;
 		private int minor_version = 0;
 
 		private string index_name;
@@ -524,6 +525,9 @@
 					}
 				} else if (fieldName == "PropertyKeyword")
 					return new LowerCaseFilter (new SingletonTokenStream (reader.ReadToEnd ()));
+				else if (fieldName == "Properties")
+					return new WhitespaceTokenizer (new StringReader (reader.ReadToEnd ()));
+
 
 				TokenStream outstream;
 				outstream = base.TokenStream (fieldName, reader);
@@ -856,6 +860,11 @@
 					
 				AddPropertyToDocument (prop, target_doc);
 			}
+
+			// Now add a field containing a whitespace separated list of other fields in the document
+			AddFieldProperies (primary_doc);
+			if (secondary_doc != null)
+				AddFieldProperies (secondary_doc);
 		}
 
 		static private Document CreateSecondaryDocument (Uri uri, Uri parent_uri)
@@ -928,6 +937,7 @@
 				}
 			}
 
+			AddFieldProperies (new_doc);
 			return new_doc;
 		}
 
@@ -949,9 +959,38 @@
 				}
 			}
 
+			AddFieldProperies (doc);
 			return doc;
 		}
 
+		// Add a new field with whitespace separated names of the existing fields
+		static protected void AddFieldProperies (Document doc)
+		{
+			const string Separator = " ";
+
+			StringBuilder sb = new StringBuilder ();
+			bool seen_properties = false;
+
+			foreach (Field f in doc.Fields ()) {
+				if (f.Name () == "Properties") {
+					seen_properties = true;
+					continue;
+				}
+
+				sb.Append (f.Name ());
+				sb.Append (Separator);
+			}
+
+			if (sb.Length > 0)
+				sb.Length -= Separator.Length;
+
+			if (seen_properties)
+				doc.RemoveFields ("Properties");
+
+			Field field = new Field ("Properties", sb.ToString (), Field.Store.YES, Field.Index.TOKENIZED); // FIXME: Field.Store.No
+			doc.Add (field);
+		}
+
 		static protected Uri GetUriFromDocument (Document doc)
 		{
 			string uri;
@@ -1633,11 +1672,13 @@
 				else
 					field_name = PropertyToFieldName (part.Type, part.Key);
 
+				// Details of the conversion here depends on BeagleAnalyzer::TokenStream
 				if (part.Type == PropertyType.Text)
 					primary_query = StringToQuery (field_name, part.Value, term_list);
 				else {
 					Term term;
-					if (field_name.StartsWith ("prop:k:" + Property.PrivateNamespace))
+					// FIXME: Handle date queries for other date fields
+					if (part.Type == PropertyType.Internal || field_name.StartsWith ("prop:k:" + Property.PrivateNamespace))
 						term = new Term (field_name, part.Value);
 					else
 						term = new Term (field_name, part.Value.ToLower ());

Modified: branches/beagle-rdf/beagled/LuceneQueryingDriver.cs
==============================================================================
--- branches/beagle-rdf/beagled/LuceneQueryingDriver.cs	(original)
+++ branches/beagle-rdf/beagled/LuceneQueryingDriver.cs	Sun Feb 17 00:34:20 2008
@@ -355,7 +355,15 @@
 
 			// Return uris for all documents with this property
 			if (subject == String.Empty && predicate != String.Empty && _object == String.Empty) {
-				return GetDocsWithProperty (predicate, pred_type);
+				string field_name = PropertyToFieldName (pred_type, predicate);
+
+				QueryPart_Property part = new QueryPart_Property ();
+				part.Type = PropertyType.Internal;
+				part.Key = "Properties";
+				part.Value = field_name;
+				query.AddPart (part);
+
+				return DoLowLevelRDFQuery (query, field_name, null);
 			}
 
 			// Property query
@@ -425,120 +433,6 @@
 			throw new Exception ("Never reaches");
 		}
 
-		// FIXME FIXME FIXME: Rewrite this horrible method by keeping a field containing
-		// the names of all properties in that document ?
-		// What about SecondaryDocument ? Which index to store this field in ?
-		private ICollection GetDocsWithProperty (string propname, PropertyType prop_type)
-		{
-			// This is the hardest!
-			// Most of the times either all docs will have the property or
-			// neither will, but we also have to cover the rare cases.
-			// Possible approach: Do a term_enum with this property name.
-			// Keep a Set of all Docs (rather Uris) which contain that term
-			// (pretty expensive - since most probably all documents will contain that
-			// property).
-			//
-			// Another approach: Get all hits from the driver, scan them one by one
-			// and return URIs for the hits which contain the property *shudder*
-			//
-
-			// FIXME: Uses PrimaryIndex only!
-			// Create a bitarray and mark all docs with that property by using a termenum
-
-			IndexReader primary_reader;
-			primary_reader = LuceneCommon.GetReader (PrimaryStore);
-
-			BetterBitArray all_docs = new BetterBitArray (primary_reader.MaxDoc ());
-
-			TermDocs docs = primary_reader.TermDocs ();
-			string field_name = PropertyToFieldName (prop_type, propname);
-			Console.WriteLine (field_name);
-			TermEnum enumerator = primary_reader.Terms (new Term (field_name, String.Empty));
-			Term term;
-			bool field_present = false;
-
-			do {
-				// Find all terms with given field
-				term = enumerator.Term ();
-			
-				if (term.Field () != field_name)
-					break;
-
-				field_present = true;
-
-				docs.Seek (enumerator);
-
-				// Find all docs with that term
-				while (docs.Next ())
-					all_docs [docs.Doc ()] = true;
-			} while (enumerator.Next ());
-			Console.WriteLine (field_present);
-
-			enumerator.Close ();
-
-			// Maxdoc could be millions!
-			ArrayList hits = new ArrayList (primary_reader.MaxDoc ());
-
-			// If field_present is false, preempt
-			if (! field_present) {
-				docs.Close ();
-				LuceneCommon.ReleaseReader (primary_reader);
-
-				return hits;
-			}
-
-			IndexReader secondary_reader = null;
-			LNS.IndexSearcher secondary_searcher = null;
-
-			if (SecondaryStore != null) {
-				secondary_reader = LuceneCommon.GetReader (SecondaryStore);
-				if (secondary_reader.NumDocs () == 0) {
-					ReleaseReader (secondary_reader);
-					secondary_reader = null;
-				}
-			}
-
-			if (secondary_reader != null)
-				secondary_searcher = new LNS.IndexSearcher (secondary_reader);
-
-			TermDocs secondary_term_docs = null;
-			if (secondary_searcher != null)
-				secondary_term_docs = secondary_searcher.Reader.TermDocs ();
-
-			string[] fields = { "Uri", "Timestamp", field_name };
-
-			// Go through all Uris now
-			enumerator = primary_reader.Terms (new Term ("Uri", String.Empty));
-			Document doc;
-
-			do {
-				// Find all terms with 
-				term = enumerator.Term ();
-			
-				if (term.Field () != "Uri")
-					break;
-
-				docs.Seek (enumerator);
-				// Assume only one doc with an uri.
-				// Go to the doc with this uri
-				// If this doc's id is present in bit_array, return the uri
-				if (docs.Next () && all_docs [docs.Doc ()]) {
-					doc = primary_reader.Document (docs.Doc (), fields);
-					Hit hit = CreateHit (doc, secondary_searcher, secondary_term_docs, fields);
-					hits.Add (hit); 
-				}
-
-			} while (enumerator.Next ());
-
-			// Traverse all docs in all_docs
-
-			enumerator.Close ();
-			docs.Close ();
-			LuceneCommon.ReleaseReader (primary_reader);
-
-			return hits;
-		}
-
 		private ICollection DoLowLevelRDFQuery (Query query,
 							string field_name,
 							string field_value)

Modified: branches/beagle-rdf/beagled/NoiseFilter.cs
==============================================================================
--- branches/beagle-rdf/beagled/NoiseFilter.cs	(original)
+++ branches/beagle-rdf/beagled/NoiseFilter.cs	Sun Feb 17 00:34:20 2008
@@ -248,7 +248,13 @@
 	}
 
 #if false
+	// To build: gmcs NoiseFilter.cs LuceneCommon.cs -r:../Util/Util.dll -r:../BeagleClient/Beagle.dll -r:BeagleDaemonLib.dll
 	public class AnalyzerTest {
+		public static void Main ()
+		{
+			Analyze (Console.In);
+		}
+
 		public static void Analyze (TextReader reader)
 		{
 			Lucene.Net.Analysis.Token lastToken = null;



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]