beagle r4826 - in trunk/beagle/beagled: . TomboyQueryable



Author: dbera
Date: Fri Jul 11 02:38:38 2008
New Revision: 4826
URL: http://svn.gnome.org/viewvc/beagle?rev=4826&view=rev

Log:
Resolve the mysterious "Too many matches. Returned top 10 of 200 results" even when query.max-hits is 100. This happened as we applied hit_filter just before sending the results and for FSQ, the hit_filter is responsible for checking if the file actually exists. So if the number of initial matches is 200, but for some reason there are 90 bad matches among the first 100 then we would have returned only 10 hits! Fix this by removing uri-filter. Uri filter is not useful anymore since we already have a hit-filter and both uri-filter and hit-filter were applied more or less one after another. Now do all kind of hit validation in hit-filter and call it right when we were calling uri-filter i.e. creating hits.


Modified:
   trunk/beagle/beagled/LuceneFileQueryable.cs
   trunk/beagle/beagled/LuceneQueryable.cs
   trunk/beagle/beagled/LuceneQueryingDriver.cs
   trunk/beagle/beagled/StaticQueryable.cs
   trunk/beagle/beagled/TomboyQueryable/TomboyQueryable.cs

Modified: trunk/beagle/beagled/LuceneFileQueryable.cs
==============================================================================
--- trunk/beagle/beagled/LuceneFileQueryable.cs	(original)
+++ trunk/beagle/beagled/LuceneFileQueryable.cs	Fri Jul 11 02:38:38 2008
@@ -186,7 +186,12 @@
 			return indexable.Uri;
 		}
 
-		override protected bool HitIsValid (Uri uri)
+		override protected bool HitFilter (Hit hit)
+		{
+			return HitIsValid (hit.Uri);
+		}
+
+		private bool HitIsValid (Uri uri)
 		{
 			// Do the right thing if the Uri is a file.
 			// If the file Uri we need is the ContentUri, this won't work.

Modified: trunk/beagle/beagled/LuceneQueryable.cs
==============================================================================
--- trunk/beagle/beagled/LuceneQueryable.cs	(original)
+++ trunk/beagle/beagled/LuceneQueryable.cs	Fri Jul 11 02:38:38 2008
@@ -74,7 +74,6 @@
 		private LuceneQueryingDriver driver;
 		private IIndexer indexer = null;
 
-		private LuceneQueryingDriver.UriFilter our_uri_filter;
 		private LuceneCommon.HitFilter our_hit_filter;
 		private LuceneCommon.QueryPartHook backend_query_part_hook;
 		private Scheduler.Task our_final_flush_task = null;
@@ -98,7 +97,6 @@
 			this.read_only_mode = read_only_mode;
 
 			driver = BuildLuceneQueryingDriver (this.index_name, this.minor_version, this.read_only_mode);
-			our_uri_filter = new LuceneQueryingDriver.UriFilter (this.HitIsValid);
 			our_hit_filter = new LuceneCommon.HitFilter (this.HitFilter);
 			backend_query_part_hook = new LuceneCommon.QueryPartHook (this.QueryPartHook);
 
@@ -181,11 +179,6 @@
 
 		/////////////////////////////////////////
 
-		virtual protected bool HitIsValid (Uri uri)
-		{
-			return true;
-		}
-
 		virtual protected bool HitFilter (Hit hit)
 		{
 			return true;
@@ -322,19 +315,6 @@
 				if (query.IsIndexListener) {
 					ArrayList synthetic_hits = new ArrayList ();
 					foreach (Uri uri in added_uris) {
-						if (our_uri_filter != null) {
-							bool accept = false;
-
-							try {
-								accept = our_uri_filter (uri);
-							} catch (Exception e) {
-								Log.Warn (e, "Caught an exception in HitIsValid for {0}", uri);
-							}
-
-							if (! accept)
-								continue;
-						}
-
 						Hit hit = new Hit ();
 						hit.Uri = uri;
 
@@ -363,7 +343,6 @@
 					query_result,
 					added_uris,
 					backend_query_part_hook,
-					our_uri_filter,
 					our_hit_filter);
 		}
 

Modified: trunk/beagle/beagled/LuceneQueryingDriver.cs
==============================================================================
--- trunk/beagle/beagled/LuceneQueryingDriver.cs	(original)
+++ trunk/beagle/beagled/LuceneQueryingDriver.cs	Fri Jul 11 02:38:38 2008
@@ -51,7 +51,6 @@
 
 		static private bool Debug = false;
 
-		public delegate bool UriFilter (Uri uri);
 		public delegate double RelevancyMultiplier (Hit hit);
 
 		public LuceneQueryingDriver (string index_name, bool read_only)
@@ -572,14 +571,8 @@
 
 				count++;
 
-				// If we have a UriFilter, apply it.
-				// RDF FIXME: Ignore Uri Filter for now
-				//if (uri_filter != null) {
-				//	Uri uri;
-				//	uri = GetUriFromDocument (doc);
-				//	if (! uri_filter (uri))
-				//		continue;
-				//}
+				// If we have a HitFilter, apply it.
+				// RDF FIXME: Ignore Hit Filter for now
 
 				// If predicate was not specified but object was specified,
 				// then figure out the right predicate
@@ -845,7 +838,6 @@
 				     IQueryResult        result,
 				     ICollection         search_subset_uris, // should be internal uris
 				     QueryPartHook       query_part_hook,
-				     UriFilter           uri_filter,
 				     HitFilter           hit_filter)
 		{
 			if (Debug)
@@ -963,7 +955,6 @@
 						      result,
 						      term_list,
 						      query.MaxHits,
-						      uri_filter,
 						      new HitFilter (all_hit_filters.HitFilter),
 						      IndexName);
 			}
@@ -1196,7 +1187,6 @@
 							  IQueryResult      result,
 							  ICollection       query_term_list,
 							  int               max_results,
-							  UriFilter         uri_filter,
 							  HitFilter         hit_filter,
 							  string            index_name)
 		{
@@ -1227,13 +1217,16 @@
 			// This is used only for scoring
 			Dictionary<int, Hit> hits_by_id = new Dictionary<int, Hit> (num_hits);
 
+			int total_number_of_matches = primary_matches.TrueCount;
+
 			if (primary_matches.TrueCount > max_results)
 				final_list_of_hits = ScanRecentDocs (primary_reader,
 					secondary_reader,
 					primary_matches,
 					hits_by_id,
 					max_results,
-					uri_filter,
+					ref total_number_of_matches,
+					hit_filter,
 					index_name);
 
 			if (final_list_of_hits == null)
@@ -1242,7 +1235,8 @@
 					primary_matches,
 					hits_by_id,
 					max_results,
-					uri_filter,
+					ref total_number_of_matches,
+					hit_filter,
 					index_name);
 
 			d.Start ();
@@ -1257,8 +1251,6 @@
 
 			e.Start ();
 
-			int total_number_of_matches = primary_matches.TrueCount;
-
 			// 25 hits seems to be the sweet spot: anything lower
 			// and serialization overhead gets us, higher takes
 			// longer to send out.
@@ -1333,7 +1325,8 @@
 						    BetterBitArray	    primary_matches,
 						    Dictionary<int, Hit>    hits_by_id,
 						    int			    max_results,
-						    UriFilter		    uri_filter,
+						    ref int		    total_number_of_matches,
+						    HitFilter		    hit_filter,
 						    string		    index_name)
 		{
 			Stopwatch a = new Stopwatch ();
@@ -1344,6 +1337,7 @@
 			ArrayList results = new ArrayList (max_results);
 			int docs_found = 0;
 			int docs_walked = 0;
+			int hit_filter_removed = 0;
 			int max_docs = (int) (primary_matches.TrueCount * 1.25);
 
 			Term term;
@@ -1366,18 +1360,19 @@
 
 					if (primary_matches.Get (doc_id)) {
 						Document doc = primary_reader.Document (doc_id);
-						// If we have a UriFilter, apply it.
-						if (uri_filter != null) {
-							Uri uri;
-							uri = GetUriFromDocument (doc);
-							if (uri_filter (uri)) {
-								Hit hit = CreateHit (doc, secondary_reader, secondary_term_docs);
-								hits_by_id [doc_id] = hit;
-								// Add the result, last modified first
-								results.Add (hit);
-								docs_found++;
-							}
+						Hit hit = CreateHit (doc, secondary_reader, secondary_term_docs);
+
+						// If we have a HitFilter, apply it.
+						if (hit_filter != null && ! hit_filter (hit)) {
+							if (Debug)
+								Log.Debug ("Filtered out {0}", hit.Uri);
+							hit_filter_removed ++;
+							continue;
 						}
+						hits_by_id [doc_id] = hit;
+						// Add the result, last modified first
+						results.Add (hit);
+						docs_found++;
 					}
 			
 					docs_walked++;
@@ -1396,6 +1391,13 @@
 				// Otherwise bad luck! Not all docs found
 				// Start afresh - this time traversing all results
 				results = null;
+			} else {
+				// Adjust total_number_of_matches. We need to do this to avoid scenarios like the following:
+				// max_hits = 100. Matched 100 results. But hit filter removed 30. So 70 results will be returned.
+				// We want to avoid saying "Showing top 70 of 100". Note that since we are not passing
+				// every document in the index through the hit_filter, when we say "Showing top 100 of 1234", the
+				// 1234 could actually be much less. But since max_hits was 100, that will not mislead the user.
+				total_number_of_matches -= hit_filter_removed;
 			}
 
 			a.Stop ();
@@ -1414,7 +1416,8 @@
 							      BetterBitArray	    primary_matches,
 							      Dictionary<int, Hit>  hits_by_id,
 							      int		    max_results,
-							      UriFilter		    uri_filter,
+							      ref int		    total_number_of_matches,
+							      HitFilter		    hit_filter,
 							      string		    index_name)
 		{
 			Stopwatch b = new Stopwatch ();
@@ -1461,17 +1464,18 @@
 						continue;
 				}
 
-				// If we have a UriFilter, apply it.
-				if (uri_filter != null) {
-					Uri uri;
-					uri = GetUriFromDocument (doc);
-					if (! uri_filter (uri))
-						continue;
-				}
-
 				// Get the actual hit now
-				// doc was created with only 2 fields, so first get the complete lucene document for primary document
+				// doc was created with only 2 fields, so first get the complete lucene document for primary document.
+				// Also run our hit_filter now, if we have one. Since we insist of returning max_results
+				// most recent hits, any hits that would be filtered out should happen now and not later.
 				Hit hit = CreateHit (primary_reader.Document (match_index), secondary_reader, term_docs);
+				if (hit_filter != null && ! hit_filter (hit)) {
+					if (Debug)
+						Log.Debug ("Filtered out {0}", hit.Uri);
+					total_number_of_matches --;
+					continue;
+				}
+
 				hits_by_id [match_index] = hit;
 
 				// Add the document to the appropriate data structure.

Modified: trunk/beagle/beagled/StaticQueryable.cs
==============================================================================
--- trunk/beagle/beagled/StaticQueryable.cs	(original)
+++ trunk/beagle/beagled/StaticQueryable.cs	Fri Jul 11 02:38:38 2008
@@ -172,7 +172,7 @@
 			return SnippetFu.GetSnippet (query_terms, reader, full_text, ctx_length, snp_length);
 		}
 
-		override protected bool HitIsValid (Uri uri)
+		private bool HitIsValid (Uri uri)
 		{
 			// We can't check anything else than file uris
 			if (! uri.IsFile)
@@ -193,6 +193,14 @@
 		// FIXME: Allow option to search unmounted media ? Return false in that case.
 		override protected bool HitFilter (Hit hit)
 		{
+			if (! HitIsValid (hit.Uri))
+				return false;
+
+			return HitIsValidRemovable (hit);
+		}
+
+		private bool HitIsValidRemovable (Hit hit)
+		{
 			if (mount_dir == null || hit.Uri.Scheme != "removable")
 				return true;
 

Modified: trunk/beagle/beagled/TomboyQueryable/TomboyQueryable.cs
==============================================================================
--- trunk/beagle/beagled/TomboyQueryable/TomboyQueryable.cs	(original)
+++ trunk/beagle/beagled/TomboyQueryable/TomboyQueryable.cs	Fri Jul 11 02:38:38 2008
@@ -212,8 +212,9 @@
 			return indexable.Uri;
 		}
 
-		override protected bool HitIsValid (Uri uri)
+		override protected bool HitFilter (Hit hit)
 		{
+			Uri uri = hit.Uri;
 			string note = Path.Combine (tomboy_dir, uri.Segments [1] + ".note");
 
 			if (File.Exists (note))



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]