[geary/wip/720361-stemming] Parameterize search matching scope, from EXACT to HORIZON.



commit a345c0295387a7015bb215cc826fb0ff0e278cc4
Author: Jim Nelson <jim yorba org>
Date:   Thu Dec 11 15:56:21 2014 -0800

    Parameterize search matching scope, from EXACT to HORIZON.

 src/client/application/geary-controller.vala       |    3 +-
 src/engine/abstract/geary-abstract-account.vala    |    2 +-
 src/engine/api/geary-account.vala                  |   12 +++-
 src/engine/api/geary-search-folder.vala            |    9 ++-
 src/engine/api/geary-search-query.vala             |   47 ++++++++++++-
 src/engine/imap-db/imap-db-account.vala            |   29 ++++----
 src/engine/imap-db/imap-db-search-query.vala       |   72 +++++++++++++++++++-
 .../imap-engine/imap-engine-generic-account.vala   |    4 +-
 8 files changed, 150 insertions(+), 28 deletions(-)
---
diff --git a/src/client/application/geary-controller.vala b/src/client/application/geary-controller.vala
index 08f3fe4..4bb7083 100644
--- a/src/client/application/geary-controller.vala
+++ b/src/client/application/geary-controller.vala
@@ -2512,7 +2512,8 @@ public class GearyController : Geary.BaseObject {
         
         cancel_search(); // Stop any search in progress.
         
-        folder.set_search_query(search_text, cancellable_search);
+        folder.set_search_query(search_text, Geary.SearchQuery.Matching.CONSERVATIVE,
+            cancellable_search);
         
         main_window.folder_list.set_search(folder);
         search_text_changed(main_window.main_toolbar.search_text);
diff --git a/src/engine/abstract/geary-abstract-account.vala b/src/engine/abstract/geary-abstract-account.vala
index 409d0b8..c56b9aa 100644
--- a/src/engine/abstract/geary-abstract-account.vala
+++ b/src/engine/abstract/geary-abstract-account.vala
@@ -118,7 +118,7 @@ public abstract class Geary.AbstractAccount : BaseObject, Geary.Account {
     public abstract async Geary.Email local_fetch_email_async(Geary.EmailIdentifier email_id,
         Geary.Email.Field required_fields, Cancellable? cancellable = null) throws Error;
     
-    public abstract Geary.SearchQuery open_search(string query);
+    public abstract Geary.SearchQuery open_search(string query, Geary.SearchQuery.Matching matching);
     
     public abstract async Gee.Collection<Geary.EmailIdentifier>? local_search_async(Geary.SearchQuery query,
         int limit = 100, int offset = 0, Gee.Collection<Geary.FolderPath?>? folder_blacklist = null,
diff --git a/src/engine/api/geary-account.vala b/src/engine/api/geary-account.vala
index 0dfc7a6..6b26097 100644
--- a/src/engine/api/geary-account.vala
+++ b/src/engine/api/geary-account.vala
@@ -325,11 +325,19 @@ public interface Geary.Account : BaseObject {
     /**
      * Create a new { link SearchQuery} for this { link Account}.
      *
+     * See { link Geary.SearchQuery.Matching} for more information about how its interpreted by the
+     * Engine.  In particular, note that it's an advisory parameter only and may have no effect,
+     * especially on server searches.  However, it may also have a dramatic effect on what search
+     * results are returned and so should be used with some caution.  Whether this parameter is
+     * user-configurable, available through GSettings or another configuration mechanism, or simply
+     * baked into the caller's code is up to the caller.  CONSERVATIVE is designed to be a good
+     * default.
+     *
      * The SearchQuery object can only be used with calls into this Account.
      *
-     * Dropping the last reference to the SearchQuery is sufficient to close it.
+     * Dropping the last reference to the SearchQuery will close it.
      */
-    public abstract Geary.SearchQuery open_search(string query);
+    public abstract Geary.SearchQuery open_search(string query, Geary.SearchQuery.Matching matching);
     
     /**
      * Performs a search with the given query.  Optionally, a list of folders not to search
diff --git a/src/engine/api/geary-search-folder.vala b/src/engine/api/geary-search-folder.vala
index e70b933..e7246a7 100644
--- a/src/engine/api/geary-search-folder.vala
+++ b/src/engine/api/geary-search-folder.vala
@@ -203,8 +203,8 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport
     /**
      * Sets the keyword string for this search.
      */
-    public void set_search_query(string query, Cancellable? cancellable = null) {
-        set_search_query_async.begin(query, cancellable, on_set_search_query_complete);
+    public void set_search_query(string query, SearchQuery.Matching matching, Cancellable? cancellable = 
null) {
+        set_search_query_async.begin(query, matching, cancellable, on_set_search_query_complete);
     }
     
     private void on_set_search_query_complete(Object? source, AsyncResult result) {
@@ -215,8 +215,9 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport
         }
     }
     
-    private async void set_search_query_async(string query, Cancellable? cancellable = null) throws Error {
-        Geary.SearchQuery search_query = account.open_search(query);
+    private async void set_search_query_async(string query, SearchQuery.Matching matching,
+        Cancellable? cancellable) throws Error {
+        Geary.SearchQuery search_query = account.open_search(query, matching);
         
         int result_mutex_token = yield result_mutex.claim_async();
         
diff --git a/src/engine/api/geary-search-query.vala b/src/engine/api/geary-search-query.vala
index 3ed0101..adfc99b 100644
--- a/src/engine/api/geary-search-query.vala
+++ b/src/engine/api/geary-search-query.vala
@@ -10,16 +10,59 @@
  *
  * The only interaction the API user should have with this is creating new ones and then passing
  * them to the search methods in the Engine.
+ *
+ * @see Geary.Account.open_search
  */
 
 public abstract class Geary.SearchQuery : BaseObject {
     /**
+     * An advisory parameter regarding search quality and scope.
+     *
+     * The Engine can perform searches based on (unspecified, uncontracted) textual variations of
+     * a query's search terms.  Some of those variations may produce undesirable results due to
+     * "greedy" matching of terms.  The Matching parameter allows for an advisory to the Engine
+     * about how to use those textual variants, if any at all.
+     *
+     * This may be respected or ignored by the Engine.  In particular, there's no guarantee it will
+     * have any effect on server search.
+     */
+    public enum Matching {
+        /**
+         * Only return exact matches, perform no searches for textual variants.
+         *
+         * Note that Geary's search syntax does prefix-matching for unquoted strings.  EXACT means
+         * exact ''prefix-''matching in this case.
+         */
+        EXACT,
+        /**
+         * Allow for searching for a small set of textual variants and small differences in search
+         * terms.  This is a good default.
+         */
+        CONSERVATIVE,
+        /**
+         * Allow for searching for a broad set of textual variants and larger differences in
+         * search terms.
+         */
+        AGGRESSIVE,
+        /**
+         * Search for all textual variants, i.e. "the sky's the limit."
+         */
+        HORIZON
+    }
+    
+    /**
      * The original user search text.
      */
     public string raw { get; private set; }
     
-    protected SearchQuery(string query) {
-        raw = query;
+    /**
+     * The selected { link Matching} quality.
+     */
+    public Matching matching { get; private set; }
+    
+    protected SearchQuery(string raw, Matching matching) {
+        this.raw = raw;
+        this.matching = matching;
     }
 }
 
diff --git a/src/engine/imap-db/imap-db-account.vala b/src/engine/imap-db/imap-db-account.vala
index 52315af..7eb74e5 100644
--- a/src/engine/imap-db/imap-db-account.vala
+++ b/src/engine/imap-db/imap-db-account.vala
@@ -7,10 +7,6 @@
 private class Geary.ImapDB.Account : BaseObject {
     private const int POPULATE_SEARCH_TABLE_DELAY_SEC = 5;
     
-    private const int MIN_TERM_LENGTH_FOR_STEMMING = 6;
-    private const int MAX_DIFF_TERM_STEM_LENGTH = 2;
-    private const int MAX_MATCH_LENGTH_STEMMED_VARIANT = 2;
-    
     private class FolderReference : Geary.SmartReference {
         public Geary.FolderPath path;
         
@@ -745,17 +741,22 @@ private class Geary.ImapDB.Account : BaseObject {
      *
      * Some hueristics are in place simply to determine if stemming should occur:
      *
-     * # If the term is < MIN_TERM_LENGTH_FOR_STEMMING, no stemming occurs.
+     * # If stemming is unallowed, no stemming occurs.
+     * # If the term is < min. term length for stemming, no stemming occurs.
      * # If the stemmer returns a stem that is the same as the original term, no stemming occurs.
      * # If the difference between the stemmed word and the original term is more than
-     *   MAX_DIFF_TERM_STEM_LENGTH, no stemming occurs.  This works under the assumption that if
+     *   maximum allowed, no stemming occurs.  This works under the assumption that if
      *   the user has typed a long word, they do not want to "go back" to searching for a much
-     *   shorter version of it.  (For example, "accountancy" stems to "account").
+     *   shorter version of it.  (For example, "accountancies" stems to "account").
      *
      * Otherwise, the stem for the term is returned.
      */
-    private string? stem_search_term(string term) {
-        if (term.length < MIN_TERM_LENGTH_FOR_STEMMING)
+    private string? stem_search_term(ImapDB.SearchQuery query, string term) {
+        if (!query.allow_stemming)
+            return null;
+        
+        int term_length = term.length;
+        if (term_length < query.min_term_length_for_stemming)
             return null;
         
         string? stemmed = null;
@@ -790,7 +791,7 @@ private class Geary.ImapDB.Account : BaseObject {
             return null;
         
         // Don't search for stemmed words that are significantly shorter than the user's search term
-        if (term.length - stemmed.length > MAX_DIFF_TERM_STEM_LENGTH) {
+        if (term_length - stemmed.length > query.max_difference_term_stem_lengths) {
             debug("Stemmed \"%s\" dropped searching for \"%s\": too much distance in terms",
                 stemmed, term);
             
@@ -877,7 +878,7 @@ private class Geary.ImapDB.Account : BaseObject {
                 if (parts.length > 1)
                     field = extract_field_from_token(parts, ref s);
                 
-                term = new SearchTerm(original, s, stem_search_term(s), str => "%s*".printf(str));
+                term = new SearchTerm(original, s, stem_search_term(query, s), str => "%s*".printf(str));
             }
             
             if (in_quote && quotes % 2 != 0)
@@ -1093,8 +1094,8 @@ private class Geary.ImapDB.Account : BaseObject {
             
             // For each matched string in this message, retain the message in the search results
             // if it prefix-matches any of the straight-up parsed terms or matches a stemmed
-            // variant (with only MAX_MATCH_LENGTH_STEMMED_VARIANT differences in their lengths,
-            // i.e. not a "greedy" match)
+            // variant (with only max. difference in their lengths allowed, i.e. not a "greedy"
+            // match)
             bool good_match_found = false;
             foreach (string match in matches) {
                 foreach (SearchTerm term in query.get_all_terms()) {
@@ -1108,7 +1109,7 @@ private class Geary.ImapDB.Account : BaseObject {
                     // if prefix-matches stemmed term w/o doing so greedily, then don't strip
                     if (term.stemmed != null && match.has_prefix(term.stemmed)) {
                         int diff = match.length - term.stemmed.length;
-                        if (diff <= MAX_MATCH_LENGTH_STEMMED_VARIANT) {
+                        if (diff <= query.max_difference_match_stem_lengths) {
                             good_match_found = true;
                             
                             break;
diff --git a/src/engine/imap-db/imap-db-search-query.vala b/src/engine/imap-db/imap-db-search-query.vala
index 28676e7..eea1f68 100644
--- a/src/engine/imap-db/imap-db-search-query.vala
+++ b/src/engine/imap-db/imap-db-search-query.vala
@@ -19,15 +19,83 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
      */
     public bool parsed { get; set; default = false; }
     
+    /**
+     * Determined by { link matching}.
+     */
+    public bool allow_stemming { get; private set; }
+    
+    /**
+     * Minimum length of the term before stemming is allowed.
+     *
+     * This prevents short words that might be stemmed from being stemmed.
+     *
+     * Overridden by { link allow_stemming}.  Determined by { link matching}.
+     */
+    public int min_term_length_for_stemming { get; private set; }
+    
+    /**
+     * Maximum difference in lengths between term and stemmed variant.
+     *
+     * This prevents long words from being stemmed to much shorter words (which creates
+     * opportunities for greedy matching).
+     *
+     * Overridden by { link allow_stemming}.  Determined by { link matching}.
+     */
+    public int max_difference_term_stem_lengths { get; private set; }
+    
+    /**
+     * Maximum difference in lengths between a matched word and the stemmed variant it matched
+     * against.
+     *
+     * This prevents long words being matched to short stem variants (which creates opportunities
+     * for greedy matching).
+     *
+     * Overridden by { link allow_stemming}.  Determined by { link matching}.
+     */
+    public int max_difference_match_stem_lengths { get; private set; }
+    
     // Not using a MultiMap because we (might) need a guarantee of order.
     private Gee.HashMap<string?, Gee.ArrayList<SearchTerm>> field_map
         = new Gee.HashMap<string?, Gee.ArrayList<SearchTerm>>();
     private Gee.ArrayList<SearchTerm> all = new Gee.ArrayList<SearchTerm>();
     
-    public SearchQuery(ImapDB.Account account, string query) {
-        base (query);
+    public SearchQuery(ImapDB.Account account, string query, Geary.SearchQuery.Matching matching) {
+        base (query, matching);
         
         this.account = account;
+        
+        switch (matching) {
+            case Matching.EXACT:
+                allow_stemming = false;
+                min_term_length_for_stemming = int.MAX;
+                max_difference_term_stem_lengths = 0;
+                max_difference_match_stem_lengths = 0;
+            break;
+            
+            case Matching.CONSERVATIVE:
+                allow_stemming = true;
+                min_term_length_for_stemming = 6;
+                max_difference_term_stem_lengths = 2;
+                max_difference_match_stem_lengths = 2;
+            break;
+            
+            case Matching.AGGRESSIVE:
+                allow_stemming = true;
+                min_term_length_for_stemming = 4;
+                max_difference_term_stem_lengths = 4;
+                max_difference_match_stem_lengths = 3;
+            break;
+            
+            case Matching.HORIZON:
+                allow_stemming = true;
+                min_term_length_for_stemming = 0;
+                max_difference_term_stem_lengths = int.MAX;
+                max_difference_match_stem_lengths = int.MAX;
+            break;
+            
+            default:
+                assert_not_reached();
+        }
     }
     
     public void add_search_term(string? field, SearchTerm term) {
diff --git a/src/engine/imap-engine/imap-engine-generic-account.vala 
b/src/engine/imap-engine/imap-engine-generic-account.vala
index 47ce2ac..f59af5a 100644
--- a/src/engine/imap-engine/imap-engine-generic-account.vala
+++ b/src/engine/imap-engine/imap-engine-generic-account.vala
@@ -824,8 +824,8 @@ private abstract class Geary.ImapEngine.GenericAccount : Geary.AbstractAccount {
         return yield local.fetch_email_async(check_id(email_id), required_fields, cancellable);
     }
     
-    public override Geary.SearchQuery open_search(string query) {
-        return new ImapDB.SearchQuery(local, query);
+    public override Geary.SearchQuery open_search(string query, SearchQuery.Matching matching) {
+        return new ImapDB.SearchQuery(local, query, matching);
     }
     
     public override async Gee.Collection<Geary.EmailIdentifier>? local_search_async(Geary.SearchQuery query,


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]