[geary/wip/720361-stemming] Improvements on search/stemming via some heuristics



commit f314c18b1b792f9615df9d66a64c388467e47a3d
Author: Jim Nelson <jim yorba org>
Date:   Wed Dec 10 17:51:59 2014 -0800

    Improvements on search/stemming via some heuristics
    
    Not bulletproof, but now heading a direction where (a) exact-prefix
    searching is possible (because the FTS is not tokenized via the
    stemmer), (b) stemmed variant searching is possible (because the
    "fake" TokenizerTable is used to call it up), (c) some heuristics are
    performed on the stemmed variant to see if its suitable for searching,
    and (d) the search results are culled to strip out any results which
    rely solely on "greedy" prefix matching of the stemmed variants only.
    
    Still some cleanup to do.  Also considering making this configurable
    by the client (although may not expose this via the UI) so there's
    some way to choose between strict/aggressive/forgiving searches w/o
    changing consts and recompiling.  Highlighting is somewhat broken at
    the moment.

 src/CMakeLists.txt                                 |    2 +
 src/engine/abstract/geary-abstract-account.vala    |    2 +
 src/engine/api/geary-account.vala                  |    9 +
 src/engine/api/geary-search-folder.vala            |    2 +-
 src/engine/api/geary-search-query.vala             |   45 +---
 src/engine/imap-db/imap-db-account.vala            |  243 ++++++++++++++------
 src/engine/imap-db/imap-db-search-query.vala       |   53 +++++
 src/engine/imap-db/imap-db-search-term.vala        |   66 ++++++
 .../imap-engine/imap-engine-generic-account.vala   |    4 +
 9 files changed, 321 insertions(+), 105 deletions(-)
---
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 13d1f2b..dbd4d98 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -170,6 +170,8 @@ engine/imap-db/imap-db-email-identifier.vala
 engine/imap-db/imap-db-folder.vala
 engine/imap-db/imap-db-message-addresses.vala
 engine/imap-db/imap-db-message-row.vala
+engine/imap-db/imap-db-search-query.vala
+engine/imap-db/imap-db-search-term.vala
 engine/imap-db/imap-db-search-email-identifier.vala
 engine/imap-db/outbox/smtp-outbox-email-identifier.vala
 engine/imap-db/outbox/smtp-outbox-email-properties.vala
diff --git a/src/engine/abstract/geary-abstract-account.vala b/src/engine/abstract/geary-abstract-account.vala
index 7c194f8..409d0b8 100644
--- a/src/engine/abstract/geary-abstract-account.vala
+++ b/src/engine/abstract/geary-abstract-account.vala
@@ -118,6 +118,8 @@ public abstract class Geary.AbstractAccount : BaseObject, Geary.Account {
     public abstract async Geary.Email local_fetch_email_async(Geary.EmailIdentifier email_id,
         Geary.Email.Field required_fields, Cancellable? cancellable = null) throws Error;
     
+    public abstract Geary.SearchQuery open_search(string query);
+    
     public abstract async Gee.Collection<Geary.EmailIdentifier>? local_search_async(Geary.SearchQuery query,
         int limit = 100, int offset = 0, Gee.Collection<Geary.FolderPath?>? folder_blacklist = null,
         Gee.Collection<Geary.EmailIdentifier>? search_ids = null, Cancellable? cancellable = null) throws 
Error;
diff --git a/src/engine/api/geary-account.vala b/src/engine/api/geary-account.vala
index a144566..0dfc7a6 100644
--- a/src/engine/api/geary-account.vala
+++ b/src/engine/api/geary-account.vala
@@ -323,6 +323,15 @@ public interface Geary.Account : BaseObject {
         Geary.Email.Field required_fields, Cancellable? cancellable = null) throws Error;
     
     /**
+     * Create a new { link SearchQuery} for this { link Account}.
+     *
+     * The SearchQuery object can only be used with calls into this Account.
+     *
+     * Dropping the last reference to the SearchQuery is sufficient to close it.
+     */
+    public abstract Geary.SearchQuery open_search(string query);
+    
+    /**
      * Performs a search with the given query.  Optionally, a list of folders not to search
      * can be passed as well as a list of email identifiers to restrict the search to only those messages.
      * Returns a list of EmailIdentifiers, or null if there are no results.
diff --git a/src/engine/api/geary-search-folder.vala b/src/engine/api/geary-search-folder.vala
index 4d03421..e70b933 100644
--- a/src/engine/api/geary-search-folder.vala
+++ b/src/engine/api/geary-search-folder.vala
@@ -216,7 +216,7 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport
     }
     
     private async void set_search_query_async(string query, Cancellable? cancellable = null) throws Error {
-        Geary.SearchQuery search_query = new Geary.SearchQuery(query);
+        Geary.SearchQuery search_query = account.open_search(query);
         
         int result_mutex_token = yield result_mutex.claim_async();
         
diff --git a/src/engine/api/geary-search-query.vala b/src/engine/api/geary-search-query.vala
index 557809c..3ed0101 100644
--- a/src/engine/api/geary-search-query.vala
+++ b/src/engine/api/geary-search-query.vala
@@ -6,45 +6,20 @@
 
 /**
  * An object to hold state for various search subsystems that might need to
- * parse the same text string different ways.  The only interaction the API
- * user should have with this is creating new ones and then passing them off to
- * the search methods in the engine.
+ * parse the same text string different ways.
  *
- * TODO: support anything other than ImapDB.Account's search methods.
+ * The only interaction the API user should have with this is creating new ones and then passing
+ * them to the search methods in the Engine.
  */
-public class Geary.SearchQuery : BaseObject {
+
+public abstract class Geary.SearchQuery : BaseObject {
+    /**
+     * The original user search text.
+     */
     public string raw { get; private set; }
-    public bool parsed { get; internal set; default = false; }
-    
-    internal int stemming_level { get; set; default = 0; }
-    
-    // Not using a MultiMap because we (might) need a guarantee of order.
-    private Gee.HashMap<string?, Gee.ArrayList<string>> field_map
-        = new Gee.HashMap<string?, Gee.ArrayList<string>>();
     
-    public SearchQuery(string query) {
+    protected SearchQuery(string query) {
         raw = query;
     }
-    
-    public void clear() {
-        field_map.clear();
-    }
-    
-    internal void add_token(string? field, string token) {
-        if (!field_map.has_key(field))
-            field_map.set(field, new Gee.ArrayList<string>());
-        
-        field_map.get(field).add(token);
-    }
-    
-    internal Gee.Collection<string?> get_fields() {
-        return field_map.keys;
-    }
-    
-    internal Gee.List<string>? get_tokens(string? field) {
-        if (!field_map.has_key(field))
-            return null;
-        
-        return field_map.get(field);
-    }
 }
+
diff --git a/src/engine/imap-db/imap-db-account.vala b/src/engine/imap-db/imap-db-account.vala
index da66583..d589113 100644
--- a/src/engine/imap-db/imap-db-account.vala
+++ b/src/engine/imap-db/imap-db-account.vala
@@ -7,6 +7,10 @@
 private class Geary.ImapDB.Account : BaseObject {
     private const int POPULATE_SEARCH_TABLE_DELAY_SEC = 5;
     
+    private const int MIN_TERM_LENGTH_FOR_STEMMING = 6;
+    private const int MAX_DIFF_TERM_STEM_LENGTH = 2;
+    private const int MAX_MATCH_LENGTH_STEMMED_VARIANT = 2;
+    
     private class FolderReference : Geary.SmartReference {
         public Geary.FolderPath path;
         
@@ -61,6 +65,14 @@ private class Geary.ImapDB.Account : BaseObject {
             throw new EngineError.OPEN_REQUIRED("Database not open");
     }
     
+    private ImapDB.SearchQuery check_search_query(Geary.SearchQuery q) throws Error {
+        ImapDB.SearchQuery? query = q as ImapDB.SearchQuery;
+        if (query == null || query.account != this)
+            throw new EngineError.BAD_PARAMETERS("Geary.SearchQuery not associated with %s", name);
+        
+        return query;
+    }
+    
     public static void get_imap_db_storage_locations(File user_data_dir, out File db_file,
         out File attachments_dir) {
         db_file = ImapDB.Database.get_db_file(user_data_dir);
@@ -716,27 +728,35 @@ private class Geary.ImapDB.Account : BaseObject {
     }
     
     /**
-     * This method is used to convert an unquoted user-entered search terms into a search term best
-     * suited for the SQLite FTS table.
+     * This method is used to convert an unquoted user-entered search terms into a stemmed search
+     * term.
      *
      * Prior experience with the Unicode Snowball stemmer indicates it's too aggressive for our
-     * needs when coupled with prefix-matching of all unquoted terms (see
-     * https://bugzilla.gnome.org/show_bug.cgi?id=713179)   This method is a heuristic designed to
-     * dampen that aggressiveness without losing the benefits of stemming entirely.
+     * tastes when coupled with prefix-matching of all unquoted terms (see
+     * https://bugzilla.gnome.org/show_bug.cgi?id=713179)   This method is part of a larger strategy
+     * designed to dampen that aggressiveness without losing the benefits of stemming entirely.
      *
      * Database upgrade 23 removes the old Snowball-stemmed FTS table and replaces it with one
      * with no stemming (using only SQLite's "simple" tokenizer).  It also creates a "magic" SQLite
      * table called TokenizerTable which allows for uniform queries to the Snowball stemmer, which
-     * is still installed in Geary.
+     * is still installed in Geary.  Thus, we are now in the position to search for the original
+     * term and its stemmed variant, then do post-search processing to strip results which are
+     * too "greedy" due to prefix-matching the stemmed variant.
      *
-     * For each term, this heuristic makes the following decisions:
+     * Some hueristics are in place simply to determine if stemming should occur:
      *
-     * # 
+     * # If the term is < MIN_TERM_LENGTH_FOR_STEMMING, no stemming occurs.
+     * # If the stemmer returns a stem that is the same as the original term, no stemming occurs.
+     * # If the difference between the stemmed word and the original term is more than
+     *   MAX_DIFF_TERM_STEM_LENGTH, no stemming occurs.  This works under the assumption that if
+     *   the user has typed a long word, they do not want to "go back" to searching for a much
+     *   shorter version of it.  (For example, "accountancy" stems to "account").
+     *
+     * Otherwise, the stem for the term is returned.
      */
-    private string stem_search_term(string term, int stemming_level) {
-        int term_length = term.length;
-        if (term_length <= 5)
-            return term;
+    private string? stem_search_term(string term) {
+        if (term.length < MIN_TERM_LENGTH_FOR_STEMMING)
+            return null;
         
         string? stemmed = null;
         try {
@@ -752,29 +772,37 @@ private class Geary.ImapDB.Account : BaseObject {
             if (!result.finished)
                 stemmed = result.string_at(0);
             else
-                message("No stemmed term returned for \"%s\"", term);
+                debug("No stemmed term returned for \"%s\"", term);
         } catch (Error err) {
-            message("Unable to query tokenizer table for stemmed term for \"%s\": %s", term, err.message);
+            debug("Unable to query tokenizer table for stemmed term for \"%s\": %s", term, err.message);
             
             // fall-through
         }
         
         if (String.is_empty(stemmed)) {
-            message("Empty stemmed term returned for \"%s\"", term);
+            debug("Empty stemmed term returned for \"%s\"", term);
             
-            return term;
+            return null;
         }
         
-        int stemmed_length = stemmed.length;
-        int diff = term_length - stemmed_length;
+        // If same term returned, treat as non-stemmed
+        if (stemmed == term)
+            return null;
         
-        message("TERM->STEM: \"%s\" -> \"%s\" (diff=%d, stemming_level=%d)", term, stemmed, diff,
-            stemming_level);
+        // Don't search for stemmed words that are significantly shorter than the user's search term
+        if (term.length - stemmed.length > MAX_DIFF_TERM_STEM_LENGTH) {
+            debug("Stemmed \"%s\" dropped searching for \"%s\": too much distance in terms",
+                stemmed, term);
+            
+            return null;
+        }
         
-        return (diff <= stemming_level) ? stemmed : term;
+        debug("Search processing: term -> stem is \"%s\" -> \"%s\"", term, stemmed);
+        
+        return stemmed;
     }
     
-    private void prepare_search_query(Geary.SearchQuery query) {
+    private void prepare_search_query(ImapDB.SearchQuery query) {
         if (query.parsed)
             return;
         
@@ -812,16 +840,31 @@ private class Geary.ImapDB.Account : BaseObject {
                 --quotes;
             }
             
+            SearchTerm? term;
             if (in_quote) {
                 // HACK: this helps prevent a syntax error when the user types
                 // something like from:"somebody".  If we ever properly support
                 // quotes after : we can get rid of this.
-                s = s.replace(":", " ");
+                term = new SearchTerm(s, s, null, str => str.replace(":", " "));
             } else {
+                string original = s;
+                
+                // some common search phrases we don't respect and therefore don't want to fall
+                // through to search results
                 string lower = s.down();
-                if (lower == "" || lower == "and" || lower == "or" || lower == "not" || lower == "near"
-                    || lower.has_prefix("near/"))
-                    continue;
+                switch (s.down()) {
+                    case "":
+                    case "and":
+                    case "or":
+                    case "not":
+                    case "near":
+                        continue;
+                    
+                    default:
+                        if (lower.has_prefix("near/"))
+                            continue;
+                    break;
+                }
                 
                 if (s.has_prefix("-"))
                     s = s.substring(1);
@@ -834,13 +877,13 @@ private class Geary.ImapDB.Account : BaseObject {
                 if (parts.length > 1)
                     field = extract_field_from_token(parts, ref s);
                 
-                s = "\"%s*\"".printf(query.stemming_level > 0 ? stem_search_term(s, query.stemming_level) : 
s);
+                term = new SearchTerm(original, s, stem_search_term(s), str => "%s*".printf(str));
             }
             
             if (in_quote && quotes % 2 != 0)
                 in_quote = false;
             
-            query.add_token(field, s);
+            query.add_search_term(field, term);
         }
         
         assert(!in_quote);
@@ -849,28 +892,50 @@ private class Geary.ImapDB.Account : BaseObject {
     }
     
     // Return a map of column -> phrase, to use as WHERE column MATCH 'phrase'.
-    private Gee.HashMap<string, string> get_query_phrases(Geary.SearchQuery query) {
+    private Gee.HashMap<string, string> get_query_phrases(ImapDB.SearchQuery query) {
         prepare_search_query(query);
         
         Gee.HashMap<string, string> phrases = new Gee.HashMap<string, string>();
         foreach (string? field in query.get_fields()) {
-            string? phrase = null;
-            Gee.List<string>? tokens = query.get_tokens(field);
-            if (tokens != null) {
-                string[] array = tokens.to_array();
-                // HACK: work around a bug in vala where it's not null-terminating
-                // arrays created from generic-typed functions (Gee.Collection.to_array)
-                // before passing them off to g_strjoinv.  Simply making a copy to a
-                // local proper string array adds the null for us.
-                string[] copy = new string[array.length];
-                for (int i = 0; i < array.length; ++i)
-                    copy[i] = array[i];
-                phrase = string.joinv(" ", copy).strip();
+            Gee.List<SearchTerm>? terms = query.get_search_terms(field);
+            if (terms == null || terms.size == 0)
+                continue;
+            
+            StringBuilder builder = new StringBuilder("(");
+            // Each SearchTerm is an AND but the SQL text within in are OR ... this allows for
+            // each user term to be AND but the variants of each term are or.  So, if terms are
+            // [party] and [eventful] and stems are [parti] and [event], the search would be:
+            //
+            // (party* OR parti*) AND (eventful* OR event*)
+            //
+            // Obviously with stemming there's the possibility of the stemmed variant being nothing
+            // but a broader search of the original term (such as event* and eventful*) but do both
+            // to determine from each hit result which term caused the hit, and if it's too greedy
+            // a match of the stemmed variant, it can be stripped from the results.
+            bool is_first_term = true;
+            foreach (SearchTerm term in terms) {
+                if (term.sql.size == 0)
+                    continue;
+                
+                if (!is_first_term)
+                    builder.append(") AND (");
+                
+                bool is_first_sql = true;
+                foreach (string sql in term.sql) {
+                    if (!is_first_sql)
+                        builder.append(" OR ");
+                    
+                    builder.append_printf("%s ", sql);
+                    is_first_sql = false;
+                }
+                
+                is_first_term = false;
             }
+            builder.append(")");
             
-            if (!Geary.String.is_empty(phrase))
-                phrases.set((field == null ? "MessageSearchTable" : field), phrase);
+            phrases.set((field == null ? "MessageSearchTable" : field), builder.str);
         }
+        
         return phrases;
     }
     
@@ -924,31 +989,13 @@ private class Geary.ImapDB.Account : BaseObject {
         return sql.str;
     }
     
-    public async Gee.Collection<Geary.EmailIdentifier>? search_async(Geary.SearchQuery query,
+    public async Gee.Collection<Geary.EmailIdentifier>? search_async(Geary.SearchQuery q,
         int limit = 100, int offset = 0, Gee.Collection<Geary.FolderPath?>? folder_blacklist = null,
         Gee.Collection<Geary.EmailIdentifier>? search_ids = null, Cancellable? cancellable = null)
         throws Error {
         check_open();
+        ImapDB.SearchQuery query = check_search_query(q);
         
-        for (;;) {
-            Gee.Collection<Geary.EmailIdentifier>? results = yield do_search_async(query,
-                limit, offset, folder_blacklist, search_ids, cancellable);
-            if (results != null && results.size > 0)
-                return results;
-            
-            if (query.stemming_level >= 2)
-                return null;
-            
-            query.stemming_level = query.stemming_level + 1;
-            query.parsed = false;
-            query.clear();
-            debug("RETYING SEARCH WITH STEMMING LEVEL %d: NO RESULTS FOUND", query.stemming_level);
-        }
-    }
-    
-    private async Gee.Collection<Geary.EmailIdentifier>? do_search_async(Geary.SearchQuery query,
-        int limit, int offset, Gee.Collection<Geary.FolderPath?>? folder_blacklist,
-        Gee.Collection<Geary.EmailIdentifier>? search_ids, Cancellable? cancellable) throws Error {
         Gee.HashMap<string, string> query_phrases = get_query_phrases(query);
         if (query_phrases.size == 0)
             return null;
@@ -1006,11 +1053,11 @@ private class Geary.ImapDB.Account : BaseObject {
             
             Db.Result result = stmt.exec(cancellable);
             while (!result.finished) {
-                int64 id = result.int64_at(0);
+                int64 message_id = result.int64_at(0);
                 int64 internaldate_time_t = result.int64_at(1);
                 DateTime? internaldate = (internaldate_time_t == -1
                     ? null : new DateTime.from_unix_local(internaldate_time_t));
-                search_results.add(new ImapDB.SearchEmailIdentifier(id, internaldate));
+                search_results.add(new ImapDB.SearchEmailIdentifier(message_id, internaldate));
                 
                 result.next(cancellable);
             }
@@ -1018,6 +1065,61 @@ private class Geary.ImapDB.Account : BaseObject {
             return Db.TransactionOutcome.DONE;
         }, cancellable);
         
+        if (search_results.size == 0)
+            return null;
+        
+        //
+        // Strip out search results that only contain a hit due to "greedy" matching of the stemmed
+        // variants on all search terms
+        //
+        
+        Gee.HashSet<string> stripped_matches = new Gee.HashSet<string>();
+        Gee.HashSet<string> accepted_matches = new Gee.HashSet<string>();
+        Gee.Iterator<ImapDB.SearchEmailIdentifier> iter = search_results.iterator();
+        while (iter.next()) {
+            Gee.Collection<string>? matches = yield get_search_matches_async(query,
+                iterate<ImapDB.EmailIdentifier>(iter.get()).to_array_list(), cancellable);
+            if (matches == null || matches.size == 0) {
+                iter.remove();
+                
+                continue;
+            }
+            
+            bool good_match_found = false;
+            foreach (string match in matches) {
+                foreach (SearchTerm term in query.get_all_terms()) {
+                    if (match.has_prefix(term.parsed)) {
+                        good_match_found = true;
+                        
+                        break;
+                    }
+                    
+                    if (term.stemmed != null && match.has_prefix(term.stemmed)) {
+                        int diff = match.length - term.stemmed.length;
+                        if (diff <= MAX_MATCH_LENGTH_STEMMED_VARIANT) {
+                            good_match_found = true;
+                            accepted_matches.add(match);
+                            
+                            break;
+                        }
+                        
+                        stripped_matches.add(match);
+                    }
+                }
+                
+                if (good_match_found)
+                    break;
+            }
+            
+            if (!good_match_found)
+                iter.remove();
+        }
+        
+        foreach (string accepted_match in accepted_matches)
+            debug("Accepted with \"%s\": %s", query.raw, accepted_match);
+        foreach (string stripped_match in stripped_matches)
+            debug("Stripped from \"%s\": %s", query.raw, stripped_match);
+        
         return (search_results.size == 0 ? null : search_results);
     }
     
@@ -1026,6 +1128,9 @@ private class Geary.ImapDB.Account : BaseObject {
     // address and the database tokenizes out the @ and ., etc.  It's not meant
     // to be comprehensive, just a little extra highlighting applied to make
     // the results look a little closer to what you typed.
+    //
+    // TODO: This needs to be done by the client, not the library.
+    /*
     private void add_literal_matches(string raw_query, Gee.Set<string> search_matches) {
         foreach (string word in raw_query.split(" ")) {
             if (word.has_suffix("\""))
@@ -1037,10 +1142,12 @@ private class Geary.ImapDB.Account : BaseObject {
                 search_matches.add(word);
         }
     }
+    */
     
-    public async Gee.Collection<string>? get_search_matches_async(Geary.SearchQuery query,
+    public async Gee.Collection<string>? get_search_matches_async(Geary.SearchQuery q,
         Gee.Collection<ImapDB.EmailIdentifier> ids, Cancellable? cancellable = null) throws Error {
         check_open();
+        ImapDB.SearchQuery query = check_search_query(q);
         
         Gee.HashMap<string, string> query_phrases = get_query_phrases(query);
         if (query_phrases.size == 0)
@@ -1064,8 +1171,6 @@ private class Geary.ImapDB.Account : BaseObject {
             foreach (string key in query_phrases.keys)
                 builder.append_printf("%s ", query_phrases[key]);
             
-            debug("\nMATCHES:\n%s\nPHRASES:%s\n", sql.str, builder.str);
-            
             Db.Statement stmt = cx.prepare(sql.str);
             sql_bind_query_phrases(stmt, 0, query_phrases);
             
@@ -1096,7 +1201,7 @@ private class Geary.ImapDB.Account : BaseObject {
             return Db.TransactionOutcome.DONE;
         }, cancellable);
         
-        add_literal_matches(query.raw, search_matches);
+        //add_literal_matches(query.raw, search_matches);
         
         return (search_matches.size == 0 ? null : search_matches);
     }
diff --git a/src/engine/imap-db/imap-db-search-query.vala b/src/engine/imap-db/imap-db-search-query.vala
new file mode 100644
index 0000000..28676e7
--- /dev/null
+++ b/src/engine/imap-db/imap-db-search-query.vala
@@ -0,0 +1,53 @@
+/* Copyright 2014 Yorba Foundation
+ *
+ * This software is licensed under the GNU Lesser General Public License
+ * (version 2.1 or later).  See the COPYING file in this distribution.
+ */
+
+/**
+ * Internal implementation of { link Geary.SearchQuery}.
+ */
+
+private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
+    /**
+     * Associated { link ImapDB.Account}.
+     */
+    public weak ImapDB.Account account { get; private set; }
+    
+    /**
+     * Whether or not the query has been parsed and processed prior to search submission.
+     */
+    public bool parsed { get; set; default = false; }
+    
+    // Not using a MultiMap because we (might) need a guarantee of order.
+    private Gee.HashMap<string?, Gee.ArrayList<SearchTerm>> field_map
+        = new Gee.HashMap<string?, Gee.ArrayList<SearchTerm>>();
+    private Gee.ArrayList<SearchTerm> all = new Gee.ArrayList<SearchTerm>();
+    
+    public SearchQuery(ImapDB.Account account, string query) {
+        base (query);
+        
+        this.account = account;
+    }
+    
+    public void add_search_term(string? field, SearchTerm term) {
+        if (!field_map.has_key(field))
+            field_map.set(field, new Gee.ArrayList<SearchTerm>());
+        
+        field_map.get(field).add(term);
+        all.add(term);
+    }
+    
+    public Gee.Collection<string?> get_fields() {
+        return field_map.keys;
+    }
+    
+    public Gee.List<SearchTerm>? get_search_terms(string? field) {
+        return field_map.has_key(field) ? field_map.get(field) : null;
+    }
+    
+    public Gee.List<SearchTerm>? get_all_terms() {
+        return all;
+    }
+}
+
diff --git a/src/engine/imap-db/imap-db-search-term.vala b/src/engine/imap-db/imap-db-search-term.vala
new file mode 100644
index 0000000..a1fe8d0
--- /dev/null
+++ b/src/engine/imap-db/imap-db-search-term.vala
@@ -0,0 +1,66 @@
+/* Copyright 2014 Yorba Foundation
+ *
+ * This software is licensed under the GNU Lesser General Public License
+ * (version 2.1 or later).  See the COPYING file in this distribution.
+ */
+
+/**
+ * Various associated state with a single term in a { link ImapDB.SearchQuery}.
+ */
+
+private class Geary.ImapDB.SearchTerm : BaseObject {
+    /**
+     * Convert search term text into SQL for SQLite's MATCH operator.
+     *
+     * If null or an empty string is returned, the text will be dropped.
+     */
+    public delegate string? TextToMatch(string text);
+    
+    /**
+     * The original tokenized search term with minimal other processing performed.
+     *
+     * For example, punctuation might be removed, but no casefolding has occurred.
+     */
+    public string original { get; private set; }
+    
+    /**
+     * The parsed tokenized search term.
+     *
+     * Casefolding and other normalizing text operations have been performed.
+     */
+    public string parsed { get; private set; }
+    
+    /**
+     * The stemmed search term.
+     *
+     * Only used if stemming is being done ''and'' the stem is different than the { link parsed}
+     * term.
+     */
+    public string? stemmed { get; private set; }
+    
+    /**
+     * A list of terms ready for binding to an SQLite statement.
+     *
+     * This should include prefix operators and quotes (i.e. ["party"] or [party*]).  These texts
+     * are guaranteed not to be null or empty strings.
+     */
+    public Gee.List<string> sql { get; private set; default = new Gee.ArrayList<string>(); }
+    
+    public SearchTerm(string original, string parsed, string? stemmed, TextToMatch text_to_match) {
+        this.original = original;
+        this.parsed = parsed;
+        this.stemmed = stemmed;
+        
+        // for now, only two variations: the parsed string and the stemmed; since stem is usually
+        // shorter (and will be first in the OR statement), include it first
+        if (stemmed != null)
+            add_sql(text_to_match(stemmed));
+        add_sql(text_to_match(parsed));
+    }
+    
+    private void add_sql(string? match_term) {
+        if (!String.is_empty(match_term))
+            sql.add(match_term);
+    }
+}
+
diff --git a/src/engine/imap-engine/imap-engine-generic-account.vala 
b/src/engine/imap-engine/imap-engine-generic-account.vala
index ef84ee6..47ce2ac 100644
--- a/src/engine/imap-engine/imap-engine-generic-account.vala
+++ b/src/engine/imap-engine/imap-engine-generic-account.vala
@@ -824,6 +824,10 @@ private abstract class Geary.ImapEngine.GenericAccount : Geary.AbstractAccount {
         return yield local.fetch_email_async(check_id(email_id), required_fields, cancellable);
     }
     
+    public override Geary.SearchQuery open_search(string query) {
+        return new ImapDB.SearchQuery(local, query);
+    }
+    
     public override async Gee.Collection<Geary.EmailIdentifier>? local_search_async(Geary.SearchQuery query,
         int limit = 100, int offset = 0, Gee.Collection<Geary.FolderPath?>? folder_blacklist = null,
         Gee.Collection<Geary.EmailIdentifier>? search_ids = null, Cancellable? cancellable = null) throws 
Error {


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]