[geary/wip/720361-stemming] First stab at this



commit 1a94ec6cf4edeb52963b5fbd2f8489f340cb0b28
Author: Jim Nelson <jim yorba org>
Date:   Tue Dec 9 17:40:54 2014 -0800

    First stab at this
    
    This deletes the old search FTS table and installs a new one, so
    backup your Geary data directory before using.

 sql/CMakeLists.txt                           |    1 +
 sql/version-023.sql                          |   21 ++++++
 src/client/application/geary-controller.vala |    5 +-
 src/engine/api/geary-search-query.vala       |    6 ++
 src/engine/imap-db/imap-db-account.vala      |   95 +++++++++++++++++++++++++-
 src/engine/imap-db/imap-db-database.vala     |   27 +++++++-
 6 files changed, 149 insertions(+), 6 deletions(-)
---
diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt
index bbd8f91..40184ce 100644
--- a/sql/CMakeLists.txt
+++ b/sql/CMakeLists.txt
@@ -22,3 +22,4 @@ install(FILES version-019.sql DESTINATION ${SQL_DEST})
 install(FILES version-020.sql DESTINATION ${SQL_DEST})
 install(FILES version-021.sql DESTINATION ${SQL_DEST})
 install(FILES version-022.sql DESTINATION ${SQL_DEST})
+install(FILES version-023.sql DESTINATION ${SQL_DEST})
diff --git a/sql/version-023.sql b/sql/version-023.sql
new file mode 100644
index 0000000..d282516
--- /dev/null
+++ b/sql/version-023.sql
@@ -0,0 +1,21 @@
+--
+-- Database upgrade to add FTS tokenize virtual table, which allows for querying the tokenizer
+-- directly for stemmed words, and dropping the stemmed FTS table for an unstemmed one.  We now
+-- use the stemmer manually to generate search queries.
+--
+
+DROP TABLE MessageSearchTable;
+
+CREATE VIRTUAL TABLE MessageSearchTable USING fts4(
+    body,
+    attachment,
+    subject,
+    from_field,
+    receivers,
+    cc,
+    bcc,
+    
+    tokenize=simple,
+    prefix="2,4,6,8,10"
+);
+
diff --git a/src/client/application/geary-controller.vala b/src/client/application/geary-controller.vala
index 808f3f4..08f3fe4 100644
--- a/src/client/application/geary-controller.vala
+++ b/src/client/application/geary-controller.vala
@@ -81,7 +81,7 @@ public class GearyController : Geary.BaseObject {
     private const string MOVE_MESSAGE_TOOLTIP_MULTIPLE = _("Move conversations");
     
     private const int SELECT_FOLDER_TIMEOUT_USEC = 100 * 1000;
-    private const int SEARCH_TIMEOUT_MSEC = 100;
+    private const int SEARCH_TIMEOUT_MSEC = 250;
     
     private const string PROP_ATTEMPT_OPEN_ACCOUNT = "attempt-open-account";
     
@@ -2523,7 +2523,8 @@ public class GearyController : Geary.BaseObject {
         // search after a quick delay when they finish typing.
         if (search_timeout_id != 0)
             Source.remove(search_timeout_id);
-        search_timeout_id = Timeout.add(SEARCH_TIMEOUT_MSEC, on_search_timeout);
+        
+        search_timeout_id = Timeout.add(SEARCH_TIMEOUT_MSEC, on_search_timeout, Priority.LOW);
     }
     
     private bool on_search_timeout() {
diff --git a/src/engine/api/geary-search-query.vala b/src/engine/api/geary-search-query.vala
index 51bcbd3..557809c 100644
--- a/src/engine/api/geary-search-query.vala
+++ b/src/engine/api/geary-search-query.vala
@@ -16,6 +16,8 @@ public class Geary.SearchQuery : BaseObject {
     public string raw { get; private set; }
     public bool parsed { get; internal set; default = false; }
     
+    internal int stemming_level { get; set; default = 0; }
+    
     // Not using a MultiMap because we (might) need a guarantee of order.
     private Gee.HashMap<string?, Gee.ArrayList<string>> field_map
         = new Gee.HashMap<string?, Gee.ArrayList<string>>();
@@ -24,6 +26,10 @@ public class Geary.SearchQuery : BaseObject {
         raw = query;
     }
     
+    public void clear() {
+        field_map.clear();
+    }
+    
     internal void add_token(string? field, string token) {
         if (!field_map.has_key(field))
             field_map.set(field, new Gee.ArrayList<string>());
diff --git a/src/engine/imap-db/imap-db-account.vala b/src/engine/imap-db/imap-db-account.vala
index c4a370e..da66583 100644
--- a/src/engine/imap-db/imap-db-account.vala
+++ b/src/engine/imap-db/imap-db-account.vala
@@ -715,6 +715,65 @@ private class Geary.ImapDB.Account : BaseObject {
         return null;
     }
     
+    /**
+     * This method is used to convert an unquoted user-entered search terms into a search term best
+     * suited for the SQLite FTS table.
+     *
+     * Prior experience with the Unicode Snowball stemmer indicates it's too aggressive for our
+     * needs when coupled with prefix-matching of all unquoted terms (see
+     * https://bugzilla.gnome.org/show_bug.cgi?id=713179)   This method is a heuristic designed to
+     * dampen that aggressiveness without losing the benefits of stemming entirely.
+     *
+     * Database upgrade 23 removes the old Snowball-stemmed FTS table and replaces it with one
+     * with no stemming (using only SQLite's "simple" tokenizer).  It also creates a "magic" SQLite
+     * table called TokenizerTable which allows for uniform queries to the Snowball stemmer, which
+     * is still installed in Geary.
+     *
+     * For each term, this heuristic makes the following decisions:
+     *
+     * # 
+     */
+    private string stem_search_term(string term, int stemming_level) {
+        int term_length = term.length;
+        if (term_length <= 5)
+            return term;
+        
+        string? stemmed = null;
+        try {
+            Db.Statement stmt = db.prepare("""
+                SELECT token
+                FROM TokenizerTable
+                WHERE input=?
+            """);
+            stmt.bind_string(0, term);
+            
+            // get stemmed string; if no result, fall through
+            Db.Result result = stmt.exec();
+            if (!result.finished)
+                stemmed = result.string_at(0);
+            else
+                message("No stemmed term returned for \"%s\"", term);
+        } catch (Error err) {
+            message("Unable to query tokenizer table for stemmed term for \"%s\": %s", term, err.message);
+            
+            // fall-through
+        }
+        
+        if (String.is_empty(stemmed)) {
+            message("Empty stemmed term returned for \"%s\"", term);
+            
+            return term;
+        }
+        
+        int stemmed_length = stemmed.length;
+        int diff = term_length - stemmed_length;
+        
+        message("TERM->STEM: \"%s\" -> \"%s\" (diff=%d, stemming_level=%d)", term, stemmed, diff,
+            stemming_level);
+        
+        return (diff <= stemming_level) ? stemmed : term;
+    }
+    
     private void prepare_search_query(Geary.SearchQuery query) {
         if (query.parsed)
             return;
@@ -775,7 +834,7 @@ private class Geary.ImapDB.Account : BaseObject {
                 if (parts.length > 1)
                     field = extract_field_from_token(parts, ref s);
                 
-                s = "\"" + s + "*\"";
+                s = "\"%s*\"".printf(query.stemming_level > 0 ? stem_search_term(s, query.stemming_level) : 
s);
             }
             
             if (in_quote && quotes % 2 != 0)
@@ -867,9 +926,29 @@ private class Geary.ImapDB.Account : BaseObject {
     
     public async Gee.Collection<Geary.EmailIdentifier>? search_async(Geary.SearchQuery query,
         int limit = 100, int offset = 0, Gee.Collection<Geary.FolderPath?>? folder_blacklist = null,
-        Gee.Collection<Geary.EmailIdentifier>? search_ids = null, Cancellable? cancellable = null) throws 
Error {
+        Gee.Collection<Geary.EmailIdentifier>? search_ids = null, Cancellable? cancellable = null)
+        throws Error {
         check_open();
         
+        for (;;) {
+            Gee.Collection<Geary.EmailIdentifier>? results = yield do_search_async(query,
+                limit, offset, folder_blacklist, search_ids, cancellable);
+            if (results != null && results.size > 0)
+                return results;
+            
+            if (query.stemming_level >= 2)
+                return null;
+            
+            query.stemming_level = query.stemming_level + 1;
+            query.parsed = false;
+            query.clear();
+            debug("RETYING SEARCH WITH STEMMING LEVEL %d: NO RESULTS FOUND", query.stemming_level);
+        }
+    }
+    
+    private async Gee.Collection<Geary.EmailIdentifier>? do_search_async(Geary.SearchQuery query,
+        int limit, int offset, Gee.Collection<Geary.FolderPath?>? folder_blacklist,
+        Gee.Collection<Geary.EmailIdentifier>? search_ids, Cancellable? cancellable) throws Error {
         Gee.HashMap<string, string> query_phrases = get_query_phrases(query);
         if (query_phrases.size == 0)
             return null;
@@ -912,6 +991,12 @@ private class Geary.ImapDB.Account : BaseObject {
             if (limit > 0)
                 sql.append(" LIMIT ? OFFSET ?");
             
+            StringBuilder builder = new StringBuilder();
+            foreach (string key in query_phrases.keys)
+                builder.append_printf("%s ", query_phrases[key]);
+            
+            debug("\nSEARCH:\n%s\nPHRASES:%s\n", sql.str, builder.str);
+            
             Db.Statement stmt = cx.prepare(sql.str);
             int bind_index = sql_bind_query_phrases(stmt, 0, query_phrases);
             if (limit > 0) {
@@ -975,6 +1060,12 @@ private class Geary.ImapDB.Account : BaseObject {
             sql.append(")");
             sql_add_query_phrases(sql, query_phrases);
             
+            StringBuilder builder = new StringBuilder();
+            foreach (string key in query_phrases.keys)
+                builder.append_printf("%s ", query_phrases[key]);
+            
+            debug("\nMATCHES:\n%s\nPHRASES:%s\n", sql.str, builder.str);
+            
             Db.Statement stmt = cx.prepare(sql.str);
             sql_bind_query_phrases(stmt, 0, query_phrases);
             
diff --git a/src/engine/imap-db/imap-db-database.vala b/src/engine/imap-db/imap-db-database.vala
index 704d730..533209f 100644
--- a/src/engine/imap-db/imap-db-database.vala
+++ b/src/engine/imap-db/imap-db-database.vala
@@ -107,7 +107,11 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
             break;
             
             case 22:
-                post_rebuild_attachments();
+                post_upgrade_rebuild_attachments();
+            break;
+            
+            case 23:
+                post_upgrade_add_tokenizer_table();
             break;
         }
     }
@@ -407,7 +411,7 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
     }
     
     // Version 22
-    private void post_rebuild_attachments() {
+    private void post_upgrade_rebuild_attachments() {
         try {
             exec_transaction(Db.TransactionType.RW, (cx) => {
                 Db.Statement stmt = cx.prepare("""
@@ -471,6 +475,25 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
         }
     }
     
+    // Version 23
+    private void post_upgrade_add_tokenizer_table() {
+        try {
+            string stemmer = find_appropriate_search_stemmer();
+            debug("Creating tokenizer table using %s stemmer", stemmer);
+            
+            // These can't go in the .sql file because its schema (the stemmer
+            // algorithm) is determined at runtime.
+            exec("""
+                CREATE VIRTUAL TABLE TokenizerTable USING fts3tokenize(
+                    unicodesn,
+                    "stemmer=%s"
+                );
+            """.printf(stemmer));
+        } catch (Error e) {
+            error("Error creating tokenizer table: %s", e.message);
+        }
+    }
+    
     private void on_prepare_database_connection(Db.Connection cx) throws Error {
         cx.set_busy_timeout_msec(Db.Connection.RECOMMENDED_BUSY_TIMEOUT_MSEC);
         cx.set_foreign_keys(true);


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]