[geary/wip/720361-stemming] Optimizations to speed up stripping of "greedy" search results



commit bbb0855bb35e58e2d6b67437a38a9bb7b85a5299
Author: Jim Nelson <jim yorba org>
Date:   Mon Dec 15 17:32:16 2014 -0800

    Optimizations to speed up stripping of "greedy" search results

 src/engine/imap-db/imap-db-account.vala |  204 ++++++++++++++++++-------------
 1 files changed, 121 insertions(+), 83 deletions(-)
---
diff --git a/src/engine/imap-db/imap-db-account.vala b/src/engine/imap-db/imap-db-account.vala
index 9a184d5..21b2c69 100644
--- a/src/engine/imap-db/imap-db-account.vala
+++ b/src/engine/imap-db/imap-db-account.vala
@@ -17,18 +17,6 @@ private class Geary.ImapDB.Account : BaseObject {
         }
     }
     
-    private class SearchOffset {
-        public int column;      // Column in search table
-        public int byte_offset; // Offset (in bytes) of search term in string
-        public int size;        // Size (in bytes) of the search term in string
-        
-        public SearchOffset(string[] offset_string) {
-            column = int.parse(offset_string[0]);
-            byte_offset = int.parse(offset_string[2]);
-            size = int.parse(offset_string[3]);
-        }
-    }
-    
     public signal void email_sent(Geary.RFC822.Message rfc822);
     
     // Only available when the Account is opened
@@ -1020,10 +1008,28 @@ private class Geary.ImapDB.Account : BaseObject {
         if (query_phrases.size == 0)
             return null;
         
-        Gee.ArrayList<ImapDB.SearchEmailIdentifier> search_results
-            = new Gee.ArrayList<ImapDB.SearchEmailIdentifier>();
-        
+        // Do this outside of transaction to catch invalid search ids up-front
         string? search_ids_sql = get_search_ids_sql(search_ids);
+        
+        // for some searches, results are stripped if they're too "greedy", but this requires
+        // examining the matched text, which has an expense to fetch, so avoid doing so unless
+        // necessary
+        bool strip_results = true;
+        
+        // HORIZON strategy is configured in such a way to allow all stemmed variants to match,
+        // so don't do any stripping in that case
+        //
+        // If any of the search terms is exact-match (no prefix matching) or none have stemmed
+        // variants, then don't do stripping of "greedy" stemmed matching (because in both cases,
+        // there are none)
+        if (query.strategy == Geary.SearchQuery.Strategy.HORIZON)
+            strip_results = false;
+        else if (traverse<SearchTerm>(query.get_all_terms()).any(term => term.stemmed == null || 
term.is_exact))
+            strip_results = false;
+        
+        Gee.Set<ImapDB.EmailIdentifier> unstripped_ids = new Gee.HashSet<ImapDB.EmailIdentifier>();
+        Gee.Map<ImapDB.EmailIdentifier, Gee.Set<string>>? search_results = null;
+        
         yield db.exec_transaction_async(Db.TransactionType.RO, (cx) => {
             string blacklisted_ids_sql = do_get_blacklisted_message_ids_sql(
                 folder_blacklist, cx, cancellable);
@@ -1065,33 +1071,40 @@ private class Geary.ImapDB.Account : BaseObject {
                 stmt.bind_int(bind_index++, offset);
             }
             
+            Gee.HashMap<int64?, ImapDB.EmailIdentifier> id_map = new Gee.HashMap<int64?, 
ImapDB.EmailIdentifier>(
+                Collection.int64_hash_func, Collection.int64_equal_func);
+            
             Db.Result result = stmt.exec(cancellable);
             while (!result.finished) {
                 int64 message_id = result.int64_at(0);
                 int64 internaldate_time_t = result.int64_at(1);
                 DateTime? internaldate = (internaldate_time_t == -1
                     ? null : new DateTime.from_unix_local(internaldate_time_t));
-                search_results.add(new ImapDB.SearchEmailIdentifier(message_id, internaldate));
+                
+                ImapDB.EmailIdentifier id = new ImapDB.SearchEmailIdentifier(message_id, internaldate);
+                
+                unstripped_ids.add(id);
+                id_map.set(message_id, id);
                 
                 result.next(cancellable);
             }
             
+            if (!strip_results)
+                return Db.TransactionOutcome.DONE;
+            
+            search_results = do_get_search_matches(cx, query, id_map, cancellable);
+            
             return Db.TransactionOutcome.DONE;
         }, cancellable);
         
-        if (search_results.size == 0)
+        if (unstripped_ids == null || unstripped_ids.size == 0)
             return null;
         
-        // HORIZON strategy is configured in such a way to allow all stemmed variants to match,
-        // so don't do any stripping in that case
-        if (query.strategy == Geary.SearchQuery.Strategy.HORIZON)
-            return search_results;
+        if (!strip_results)
+            return unstripped_ids;
         
-        // if any of the search terms is exact-match (no prefix matching) or none have stemmed
-        // variants, then don't do stripping of "greedy" stemmed matching (because in both cases,
-        // there are none)
-        if (traverse<SearchTerm>(query.get_all_terms()).any(term => term.stemmed == null || term.is_exact))
-            return search_results;
+        // at this point, there should be some "full" search results to strip from
+        assert(search_results != null && search_results.size > 0);
         
         //
         // Strip out search results that only contain a hit due to "greedy" matching of the stemmed
@@ -1099,22 +1112,14 @@ private class Geary.ImapDB.Account : BaseObject {
         //
         
         int prestripped_results = search_results.size;
-        Gee.Iterator<ImapDB.SearchEmailIdentifier> iter = search_results.iterator();
+        Gee.MapIterator<ImapDB.EmailIdentifier, Gee.Set<string>> iter = search_results.map_iterator();
         while (iter.next()) {
-            Gee.Collection<string>? matches = yield get_search_matches_async(query,
-                iterate<ImapDB.EmailIdentifier>(iter.get()).to_array_list(), cancellable);
-            if (matches == null || matches.size == 0) {
-                iter.remove();
-                
-                continue;
-            }
-            
             // For each matched string in this message, retain the message in the search results
             // if it prefix-matches any of the straight-up parsed terms or matches a stemmed
             // variant (with only max. difference in their lengths allowed, i.e. not a "greedy"
             // match)
             bool good_match_found = false;
-            foreach (string match in matches) {
+            foreach (string match in iter.get_value()) {
                 foreach (SearchTerm term in query.get_all_terms()) {
                     // if prefix-matches parsed term, then don't strip
                     if (match.has_prefix(term.parsed)) {
@@ -1139,71 +1144,40 @@ private class Geary.ImapDB.Account : BaseObject {
             }
             
             if (!good_match_found)
-                iter.remove();
+                iter.unset();
         }
         
         debug("Stripped %d emails from search for [%s] due to greedy stem matching",
             prestripped_results - search_results.size, query.raw);
         
-        return (search_results.size == 0 ? null : search_results);
+        return search_results.size == 0 ? null : search_results.keys;
     }
     
-    // See add_literal_matches() for explanation of include_literal_matches
     public async Gee.Set<string>? get_search_matches_async(Geary.SearchQuery q,
         Gee.Collection<ImapDB.EmailIdentifier> ids, Cancellable? cancellable = null) throws Error {
         check_open();
         ImapDB.SearchQuery query = check_search_query(q);
         
-        Gee.HashMap<string, string> query_phrases = get_query_phrases(query);
-        if (query_phrases.size == 0)
-            return null;
-        
-        Gee.Set<string> search_matches = new Gee.HashSet<string>();
-        
+        Gee.Set<string>? search_matches = null;
         yield db.exec_transaction_async(Db.TransactionType.RO, (cx) => {
-            StringBuilder sql = new StringBuilder();
-            sql.append("""
-                SELECT offsets(MessageSearchTable), *
-                FROM MessageSearchTable
-                WHERE docid IN (
-            """);
-            sql_append_ids(sql,
-                Geary.traverse<ImapDB.EmailIdentifier>(ids).map<int64?>(id => 
id.message_id).to_gee_iterable());
-            sql.append(")");
-            sql_add_query_phrases(sql, query_phrases);
-            
-            Db.Statement stmt = cx.prepare(sql.str);
-            sql_bind_query_phrases(stmt, 0, query_phrases);
+            Gee.HashMap<int64?, ImapDB.EmailIdentifier> id_map = new Gee.HashMap<
+                int64?, ImapDB.EmailIdentifier>(Collection.int64_hash_func, Collection.int64_equal_func);
+            foreach (ImapDB.EmailIdentifier id in ids)
+                id_map.set(id.message_id, id);
+            
+            Gee.Map<ImapDB.EmailIdentifier, Gee.Set<string>>? match_map =
+                do_get_search_matches(cx, query, id_map, cancellable);
+            if (match_map == null || match_map.size == 0)
+                return Db.TransactionOutcome.DONE;
             
-            Db.Result result = stmt.exec(cancellable);
-            while (!result.finished) {
-                // Build a list of search offsets.
-                string[] offset_array = result.nonnull_string_at(0).split(" ");
-                
-                Gee.ArrayList<SearchOffset> all_offsets = new Gee.ArrayList<SearchOffset>();
-                int j = 0;
-                while (true) {
-                    all_offsets.add(new SearchOffset(offset_array[j:j+4]));
-                    
-                    j += 4;
-                    if (j >= offset_array.length)
-                        break;
-                }
-                
-                // Iterate over the offset list, scrape strings from the database, and push
-                // the results into our return set.
-                foreach(SearchOffset offset in all_offsets) {
-                    unowned string text = result.nonnull_string_at(offset.column + 1);
-                    search_matches.add(text[offset.byte_offset : offset.byte_offset + offset.size].down());
-                }
-                
-                result.next(cancellable);
-            }
+            search_matches = new Gee.HashSet<string>();
+            foreach (Gee.Set<string> matches in match_map.values)
+                search_matches.add_all(matches);
             
             return Db.TransactionOutcome.DONE;
         }, cancellable);
         
-        return (search_matches.size == 0 ? null : search_matches);
+        return search_matches;
     }
     
     public async Geary.Email fetch_email_async(ImapDB.EmailIdentifier email_id,
@@ -1757,5 +1731,69 @@ private class Geary.ImapDB.Account : BaseObject {
                 unread_change.get(path));
         }
     }
+    
+    // Not using a MultiMap because when traversing want to process all values at once per iteration,
+    // not per key-value
+    public Gee.Map<ImapDB.EmailIdentifier, Gee.Set<string>>? do_get_search_matches(Db.Connection cx,
+        ImapDB.SearchQuery query, Gee.Map<int64?, ImapDB.EmailIdentifier> id_map, Cancellable? cancellable)
+        throws Error {
+        if (id_map.size == 0)
+            return null;
+        
+        Gee.HashMap<string, string> query_phrases = get_query_phrases(query);
+        if (query_phrases.size == 0)
+            return null;
+        
+        StringBuilder sql = new StringBuilder();
+        sql.append("""
+            SELECT docid, offsets(MessageSearchTable), *
+            FROM MessageSearchTable
+            WHERE docid IN (
+        """);
+        sql_append_ids(sql, id_map.keys);
+        sql.append(")");
+        sql_add_query_phrases(sql, query_phrases);
+        
+        Db.Statement stmt = cx.prepare(sql.str);
+        sql_bind_query_phrases(stmt, 0, query_phrases);
+        
+        Gee.Map<ImapDB.EmailIdentifier, Gee.Set<string>> search_matches = new Gee.HashMap<
+            ImapDB.EmailIdentifier, Gee.Set<string>>();
+        
+        Db.Result result = stmt.exec(cancellable);
+        while (!result.finished) {
+            int64 docid = result.rowid_at(0);
+            assert(id_map.contains(docid));
+            ImapDB.EmailIdentifier id = id_map.get(docid);
+            
+            // offsets() function returns a list of 4 strings that are ints indicating position
+            // and length of match string in search table corpus
+            string[] offset_array = result.nonnull_string_at(1).split(" ");
+            
+            Gee.Set<string> matches = new Gee.HashSet<string>();
+            
+            int j = 0;
+            while (true) {
+                unowned string[] offset_string = offset_array[j:j+4];
+                
+                int column = int.parse(offset_string[0]);
+                int byte_offset = int.parse(offset_string[2]);
+                int size = int.parse(offset_string[3]);
+                
+                unowned string text = result.nonnull_string_at(column + 2);
+                matches.add(text[byte_offset : byte_offset + size].down());
+                
+                j += 4;
+                if (j >= offset_array.length)
+                    break;
+            }
+            
+            search_matches.set(id, matches);
+            
+            result.next(cancellable);
+        }
+        
+        return search_matches.size > 0 ? search_matches : null;
+    }
 }
 


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]