[geary/wip/720361-stemming] Optimizations to speed up stripping of "greedy" search results
- From: Jim Nelson <jnelson src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [geary/wip/720361-stemming] Optimizations to speed up stripping of "greedy" search results
- Date: Tue, 16 Dec 2014 01:32:30 +0000 (UTC)
commit bbb0855bb35e58e2d6b67437a38a9bb7b85a5299
Author: Jim Nelson <jim yorba org>
Date: Mon Dec 15 17:32:16 2014 -0800
Optimizations to speed up stripping of "greedy" search results
src/engine/imap-db/imap-db-account.vala | 204 ++++++++++++++++++-------------
1 files changed, 121 insertions(+), 83 deletions(-)
---
diff --git a/src/engine/imap-db/imap-db-account.vala b/src/engine/imap-db/imap-db-account.vala
index 9a184d5..21b2c69 100644
--- a/src/engine/imap-db/imap-db-account.vala
+++ b/src/engine/imap-db/imap-db-account.vala
@@ -17,18 +17,6 @@ private class Geary.ImapDB.Account : BaseObject {
}
}
- private class SearchOffset {
- public int column; // Column in search table
- public int byte_offset; // Offset (in bytes) of search term in string
- public int size; // Size (in bytes) of the search term in string
-
- public SearchOffset(string[] offset_string) {
- column = int.parse(offset_string[0]);
- byte_offset = int.parse(offset_string[2]);
- size = int.parse(offset_string[3]);
- }
- }
-
public signal void email_sent(Geary.RFC822.Message rfc822);
// Only available when the Account is opened
@@ -1020,10 +1008,28 @@ private class Geary.ImapDB.Account : BaseObject {
if (query_phrases.size == 0)
return null;
- Gee.ArrayList<ImapDB.SearchEmailIdentifier> search_results
- = new Gee.ArrayList<ImapDB.SearchEmailIdentifier>();
-
+ // Do this outside of transaction to catch invalid search ids up-front
string? search_ids_sql = get_search_ids_sql(search_ids);
+
+ // for some searches, results are stripped if they're too "greedy", but this requires
+ // examining the matched text, which has an expense to fetch, so avoid doing so unless
+ // necessary
+ bool strip_results = true;
+
+ // HORIZON strategy is configured in such a way to allow all stemmed variants to match,
+ // so don't do any stripping in that case
+ //
+ // If any of the search terms is exact-match (no prefix matching) or none have stemmed
+ // variants, then don't do stripping of "greedy" stemmed matching (because in both cases,
+ // there are none)
+ if (query.strategy == Geary.SearchQuery.Strategy.HORIZON)
+ strip_results = false;
+ else if (traverse<SearchTerm>(query.get_all_terms()).any(term => term.stemmed == null ||
term.is_exact))
+ strip_results = false;
+
+ Gee.Set<ImapDB.EmailIdentifier> unstripped_ids = new Gee.HashSet<ImapDB.EmailIdentifier>();
+ Gee.Map<ImapDB.EmailIdentifier, Gee.Set<string>>? search_results = null;
+
yield db.exec_transaction_async(Db.TransactionType.RO, (cx) => {
string blacklisted_ids_sql = do_get_blacklisted_message_ids_sql(
folder_blacklist, cx, cancellable);
@@ -1065,33 +1071,40 @@ private class Geary.ImapDB.Account : BaseObject {
stmt.bind_int(bind_index++, offset);
}
+ Gee.HashMap<int64?, ImapDB.EmailIdentifier> id_map = new Gee.HashMap<int64?,
ImapDB.EmailIdentifier>(
+ Collection.int64_hash_func, Collection.int64_equal_func);
+
Db.Result result = stmt.exec(cancellable);
while (!result.finished) {
int64 message_id = result.int64_at(0);
int64 internaldate_time_t = result.int64_at(1);
DateTime? internaldate = (internaldate_time_t == -1
? null : new DateTime.from_unix_local(internaldate_time_t));
- search_results.add(new ImapDB.SearchEmailIdentifier(message_id, internaldate));
+
+ ImapDB.EmailIdentifier id = new ImapDB.SearchEmailIdentifier(message_id, internaldate);
+
+ unstripped_ids.add(id);
+ id_map.set(message_id, id);
result.next(cancellable);
}
+ if (!strip_results)
+ return Db.TransactionOutcome.DONE;
+
+ search_results = do_get_search_matches(cx, query, id_map, cancellable);
+
return Db.TransactionOutcome.DONE;
}, cancellable);
- if (search_results.size == 0)
+ if (unstripped_ids == null || unstripped_ids.size == 0)
return null;
- // HORIZON strategy is configured in such a way to allow all stemmed variants to match,
- // so don't do any stripping in that case
- if (query.strategy == Geary.SearchQuery.Strategy.HORIZON)
- return search_results;
+ if (!strip_results)
+ return unstripped_ids;
- // if any of the search terms is exact-match (no prefix matching) or none have stemmed
- // variants, then don't do stripping of "greedy" stemmed matching (because in both cases,
- // there are none)
- if (traverse<SearchTerm>(query.get_all_terms()).any(term => term.stemmed == null || term.is_exact))
- return search_results;
+ // at this point, there should be some "full" search results to strip from
+ assert(search_results != null && search_results.size > 0);
//
// Strip out search results that only contain a hit due to "greedy" matching of the stemmed
@@ -1099,22 +1112,14 @@ private class Geary.ImapDB.Account : BaseObject {
//
int prestripped_results = search_results.size;
- Gee.Iterator<ImapDB.SearchEmailIdentifier> iter = search_results.iterator();
+ Gee.MapIterator<ImapDB.EmailIdentifier, Gee.Set<string>> iter = search_results.map_iterator();
while (iter.next()) {
- Gee.Collection<string>? matches = yield get_search_matches_async(query,
- iterate<ImapDB.EmailIdentifier>(iter.get()).to_array_list(), cancellable);
- if (matches == null || matches.size == 0) {
- iter.remove();
-
- continue;
- }
-
// For each matched string in this message, retain the message in the search results
// if it prefix-matches any of the straight-up parsed terms or matches a stemmed
// variant (with only max. difference in their lengths allowed, i.e. not a "greedy"
// match)
bool good_match_found = false;
- foreach (string match in matches) {
+ foreach (string match in iter.get_value()) {
foreach (SearchTerm term in query.get_all_terms()) {
// if prefix-matches parsed term, then don't strip
if (match.has_prefix(term.parsed)) {
@@ -1139,71 +1144,40 @@ private class Geary.ImapDB.Account : BaseObject {
}
if (!good_match_found)
- iter.remove();
+ iter.unset();
}
debug("Stripped %d emails from search for [%s] due to greedy stem matching",
prestripped_results - search_results.size, query.raw);
- return (search_results.size == 0 ? null : search_results);
+ return search_results.size == 0 ? null : search_results.keys;
}
- // See add_literal_matches() for explanation of include_literal_matches
public async Gee.Set<string>? get_search_matches_async(Geary.SearchQuery q,
Gee.Collection<ImapDB.EmailIdentifier> ids, Cancellable? cancellable = null) throws Error {
check_open();
ImapDB.SearchQuery query = check_search_query(q);
- Gee.HashMap<string, string> query_phrases = get_query_phrases(query);
- if (query_phrases.size == 0)
- return null;
-
- Gee.Set<string> search_matches = new Gee.HashSet<string>();
-
+ Gee.Set<string>? search_matches = null;
yield db.exec_transaction_async(Db.TransactionType.RO, (cx) => {
- StringBuilder sql = new StringBuilder();
- sql.append("""
- SELECT offsets(MessageSearchTable), *
- FROM MessageSearchTable
- WHERE docid IN (
- """);
- sql_append_ids(sql,
- Geary.traverse<ImapDB.EmailIdentifier>(ids).map<int64?>(id =>
id.message_id).to_gee_iterable());
- sql.append(")");
- sql_add_query_phrases(sql, query_phrases);
-
- Db.Statement stmt = cx.prepare(sql.str);
- sql_bind_query_phrases(stmt, 0, query_phrases);
+ Gee.HashMap<int64?, ImapDB.EmailIdentifier> id_map = new Gee.HashMap<
+ int64?, ImapDB.EmailIdentifier>(Collection.int64_hash_func, Collection.int64_equal_func);
+ foreach (ImapDB.EmailIdentifier id in ids)
+ id_map.set(id.message_id, id);
+
+ Gee.Map<ImapDB.EmailIdentifier, Gee.Set<string>>? match_map =
+ do_get_search_matches(cx, query, id_map, cancellable);
+ if (match_map == null || match_map.size == 0)
+ return Db.TransactionOutcome.DONE;
- Db.Result result = stmt.exec(cancellable);
- while (!result.finished) {
- // Build a list of search offsets.
- string[] offset_array = result.nonnull_string_at(0).split(" ");
-
- Gee.ArrayList<SearchOffset> all_offsets = new Gee.ArrayList<SearchOffset>();
- int j = 0;
- while (true) {
- all_offsets.add(new SearchOffset(offset_array[j:j+4]));
-
- j += 4;
- if (j >= offset_array.length)
- break;
- }
-
- // Iterate over the offset list, scrape strings from the database, and push
- // the results into our return set.
- foreach(SearchOffset offset in all_offsets) {
- unowned string text = result.nonnull_string_at(offset.column + 1);
- search_matches.add(text[offset.byte_offset : offset.byte_offset + offset.size].down());
- }
-
- result.next(cancellable);
- }
+ search_matches = new Gee.HashSet<string>();
+ foreach (Gee.Set<string> matches in match_map.values)
+ search_matches.add_all(matches);
return Db.TransactionOutcome.DONE;
}, cancellable);
- return (search_matches.size == 0 ? null : search_matches);
+ return search_matches;
}
public async Geary.Email fetch_email_async(ImapDB.EmailIdentifier email_id,
@@ -1757,5 +1731,69 @@ private class Geary.ImapDB.Account : BaseObject {
unread_change.get(path));
}
}
+
+ // Not using a MultiMap because when traversing want to process all values at once per iteration,
+ // not per key-value
+ public Gee.Map<ImapDB.EmailIdentifier, Gee.Set<string>>? do_get_search_matches(Db.Connection cx,
+ ImapDB.SearchQuery query, Gee.Map<int64?, ImapDB.EmailIdentifier> id_map, Cancellable? cancellable)
+ throws Error {
+ if (id_map.size == 0)
+ return null;
+
+ Gee.HashMap<string, string> query_phrases = get_query_phrases(query);
+ if (query_phrases.size == 0)
+ return null;
+
+ StringBuilder sql = new StringBuilder();
+ sql.append("""
+ SELECT docid, offsets(MessageSearchTable), *
+ FROM MessageSearchTable
+ WHERE docid IN (
+ """);
+ sql_append_ids(sql, id_map.keys);
+ sql.append(")");
+ sql_add_query_phrases(sql, query_phrases);
+
+ Db.Statement stmt = cx.prepare(sql.str);
+ sql_bind_query_phrases(stmt, 0, query_phrases);
+
+ Gee.Map<ImapDB.EmailIdentifier, Gee.Set<string>> search_matches = new Gee.HashMap<
+ ImapDB.EmailIdentifier, Gee.Set<string>>();
+
+ Db.Result result = stmt.exec(cancellable);
+ while (!result.finished) {
+ int64 docid = result.rowid_at(0);
+ assert(id_map.contains(docid));
+ ImapDB.EmailIdentifier id = id_map.get(docid);
+
+ // offsets() function returns a list of 4 strings that are ints indicating position
+ // and length of match string in search table corpus
+ string[] offset_array = result.nonnull_string_at(1).split(" ");
+
+ Gee.Set<string> matches = new Gee.HashSet<string>();
+
+ int j = 0;
+ while (true) {
+ unowned string[] offset_string = offset_array[j:j+4];
+
+ int column = int.parse(offset_string[0]);
+ int byte_offset = int.parse(offset_string[2]);
+ int size = int.parse(offset_string[3]);
+
+ unowned string text = result.nonnull_string_at(column + 2);
+ matches.add(text[byte_offset : byte_offset + size].down());
+
+ j += 4;
+ if (j >= offset_array.length)
+ break;
+ }
+
+ search_matches.set(id, matches);
+
+ result.next(cancellable);
+ }
+
+ return search_matches.size > 0 ? search_matches : null;
+ }
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]