[geary/wip/search-fixes: 3/14] Make ImapDb.SearchQuery prepare terms at construction time
- From: Michael Gratton <mjog src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [geary/wip/search-fixes: 3/14] Make ImapDb.SearchQuery prepare terms at construction time
- Date: Tue, 13 Aug 2019 21:39:48 +0000 (UTC)
commit 0be19a1c3e4cac9b61bd5b1a99aa1ab005a999bf
Author: Michael Gratton <mike vee net>
Date: Sun Aug 4 19:38:10 2019 +1000
Make ImapDb.SearchQuery prepare terms at construction time
This allows us to move all search query specific code from
ImapDb.Account to that class.
src/engine/imap-db/imap-db-account.vala | 542 +--------------------
.../imap-db/search/imap-db-search-query.vala | 540 +++++++++++++++++++-
2 files changed, 535 insertions(+), 547 deletions(-)
---
diff --git a/src/engine/imap-db/imap-db-account.vala b/src/engine/imap-db/imap-db-account.vala
index 5b708d46..44cc0894 100644
--- a/src/engine/imap-db/imap-db-account.vala
+++ b/src/engine/imap-db/imap-db-account.vala
@@ -8,37 +8,6 @@
private class Geary.ImapDB.Account : BaseObject {
- // These characters are chosen for being commonly used to continue a single word (such as
- // extended last names, i.e. "Lars-Eric") or in terms commonly searched for in an email client,
- // i.e. unadorned mailbox addresses. Note that characters commonly used for wildcards or that
- // would be interpreted as wildcards by SQLite are not included here.
- private const unichar[] SEARCH_TERM_CONTINUATION_CHARS = { '-', '_', '.', '@' };
-
- // Search operator field names, eg: "to:foo example com" or "is:unread"
- private const string SEARCH_OP_ATTACHMENT = "attachment";
- private const string SEARCH_OP_BCC = "bcc";
- private const string SEARCH_OP_BODY = "body";
- private const string SEARCH_OP_CC = "cc";
- private const string SEARCH_OP_FROM = "from_field";
- private const string SEARCH_OP_IS = "is";
- private const string SEARCH_OP_SUBJECT = "subject";
- private const string SEARCH_OP_TO = "receivers";
-
- // Operators allowing finding mail addressed to "me"
- private const string[] SEARCH_OP_TO_ME_FIELDS = {
- SEARCH_OP_BCC,
- SEARCH_OP_CC,
- SEARCH_OP_TO,
- };
-
- // The addressable op value for "me"
- private const string SEARCH_OP_ADDRESSABLE_VALUE_ME = "me";
-
- // Search operator field values
- private const string SEARCH_OP_VALUE_READ = "read";
- private const string SEARCH_OP_VALUE_STARRED = "starred";
- private const string SEARCH_OP_VALUE_UNREAD = "unread";
-
// Storage path names
private const string DB_FILENAME = "geary.db";
private const string ATTACHMENTS_DIR = "attachments";
@@ -55,18 +24,6 @@ private class Geary.ImapDB.Account : BaseObject {
}
- // Maps of localised search operator names and values to their
- // internal forms
- private static Gee.HashMap<string, string> search_op_names =
- new Gee.HashMap<string, string>();
- private static Gee.ArrayList<string> search_op_to_me_values =
- new Gee.ArrayList<string>();
- private static Gee.ArrayList<string> search_op_from_me_values =
- new Gee.ArrayList<string>();
- private static Gee.HashMap<string, string> search_op_is_values =
- new Gee.HashMap<string, string>();
-
-
/**
* The root path for all remote IMAP folders.
*
@@ -91,162 +48,15 @@ private class Geary.ImapDB.Account : BaseObject {
/** The backing database for the account. */
public ImapDB.Database db { get; private set; }
+ internal AccountInformation account_information { get; private set; }
+
private string name;
- private AccountInformation account_information;
private GLib.File db_file;
private GLib.File attachments_dir;
private Gee.HashMap<Geary.FolderPath, FolderReference> folder_refs =
new Gee.HashMap<Geary.FolderPath, FolderReference>();
private Cancellable? background_cancellable = null;
- static construct {
- // Map of possibly translated search operator names and values
- // to English/internal names and values. We include the
- // English version anyway so that when translations provide a
- // localised version of the operator names but have not also
- // translated the user manual, the English version in the
- // manual still works.
-
- // Can be typed in the search box like "attachment:file.txt"
- // to find messages with attachments with a particular name.
- //
- // The translated string must be a single word (use '-', '_'
- // or similar to combine words into one), should be short, and
- // also match the translation in "search.page" of the Geary User
- // Guide.
- search_op_names.set(C_("Search operator", "attachment"), SEARCH_OP_ATTACHMENT);
- // Can be typed in the search box like
- // "bcc:johndoe example com" to find messages bcc'd to a
- // particular person.
- //
- // The translated string must be a single word (use '-', '_'
- // or similar to combine words into one), should be short, and
- // also match the translation in "search.page" of the Geary User
- // Guide.
- search_op_names.set(C_("Search operator", "bcc"), SEARCH_OP_BCC);
- // Can be typed in the search box like "body:word" to find
- // "word" only if it occurs in the body of a message.
- //
- // The translated string must be a single word (use '-', '_'
- // or similar to combine words into one), should be short, and
- // also match the translation in "search.page" of the Geary User
- // Guide.
- search_op_names.set(C_("Search operator", "body"), SEARCH_OP_BODY);
- // Can be typed in the search box like
- // "cc:johndoe example com" to find messages cc'd to a
- // particular person.
- //
- // The translated string must be a single word (use '-', '_'
- // or similar to combine words into one), should be short, and
- // also match the translation in "search.page" of the Geary User
- // Guide.
- search_op_names.set(C_("Search operator", "cc"), SEARCH_OP_CC);
- // Can be typed in the search box like
- // "from:johndoe example com" to find messages from a
- // particular sender.
- //
- // The translated string must be a single word (use '-', '_'
- // or similar to combine words into one), should be short, and
- // also match the translation in "search.page" of the Geary User
- // Guide.
- search_op_names.set(C_("Search operator", "from"), SEARCH_OP_FROM);
- // Can be typed in the search box like "is:unread" to find
- // messages that are read, unread, or starred.
- //
- // The translated string must be a single word (use '-', '_'
- // or similar to combine words into one), should be short, and
- // also match the translation in "search.page" of the Geary User
- // Guide.
- search_op_names.set(C_("Search operator", "is"), SEARCH_OP_IS);
- // Can be typed in the search box like "subject:word" to find
- // "word" only if it occurs in the subject of a message.
- //
- // The translated string must be a single word (use '-', '_'
- // or similar to combine words into one), should be short, and
- // also match the translation in "search.page" of the Geary
- // User Guide.
- search_op_names.set(C_("Search operator", "subject"), SEARCH_OP_SUBJECT);
- // Can be typed in the search box like
- // "to:johndoe example com" to find messages received by a
- // particular person.
- //
- // The translated string must be a single word (use '-', '_'
- // or similar to combine words into one), should be short, and
- // also match the translation in "search.page" of the Geary User
- // Guide.
- search_op_names.set(C_("Search operator", "to"), SEARCH_OP_TO);
-
- // And the English language versions
- search_op_names.set("attachment", SEARCH_OP_ATTACHMENT);
- search_op_names.set("bcc", SEARCH_OP_BCC);
- search_op_names.set("body", SEARCH_OP_BODY);
- search_op_names.set("cc", SEARCH_OP_CC);
- search_op_names.set("from", SEARCH_OP_FROM);
- search_op_names.set("is", SEARCH_OP_IS);
- search_op_names.set("subject", SEARCH_OP_SUBJECT);
- search_op_names.set("to", SEARCH_OP_TO);
-
- // Can be typed in the search box after "to:", "cc:" and
- // "bcc:" e.g.: "to:me". Matches conversations that are
- // addressed to the user.
- //
- // The translated string must be a single word (use '-', '_'
- // or similar to combine words into one), should be short, and
- // also match the translation in "search.page" of the Geary User
- // Guide.
- search_op_to_me_values.add(
- C_("Search operator value - mail addressed to the user", "me")
- );
- search_op_to_me_values.add(SEARCH_OP_ADDRESSABLE_VALUE_ME);
-
- // Can be typed in the search box after "from:" i.e.:
- // "from:me". Matches conversations were sent by the user.
- //
- // The translated string must be a single word (use '-', '_'
- // or similar to combine words into one), should be short, and
- // also match the translation in "search.page" of the Geary User
- // Guide.
- search_op_from_me_values.add(
- C_("Search operator value - mail sent by the user", "me")
- );
- search_op_from_me_values.add(SEARCH_OP_ADDRESSABLE_VALUE_ME);
-
- // Can be typed in the search box after "is:" i.e.:
- // "is:read". Matches conversations that are flagged as read.
- //
- // The translated string must be a single word (use '-', '_'
- // or similar to combine words into one), should be short, and
- // also match the translation in "search.page" of the Geary User
- // Guide.
- search_op_is_values.set(
- C_("'is:' search operator value", "read"), SEARCH_OP_VALUE_READ
- );
- // Can be typed in the search box after "is:" i.e.:
- // "is:starred". Matches conversations that are flagged as
- // starred.
- //
- // The translated string must be a single word (use '-', '_'
- // or similar to combine words into one), should be short, and
- // also match the translation in "search.page" of the Geary User
- // Guide.
- search_op_is_values.set(
- C_("'is:' search operator value", "starred"), SEARCH_OP_VALUE_STARRED
- );
- // Can be typed in the search box after "is:" i.e.:
- // "is:unread". Matches conversations that are flagged unread.
- //
- // The translated string must be a single word (use '-', '_'
- // or similar to combine words into one), should be short, and
- // also match the translation in "search.page" of the Geary User
- // Guide.
- search_op_is_values.set(
- C_("'is:' search operator value", "unread"), SEARCH_OP_VALUE_UNREAD
- );
- search_op_is_values.set(SEARCH_OP_VALUE_READ, SEARCH_OP_VALUE_READ);
- search_op_is_values.set(SEARCH_OP_VALUE_STARRED, SEARCH_OP_VALUE_STARRED);
- search_op_is_values.set(SEARCH_OP_VALUE_UNREAD, SEARCH_OP_VALUE_UNREAD);
- }
-
public Account(AccountInformation config,
GLib.File data_dir,
GLib.File schema_dir) {
@@ -264,14 +74,6 @@ private class Geary.ImapDB.Account : BaseObject {
);
}
- private ImapDB.SearchQuery check_search_query(Geary.SearchQuery q) throws Error {
- ImapDB.SearchQuery? query = q as ImapDB.SearchQuery;
- if (query == null || query.account != this)
- throw new EngineError.BAD_PARAMETERS("Geary.SearchQuery not associated with %s", name);
-
- return query;
- }
-
public async void open_async(GLib.Cancellable? cancellable)
throws GLib.Error {
if (this.db.is_open) {
@@ -677,288 +479,6 @@ private class Geary.ImapDB.Account : BaseObject {
return (messages.size == 0 ? null : messages);
}
- private string? extract_field_from_token(string[] parts, ref string token) {
- string? field = null;
- if (Geary.String.is_empty_or_whitespace(parts[1])) {
- // User stopped at "field:", treat it as if they hadn't
- // typed the ':'
- token = parts[0];
- } else {
- field = search_op_names.get(parts[0].down());
- if (field == SEARCH_OP_IS) {
- string? value = search_op_is_values.get(parts[1].down());
- if (value != null) {
- token = value;
- } else {
- // Unknown op value, pretend there is no search op
- field = null;
- }
- } else if (field == SEARCH_OP_FROM &&
- parts[1].down() in search_op_from_me_values) {
- // Search for all addresses on the account. Bug 768779
- token = account_information.primary_mailbox.address;
- } else if (field in SEARCH_OP_TO_ME_FIELDS &&
- parts[1].down() in search_op_to_me_values) {
- // Search for all addresses on the account. Bug 768779
- token = account_information.primary_mailbox.address;
- } else if (field != null) {
- token = parts[1];
- }
- }
- return field;
- }
-
- /**
- * This method is used to convert an unquoted user-entered search terms into a stemmed search
- * term.
- *
- * Prior experience with the Unicode Snowball stemmer indicates it's too aggressive for our
- * tastes when coupled with prefix-matching of all unquoted terms (see
- * https://bugzilla.gnome.org/show_bug.cgi?id=713179) This method is part of a larger strategy
- * designed to dampen that aggressiveness without losing the benefits of stemming entirely.
- *
- * Database upgrade 23 removes the old Snowball-stemmed FTS table and replaces it with one
- * with no stemming (using only SQLite's "simple" tokenizer). It also creates a "magic" SQLite
- * table called TokenizerTable which allows for uniform queries to the Snowball stemmer, which
- * is still installed in Geary. Thus, we are now in the position to search for the original
- * term and its stemmed variant, then do post-search processing to strip results which are
- * too "greedy" due to prefix-matching the stemmed variant.
- *
- * Some heuristics are in place simply to determine if stemming should occur:
- *
- * # If stemming is unallowed, no stemming occurs.
- * # If the term is < min. term length for stemming, no stemming occurs.
- * # If the stemmer returns a stem that is the same as the original term, no stemming occurs.
- * # If the difference between the stemmed word and the original term is more than
- * maximum allowed, no stemming occurs. This works under the assumption that if
- * the user has typed a long word, they do not want to "go back" to searching for a much
- * shorter version of it. (For example, "accountancies" stems to "account").
- *
- * Otherwise, the stem for the term is returned.
- */
- private string? stem_search_term(ImapDB.SearchQuery query, string term) {
- if (!query.allow_stemming)
- return null;
-
- int term_length = term.length;
- if (term_length < query.min_term_length_for_stemming)
- return null;
-
- string? stemmed = null;
- try {
- Db.Statement stmt = db.prepare("""
- SELECT token
- FROM TokenizerTable
- WHERE input=?
- """);
- stmt.bind_string(0, term);
-
- // get stemmed string; if no result, fall through
- Db.Result result = stmt.exec();
- if (!result.finished)
- stemmed = result.string_at(0);
- else
- debug("No stemmed term returned for \"%s\"", term);
- } catch (Error err) {
- debug("Unable to query tokenizer table for stemmed term for \"%s\": %s", term, err.message);
-
- // fall-through
- }
-
- if (String.is_empty(stemmed)) {
- debug("Empty stemmed term returned for \"%s\"", term);
-
- return null;
- }
-
- // If same term returned, treat as non-stemmed
- if (stemmed == term)
- return null;
-
- // Don't search for stemmed words that are significantly shorter than the user's search term
- if (term_length - stemmed.length > query.max_difference_term_stem_lengths) {
- debug("Stemmed \"%s\" dropped searching for \"%s\": too much distance in terms",
- stemmed, term);
-
- return null;
- }
-
- debug("Search processing: term -> stem is \"%s\" -> \"%s\"", term, stemmed);
-
- return stemmed;
- }
-
- private void prepare_search_query(ImapDB.SearchQuery query) {
- if (query.parsed)
- return;
-
- // A few goals here:
- // 1) Append an * after every term so it becomes a prefix search
- // (see <https://www.sqlite.org/fts3.html#section_3>)
- // 2) Strip out common words/operators that might get interpreted as
- // search operators
- // 3) Parse each word into a list of which field it applies to, so
- // you can do "to:johndoe example com thing" (quotes excluded)
- // to find messages to John containing the word thing
- // We ignore everything inside quotes to give the user a way to
- // override our algorithm here. The idea is to offer one search query
- // syntax for Geary that we can use locally and via IMAP, etc.
-
- string quote_balanced = query.raw;
- if (Geary.String.count_char(query.raw, '"') % 2 != 0) {
- // Remove the last quote if it's not balanced. This has the
- // benefit of showing decent results as you type a quoted phrase.
- int last_quote = query.raw.last_index_of_char('"');
- assert(last_quote >= 0);
- quote_balanced = query.raw.splice(last_quote, last_quote + 1, " ");
- }
-
- string[] words = quote_balanced.split_set(" \t\r\n()%*\\");
- bool in_quote = false;
- foreach (string s in words) {
- string? field = null;
-
- s = s.strip();
-
- int quotes = Geary.String.count_char(s, '"');
- if (!in_quote && quotes > 0) {
- in_quote = true;
- --quotes;
- }
-
- SearchTerm? term;
- if (in_quote) {
- // HACK: this helps prevent a syntax error when the user types
- // something like from:"somebody". If we ever properly support
- // quotes after : we can get rid of this.
- term = new SearchTerm(s, s, null, s.replace(":", " "), null);
- } else {
- string original = s;
-
- // Some common search phrases we don't respect and
- // therefore don't want to fall through to search
- // results
- // XXX translate these
- string lower = s.down();
- switch (lower) {
- case "":
- case "and":
- case "or":
- case "not":
- case "near":
- continue;
-
- default:
- if (lower.has_prefix("near/"))
- continue;
- break;
- }
-
- if (s.has_prefix("-"))
- s = s.substring(1);
-
- if (s == "")
- continue;
-
- // TODO: support quotes after :
- string[] parts = s.split(":", 2);
- if (parts.length > 1)
- field = extract_field_from_token(parts, ref s);
-
- if (field == SEARCH_OP_IS) {
- // s will have been de-translated
- term = new SearchTerm(original, s, null, null, null);
- } else {
- // SQL MATCH syntax for parsed term
- string? sql_s = "%s*".printf(s);
-
- // stem the word, but if stemmed and stem is
- // simply shorter version of original term, only
- // prefix-match search for it (i.e. avoid
- // searching for [archive* OR archiv*] when that's
- // the same as [archiv*]), otherwise search for
- // both
- string? stemmed = stem_search_term(query, s);
-
- string? sql_stemmed = null;
- if (stemmed != null) {
- sql_stemmed = "%s*".printf(stemmed);
- if (s.has_prefix(stemmed))
- sql_s = null;
- }
-
- // if term contains continuation characters, treat
- // as exact search to reduce effects of tokenizer
- // splitting terms w/ punctuation in them
- if (String.contains_any_char(s, SEARCH_TERM_CONTINUATION_CHARS))
- s = "\"%s\"".printf(s);
-
- term = new SearchTerm(original, s, stemmed, sql_s, sql_stemmed);
- }
- }
-
- if (in_quote && quotes % 2 != 0)
- in_quote = false;
-
- query.add_search_term(field, term);
- }
-
- assert(!in_quote);
-
- query.parsed = true;
- }
-
- // Return a map of column -> phrase, to use as WHERE column MATCH 'phrase'.
- private Gee.HashMap<string, string> get_query_phrases(ImapDB.SearchQuery query) {
- prepare_search_query(query);
-
- Gee.HashMap<string, string> phrases = new Gee.HashMap<string, string>();
- foreach (string? field in query.get_fields()) {
- Gee.List<SearchTerm>? terms = query.get_search_terms(field);
- if (terms == null || terms.size == 0 || field == "is")
- continue;
-
- // Each SearchTerm is an AND but the SQL text within in are OR ... this allows for
- // each user term to be AND but the variants of each term are or. So, if terms are
- // [party] and [eventful] and stems are [parti] and [event], the search would be:
- //
- // (party* OR parti*) AND (eventful* OR event*)
- //
- // Obviously with stemming there's the possibility of the stemmed variant being nothing
- // but a broader search of the original term (such as event* and eventful*) but do both
- // to determine from each hit result which term caused the hit, and if it's too greedy
- // a match of the stemmed variant, it can be stripped from the results.
- //
- // Note that this uses SQLite's "standard" query syntax for MATCH, where AND is implied
- // (and would be treated as search term if included), parentheses are not allowed, and
- // OR has a higher precedence than AND. So the above example in standard syntax is:
- //
- // party* OR parti* eventful* OR event*
- StringBuilder builder = new StringBuilder();
- foreach (SearchTerm term in terms) {
- if (term.sql.size == 0)
- continue;
-
- if (term.is_exact) {
- builder.append_printf("%s ", term.parsed);
- } else {
- bool is_first_sql = true;
- foreach (string sql in term.sql) {
- if (!is_first_sql)
- builder.append(" OR ");
-
- builder.append_printf("%s ", sql);
- is_first_sql = false;
- }
- }
- }
-
- phrases.set(field ?? "MessageSearchTable", builder.str);
- }
-
- return phrases;
- }
-
private void sql_add_query_phrases(StringBuilder sql, Gee.HashMap<string, string> query_phrases,
string operator, string columns, string condition) {
bool is_first_field = true;
@@ -1034,8 +554,8 @@ private class Geary.ImapDB.Account : BaseObject {
check_open();
ImapDB.SearchQuery query = check_search_query(q);
- Gee.HashMap<string, string> query_phrases = get_query_phrases(query);
- Gee.Map<Geary.NamedFlag, bool> removal_conditions = get_removal_conditions(query);
+ Gee.HashMap<string, string> query_phrases = query.get_query_phrases();
+ Gee.Map<Geary.NamedFlag, bool> removal_conditions = query.get_removal_conditions();
if (query_phrases.size == 0 && removal_conditions.is_empty)
return null;
@@ -1059,7 +579,7 @@ private class Geary.ImapDB.Account : BaseObject {
// Do this outside of transaction to catch invalid search ids up-front
string? search_ids_sql = get_search_ids_sql(search_ids);
- bool strip_greedy = should_strip_greedy_results(query);
+ bool strip_greedy = query.should_strip_greedy_results();
Gee.Set<EmailIdentifier> matching_ids = new Gee.HashSet<EmailIdentifier>();
Gee.Map<EmailIdentifier,Gee.Set<string>>? search_matches = null;
@@ -1152,23 +672,6 @@ private class Geary.ImapDB.Account : BaseObject {
return matching_ids.is_empty ? null : matching_ids;
}
- private Gee.Map<Geary.NamedFlag, bool> get_removal_conditions(ImapDB.SearchQuery query) {
- Gee.Map<Geary.NamedFlag, bool> removal_conditions = new Gee.HashMap<Geary.NamedFlag, bool>();
- foreach (string? field in query.get_fields())
- if (field == SEARCH_OP_IS) {
- Gee.List<SearchTerm>? terms = query.get_search_terms(field);
- foreach (SearchTerm term in terms)
- if (term.parsed == SEARCH_OP_VALUE_READ)
- removal_conditions.set(new NamedFlag("UNREAD"), true);
- else if (term.parsed == SEARCH_OP_VALUE_UNREAD)
- removal_conditions.set(new NamedFlag("UNREAD"), false);
- else if (term.parsed == SEARCH_OP_VALUE_STARRED)
- removal_conditions.set(new NamedFlag("FLAGGED"), false);
- return removal_conditions;
- }
- return removal_conditions;
- }
-
// Strip out from the given collection any email that matches the
// given removal conditions
private async void strip_removal_conditions(ImapDB.SearchQuery query,
@@ -1192,29 +695,6 @@ private class Geary.ImapDB.Account : BaseObject {
}
}
- // For some searches, results are stripped if they're too
- // "greedy", but this requires examining the matched text, which
- // has an expense to fetch, so avoid doing so unless necessary
- private bool should_strip_greedy_results(SearchQuery query) {
- // HORIZON strategy is configured in such a way to allow all
- // stemmed variants to match, so don't do any stripping in
- // that case
- //
- // If any of the search terms is exact-match (no prefix
- // matching) or none have stemmed variants, then don't do
- // stripping of "greedy" stemmed matching (because in both
- // cases, there are none)
-
- bool strip_results = true;
- if (query.strategy == Geary.SearchQuery.Strategy.HORIZON)
- strip_results = false;
- else if (traverse<SearchTerm>(query.get_all_terms()).any(
- term => term.stemmed == null || term.is_exact)) {
- strip_results = false;
- }
- return strip_results;
- }
-
// Strip out from the given collection of matching ids and results
// for any search results that only contain a hit due to "greedy"
// matching of the stemmed variants on all search terms.
@@ -1284,7 +764,7 @@ private class Geary.ImapDB.Account : BaseObject {
if (match_map == null || match_map.size == 0)
return Db.TransactionOutcome.DONE;
- if (should_strip_greedy_results(query)) {
+ if (query.should_strip_greedy_results()) {
strip_greedy_results(query, ids, match_map);
}
@@ -1792,7 +1272,7 @@ private class Geary.ImapDB.Account : BaseObject {
if (id_map.size == 0)
return null;
- Gee.HashMap<string, string> query_phrases = get_query_phrases(query);
+ Gee.HashMap<string, string> query_phrases = query.get_query_phrases();
if (query_phrases.size == 0)
return null;
@@ -1898,4 +1378,12 @@ private class Geary.ImapDB.Account : BaseObject {
}
}
+ private ImapDB.SearchQuery check_search_query(Geary.SearchQuery q) throws Error {
+ ImapDB.SearchQuery? query = q as ImapDB.SearchQuery;
+ if (query == null || query.account != this)
+ throw new EngineError.BAD_PARAMETERS("Geary.SearchQuery not associated with %s", name);
+
+ return query;
+ }
+
}
diff --git a/src/engine/imap-db/search/imap-db-search-query.vala
b/src/engine/imap-db/search/imap-db-search-query.vala
index 5e43eac4..543eec18 100644
--- a/src/engine/imap-db/search/imap-db-search-query.vala
+++ b/src/engine/imap-db/search/imap-db-search-query.vala
@@ -1,8 +1,9 @@
-/* Copyright 2016 Software Freedom Conservancy Inc.
+/*
* Copyright 2016 Software Freedom Conservancy Inc.
+ * Copyright 2019 Michael Gratton <mike vee net>.
*
* This software is licensed under the GNU Lesser General Public License
- * (version 2.1 or later). See the COPYING file in this distribution.
+ * (version 2.1 or later). See the COPYING file in this distribution.
*/
/**
@@ -10,17 +11,204 @@
*/
private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
+ // These characters are chosen for being commonly used to continue a single word (such as
+ // extended last names, i.e. "Lars-Eric") or in terms commonly searched for in an email client,
+ // i.e. unadorned mailbox addresses. Note that characters commonly used for wildcards or that
+ // would be interpreted as wildcards by SQLite are not included here.
+ private const unichar[] SEARCH_TERM_CONTINUATION_CHARS = { '-', '_', '.', '@' };
+
+ // Search operator field names, eg: "to:foo example com" or "is:unread"
+ private const string SEARCH_OP_ATTACHMENT = "attachment";
+ private const string SEARCH_OP_BCC = "bcc";
+ private const string SEARCH_OP_BODY = "body";
+ private const string SEARCH_OP_CC = "cc";
+ private const string SEARCH_OP_FROM = "from_field";
+ private const string SEARCH_OP_IS = "is";
+ private const string SEARCH_OP_SUBJECT = "subject";
+ private const string SEARCH_OP_TO = "receivers";
+
+ // Operators allowing finding mail addressed to "me"
+ private const string[] SEARCH_OP_TO_ME_FIELDS = {
+ SEARCH_OP_BCC,
+ SEARCH_OP_CC,
+ SEARCH_OP_TO,
+ };
+
+ // The addressable op value for "me"
+ private const string SEARCH_OP_ADDRESSABLE_VALUE_ME = "me";
+
+ // Search operator field values
+ private const string SEARCH_OP_VALUE_READ = "read";
+ private const string SEARCH_OP_VALUE_STARRED = "starred";
+ private const string SEARCH_OP_VALUE_UNREAD = "unread";
+
+
+ // Maps of localised search operator names and values to their
+ // internal forms
+ private static Gee.HashMap<string, string> search_op_names =
+ new Gee.HashMap<string, string>();
+ private static Gee.ArrayList<string> search_op_to_me_values =
+ new Gee.ArrayList<string>();
+ private static Gee.ArrayList<string> search_op_from_me_values =
+ new Gee.ArrayList<string>();
+ private static Gee.HashMap<string, string> search_op_is_values =
+ new Gee.HashMap<string, string>();
+
+
+ static construct {
+ // Map of possibly translated search operator names and values
+ // to English/internal names and values. We include the
+ // English version anyway so that when translations provide a
+ // localised version of the operator names but have not also
+ // translated the user manual, the English version in the
+ // manual still works.
+
+ // Can be typed in the search box like "attachment:file.txt"
+ // to find messages with attachments with a particular name.
+ //
+ // The translated string must be a single word (use '-', '_'
+ // or similar to combine words into one), should be short, and
+ // also match the translation in "search.page" of the Geary User
+ // Guide.
+ search_op_names.set(C_("Search operator", "attachment"), SEARCH_OP_ATTACHMENT);
+ // Can be typed in the search box like
+ // "bcc:johndoe example com" to find messages bcc'd to a
+ // particular person.
+ //
+ // The translated string must be a single word (use '-', '_'
+ // or similar to combine words into one), should be short, and
+ // also match the translation in "search.page" of the Geary User
+ // Guide.
+ search_op_names.set(C_("Search operator", "bcc"), SEARCH_OP_BCC);
+ // Can be typed in the search box like "body:word" to find
+ // "word" only if it occurs in the body of a message.
+ //
+ // The translated string must be a single word (use '-', '_'
+ // or similar to combine words into one), should be short, and
+ // also match the translation in "search.page" of the Geary User
+ // Guide.
+ search_op_names.set(C_("Search operator", "body"), SEARCH_OP_BODY);
+ // Can be typed in the search box like
+ // "cc:johndoe example com" to find messages cc'd to a
+ // particular person.
+ //
+ // The translated string must be a single word (use '-', '_'
+ // or similar to combine words into one), should be short, and
+ // also match the translation in "search.page" of the Geary User
+ // Guide.
+ search_op_names.set(C_("Search operator", "cc"), SEARCH_OP_CC);
+ // Can be typed in the search box like
+ // "from:johndoe example com" to find messages from a
+ // particular sender.
+ //
+ // The translated string must be a single word (use '-', '_'
+ // or similar to combine words into one), should be short, and
+ // also match the translation in "search.page" of the Geary User
+ // Guide.
+ search_op_names.set(C_("Search operator", "from"), SEARCH_OP_FROM);
+ // Can be typed in the search box like "is:unread" to find
+ // messages that are read, unread, or starred.
+ //
+ // The translated string must be a single word (use '-', '_'
+ // or similar to combine words into one), should be short, and
+ // also match the translation in "search.page" of the Geary User
+ // Guide.
+ search_op_names.set(C_("Search operator", "is"), SEARCH_OP_IS);
+ // Can be typed in the search box like "subject:word" to find
+ // "word" only if it occurs in the subject of a message.
+ //
+ // The translated string must be a single word (use '-', '_'
+ // or similar to combine words into one), should be short, and
+ // also match the translation in "search.page" of the Geary
+ // User Guide.
+ search_op_names.set(C_("Search operator", "subject"), SEARCH_OP_SUBJECT);
+ // Can be typed in the search box like
+ // "to:johndoe example com" to find messages received by a
+ // particular person.
+ //
+ // The translated string must be a single word (use '-', '_'
+ // or similar to combine words into one), should be short, and
+ // also match the translation in "search.page" of the Geary User
+ // Guide.
+ search_op_names.set(C_("Search operator", "to"), SEARCH_OP_TO);
+
+ // And the English language versions
+ search_op_names.set("attachment", SEARCH_OP_ATTACHMENT);
+ search_op_names.set("bcc", SEARCH_OP_BCC);
+ search_op_names.set("body", SEARCH_OP_BODY);
+ search_op_names.set("cc", SEARCH_OP_CC);
+ search_op_names.set("from", SEARCH_OP_FROM);
+ search_op_names.set("is", SEARCH_OP_IS);
+ search_op_names.set("subject", SEARCH_OP_SUBJECT);
+ search_op_names.set("to", SEARCH_OP_TO);
+
+ // Can be typed in the search box after "to:", "cc:" and
+ // "bcc:" e.g.: "to:me". Matches conversations that are
+ // addressed to the user.
+ //
+ // The translated string must be a single word (use '-', '_'
+ // or similar to combine words into one), should be short, and
+ // also match the translation in "search.page" of the Geary User
+ // Guide.
+ search_op_to_me_values.add(
+ C_("Search operator value - mail addressed to the user", "me")
+ );
+ search_op_to_me_values.add(SEARCH_OP_ADDRESSABLE_VALUE_ME);
+
+ // Can be typed in the search box after "from:" i.e.:
+ // "from:me". Matches conversations were sent by the user.
+ //
+ // The translated string must be a single word (use '-', '_'
+ // or similar to combine words into one), should be short, and
+ // also match the translation in "search.page" of the Geary User
+ // Guide.
+ search_op_from_me_values.add(
+ C_("Search operator value - mail sent by the user", "me")
+ );
+ search_op_from_me_values.add(SEARCH_OP_ADDRESSABLE_VALUE_ME);
+
+ // Can be typed in the search box after "is:" i.e.:
+ // "is:read". Matches conversations that are flagged as read.
+ //
+ // The translated string must be a single word (use '-', '_'
+ // or similar to combine words into one), should be short, and
+ // also match the translation in "search.page" of the Geary User
+ // Guide.
+ search_op_is_values.set(
+ C_("'is:' search operator value", "read"), SEARCH_OP_VALUE_READ
+ );
+ // Can be typed in the search box after "is:" i.e.:
+ // "is:starred". Matches conversations that are flagged as
+ // starred.
+ //
+ // The translated string must be a single word (use '-', '_'
+ // or similar to combine words into one), should be short, and
+ // also match the translation in "search.page" of the Geary User
+ // Guide.
+ search_op_is_values.set(
+ C_("'is:' search operator value", "starred"), SEARCH_OP_VALUE_STARRED
+ );
+ // Can be typed in the search box after "is:" i.e.:
+ // "is:unread". Matches conversations that are flagged unread.
+ //
+ // The translated string must be a single word (use '-', '_'
+ // or similar to combine words into one), should be short, and
+ // also match the translation in "search.page" of the Geary User
+ // Guide.
+ search_op_is_values.set(
+ C_("'is:' search operator value", "unread"), SEARCH_OP_VALUE_UNREAD
+ );
+ search_op_is_values.set(SEARCH_OP_VALUE_READ, SEARCH_OP_VALUE_READ);
+ search_op_is_values.set(SEARCH_OP_VALUE_STARRED, SEARCH_OP_VALUE_STARRED);
+ search_op_is_values.set(SEARCH_OP_VALUE_UNREAD, SEARCH_OP_VALUE_UNREAD);
+ }
+
+
/**
* Associated {@link ImapDB.Account}.
*/
public weak ImapDB.Account account { get; private set; }
- /**
- * Whether or not the query has been parsed and processed prior to
- * search submission.
- */
- public bool parsed { get; set; default = false; }
-
/**
* Returns whether stemming may be used when exerting the search.
*
@@ -73,7 +261,9 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
// A list of all search terms, regardless of search op field name
private Gee.ArrayList<SearchTerm> all = new Gee.ArrayList<SearchTerm>();
- public SearchQuery(ImapDB.Account account, string query, Geary.SearchQuery.Strategy strategy) {
+ public SearchQuery(ImapDB.Account account,
+ string query,
+ Geary.SearchQuery.Strategy strategy) {
base (query, strategy);
this.account = account;
@@ -106,18 +296,9 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
max_difference_term_stem_lengths = int.MAX;
max_difference_match_stem_lengths = int.MAX;
break;
-
- default:
- assert_not_reached();
}
- }
-
- public void add_search_term(string? field, SearchTerm term) {
- if (!field_map.has_key(field))
- field_map.set(field, new Gee.ArrayList<SearchTerm>());
- field_map.get(field).add(term);
- all.add(term);
+ prepare();
}
public Gee.Collection<string?> get_fields() {
@@ -131,5 +312,324 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
public Gee.List<SearchTerm>? get_all_terms() {
return all;
}
-}
+ // For some searches, results are stripped if they're too
+ // "greedy", but this requires examining the matched text, which
+ // has an expense to fetch, so avoid doing so unless necessary
+ internal bool should_strip_greedy_results() {
+ // HORIZON strategy is configured in such a way to allow all
+ // stemmed variants to match, so don't do any stripping in
+ // that case
+ //
+ // If any of the search terms is exact-match (no prefix
+ // matching) or none have stemmed variants, then don't do
+ // stripping of "greedy" stemmed matching (because in both
+ // cases, there are none)
+
+ bool strip_results = true;
+ if (this.strategy == Geary.SearchQuery.Strategy.HORIZON)
+ strip_results = false;
+ else if (traverse<SearchTerm>(this.all).any(
+ term => term.stemmed == null || term.is_exact)) {
+ strip_results = false;
+ }
+ return strip_results;
+ }
+
+ internal Gee.Map<Geary.NamedFlag,bool> get_removal_conditions() {
+ Gee.Map<Geary.NamedFlag,bool> conditions =
+ new Gee.HashMap<Geary.NamedFlag,bool>();
+ foreach (string? field in this.field_map.keys) {
+ if (field == SEARCH_OP_IS) {
+ Gee.List<SearchTerm>? terms = get_search_terms(field);
+ foreach (SearchTerm term in terms)
+ if (term.parsed == SEARCH_OP_VALUE_READ)
+ conditions.set(new NamedFlag("UNREAD"), true);
+ else if (term.parsed == SEARCH_OP_VALUE_UNREAD)
+ conditions.set(new NamedFlag("UNREAD"), false);
+ else if (term.parsed == SEARCH_OP_VALUE_STARRED)
+ conditions.set(new NamedFlag("FLAGGED"), false);
+ }
+ }
+ return conditions;
+ }
+
+ // Return a map of column -> phrase, to use as WHERE column MATCH 'phrase'.
+ internal Gee.HashMap<string, string> get_query_phrases() {
+ Gee.HashMap<string, string> phrases = new Gee.HashMap<string, string>();
+ foreach (string? field in field_map.keys) {
+ Gee.List<SearchTerm>? terms = get_search_terms(field);
+ if (terms == null || terms.size == 0 || field == "is")
+ continue;
+
+ // Each SearchTerm is an AND but the SQL text within in are OR ... this allows for
+ // each user term to be AND but the variants of each term are or. So, if terms are
+ // [party] and [eventful] and stems are [parti] and [event], the search would be:
+ //
+ // (party* OR parti*) AND (eventful* OR event*)
+ //
+ // Obviously with stemming there's the possibility of the stemmed variant being nothing
+ // but a broader search of the original term (such as event* and eventful*) but do both
+ // to determine from each hit result which term caused the hit, and if it's too greedy
+ // a match of the stemmed variant, it can be stripped from the results.
+ //
+ // Note that this uses SQLite's "standard" query syntax for MATCH, where AND is implied
+ // (and would be treated as search term if included), parentheses are not allowed, and
+ // OR has a higher precedence than AND. So the above example in standard syntax is:
+ //
+ // party* OR parti* eventful* OR event*
+ StringBuilder builder = new StringBuilder();
+ foreach (SearchTerm term in terms) {
+ if (term.sql.size == 0)
+ continue;
+
+ if (term.is_exact) {
+ builder.append_printf("%s ", term.parsed);
+ } else {
+ bool is_first_sql = true;
+ foreach (string sql in term.sql) {
+ if (!is_first_sql)
+ builder.append(" OR ");
+
+ builder.append_printf("%s ", sql);
+ is_first_sql = false;
+ }
+ }
+ }
+
+ phrases.set(field ?? "MessageSearchTable", builder.str);
+ }
+
+ return phrases;
+ }
+
+ private void prepare() {
+ // A few goals here:
+ // 1) Append an * after every term so it becomes a prefix search
+ // (see <https://www.sqlite.org/fts3.html#section_3>)
+ // 2) Strip out common words/operators that might get interpreted as
+ // search operators
+ // 3) Parse each word into a list of which field it applies to, so
+ // you can do "to:johndoe example com thing" (quotes excluded)
+ // to find messages to John containing the word thing
+ // We ignore everything inside quotes to give the user a way to
+ // override our algorithm here. The idea is to offer one search query
+ // syntax for Geary that we can use locally and via IMAP, etc.
+
+ string quote_balanced = this.raw;
+ if (Geary.String.count_char(this.raw, '"') % 2 != 0) {
+ // Remove the last quote if it's not balanced. This has the
+ // benefit of showing decent results as you type a quoted phrase.
+ int last_quote = this.raw.last_index_of_char('"');
+ assert(last_quote >= 0);
+ quote_balanced = this.raw.splice(last_quote, last_quote + 1, " ");
+ }
+
+ string[] words = quote_balanced.split_set(" \t\r\n()%*\\");
+ bool in_quote = false;
+ foreach (string s in words) {
+ string? field = null;
+
+ s = s.strip();
+
+ int quotes = Geary.String.count_char(s, '"');
+ if (!in_quote && quotes > 0) {
+ in_quote = true;
+ --quotes;
+ }
+
+ SearchTerm? term;
+ if (in_quote) {
+ // HACK: this helps prevent a syntax error when the user types
+ // something like from:"somebody". If we ever properly support
+ // quotes after : we can get rid of this.
+ term = new SearchTerm(s, s, null, s.replace(":", " "), null);
+ } else {
+ string original = s;
+
+ // Some common search phrases we don't respect and
+ // therefore don't want to fall through to search
+ // results
+ // XXX translate these
+ string lower = s.down();
+ switch (lower) {
+ case "":
+ case "and":
+ case "or":
+ case "not":
+ case "near":
+ continue;
+
+ default:
+ if (lower.has_prefix("near/"))
+ continue;
+ break;
+ }
+
+ if (s.has_prefix("-"))
+ s = s.substring(1);
+
+ if (s == "")
+ continue;
+
+ // TODO: support quotes after :
+ string[] parts = s.split(":", 2);
+ if (parts.length > 1)
+ field = extract_field_from_token(parts, ref s);
+
+ if (field == SEARCH_OP_IS) {
+ // s will have been de-translated
+ term = new SearchTerm(original, s, null, null, null);
+ } else {
+ // SQL MATCH syntax for parsed term
+ string? sql_s = "%s*".printf(s);
+
+ // stem the word, but if stemmed and stem is
+ // simply shorter version of original term, only
+ // prefix-match search for it (i.e. avoid
+ // searching for [archive* OR archiv*] when that's
+ // the same as [archiv*]), otherwise search for
+ // both
+ string? stemmed = stem_search_term(s);
+
+ string? sql_stemmed = null;
+ if (stemmed != null) {
+ sql_stemmed = "%s*".printf(stemmed);
+ if (s.has_prefix(stemmed))
+ sql_s = null;
+ }
+
+ // if term contains continuation characters, treat
+ // as exact search to reduce effects of tokenizer
+ // splitting terms w/ punctuation in them
+ if (String.contains_any_char(s, SEARCH_TERM_CONTINUATION_CHARS))
+ s = "\"%s\"".printf(s);
+
+ term = new SearchTerm(original, s, stemmed, sql_s, sql_stemmed);
+ }
+ }
+
+ if (in_quote && quotes % 2 != 0)
+ in_quote = false;
+
+ // Finally, add the term
+ if (!this.field_map.has_key(field)) {
+ this.field_map.set(field, new Gee.ArrayList<SearchTerm>());
+ }
+ this.field_map.get(field).add(term);
+ this.all.add(term);
+ }
+ }
+
+ private string? extract_field_from_token(string[] parts, ref string token) {
+ string? field = null;
+ if (Geary.String.is_empty_or_whitespace(parts[1])) {
+ // User stopped at "field:", treat it as if they hadn't
+ // typed the ':'
+ token = parts[0];
+ } else {
+ field = search_op_names.get(parts[0].down());
+ if (field == SEARCH_OP_IS) {
+ string? value = search_op_is_values.get(parts[1].down());
+ if (value != null) {
+ token = value;
+ } else {
+ // Unknown op value, pretend there is no search op
+ field = null;
+ }
+ } else if (field == SEARCH_OP_FROM &&
+ parts[1].down() in search_op_from_me_values) {
+ // Search for all addresses on the account. Bug 768779
+ token = this.account.account_information.primary_mailbox.address;
+ } else if (field in SEARCH_OP_TO_ME_FIELDS &&
+ parts[1].down() in search_op_to_me_values) {
+ // Search for all addresses on the account. Bug 768779
+ token = this.account.account_information.primary_mailbox.address;
+ } else if (field != null) {
+ token = parts[1];
+ }
+ }
+ return field;
+ }
+
+ /**
+ * This method is used to convert an unquoted user-entered search terms into a stemmed search
+ * term.
+ *
+ * Prior experience with the Unicode Snowball stemmer indicates it's too aggressive for our
+ * tastes when coupled with prefix-matching of all unquoted terms (see
+ * https://bugzilla.gnome.org/show_bug.cgi?id=713179) This method is part of a larger strategy
+ * designed to dampen that aggressiveness without losing the benefits of stemming entirely.
+ *
+ * Database upgrade 23 removes the old Snowball-stemmed FTS table and replaces it with one
+ * with no stemming (using only SQLite's "simple" tokenizer). It also creates a "magic" SQLite
+ * table called TokenizerTable which allows for uniform queries to the Snowball stemmer, which
+ * is still installed in Geary. Thus, we are now in the position to search for the original
+ * term and its stemmed variant, then do post-search processing to strip results which are
+ * too "greedy" due to prefix-matching the stemmed variant.
+ *
+ * Some heuristics are in place simply to determine if stemming should occur:
+ *
+ * # If stemming is unallowed, no stemming occurs.
+ * # If the term is < min. term length for stemming, no stemming occurs.
+ * # If the stemmer returns a stem that is the same as the original term, no stemming occurs.
+ * # If the difference between the stemmed word and the original term is more than
+ * maximum allowed, no stemming occurs. This works under the assumption that if
+ * the user has typed a long word, they do not want to "go back" to searching for a much
+ * shorter version of it. (For example, "accountancies" stems to "account").
+ *
+ * Otherwise, the stem for the term is returned.
+ */
+ private string? stem_search_term(string term) {
+ if (!this.allow_stemming)
+ return null;
+
+ int term_length = term.length;
+ if (term_length < this.min_term_length_for_stemming)
+ return null;
+
+ string? stemmed = null;
+ try {
+ Db.Statement stmt = this.account.db.prepare("""
+ SELECT token
+ FROM TokenizerTable
+ WHERE input=?
+ """);
+ stmt.bind_string(0, term);
+
+ // get stemmed string; if no result, fall through
+ Db.Result result = stmt.exec();
+ if (!result.finished)
+ stemmed = result.string_at(0);
+ else
+ debug("No stemmed term returned for \"%s\"", term);
+ } catch (Error err) {
+ debug("Unable to query tokenizer table for stemmed term for \"%s\": %s", term, err.message);
+
+ // fall-through
+ }
+
+ if (String.is_empty(stemmed)) {
+ debug("Empty stemmed term returned for \"%s\"", term);
+
+ return null;
+ }
+
+ // If same term returned, treat as non-stemmed
+ if (stemmed == term)
+ return null;
+
+ // Don't search for stemmed words that are significantly shorter than the user's search term
+ if (term_length - stemmed.length > this.max_difference_term_stem_lengths) {
+ debug("Stemmed \"%s\" dropped searching for \"%s\": too much distance in terms",
+ stemmed, term);
+
+ return null;
+ }
+
+ debug("Search processing: term -> stem is \"%s\" -> \"%s\"", term, stemmed);
+
+ return stemmed;
+ }
+
+}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]