[geary/wip/720361-stemming] Parameterize search matching scope, from EXACT to HORIZON.
- From: Jim Nelson <jnelson src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [geary/wip/720361-stemming] Parameterize search matching scope, from EXACT to HORIZON.
- Date: Thu, 11 Dec 2014 23:56:48 +0000 (UTC)
commit a345c0295387a7015bb215cc826fb0ff0e278cc4
Author: Jim Nelson <jim yorba org>
Date: Thu Dec 11 15:56:21 2014 -0800
Parameterize search matching scope, from EXACT to HORIZON.
src/client/application/geary-controller.vala | 3 +-
src/engine/abstract/geary-abstract-account.vala | 2 +-
src/engine/api/geary-account.vala | 12 +++-
src/engine/api/geary-search-folder.vala | 9 ++-
src/engine/api/geary-search-query.vala | 47 ++++++++++++-
src/engine/imap-db/imap-db-account.vala | 29 ++++----
src/engine/imap-db/imap-db-search-query.vala | 72 +++++++++++++++++++-
.../imap-engine/imap-engine-generic-account.vala | 4 +-
8 files changed, 150 insertions(+), 28 deletions(-)
---
diff --git a/src/client/application/geary-controller.vala b/src/client/application/geary-controller.vala
index 08f3fe4..4bb7083 100644
--- a/src/client/application/geary-controller.vala
+++ b/src/client/application/geary-controller.vala
@@ -2512,7 +2512,8 @@ public class GearyController : Geary.BaseObject {
cancel_search(); // Stop any search in progress.
- folder.set_search_query(search_text, cancellable_search);
+ folder.set_search_query(search_text, Geary.SearchQuery.Matching.CONSERVATIVE,
+ cancellable_search);
main_window.folder_list.set_search(folder);
search_text_changed(main_window.main_toolbar.search_text);
diff --git a/src/engine/abstract/geary-abstract-account.vala b/src/engine/abstract/geary-abstract-account.vala
index 409d0b8..c56b9aa 100644
--- a/src/engine/abstract/geary-abstract-account.vala
+++ b/src/engine/abstract/geary-abstract-account.vala
@@ -118,7 +118,7 @@ public abstract class Geary.AbstractAccount : BaseObject, Geary.Account {
public abstract async Geary.Email local_fetch_email_async(Geary.EmailIdentifier email_id,
Geary.Email.Field required_fields, Cancellable? cancellable = null) throws Error;
- public abstract Geary.SearchQuery open_search(string query);
+ public abstract Geary.SearchQuery open_search(string query, Geary.SearchQuery.Matching matching);
public abstract async Gee.Collection<Geary.EmailIdentifier>? local_search_async(Geary.SearchQuery query,
int limit = 100, int offset = 0, Gee.Collection<Geary.FolderPath?>? folder_blacklist = null,
diff --git a/src/engine/api/geary-account.vala b/src/engine/api/geary-account.vala
index 0dfc7a6..6b26097 100644
--- a/src/engine/api/geary-account.vala
+++ b/src/engine/api/geary-account.vala
@@ -325,11 +325,19 @@ public interface Geary.Account : BaseObject {
/**
* Create a new { link SearchQuery} for this { link Account}.
*
+ * See { link Geary.SearchQuery.Matching} for more information about how its interpreted by the
+ * Engine. In particular, note that it's an advisory parameter only and may have no effect,
+ * especially on server searches. However, it may also have a dramatic effect on what search
+ * results are returned and so should be used with some caution. Whether this parameter is
+ * user-configurable, available through GSettings or another configuration mechanism, or simply
+ * baked into the caller's code is up to the caller. CONSERVATIVE is designed to be a good
+ * default.
+ *
* The SearchQuery object can only be used with calls into this Account.
*
- * Dropping the last reference to the SearchQuery is sufficient to close it.
+ * Dropping the last reference to the SearchQuery will close it.
*/
- public abstract Geary.SearchQuery open_search(string query);
+ public abstract Geary.SearchQuery open_search(string query, Geary.SearchQuery.Matching matching);
/**
* Performs a search with the given query. Optionally, a list of folders not to search
diff --git a/src/engine/api/geary-search-folder.vala b/src/engine/api/geary-search-folder.vala
index e70b933..e7246a7 100644
--- a/src/engine/api/geary-search-folder.vala
+++ b/src/engine/api/geary-search-folder.vala
@@ -203,8 +203,8 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport
/**
* Sets the keyword string for this search.
*/
- public void set_search_query(string query, Cancellable? cancellable = null) {
- set_search_query_async.begin(query, cancellable, on_set_search_query_complete);
+ public void set_search_query(string query, SearchQuery.Matching matching, Cancellable? cancellable =
null) {
+ set_search_query_async.begin(query, matching, cancellable, on_set_search_query_complete);
}
private void on_set_search_query_complete(Object? source, AsyncResult result) {
@@ -215,8 +215,9 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport
}
}
- private async void set_search_query_async(string query, Cancellable? cancellable = null) throws Error {
- Geary.SearchQuery search_query = account.open_search(query);
+ private async void set_search_query_async(string query, SearchQuery.Matching matching,
+ Cancellable? cancellable) throws Error {
+ Geary.SearchQuery search_query = account.open_search(query, matching);
int result_mutex_token = yield result_mutex.claim_async();
diff --git a/src/engine/api/geary-search-query.vala b/src/engine/api/geary-search-query.vala
index 3ed0101..adfc99b 100644
--- a/src/engine/api/geary-search-query.vala
+++ b/src/engine/api/geary-search-query.vala
@@ -10,16 +10,59 @@
*
* The only interaction the API user should have with this is creating new ones and then passing
* them to the search methods in the Engine.
+ *
+ * @see Geary.Account.open_search
*/
public abstract class Geary.SearchQuery : BaseObject {
/**
+ * An advisory parameter regarding search quality and scope.
+ *
+ * The Engine can perform searches based on (unspecified, uncontracted) textual variations of
+ * a query's search terms. Some of those variations may produce undesirable results due to
+ * "greedy" matching of terms. The Matching parameter allows for an advisory to the Engine
+ * about how to use those textual variants, if any at all.
+ *
+ * This may be respected or ignored by the Engine. In particular, there's no guarantee it will
+ * have any effect on server search.
+ */
+ public enum Matching {
+ /**
+ * Only return exact matches, perform no searches for textual variants.
+ *
+ * Note that Geary's search syntax does prefix-matching for unquoted strings. EXACT means
+ * exact ''prefix-''matching in this case.
+ */
+ EXACT,
+ /**
+ * Allow for searching for a small set of textual variants and small differences in search
+ * terms. This is a good default.
+ */
+ CONSERVATIVE,
+ /**
+ * Allow for searching for a broad set of textual variants and larger differences in
+ * search terms.
+ */
+ AGGRESSIVE,
+ /**
+ * Search for all textual variants, i.e. "the sky's the limit."
+ */
+ HORIZON
+ }
+
+ /**
* The original user search text.
*/
public string raw { get; private set; }
- protected SearchQuery(string query) {
- raw = query;
+ /**
+ * The selected { link Matching} quality.
+ */
+ public Matching matching { get; private set; }
+
+ protected SearchQuery(string raw, Matching matching) {
+ this.raw = raw;
+ this.matching = matching;
}
}
diff --git a/src/engine/imap-db/imap-db-account.vala b/src/engine/imap-db/imap-db-account.vala
index 52315af..7eb74e5 100644
--- a/src/engine/imap-db/imap-db-account.vala
+++ b/src/engine/imap-db/imap-db-account.vala
@@ -7,10 +7,6 @@
private class Geary.ImapDB.Account : BaseObject {
private const int POPULATE_SEARCH_TABLE_DELAY_SEC = 5;
- private const int MIN_TERM_LENGTH_FOR_STEMMING = 6;
- private const int MAX_DIFF_TERM_STEM_LENGTH = 2;
- private const int MAX_MATCH_LENGTH_STEMMED_VARIANT = 2;
-
private class FolderReference : Geary.SmartReference {
public Geary.FolderPath path;
@@ -745,17 +741,22 @@ private class Geary.ImapDB.Account : BaseObject {
*
* Some hueristics are in place simply to determine if stemming should occur:
*
- * # If the term is < MIN_TERM_LENGTH_FOR_STEMMING, no stemming occurs.
+ * # If stemming is unallowed, no stemming occurs.
+ * # If the term is < min. term length for stemming, no stemming occurs.
* # If the stemmer returns a stem that is the same as the original term, no stemming occurs.
* # If the difference between the stemmed word and the original term is more than
- * MAX_DIFF_TERM_STEM_LENGTH, no stemming occurs. This works under the assumption that if
+ * maximum allowed, no stemming occurs. This works under the assumption that if
* the user has typed a long word, they do not want to "go back" to searching for a much
- * shorter version of it. (For example, "accountancy" stems to "account").
+ * shorter version of it. (For example, "accountancies" stems to "account").
*
* Otherwise, the stem for the term is returned.
*/
- private string? stem_search_term(string term) {
- if (term.length < MIN_TERM_LENGTH_FOR_STEMMING)
+ private string? stem_search_term(ImapDB.SearchQuery query, string term) {
+ if (!query.allow_stemming)
+ return null;
+
+ int term_length = term.length;
+ if (term_length < query.min_term_length_for_stemming)
return null;
string? stemmed = null;
@@ -790,7 +791,7 @@ private class Geary.ImapDB.Account : BaseObject {
return null;
// Don't search for stemmed words that are significantly shorter than the user's search term
- if (term.length - stemmed.length > MAX_DIFF_TERM_STEM_LENGTH) {
+ if (term_length - stemmed.length > query.max_difference_term_stem_lengths) {
debug("Stemmed \"%s\" dropped searching for \"%s\": too much distance in terms",
stemmed, term);
@@ -877,7 +878,7 @@ private class Geary.ImapDB.Account : BaseObject {
if (parts.length > 1)
field = extract_field_from_token(parts, ref s);
- term = new SearchTerm(original, s, stem_search_term(s), str => "%s*".printf(str));
+ term = new SearchTerm(original, s, stem_search_term(query, s), str => "%s*".printf(str));
}
if (in_quote && quotes % 2 != 0)
@@ -1093,8 +1094,8 @@ private class Geary.ImapDB.Account : BaseObject {
// For each matched string in this message, retain the message in the search results
// if it prefix-matches any of the straight-up parsed terms or matches a stemmed
- // variant (with only MAX_MATCH_LENGTH_STEMMED_VARIANT differences in their lengths,
- // i.e. not a "greedy" match)
+ // variant (with only max. difference in their lengths allowed, i.e. not a "greedy"
+ // match)
bool good_match_found = false;
foreach (string match in matches) {
foreach (SearchTerm term in query.get_all_terms()) {
@@ -1108,7 +1109,7 @@ private class Geary.ImapDB.Account : BaseObject {
// if prefix-matches stemmed term w/o doing so greedily, then don't strip
if (term.stemmed != null && match.has_prefix(term.stemmed)) {
int diff = match.length - term.stemmed.length;
- if (diff <= MAX_MATCH_LENGTH_STEMMED_VARIANT) {
+ if (diff <= query.max_difference_match_stem_lengths) {
good_match_found = true;
break;
diff --git a/src/engine/imap-db/imap-db-search-query.vala b/src/engine/imap-db/imap-db-search-query.vala
index 28676e7..eea1f68 100644
--- a/src/engine/imap-db/imap-db-search-query.vala
+++ b/src/engine/imap-db/imap-db-search-query.vala
@@ -19,15 +19,83 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
*/
public bool parsed { get; set; default = false; }
+ /**
+ * Determined by { link matching}.
+ */
+ public bool allow_stemming { get; private set; }
+
+ /**
+ * Minimum length of the term before stemming is allowed.
+ *
+ * This prevents short words that might be stemmed from being stemmed.
+ *
+ * Overridden by { link allow_stemming}. Determined by { link matching}.
+ */
+ public int min_term_length_for_stemming { get; private set; }
+
+ /**
+ * Maximum difference in lengths between term and stemmed variant.
+ *
+ * This prevents long words from being stemmed to much shorter words (which creates
+ * opportunities for greedy matching).
+ *
+ * Overridden by { link allow_stemming}. Determined by { link matching}.
+ */
+ public int max_difference_term_stem_lengths { get; private set; }
+
+ /**
+ * Maximum difference in lengths between a matched word and the stemmed variant it matched
+ * against.
+ *
+ * This prevents long words being matched to short stem variants (which creates opportunities
+ * for greedy matching).
+ *
+ * Overridden by { link allow_stemming}. Determined by { link matching}.
+ */
+ public int max_difference_match_stem_lengths { get; private set; }
+
// Not using a MultiMap because we (might) need a guarantee of order.
private Gee.HashMap<string?, Gee.ArrayList<SearchTerm>> field_map
= new Gee.HashMap<string?, Gee.ArrayList<SearchTerm>>();
private Gee.ArrayList<SearchTerm> all = new Gee.ArrayList<SearchTerm>();
- public SearchQuery(ImapDB.Account account, string query) {
- base (query);
+ public SearchQuery(ImapDB.Account account, string query, Geary.SearchQuery.Matching matching) {
+ base (query, matching);
this.account = account;
+
+ switch (matching) {
+ case Matching.EXACT:
+ allow_stemming = false;
+ min_term_length_for_stemming = int.MAX;
+ max_difference_term_stem_lengths = 0;
+ max_difference_match_stem_lengths = 0;
+ break;
+
+ case Matching.CONSERVATIVE:
+ allow_stemming = true;
+ min_term_length_for_stemming = 6;
+ max_difference_term_stem_lengths = 2;
+ max_difference_match_stem_lengths = 2;
+ break;
+
+ case Matching.AGGRESSIVE:
+ allow_stemming = true;
+ min_term_length_for_stemming = 4;
+ max_difference_term_stem_lengths = 4;
+ max_difference_match_stem_lengths = 3;
+ break;
+
+ case Matching.HORIZON:
+ allow_stemming = true;
+ min_term_length_for_stemming = 0;
+ max_difference_term_stem_lengths = int.MAX;
+ max_difference_match_stem_lengths = int.MAX;
+ break;
+
+ default:
+ assert_not_reached();
+ }
}
public void add_search_term(string? field, SearchTerm term) {
diff --git a/src/engine/imap-engine/imap-engine-generic-account.vala
b/src/engine/imap-engine/imap-engine-generic-account.vala
index 47ce2ac..f59af5a 100644
--- a/src/engine/imap-engine/imap-engine-generic-account.vala
+++ b/src/engine/imap-engine/imap-engine-generic-account.vala
@@ -824,8 +824,8 @@ private abstract class Geary.ImapEngine.GenericAccount : Geary.AbstractAccount {
return yield local.fetch_email_async(check_id(email_id), required_fields, cancellable);
}
- public override Geary.SearchQuery open_search(string query) {
- return new ImapDB.SearchQuery(local, query);
+ public override Geary.SearchQuery open_search(string query, SearchQuery.Matching matching) {
+ return new ImapDB.SearchQuery(local, query, matching);
}
public override async Gee.Collection<Geary.EmailIdentifier>? local_search_async(Geary.SearchQuery query,
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]