[geary/wip/720361-stemming] Fix searching for exact phrases, i.e. ["archive"]
- From: Jim Nelson <jnelson src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [geary/wip/720361-stemming] Fix searching for exact phrases, i.e. ["archive"]
- Date: Thu, 11 Dec 2014 23:09:18 +0000 (UTC)
commit 7c146beb7d8c6c8a9d51e63a7812e2ac3c159509
Author: Jim Nelson <jim yorba org>
Date: Thu Dec 11 15:08:26 2014 -0800
Fix searching for exact phrases, i.e. ["archive"]
src/engine/imap-db/imap-db-account.vala | 57 +++++++++++++++------------
src/engine/imap-db/imap-db-search-term.vala | 6 +++
2 files changed, 38 insertions(+), 25 deletions(-)
---
diff --git a/src/engine/imap-db/imap-db-account.vala b/src/engine/imap-db/imap-db-account.vala
index d589113..52315af 100644
--- a/src/engine/imap-db/imap-db-account.vala
+++ b/src/engine/imap-db/imap-db-account.vala
@@ -901,7 +901,6 @@ private class Geary.ImapDB.Account : BaseObject {
if (terms == null || terms.size == 0)
continue;
- StringBuilder builder = new StringBuilder("(");
// Each SearchTerm is an AND but the SQL text within in are OR ... this allows for
// each user term to be AND but the variants of each term are or. So, if terms are
// [party] and [eventful] and stems are [parti] and [event], the search would be:
@@ -912,28 +911,30 @@ private class Geary.ImapDB.Account : BaseObject {
// but a broader search of the original term (such as event* and eventful*) but do both
// to determine from each hit result which term caused the hit, and if it's too greedy
// a match of the stemmed variant, it can be stripped from the results.
- bool is_first_term = true;
+ //
+ // Note that this uses SQLite's "standard" query syntax for MATCH, where AND is implied
+ // (and would be treated as search term if included), parentheses are not allowed, and
+ // OR has a higher precendence than AND.
+ StringBuilder builder = new StringBuilder();
foreach (SearchTerm term in terms) {
if (term.sql.size == 0)
continue;
- if (!is_first_term)
- builder.append(") AND (");
-
- bool is_first_sql = true;
- foreach (string sql in term.sql) {
- if (!is_first_sql)
- builder.append(" OR ");
-
- builder.append_printf("%s ", sql);
- is_first_sql = false;
+ if (term.is_exact) {
+ builder.append_printf("%s ", term.parsed);
+ } else {
+ bool is_first_sql = true;
+ foreach (string sql in term.sql) {
+ if (!is_first_sql)
+ builder.append(" OR ");
+
+ builder.append_printf("%s ", sql);
+ is_first_sql = false;
+ }
}
-
- is_first_term = false;
}
- builder.append(")");
- phrases.set((field == null ? "MessageSearchTable" : field), builder.str);
+ phrases.set(field ?? "MessageSearchTable", builder.str);
}
return phrases;
@@ -1068,13 +1069,18 @@ private class Geary.ImapDB.Account : BaseObject {
if (search_results.size == 0)
return null;
+ // if any of the search terms is exact-match (no prefix matching) or none have stemmed
+ // variants, then don't do stripping of "greedy" stemmed variants (as in both cases, there
+ // are none)
+ if (traverse<SearchTerm>(query.get_all_terms()).any(term => term.stemmed == null || term.is_exact))
+ return search_results;
+
//
// Strip out search results that only contain a hit due to "greedy" matching of the stemmed
// variants on all search terms
//
- Gee.HashSet<string> stripped_matches = new Gee.HashSet<string>();
- Gee.HashSet<string> accepted_matches = new Gee.HashSet<string>();
+ int prestripped_results = search_results.size;
Gee.Iterator<ImapDB.SearchEmailIdentifier> iter = search_results.iterator();
while (iter.next()) {
Gee.Collection<string>? matches = yield get_search_matches_async(query,
@@ -1085,25 +1091,28 @@ private class Geary.ImapDB.Account : BaseObject {
continue;
}
+ // For each matched string in this message, retain the message in the search results
+ // if it prefix-matches any of the straight-up parsed terms or matches a stemmed
+ // variant (with only MAX_MATCH_LENGTH_STEMMED_VARIANT differences in their lengths,
+ // i.e. not a "greedy" match)
bool good_match_found = false;
foreach (string match in matches) {
foreach (SearchTerm term in query.get_all_terms()) {
+ // if prefix-matches parsed term, then don't strip
if (match.has_prefix(term.parsed)) {
good_match_found = true;
break;
}
+ // if prefix-matches stemmed term w/o doing so greedily, then don't strip
if (term.stemmed != null && match.has_prefix(term.stemmed)) {
int diff = match.length - term.stemmed.length;
if (diff <= MAX_MATCH_LENGTH_STEMMED_VARIANT) {
good_match_found = true;
- accepted_matches.add(match);
break;
}
-
- stripped_matches.add(match);
}
}
@@ -1115,10 +1124,8 @@ private class Geary.ImapDB.Account : BaseObject {
iter.remove();
}
- foreach (string accepted_match in accepted_matches)
- debug("Accepted with \"%s\": %s", query.raw, accepted_match);
- foreach (string stripped_match in stripped_matches)
- debug("Stripped from \"%s\": %s", query.raw, stripped_match);
+ debug("Stripped %d emails from search for [%s] due to greedy stem matching",
+ prestripped_results - search_results.size, query.raw);
return (search_results.size == 0 ? null : search_results);
}
diff --git a/src/engine/imap-db/imap-db-search-term.vala b/src/engine/imap-db/imap-db-search-term.vala
index a1fe8d0..4cf83a5 100644
--- a/src/engine/imap-db/imap-db-search-term.vala
+++ b/src/engine/imap-db/imap-db-search-term.vala
@@ -46,6 +46,12 @@ private class Geary.ImapDB.SearchTerm : BaseObject {
*/
public Gee.List<string> sql { get; private set; default = new Gee.ArrayList<string>(); }
+ /**
+ * Returns true if the { link parsed} term is exact-match only (i.e. starts with quotes) and
+ * there is no { link stemmed} variant.
+ */
+ public bool is_exact { get { return parsed.has_prefix("\"") && stemmed == null; } }
+
public SearchTerm(string original, string parsed, string? stemmed, TextToMatch text_to_match) {
this.original = original;
this.parsed = parsed;
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]