[geary/wip/720361-stemming] Fix searching for exact phrases, i.e. ["archive"]



commit 7c146beb7d8c6c8a9d51e63a7812e2ac3c159509
Author: Jim Nelson <jim yorba org>
Date:   Thu Dec 11 15:08:26 2014 -0800

    Fix searching for exact phrases, i.e. ["archive"]

 src/engine/imap-db/imap-db-account.vala     |   57 +++++++++++++++------------
 src/engine/imap-db/imap-db-search-term.vala |    6 +++
 2 files changed, 38 insertions(+), 25 deletions(-)
---
diff --git a/src/engine/imap-db/imap-db-account.vala b/src/engine/imap-db/imap-db-account.vala
index d589113..52315af 100644
--- a/src/engine/imap-db/imap-db-account.vala
+++ b/src/engine/imap-db/imap-db-account.vala
@@ -901,7 +901,6 @@ private class Geary.ImapDB.Account : BaseObject {
             if (terms == null || terms.size == 0)
                 continue;
             
-            StringBuilder builder = new StringBuilder("(");
             // Each SearchTerm is an AND but the SQL text within in are OR ... this allows for
             // each user term to be AND but the variants of each term are or.  So, if terms are
             // [party] and [eventful] and stems are [parti] and [event], the search would be:
@@ -912,28 +911,30 @@ private class Geary.ImapDB.Account : BaseObject {
             // but a broader search of the original term (such as event* and eventful*) but do both
             // to determine from each hit result which term caused the hit, and if it's too greedy
             // a match of the stemmed variant, it can be stripped from the results.
-            bool is_first_term = true;
+            //
+            // Note that this uses SQLite's "standard" query syntax for MATCH, where AND is implied
+            // (and would be treated as search term if included), parentheses are not allowed, and
+            // OR has a higher precendence than AND.
+            StringBuilder builder = new StringBuilder();
             foreach (SearchTerm term in terms) {
                 if (term.sql.size == 0)
                     continue;
                 
-                if (!is_first_term)
-                    builder.append(") AND (");
-                
-                bool is_first_sql = true;
-                foreach (string sql in term.sql) {
-                    if (!is_first_sql)
-                        builder.append(" OR ");
-                    
-                    builder.append_printf("%s ", sql);
-                    is_first_sql = false;
+                if (term.is_exact) {
+                    builder.append_printf("%s ", term.parsed);
+                } else {
+                    bool is_first_sql = true;
+                    foreach (string sql in term.sql) {
+                        if (!is_first_sql)
+                            builder.append(" OR ");
+                        
+                        builder.append_printf("%s ", sql);
+                        is_first_sql = false;
+                    }
                 }
-                
-                is_first_term = false;
             }
-            builder.append(")");
             
-            phrases.set((field == null ? "MessageSearchTable" : field), builder.str);
+            phrases.set(field ?? "MessageSearchTable", builder.str);
         }
         
         return phrases;
@@ -1068,13 +1069,18 @@ private class Geary.ImapDB.Account : BaseObject {
         if (search_results.size == 0)
             return null;
         
+        // if any of the search terms is exact-match (no prefix matching) or none have stemmed
+        // variants, then don't do stripping of "greedy" stemmed variants (as in both cases, there
+        // are none)
+        if (traverse<SearchTerm>(query.get_all_terms()).any(term => term.stemmed == null || term.is_exact))
+            return search_results;
+        
         //
         // Strip out search results that only contain a hit due to "greedy" matching of the stemmed
         // variants on all search terms
         //
         
-        Gee.HashSet<string> stripped_matches = new Gee.HashSet<string>();
-        Gee.HashSet<string> accepted_matches = new Gee.HashSet<string>();
+        int prestripped_results = search_results.size;
         Gee.Iterator<ImapDB.SearchEmailIdentifier> iter = search_results.iterator();
         while (iter.next()) {
             Gee.Collection<string>? matches = yield get_search_matches_async(query,
@@ -1085,25 +1091,28 @@ private class Geary.ImapDB.Account : BaseObject {
                 continue;
             }
             
+            // For each matched string in this message, retain the message in the search results
+            // if it prefix-matches any of the straight-up parsed terms or matches a stemmed
+            // variant (with only MAX_MATCH_LENGTH_STEMMED_VARIANT differences in their lengths,
+            // i.e. not a "greedy" match)
             bool good_match_found = false;
             foreach (string match in matches) {
                 foreach (SearchTerm term in query.get_all_terms()) {
+                    // if prefix-matches parsed term, then don't strip
                     if (match.has_prefix(term.parsed)) {
                         good_match_found = true;
                         
                         break;
                     }
                     
+                    // if prefix-matches stemmed term w/o doing so greedily, then don't strip
                     if (term.stemmed != null && match.has_prefix(term.stemmed)) {
                         int diff = match.length - term.stemmed.length;
                         if (diff <= MAX_MATCH_LENGTH_STEMMED_VARIANT) {
                             good_match_found = true;
-                            accepted_matches.add(match);
                             
                             break;
                         }
-                        
-                        stripped_matches.add(match);
                     }
                 }
                 
@@ -1115,10 +1124,8 @@ private class Geary.ImapDB.Account : BaseObject {
                 iter.remove();
         }
         
-        foreach (string accepted_match in accepted_matches)
-            debug("Accepted with \"%s\": %s", query.raw, accepted_match);
-        foreach (string stripped_match in stripped_matches)
-            debug("Stripped from \"%s\": %s", query.raw, stripped_match);
+        debug("Stripped %d emails from search for [%s] due to greedy stem matching",
+            prestripped_results - search_results.size, query.raw);
         
         return (search_results.size == 0 ? null : search_results);
     }
diff --git a/src/engine/imap-db/imap-db-search-term.vala b/src/engine/imap-db/imap-db-search-term.vala
index a1fe8d0..4cf83a5 100644
--- a/src/engine/imap-db/imap-db-search-term.vala
+++ b/src/engine/imap-db/imap-db-search-term.vala
@@ -46,6 +46,12 @@ private class Geary.ImapDB.SearchTerm : BaseObject {
      */
     public Gee.List<string> sql { get; private set; default = new Gee.ArrayList<string>(); }
     
+    /**
+     * Returns true if the { link parsed} term is exact-match only (i.e. starts with quotes) and
+     * there is no { link stemmed} variant.
+     */
+    public bool is_exact { get { return parsed.has_prefix("\"") && stemmed == null; } }
+    
     public SearchTerm(string original, string parsed, string? stemmed, TextToMatch text_to_match) {
         this.original = original;
         this.parsed = parsed;


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]