[geary/wip/720361-stemming] First stab at this
- From: Jim Nelson <jnelson src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [geary/wip/720361-stemming] First stab at this
- Date: Wed, 10 Dec 2014 01:41:33 +0000 (UTC)
commit 1a94ec6cf4edeb52963b5fbd2f8489f340cb0b28
Author: Jim Nelson <jim yorba org>
Date: Tue Dec 9 17:40:54 2014 -0800
First stab at this
This deletes the old search FTS table and installs a new one, so
backup your Geary data directory before using.
sql/CMakeLists.txt | 1 +
sql/version-023.sql | 21 ++++++
src/client/application/geary-controller.vala | 5 +-
src/engine/api/geary-search-query.vala | 6 ++
src/engine/imap-db/imap-db-account.vala | 95 +++++++++++++++++++++++++-
src/engine/imap-db/imap-db-database.vala | 27 +++++++-
6 files changed, 149 insertions(+), 6 deletions(-)
---
diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt
index bbd8f91..40184ce 100644
--- a/sql/CMakeLists.txt
+++ b/sql/CMakeLists.txt
@@ -22,3 +22,4 @@ install(FILES version-019.sql DESTINATION ${SQL_DEST})
install(FILES version-020.sql DESTINATION ${SQL_DEST})
install(FILES version-021.sql DESTINATION ${SQL_DEST})
install(FILES version-022.sql DESTINATION ${SQL_DEST})
+install(FILES version-023.sql DESTINATION ${SQL_DEST})
diff --git a/sql/version-023.sql b/sql/version-023.sql
new file mode 100644
index 0000000..d282516
--- /dev/null
+++ b/sql/version-023.sql
@@ -0,0 +1,21 @@
+--
+-- Database upgrade to add FTS tokenize virtual table, which allows for querying the tokenizer
+-- directly for stemmed words, and dropping the stemmed FTS table for an unstemmed one. We now
+-- use the stemmer manually to generate search queries.
+--
+
+DROP TABLE MessageSearchTable;
+
+CREATE VIRTUAL TABLE MessageSearchTable USING fts4(
+ body,
+ attachment,
+ subject,
+ from_field,
+ receivers,
+ cc,
+ bcc,
+
+ tokenize=simple,
+ prefix="2,4,6,8,10"
+);
+
diff --git a/src/client/application/geary-controller.vala b/src/client/application/geary-controller.vala
index 808f3f4..08f3fe4 100644
--- a/src/client/application/geary-controller.vala
+++ b/src/client/application/geary-controller.vala
@@ -81,7 +81,7 @@ public class GearyController : Geary.BaseObject {
private const string MOVE_MESSAGE_TOOLTIP_MULTIPLE = _("Move conversations");
private const int SELECT_FOLDER_TIMEOUT_USEC = 100 * 1000;
- private const int SEARCH_TIMEOUT_MSEC = 100;
+ private const int SEARCH_TIMEOUT_MSEC = 250;
private const string PROP_ATTEMPT_OPEN_ACCOUNT = "attempt-open-account";
@@ -2523,7 +2523,8 @@ public class GearyController : Geary.BaseObject {
// search after a quick delay when they finish typing.
if (search_timeout_id != 0)
Source.remove(search_timeout_id);
- search_timeout_id = Timeout.add(SEARCH_TIMEOUT_MSEC, on_search_timeout);
+
+ search_timeout_id = Timeout.add(SEARCH_TIMEOUT_MSEC, on_search_timeout, Priority.LOW);
}
private bool on_search_timeout() {
diff --git a/src/engine/api/geary-search-query.vala b/src/engine/api/geary-search-query.vala
index 51bcbd3..557809c 100644
--- a/src/engine/api/geary-search-query.vala
+++ b/src/engine/api/geary-search-query.vala
@@ -16,6 +16,8 @@ public class Geary.SearchQuery : BaseObject {
public string raw { get; private set; }
public bool parsed { get; internal set; default = false; }
+ internal int stemming_level { get; set; default = 0; }
+
// Not using a MultiMap because we (might) need a guarantee of order.
private Gee.HashMap<string?, Gee.ArrayList<string>> field_map
= new Gee.HashMap<string?, Gee.ArrayList<string>>();
@@ -24,6 +26,10 @@ public class Geary.SearchQuery : BaseObject {
raw = query;
}
+ public void clear() {
+ field_map.clear();
+ }
+
internal void add_token(string? field, string token) {
if (!field_map.has_key(field))
field_map.set(field, new Gee.ArrayList<string>());
diff --git a/src/engine/imap-db/imap-db-account.vala b/src/engine/imap-db/imap-db-account.vala
index c4a370e..da66583 100644
--- a/src/engine/imap-db/imap-db-account.vala
+++ b/src/engine/imap-db/imap-db-account.vala
@@ -715,6 +715,65 @@ private class Geary.ImapDB.Account : BaseObject {
return null;
}
+ /**
+ * This method is used to convert an unquoted user-entered search terms into a search term best
+ * suited for the SQLite FTS table.
+ *
+ * Prior experience with the Unicode Snowball stemmer indicates it's too aggressive for our
+ * needs when coupled with prefix-matching of all unquoted terms (see
+ * https://bugzilla.gnome.org/show_bug.cgi?id=713179) This method is a heuristic designed to
+ * dampen that aggressiveness without losing the benefits of stemming entirely.
+ *
+ * Database upgrade 23 removes the old Snowball-stemmed FTS table and replaces it with one
+ * with no stemming (using only SQLite's "simple" tokenizer). It also creates a "magic" SQLite
+ * table called TokenizerTable which allows for uniform queries to the Snowball stemmer, which
+ * is still installed in Geary.
+ *
+ * For each term, this heuristic makes the following decisions:
+ *
+ * #
+ */
+ private string stem_search_term(string term, int stemming_level) {
+ int term_length = term.length;
+ if (term_length <= 5)
+ return term;
+
+ string? stemmed = null;
+ try {
+ Db.Statement stmt = db.prepare("""
+ SELECT token
+ FROM TokenizerTable
+ WHERE input=?
+ """);
+ stmt.bind_string(0, term);
+
+ // get stemmed string; if no result, fall through
+ Db.Result result = stmt.exec();
+ if (!result.finished)
+ stemmed = result.string_at(0);
+ else
+ message("No stemmed term returned for \"%s\"", term);
+ } catch (Error err) {
+ message("Unable to query tokenizer table for stemmed term for \"%s\": %s", term, err.message);
+
+ // fall-through
+ }
+
+ if (String.is_empty(stemmed)) {
+ message("Empty stemmed term returned for \"%s\"", term);
+
+ return term;
+ }
+
+ int stemmed_length = stemmed.length;
+ int diff = term_length - stemmed_length;
+
+ message("TERM->STEM: \"%s\" -> \"%s\" (diff=%d, stemming_level=%d)", term, stemmed, diff,
+ stemming_level);
+
+ return (diff <= stemming_level) ? stemmed : term;
+ }
+
private void prepare_search_query(Geary.SearchQuery query) {
if (query.parsed)
return;
@@ -775,7 +834,7 @@ private class Geary.ImapDB.Account : BaseObject {
if (parts.length > 1)
field = extract_field_from_token(parts, ref s);
- s = "\"" + s + "*\"";
+ s = "\"%s*\"".printf(query.stemming_level > 0 ? stem_search_term(s, query.stemming_level) :
s);
}
if (in_quote && quotes % 2 != 0)
@@ -867,9 +926,29 @@ private class Geary.ImapDB.Account : BaseObject {
public async Gee.Collection<Geary.EmailIdentifier>? search_async(Geary.SearchQuery query,
int limit = 100, int offset = 0, Gee.Collection<Geary.FolderPath?>? folder_blacklist = null,
- Gee.Collection<Geary.EmailIdentifier>? search_ids = null, Cancellable? cancellable = null) throws
Error {
+ Gee.Collection<Geary.EmailIdentifier>? search_ids = null, Cancellable? cancellable = null)
+ throws Error {
check_open();
+ for (;;) {
+ Gee.Collection<Geary.EmailIdentifier>? results = yield do_search_async(query,
+ limit, offset, folder_blacklist, search_ids, cancellable);
+ if (results != null && results.size > 0)
+ return results;
+
+ if (query.stemming_level >= 2)
+ return null;
+
+ query.stemming_level = query.stemming_level + 1;
+ query.parsed = false;
+ query.clear();
+ debug("RETYING SEARCH WITH STEMMING LEVEL %d: NO RESULTS FOUND", query.stemming_level);
+ }
+ }
+
+ private async Gee.Collection<Geary.EmailIdentifier>? do_search_async(Geary.SearchQuery query,
+ int limit, int offset, Gee.Collection<Geary.FolderPath?>? folder_blacklist,
+ Gee.Collection<Geary.EmailIdentifier>? search_ids, Cancellable? cancellable) throws Error {
Gee.HashMap<string, string> query_phrases = get_query_phrases(query);
if (query_phrases.size == 0)
return null;
@@ -912,6 +991,12 @@ private class Geary.ImapDB.Account : BaseObject {
if (limit > 0)
sql.append(" LIMIT ? OFFSET ?");
+ StringBuilder builder = new StringBuilder();
+ foreach (string key in query_phrases.keys)
+ builder.append_printf("%s ", query_phrases[key]);
+
+ debug("\nSEARCH:\n%s\nPHRASES:%s\n", sql.str, builder.str);
+
Db.Statement stmt = cx.prepare(sql.str);
int bind_index = sql_bind_query_phrases(stmt, 0, query_phrases);
if (limit > 0) {
@@ -975,6 +1060,12 @@ private class Geary.ImapDB.Account : BaseObject {
sql.append(")");
sql_add_query_phrases(sql, query_phrases);
+ StringBuilder builder = new StringBuilder();
+ foreach (string key in query_phrases.keys)
+ builder.append_printf("%s ", query_phrases[key]);
+
+ debug("\nMATCHES:\n%s\nPHRASES:%s\n", sql.str, builder.str);
+
Db.Statement stmt = cx.prepare(sql.str);
sql_bind_query_phrases(stmt, 0, query_phrases);
diff --git a/src/engine/imap-db/imap-db-database.vala b/src/engine/imap-db/imap-db-database.vala
index 704d730..533209f 100644
--- a/src/engine/imap-db/imap-db-database.vala
+++ b/src/engine/imap-db/imap-db-database.vala
@@ -107,7 +107,11 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
break;
case 22:
- post_rebuild_attachments();
+ post_upgrade_rebuild_attachments();
+ break;
+
+ case 23:
+ post_upgrade_add_tokenizer_table();
break;
}
}
@@ -407,7 +411,7 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
}
// Version 22
- private void post_rebuild_attachments() {
+ private void post_upgrade_rebuild_attachments() {
try {
exec_transaction(Db.TransactionType.RW, (cx) => {
Db.Statement stmt = cx.prepare("""
@@ -471,6 +475,25 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
}
}
+ // Version 23
+ private void post_upgrade_add_tokenizer_table() {
+ try {
+ string stemmer = find_appropriate_search_stemmer();
+ debug("Creating tokenizer table using %s stemmer", stemmer);
+
+ // These can't go in the .sql file because its schema (the stemmer
+ // algorithm) is determined at runtime.
+ exec("""
+ CREATE VIRTUAL TABLE TokenizerTable USING fts3tokenize(
+ unicodesn,
+ "stemmer=%s"
+ );
+ """.printf(stemmer));
+ } catch (Error e) {
+ error("Error creating tokenizer table: %s", e.message);
+ }
+ }
+
private void on_prepare_database_connection(Db.Connection cx) throws Error {
cx.set_busy_timeout_msec(Db.Connection.RECOMMENDED_BUSY_TIMEOUT_MSEC);
cx.set_foreign_keys(true);
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]