[geary/mjog/334-libstemmer: 111/113] ImapDb.SearchQuery: Use libstemmer for stemming
- From: Michael Gratton <mjog src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [geary/mjog/334-libstemmer: 111/113] ImapDb.SearchQuery: Use libstemmer for stemming
- Date: Wed, 14 Oct 2020 12:13:47 +0000 (UTC)
commit 968bc1a9e82d275ffbfd3825d3aa9e4597d9bd4b
Author: Michael Gratton <mike vee net>
Date: Sun Sep 13 16:27:51 2020 +1000
ImapDb.SearchQuery: Use libstemmer for stemming
Instead of shipping our own forked stemmer via SQLite, add a dependency
on libstemmer and use that instead.
.gitlab-ci.yml | 10 +--
BUILDING.md | 8 +--
bindings/vapi/libstemmer.vapi | 38 ++++++++++
meson.build | 8 +++
src/engine/imap-db/imap-db-search-query.vala | 102 ++++++++++++++++-----------
src/engine/meson.build | 1 +
src/meson.build | 1 +
7 files changed, 118 insertions(+), 50 deletions(-)
---
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 82f6ab3bd..80be0f435 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -26,9 +26,9 @@ variables:
meson vala desktop-file-utils enchant2-devel folks-devel gcr-devel
glib2-devel gmime30-devel gnome-online-accounts-devel gspell-devel
gsound-devel gtk3-devel iso-codes-devel json-glib-devel itstool
- libappstream-glib-devel libgee-devel libhandy1-devel libpeas-devel
- libsecret-devel libunwind-devel libxml2-devel libytnef-devel
- sqlite-devel webkitgtk4-devel
+ libappstream-glib-devel libgee-devel libhandy1-devel
+ libpeas-devel libsecret-devel libstemmer-devel libunwind-devel
+ libxml2-devel libytnef-devel sqlite-devel webkitgtk4-devel
FEDORA_TEST_DEPS: glibc-langpack-en gnutls-utils tar Xvfb xz
# Ubuntu packages
@@ -37,8 +37,8 @@ variables:
itstool libappstream-glib-dev libenchant-2-dev libfolks-dev
libgcr-3-dev libgee-0.8-dev libglib2.0-dev libgmime-3.0-dev
libgoa-1.0-dev libgspell-1-dev libgsound-dev libgtk-3-dev
- libhandy-1-dev libjson-glib-dev libmessaging-menu-dev
- libpeas-dev libsecret-1-dev libsqlite3-dev libunwind-dev
+ libhandy-1-dev libjson-glib-dev libmessaging-menu-dev libpeas-dev
+ libsecret-1-dev libsqlite3-dev libstemmer-dev libunwind-dev
libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev
UBUNTU_TEST_DEPS: gnutls-bin librsvg2-common locales xauth xvfb
diff --git a/BUILDING.md b/BUILDING.md
index c00b19018..ff2c7e46e 100644
--- a/BUILDING.md
+++ b/BUILDING.md
@@ -91,8 +91,8 @@ sudo dnf install meson vala desktop-file-utils enchant2-devel \
gnome-online-accounts-devel gspell-devel gsound-devel \
gtk3-devel iso-codes-devel itstool json-glib-devel \
libappstream-glib-devel libgee-devel libhandy1-devel \
- libpeas-devel libsecret-devel libunwind-devel libxml2-devel \
- libytnef-devel sqlite-devel webkitgtk4-devel
+ libpeas-devel libsecret-devel libstemmer-devel libunwind-devel \
+ libxml2-devel libytnef-devel sqlite-devel webkitgtk4-devel
```
Installing dependencies on Ubuntu/Debian
@@ -107,8 +107,8 @@ sudo apt-get install meson build-essential valac \
libgcr-3-dev libgee-0.8-dev libglib2.0-dev libgmime-3.0-dev \
libgoa-1.0-dev libgspell-1-dev libgsound-dev libgtk-3-dev \
libjson-glib-dev libhandy-1-dev libpeas-dev libsecret-1-dev \
- libsqlite3-dev libunwind-dev libwebkit2gtk-4.0-dev libxml2-dev \
- libytnef0-dev
+ libsqlite3-dev libstemmer-dev libunwind-dev \
+ libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev
```
And for Ubuntu Messaging Menu integration:
diff --git a/bindings/vapi/libstemmer.vapi b/bindings/vapi/libstemmer.vapi
new file mode 100644
index 000000000..1d2e01c5c
--- /dev/null
+++ b/bindings/vapi/libstemmer.vapi
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2010-2013 Giulio Paci <giuliopaci gmail com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+[CCode (cheader_filename = "libstemmer.h", lower_case_cprefix = "sb_", cprefix="sb_")]
+namespace SnowBall {
+ [Compact]
+ [CCode (cname = "struct sb_stemmer", free_function="sb_stemmer_delete")]
+ public class Stemmer {
+ public unowned string stem(string word, int size);
+ public int length();
+ public Stemmer(string language, string encoding = "UTF_8");
+ [CCode (array_length = false, array_null_terminated = true)]
+ public static unowned string[] list();
+ }
+}
\ No newline at end of file
diff --git a/meson.build b/meson.build
index 84cb43f9b..85f256e9f 100644
--- a/meson.build
+++ b/meson.build
@@ -93,6 +93,7 @@ libmath = cc.find_library('m')
libpeas = dependency('libpeas-1.0', version: '>= 1.24.0')
libpeas_gtk = dependency('libpeas-gtk-1.0', version: '>= 1.24.0')
libsecret = dependency('libsecret-1', version: '>= 0.11')
+libstemmer_dep = cc.find_library('stemmer')
libsoup = dependency('libsoup-2.4', version: '>= 2.48')
libunwind_dep = dependency(
'libunwind', version: '>= 1.1', required: get_option('libunwind')
@@ -122,6 +123,13 @@ if libunwind_dep.found()
)
endif
+libstemmer = declare_dependency(
+ dependencies: [
+ valac.find_library('libstemmer', dirs: [vapi_dir]),
+ libstemmer_dep,
+ ],
+)
+
# Optional dependencies
appstream_util = find_program('appstream-util', required: false)
desktop_file_validate = find_program('desktop-file-validate', required: false)
diff --git a/src/engine/imap-db/imap-db-search-query.vala b/src/engine/imap-db/imap-db-search-query.vala
index 85f556fa8..968ac6eac 100644
--- a/src/engine/imap-db/imap-db-search-query.vala
+++ b/src/engine/imap-db/imap-db-search-query.vala
@@ -318,6 +318,9 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
// A list of all search terms, regardless of search op field name
private Gee.ArrayList<Term> all = new Gee.ArrayList<Term>();
+ private SnowBall.Stemmer stemmer;
+
+
public async SearchQuery(Geary.Account owner,
ImapDB.Account local,
string query,
@@ -325,6 +328,7 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
GLib.Cancellable? cancellable) {
base(owner, query, strategy);
this.account = local;
+ this.stemmer = new SnowBall.Stemmer(find_appropriate_search_stemmer());
switch (strategy) {
case Strategy.EXACT:
@@ -611,20 +615,19 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
}
/**
- * This method is used to convert an unquoted user-entered search terms into a stemmed search
- * term.
+ * Converts unquoted search terms into a stemmed search term.
*
- * Prior experience with the Unicode Snowball stemmer indicates it's too aggressive for our
- * tastes when coupled with prefix-matching of all unquoted terms (see
- * https://bugzilla.gnome.org/show_bug.cgi?id=713179) This method is part of a larger strategy
- * designed to dampen that aggressiveness without losing the benefits of stemming entirely.
+ * Prior experience with the Snowball stemmer indicates it is too
+ * aggressive for our tastes when coupled with prefix-matching of
+ * all unquoted terms (see
+ * https://bugzilla.gnome.org/show_bug.cgi?id=713179).
*
- * Database upgrade 23 removes the old Snowball-stemmed FTS table and replaces it with one
- * with no stemming (using only SQLite's "simple" tokenizer). It also creates a "magic" SQLite
- * table called TokenizerTable which allows for uniform queries to the Snowball stemmer, which
- * is still installed in Geary. Thus, we are now in the position to search for the original
- * term and its stemmed variant, then do post-search processing to strip results which are
- * too "greedy" due to prefix-matching the stemmed variant.
+ * This method is part of a larger strategy designed to dampen
+ * that aggressiveness without losing the benefits of stemming
+ * entirely: The database's FTS table uses no stemming, but
+ * libstemmer is used to generate stemmed search terms.
+ * Post-search processing is then to strip results which are too
+ * "greedy" due to prefix-matching the stemmed variant.
*
* Some heuristics are in place simply to determine if stemming should occur:
*
@@ -647,36 +650,9 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
if (term_length < this.min_term_length_for_stemming)
return null;
- string? stemmed = null;
- try {
- yield this.account.db.exec_transaction_async(RO,
- (cx, cancellable) => {
- Db.Statement stmt = cx.prepare("""
- SELECT token
- FROM TokenizerTable
- WHERE input=?
- """);
- stmt.bind_string(0, term);
-
- // get stemmed string; if no result, fall through
- Db.Result result = stmt.exec(cancellable);
- if (!result.finished) {
- stemmed = result.string_at(0);
- } else {
- debug("No stemmed term returned for \"%s\"", term);
- }
- return COMMIT;
- }, cancellable
- );
- } catch (Error err) {
- debug("Unable to query tokenizer table for stemmed term for \"%s\": %s", term, err.message);
-
- // fall-through
- }
-
+ string? stemmed = this.stemmer.stem(term, term.length);
if (String.is_empty(stemmed)) {
debug("Empty stemmed term returned for \"%s\"", term);
-
return null;
}
@@ -693,8 +669,52 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
}
debug("Search processing: term -> stem is \"%s\" -> \"%s\"", term, stemmed);
-
return stemmed;
}
+ private string find_appropriate_search_stemmer() {
+ // Unfortunately, the stemmer library only accepts the full language
+ // name for the stemming algorithm. This translates between the user's
+ // preferred language ISO 639-1 code and our available stemmers.
+ // FIXME: the available list here is determined by what's included in
+ // src/sqlite3-unicodesn/CMakeLists.txt. We should pass that list in
+ // instead of hardcoding it here.
+ foreach (string l in Intl.get_language_names()) {
+ switch (l) {
+ case "ar": return "arabic";
+ case "eu": return "basque";
+ case "ca": return "catalan";
+ case "da": return "danish";
+ case "nl": return "dutch";
+ case "en": return "english";
+ case "fi": return "finnish";
+ case "fr": return "french";
+ case "de": return "german";
+ case "el": return "greek";
+ case "hi": return "hindi";
+ case "hu": return "hungarian";
+ case "id": return "indonesian";
+ case "ga": return "irish";
+ case "it": return "italian";
+ case "lt": return "lithuanian";
+ case "ne": return "nepali";
+ case "no": return "norwegian";
+ case "pt": return "portuguese";
+ case "ro": return "romanian";
+ case "ru": return "russian";
+ case "sr": return "serbian";
+ case "es": return "spanish";
+ case "sv": return "swedish";
+ case "ta": return "tamil";
+ case "tr": return "turkish";
+ }
+ }
+
+ // Default to English because it seems to be on average the language
+ // most likely to be present in emails, regardless of the user's
+ // language setting. This is not an exact science, and search results
+ // should be ok either way in most cases.
+ return "english";
+ }
+
}
diff --git a/src/engine/meson.build b/src/engine/meson.build
index 1133f7b8b..377e22d2d 100644
--- a/src/engine/meson.build
+++ b/src/engine/meson.build
@@ -323,6 +323,7 @@ engine_dependencies = [
glib,
gmime,
libmath,
+ libstemmer,
libxml,
posix,
sqlite
diff --git a/src/meson.build b/src/meson.build
index 14f08c186..eef8c4762 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -150,6 +150,7 @@ foreach dep : valadoc_dependencies
valadoc_dep_args += '--pkg'
valadoc_dep_args += dep.name()
endforeach
+valadoc_dep_args += [ '--pkg', 'libstemmer' ]
valadoc_dep_args += [ '--pkg', 'posix' ]
valadoc_vapidir_args = []
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]