[geary/mjog/search-update: 43/43] Util.Email.SearchExpressionFactory: Use ICU for work breaking
- From: Michael Gratton <mjog src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [geary/mjog/search-update: 43/43] Util.Email.SearchExpressionFactory: Use ICU for work breaking
- Date: Tue, 19 Jan 2021 09:50:35 +0000 (UTC)
commit 642bf00e88de4e317977754ef377ed7aafe08461
Author: Michael Gratton <mike vee net>
Date: Tue Jan 19 20:42:26 2021 +1100
Util.Email.SearchExpressionFactory: Use ICU for work breaking
Implement search query text word segmentaion using ICU, so that
languages that don't use spaces for word delimiters are correctly
tokenised.
src/client/util/util-email.vala | 65 ++++++++++++++++++++++-------------
test/client/util/util-email-test.vala | 30 ++++++++++------
2 files changed, 62 insertions(+), 33 deletions(-)
---
diff --git a/src/client/util/util-email.vala b/src/client/util/util-email.vala
index 554dbe72b..c94af33b8 100644
--- a/src/client/util/util-email.vala
+++ b/src/client/util/util-email.vala
@@ -1,6 +1,6 @@
/*
* Copyright 2016 Software Freedom Conservancy Inc.
- * Copyright 2019 Michael Gratton <mike vee net>
+ * Copyright 2019-2021 Michael Gratton <mike vee net>
*
* This software is licensed under the GNU Lesser General Public License
* (version 2.1 or later). See the COPYING file in this distribution.
@@ -350,6 +350,10 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject {
private class Tokeniser {
+ [Flags]
+ private enum CharStatus { NONE, IN_WORD, END_WORD; }
+
+
// These characters are chosen for being commonly used to
// continue a single word (such as extended last names,
// i.e. "Lars-Eric") or in terms commonly searched for in an
@@ -365,7 +369,7 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject {
}
public bool is_at_word {
- get { return (this.attrs[this.current_c].is_word_start == 1); }
+ get { return CharStatus.IN_WORD in this.char_status[this.current_pos]; }
}
public bool is_at_quote {
@@ -380,30 +384,51 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject {
private int next_pos = 0;
private unichar c = 0;
- private int current_c = -1;
- private Pango.LogAttr[] attrs;
+ private CharStatus[] char_status;
- public Tokeniser(string query, Pango.Language language) {
+ public Tokeniser(string query) {
this.query = query;
// Break up search string into individual words and/or
- // operators. Can't simply break on space or non-alphanumeric
- // chars since some languages don't use spaces, so use Pango
- // for its support for the Unicode UAX #29 word boundary spec.
- this.attrs = new Pango.LogAttr[query.char_count() + 1];
- Pango.get_log_attrs(
- query, query.length, -1, language, this.attrs
+ // operators. Can't simply break on space or
+ // non-alphanumeric chars since some languages don't use
+ // spaces, so use ICU for its support for the Unicode UAX
+ // #29 word boundary spec and dictionary-based breaking
+ // for languages that do not use spaces for work breaks.
+
+ this.char_status = new CharStatus[query.length + 1];
+
+ var icu_err = Icu.ErrorCode.ZERO_ERROR;
+ var icu_text = Icu.Text.open_utf8(null, this.query.data, ref icu_err);
+ var word_breaker = Icu.BreakIterator.open(
+ WORD, "en", null, -1, ref icu_err
);
+ word_breaker.set_utext(icu_text, ref icu_err);
+
+ int32 prev_index = 0;
+ var current_index = word_breaker.first();
+ var status = 0;
+ while (current_index != Icu.BreakIterator.DONE) {
+ status = word_breaker.rule_status;
+ if (!(status >= Icu.BreakIterator.WordBreak.NONE &&
+ status < Icu.BreakIterator.WordBreak.NONE_LIMIT)) {
+ for (int i = prev_index; i < current_index; i++) {
+ this.char_status[i] |= IN_WORD;
+ }
+ this.char_status[current_index] |= END_WORD;
+ }
+
+ prev_index = current_index;
+ current_index = word_breaker.next();
+ }
consume_char();
}
public void consume_char() {
var current_pos = this.next_pos;
- if (this.query.get_next_char(ref this.next_pos, out this.c)) {
- this.current_c++;
- }
+ this.query.get_next_char(ref this.next_pos, out this.c);
this.current_pos = current_pos;
}
@@ -415,13 +440,11 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject {
public string consume_word() {
var start = this.current_pos;
- // the attr.is_word_end value applies to the first char
- // after then end of a word, so need to move one past the
- // end of the current word to determine where it ends
consume_char();
while (this.has_next &&
+ this.c != OPERATOR_SEPARATOR &&
(this.c in CONTINUATION_CHARS ||
- this.attrs[this.current_c].is_word_end != 1)) {
+ !(CharStatus.END_WORD in this.char_status[this.current_pos]))) {
consume_char();
}
return this.query.slice(start, this.current_pos);
@@ -446,10 +469,6 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject {
public Geary.AccountInformation account { get; private set; }
- public Pango.Language language {
- get; set; default = Pango.Language.get_default();
- }
-
// Maps of localised search operator names and values to their
// internal forms
private Gee.Map<string,FactoryContext> text_operators =
@@ -470,7 +489,7 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject {
/** Constructs a search expression from the given query string. */
public Gee.List<Geary.SearchQuery.Term> parse_query(string query) {
var operands = new Gee.LinkedList<Geary.SearchQuery.Term>();
- var tokens = new Tokeniser(query, this.language);
+ var tokens = new Tokeniser(query);
while (tokens.has_next) {
if (tokens.is_at_word) {
Geary.SearchQuery.Term? op = null;
diff --git a/test/client/util/util-email-test.vala b/test/client/util/util-email-test.vala
index 01605480c..b3e45d7d7 100644
--- a/test/client/util/util-email-test.vala
+++ b/test/client/util/util-email-test.vala
@@ -195,18 +195,28 @@ public class Util.Email.Test : TestCase {
this.config.get_search_strategy(),
this.account
);
- test_article.language = Pango.Language.from_string("th");
- var multiple = test_article.parse_query("ภาษาไทย");
- assert_collection(multiple).size(2);
- assert_true(multiple[0] is Geary.SearchQuery.EmailTextTerm);
- assert_true(multiple[1] is Geary.SearchQuery.EmailTextTerm);
+ var thai = test_article.parse_query("ภาษาไทย");
+ assert_collection(thai).size(2);
+ assert_true(thai[0] is Geary.SearchQuery.EmailTextTerm);
+ assert_true(thai[1] is Geary.SearchQuery.EmailTextTerm);
assert_collection(
- ((Geary.SearchQuery.EmailTextTerm) multiple[0]).terms
+ ((Geary.SearchQuery.EmailTextTerm) thai[0]).terms
).size(1).contains("ภาษา");
assert_collection(
- ((Geary.SearchQuery.EmailTextTerm) multiple[1]).terms
+ ((Geary.SearchQuery.EmailTextTerm) thai[1]).terms
).size(1).contains("ไทย");
+
+ var chinese = test_article.parse_query("男子去");
+ assert_collection(chinese).size(2);
+ assert_true(chinese[0] is Geary.SearchQuery.EmailTextTerm);
+ assert_true(chinese[1] is Geary.SearchQuery.EmailTextTerm);
+ assert_collection(
+ ((Geary.SearchQuery.EmailTextTerm) chinese[0]).terms
+ ).size(1).contains("男子");
+ assert_collection(
+ ((Geary.SearchQuery.EmailTextTerm) chinese[1]).terms
+ ).size(1).contains("去");
}
public void multiple_search_terms() throws GLib.Error {
@@ -277,10 +287,10 @@ public class Util.Email.Test : TestCase {
var simple_body = test_article.parse_query("body:hello");
assert_collection(simple_body).size(1);
- assert_true(simple_body[0] is Geary.SearchQuery.EmailTextTerm);
+ assert_true(simple_body[0] is Geary.SearchQuery.EmailTextTerm, "type");
var text_body = simple_body[0] as Geary.SearchQuery.EmailTextTerm;
- assert_true(text_body.target == BODY);
- assert_true(text_body.matching_strategy == CONSERVATIVE);
+ assert_true(text_body.target == BODY, "target");
+ assert_true(text_body.matching_strategy == CONSERVATIVE, "strategy");
assert_collection(text_body.terms).size(1).contains("hello");
var simple_body_quoted = test_article.parse_query("body:\"hello\"");
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]