[geary/mjog/search-update: 43/43] Util.Email.SearchExpressionFactory: Use ICU for work breaking




commit 642bf00e88de4e317977754ef377ed7aafe08461
Author: Michael Gratton <mike vee net>
Date:   Tue Jan 19 20:42:26 2021 +1100

    Util.Email.SearchExpressionFactory: Use ICU for work breaking
    
    Implement search query text word segmentaion using ICU, so that
    languages that don't use spaces for word delimiters are correctly
    tokenised.

 src/client/util/util-email.vala       | 65 ++++++++++++++++++++++-------------
 test/client/util/util-email-test.vala | 30 ++++++++++------
 2 files changed, 62 insertions(+), 33 deletions(-)
---
diff --git a/src/client/util/util-email.vala b/src/client/util/util-email.vala
index 554dbe72b..c94af33b8 100644
--- a/src/client/util/util-email.vala
+++ b/src/client/util/util-email.vala
@@ -1,6 +1,6 @@
 /*
  * Copyright 2016 Software Freedom Conservancy Inc.
- * Copyright 2019 Michael Gratton <mike vee net>
+ * Copyright 2019-2021 Michael Gratton <mike vee net>
  *
  * This software is licensed under the GNU Lesser General Public License
  * (version 2.1 or later). See the COPYING file in this distribution.
@@ -350,6 +350,10 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject {
     private class Tokeniser {
 
 
+        [Flags]
+        private enum CharStatus { NONE, IN_WORD, END_WORD; }
+
+
         // These characters are chosen for being commonly used to
         // continue a single word (such as extended last names,
         // i.e. "Lars-Eric") or in terms commonly searched for in an
@@ -365,7 +369,7 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject {
         }
 
         public bool is_at_word {
-            get { return (this.attrs[this.current_c].is_word_start == 1); }
+            get { return CharStatus.IN_WORD in this.char_status[this.current_pos]; }
         }
 
         public bool is_at_quote {
@@ -380,30 +384,51 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject {
         private int next_pos = 0;
 
         private unichar c = 0;
-        private int current_c = -1;
-        private Pango.LogAttr[] attrs;
+        private CharStatus[] char_status;
 
 
-        public Tokeniser(string query, Pango.Language language) {
+        public Tokeniser(string query) {
             this.query = query;
 
             // Break up search string into individual words and/or
-            // operators. Can't simply break on space or non-alphanumeric
-            // chars since some languages don't use spaces, so use Pango
-            // for its support for the Unicode UAX #29 word boundary spec.
-            this.attrs = new Pango.LogAttr[query.char_count() + 1];
-            Pango.get_log_attrs(
-                query, query.length, -1, language, this.attrs
+            // operators. Can't simply break on space or
+            // non-alphanumeric chars since some languages don't use
+            // spaces, so use ICU for its support for the Unicode UAX
+            // #29 word boundary spec and dictionary-based breaking
+            // for languages that do not use spaces for work breaks.
+
+            this.char_status = new CharStatus[query.length + 1];
+
+            var icu_err = Icu.ErrorCode.ZERO_ERROR;
+            var icu_text = Icu.Text.open_utf8(null, this.query.data, ref icu_err);
+            var word_breaker = Icu.BreakIterator.open(
+                WORD, "en", null, -1, ref icu_err
             );
+            word_breaker.set_utext(icu_text, ref icu_err);
+
+            int32 prev_index = 0;
+            var current_index = word_breaker.first();
+            var status = 0;
+            while (current_index != Icu.BreakIterator.DONE) {
+                status = word_breaker.rule_status;
+                if (!(status >= Icu.BreakIterator.WordBreak.NONE &&
+                      status < Icu.BreakIterator.WordBreak.NONE_LIMIT)) {
+                    for (int i = prev_index; i < current_index; i++) {
+                        this.char_status[i] |= IN_WORD;
+                    }
+                    this.char_status[current_index] |= END_WORD;
+                }
+
+                prev_index = current_index;
+                current_index = word_breaker.next();
+            }
 
             consume_char();
         }
 
         public void consume_char() {
             var current_pos = this.next_pos;
-            if (this.query.get_next_char(ref this.next_pos, out this.c)) {
-                this.current_c++;
-            }
+            this.query.get_next_char(ref this.next_pos, out this.c);
             this.current_pos = current_pos;
         }
 
@@ -415,13 +440,11 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject {
 
         public string consume_word() {
             var start = this.current_pos;
-            // the attr.is_word_end value applies to the first char
-            // after then end of a word, so need to move one past the
-            // end of the current word to determine where it ends
             consume_char();
             while (this.has_next &&
+                   this.c != OPERATOR_SEPARATOR &&
                    (this.c in CONTINUATION_CHARS ||
-                    this.attrs[this.current_c].is_word_end != 1)) {
+                    !(CharStatus.END_WORD in this.char_status[this.current_pos]))) {
                 consume_char();
             }
             return this.query.slice(start, this.current_pos);
@@ -446,10 +469,6 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject {
 
     public Geary.AccountInformation account { get; private set; }
 
-    public Pango.Language language {
-        get; set; default = Pango.Language.get_default();
-    }
-
     // Maps of localised search operator names and values to their
     // internal forms
     private Gee.Map<string,FactoryContext> text_operators =
@@ -470,7 +489,7 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject {
     /** Constructs a search expression from the given query string. */
     public Gee.List<Geary.SearchQuery.Term> parse_query(string query) {
         var operands = new Gee.LinkedList<Geary.SearchQuery.Term>();
-        var tokens = new Tokeniser(query, this.language);
+        var tokens = new Tokeniser(query);
         while (tokens.has_next) {
             if (tokens.is_at_word) {
                 Geary.SearchQuery.Term? op = null;
diff --git a/test/client/util/util-email-test.vala b/test/client/util/util-email-test.vala
index 01605480c..b3e45d7d7 100644
--- a/test/client/util/util-email-test.vala
+++ b/test/client/util/util-email-test.vala
@@ -195,18 +195,28 @@ public class Util.Email.Test : TestCase {
             this.config.get_search_strategy(),
             this.account
         );
-        test_article.language = Pango.Language.from_string("th");
 
-        var multiple = test_article.parse_query("ภาษาไทย");
-        assert_collection(multiple).size(2);
-        assert_true(multiple[0] is Geary.SearchQuery.EmailTextTerm);
-        assert_true(multiple[1] is Geary.SearchQuery.EmailTextTerm);
+        var thai = test_article.parse_query("ภาษาไทย");
+        assert_collection(thai).size(2);
+        assert_true(thai[0] is Geary.SearchQuery.EmailTextTerm);
+        assert_true(thai[1] is Geary.SearchQuery.EmailTextTerm);
         assert_collection(
-            ((Geary.SearchQuery.EmailTextTerm) multiple[0]).terms
+            ((Geary.SearchQuery.EmailTextTerm) thai[0]).terms
         ).size(1).contains("ภาษา");
         assert_collection(
-            ((Geary.SearchQuery.EmailTextTerm) multiple[1]).terms
+            ((Geary.SearchQuery.EmailTextTerm) thai[1]).terms
         ).size(1).contains("ไทย");
+
+        var chinese = test_article.parse_query("男子去");
+        assert_collection(chinese).size(2);
+        assert_true(chinese[0] is Geary.SearchQuery.EmailTextTerm);
+        assert_true(chinese[1] is Geary.SearchQuery.EmailTextTerm);
+        assert_collection(
+            ((Geary.SearchQuery.EmailTextTerm) chinese[0]).terms
+        ).size(1).contains("男子");
+        assert_collection(
+            ((Geary.SearchQuery.EmailTextTerm) chinese[1]).terms
+        ).size(1).contains("去");
     }
 
     public void multiple_search_terms() throws GLib.Error {
@@ -277,10 +287,10 @@ public class Util.Email.Test : TestCase {
 
         var simple_body = test_article.parse_query("body:hello");
         assert_collection(simple_body).size(1);
-        assert_true(simple_body[0] is Geary.SearchQuery.EmailTextTerm);
+        assert_true(simple_body[0] is Geary.SearchQuery.EmailTextTerm, "type");
         var text_body = simple_body[0] as Geary.SearchQuery.EmailTextTerm;
-        assert_true(text_body.target == BODY);
-        assert_true(text_body.matching_strategy == CONSERVATIVE);
+        assert_true(text_body.target == BODY, "target");
+        assert_true(text_body.matching_strategy == CONSERVATIVE, "strategy");
         assert_collection(text_body.terms).size(1).contains("hello");
 
         var simple_body_quoted = test_article.parse_query("body:\"hello\"");


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]