[geary/wip/composer-folks] Ensure Geary.ContactStoreImpl handles non english searches



commit 4e1aa3251402c4207b5bf3af446aa07642fe28a8
Author: Michael Gratton <mike vee net>
Date:   Sat Jun 22 14:11:10 2019 +1000

    Ensure Geary.ContactStoreImpl handles non english searches
    
    Fix DB impl actually do UTF-8 case-insensitive search matching. Add
    some unit tests.

 src/engine/common/common-contact-store-impl.vala   | 10 +--
 src/engine/db/db.vala                              | 11 +++
 src/engine/imap-db/imap-db-database.vala           | 64 +++++++++++----
 .../common/common-contact-store-impl-test.vala     | 91 ++++++++++++++++++----
 test/engine/imap-db/imap-db-database-test.vala     |  4 +-
 5 files changed, 146 insertions(+), 34 deletions(-)
---
diff --git a/src/engine/common/common-contact-store-impl.vala 
b/src/engine/common/common-contact-store-impl.vala
index 433213bd..912fee2e 100644
--- a/src/engine/common/common-contact-store-impl.vala
+++ b/src/engine/common/common-contact-store-impl.vala
@@ -98,19 +98,19 @@ internal class Geary.ContactStoreImpl : BaseObject, Geary.ContactStore {
                                                       GLib.Cancellable? cancellable)
         throws GLib.Error {
         Gee.Collection<Contact> contacts = new Gee.LinkedList<Contact>();
-        string normalised_query = query.make_valid().normalize().down();
+        string normalised_query = Geary.Db.normalise_case_insensitive_query(query);
         if (!String.is_empty(normalised_query)) {
             normalised_query = normalised_query + "%";
             Db.Statement stmt = cx.prepare("""
                 SELECT * FROM ContactTable
                 WHERE highest_importance >= ? AND (
-                    real_name LIKE ? COLLATE UTF8ICASE OR
-                    normalized_email LIKE ? COLLATE UTF8ICASE
+                    UTF8FOLD(real_name) LIKE ? OR
+                    UTF8FOLD(email) LIKE ?
                 )
                 ORDER BY highest_importance DESC,
                          real_name IS NULL,
-                         real_name COLLATE UTF8ICASE,
-                         email COLLATE UTF8ICASE
+                         real_name COLLATE UTF8COLL,
+                         email COLLATE UTF8COLL
                 LIMIT ?
             """);
             stmt.bind_uint(0, min_importance);
diff --git a/src/engine/db/db.vala b/src/engine/db/db.vala
index ff362496..b062ee9a 100644
--- a/src/engine/db/db.vala
+++ b/src/engine/db/db.vala
@@ -81,6 +81,17 @@ public bool set_shared_cache_mode(bool enabled) {
     return sqlite3_enable_shared_cache(enabled ? 1 : 0) == Sqlite.OK;
 }
 
+/** Standard transformation for case-insensitive string values. */
+public inline string normalise_case_insensitive_query(string text) {
+    // This would be a place to do transliteration to improve query
+    // results, for example normalising `á` to `a`. The built-in GLib
+    // method `string.to_ascii()` does this but is too strong: It will
+    // convert e.g. CJK chars to `?`. The `string.tokenize_and_fold`
+    // function may work better but the calling interface is all
+    // wrong.
+    return text.normalize().casefold();
+}
+
 private void check_cancelled(string? method, Cancellable? cancellable) throws IOError {
     if (cancellable != null && cancellable.is_cancelled())
         throw new IOError.CANCELLED("%s cancelled", !String.is_empty(method) ? method : "Operation");
diff --git a/src/engine/imap-db/imap-db-database.vala b/src/engine/imap-db/imap-db-database.vala
index cbf65d65..91e2f01c 100644
--- a/src/engine/imap-db/imap-db-database.vala
+++ b/src/engine/imap-db/imap-db-database.vala
@@ -5,22 +5,45 @@
  * (version 2.1 or later).  See the COPYING file in this distribution.
  */
 
-[CCode (cname = "g_utf8_casefold")]
-extern string utf8_casefold(string data, ssize_t len);
+[CCode (cname = "g_utf8_collate_key")]
+extern string utf8_collate_key(string data, ssize_t len);
 extern int sqlite3_unicodesn_register_tokenizer(Sqlite.Database db);
 
 private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
 
-    /** Name of UTF-8 case-sensitive SQLite collation function name. */
-    public const string UTF8_CASE_INSENSITIVE_COLLATION = "UTF8ICASE";
 
-    private static int case_insensitive_collation(int a_len, void* a_bytes,
-                                                  int b_len, void* b_bytes) {
-        string a_str = utf8_casefold((string) a_bytes, a_len).collate_key();
-        string b_str = utf8_casefold((string) b_bytes, b_len).collate_key();
-        return strcmp(a_str, b_str);
+    /** SQLite UTF-8 case-insensitive, transliterating function name. */
+    public const string UTF8_CASE_INSENSITIVE_FN = "UTF8FOLD";
+
+    /** SQLite UTF-8 collation name. */
+    public const string UTF8_COLLATE = "UTF8COLL";
+
+
+    private static void utf8_transliterate_fold(Sqlite.Context context,
+                                                Sqlite.Value[] values) {
+        string? text = values[0].to_text();
+        if (text != null) {
+            context.result_text(Geary.Db.normalise_case_insensitive_query(text));
+        } else {
+            context.result_value(values[0]);
+        }
+    }
+
+    private static int utf8_collate(int a_len, void* a_bytes,
+                                    int b_len, void* b_bytes) {
+        // Don't need to normalise, collate_key() will do it for us
+        string? a_str = null;
+        if (a_bytes != null) {
+            a_str = utf8_collate_key((string) a_bytes, a_len);
+        }
+        string? b_str = null;
+        if (b_bytes != null) {
+            b_str = utf8_collate_key((string) b_bytes, b_len);
+        }
+        return GLib.strcmp(a_str, b_str);
     }
 
+
     internal GLib.File attachments_path;
 
     private const int OPEN_PUMP_EVENT_LOOP_MSEC = 100;
@@ -598,14 +621,29 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
         cx.set_recursive_triggers(true);
         cx.set_synchronous(Db.SynchronousMode.NORMAL);
         sqlite3_unicodesn_register_tokenizer(cx.db);
+
+        if (cx.db.create_function(
+                UTF8_CASE_INSENSITIVE_FN,
+                1, // n args
+                Sqlite.UTF8,
+                null,
+                Database.utf8_transliterate_fold,
+                null,
+                null
+            ) != Sqlite.OK) {
+            throw new DatabaseError.GENERAL(
+                "Failed to register function %s",
+                UTF8_CASE_INSENSITIVE_FN
+            );
+        }
+
         if (cx.db.create_collation(
-                UTF8_CASE_INSENSITIVE_COLLATION,
+                UTF8_COLLATE,
                 Sqlite.UTF8,
-                Database.case_insensitive_collation
+                Database.utf8_collate
             ) != Sqlite.OK) {
             throw new DatabaseError.GENERAL(
-                "Failed to register collation function %s",
-                UTF8_CASE_INSENSITIVE_COLLATION
+                "Failed to register collation %s", UTF8_COLLATE
             );
         }
     }
diff --git a/test/engine/common/common-contact-store-impl-test.vala 
b/test/engine/common/common-contact-store-impl-test.vala
index 99530c87..b0a323ae 100644
--- a/test/engine/common/common-contact-store-impl-test.vala
+++ b/test/engine/common/common-contact-store-impl-test.vala
@@ -20,6 +20,8 @@ class Geary.ContactStoreImplTest : TestCase {
         add_test("search_no_match", search_no_match);
         add_test("search_email_match", search_email_match);
         add_test("search_name_match", search_name_match);
+        add_test("search_utf8_latin_names", search_utf8_latin_names);
+        add_test("search_utf8_multi_byte_names", search_utf8_multi_byte_names);
         add_test("update_new_contact", update_new_contact);
         add_test("update_existing_contact", update_existing_contact);
     }
@@ -45,20 +47,20 @@ class Geary.ContactStoreImplTest : TestCase {
         this.db.open.end(async_result());
 
         this.db.exec("""
-INSERT INTO ContactTable (
-    id,
-    normalized_email,
-    real_name,
-    email,
-    highest_importance
-) VALUES (
-    1,
-    'test example com',
-    'Test Name',
-    'Test example com',
-    50
-);
-""");
+            INSERT INTO ContactTable (
+                id,
+                normalized_email,
+                real_name,
+                email,
+                highest_importance
+            ) VALUES (
+                1,
+                'test example com',
+                'Test Name',
+                'Test example com',
+                50
+            );
+        """);
 
         this.test_article = new ContactStoreImpl(this.db);
     }
@@ -152,6 +154,67 @@ INSERT INTO ContactTable (
         assert_false(search_hit.flags.always_load_remote_images(), "Existing flags");
     }
 
+    public void search_utf8_latin_names() throws GLib.Error {
+        this.db.exec("""
+            INSERT INTO ContactTable (
+                real_name,
+                email,
+                normalized_email,
+                highest_importance
+            ) VALUES (
+                'Germán',
+                'latin example com',
+                'latin example com',
+                50
+            );
+        """);
+        test_article.search.begin(
+            "germá",
+            0,
+            10,
+            null,
+            (obj, ret) => { async_complete(ret); }
+        );
+        Gee.Collection<Contact> results = test_article.search.end(
+            async_result()
+        );
+        assert_int(1, results.size, "results.size");
+
+        Contact search_hit = Collection.get_first(results);
+        assert_string("Germán", search_hit.real_name, "Existing real_name");
+    }
+
+    public void search_utf8_multi_byte_names() throws GLib.Error {
+        this.db.exec("""
+            INSERT INTO ContactTable (
+                real_name,
+                email,
+                normalized_email,
+                highest_importance
+            ) VALUES (
+                '年収1億円目指せ',
+                'cjk example com',
+                'cjk example com',
+                50
+            );
+        """);
+
+        test_article.search.begin(
+            "年収",
+            0,
+            10,
+            null,
+            (obj, ret) => { async_complete(ret); }
+        );
+        Gee.Collection<Contact> results = test_article.search.end(
+            async_result()
+        );
+        assert_int(1, results.size, "results.size");
+
+        Contact search_hit = Collection.get_first(results);
+        assert_string("年収1億円目指せ", search_hit.real_name, "Existing real_name");
+    }
+
     public void update_new_contact() throws GLib.Error {
         Contact not_persisted = new Contact(
             "New example com",
diff --git a/test/engine/imap-db/imap-db-database-test.vala b/test/engine/imap-db/imap-db-database-test.vala
index f937456e..44a4845f 100644
--- a/test/engine/imap-db/imap-db-database-test.vala
+++ b/test/engine/imap-db/imap-db-database-test.vala
@@ -147,10 +147,10 @@ class Geary.ImapDB.DatabaseTest : TestCase {
             INSERT INTO Test (test_str) VALUES ('BB');
             INSERT INTO Test (test_str) VALUES ('🤯');
         """);
-        string[] expected = { "🤯", "BB", "B", "a" };
+        string[] expected = { "a", "BB", "B", "🤯" };
 
         Db.Result result = db.query(
-            "SELECT test_str FROM Test ORDER BY test_str COLLATE UTF8ICASE DESC"
+            "SELECT test_str FROM Test ORDER BY test_str COLLATE UTF8COLL DESC"
         );
 
         int i = 0;


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]