[gtk/emoji-data-cldr: 3/5] Do case-folding and tokenization when creating emoji.data

From: Matthias Clasen <matthiasc src gnome org>
To: commits-list gnome org
Cc:
Subject: [gtk/emoji-data-cldr: 3/5] Do case-folding and tokenization when creating emoji.data
Date: Sun, 4 Oct 2020 05:05:57 +0000 (UTC)

commit 1d9d5fcf3956e7ee3ae2b8c3c6a84e8a3d438015
Author: Matthias Clasen <mclasen redhat com>
Date:   Wed Apr 15 00:28:53 2020 -0400

    Do case-folding and tokenization when creating emoji.data
    
    We can avoid doing the extra work of case-folding and
    tokenization whenever we filter in the Emoji chooser.

 gtk/emoji/convert-emoji.c |  26 +++++++++++++++++++++++++-
 gtk/emoji/emoji.data      | Bin 55307 -> 152041 bytes
 gtk/gtkemojichooser.c     |  41 ++++++++++++++++++++++++++++++++++++-----
 3 files changed, 61 insertions(+), 6 deletions(-)
---
diff --git a/gtk/emoji/convert-emoji.c b/gtk/emoji/convert-emoji.c
index 89d3480e38..2e89f5a3d6 100644
--- a/gtk/emoji/convert-emoji.c
+++ b/gtk/emoji/convert-emoji.c
@@ -143,6 +143,7 @@ main (int argc, char *argv[])
       gboolean has_variations;
       JsonObject *obj2;
       JsonArray *kw;
+      char **name_tokens;
 
       i++;
 
@@ -179,17 +180,40 @@ main (int argc, char *argv[])
         return 1;
 
       g_variant_builder_init (&b2, G_VARIANT_TYPE ("as"));
+      name_tokens = g_str_tokenize_and_fold (name, "en", NULL);
+      for (j = 0; j < g_strv_length (name_tokens); j++)
+        g_variant_builder_add (&b2, "s", name_tokens[j]);
+
       obj2 = g_hash_table_lookup (names, name_key->str);
       if (obj2)
         {
           shortname = json_object_get_string_member (obj2, "shortname");
           kw = json_object_get_array_member (obj2, "keywords");
           for (k = 0; k < json_array_get_length (kw); k++)
-            g_variant_builder_add (&b2, "s", json_array_get_string_element (kw, k));
+            {
+              char **folded;
+              char **ascii;
+
+              folded = g_str_tokenize_and_fold (json_array_get_string_element (kw, k), "en", &ascii);
+              for (j = 0; j < g_strv_length (folded); j++)
+                {
+                  if (!g_strv_contains ((const char * const *)name_tokens, folded[j]))
+                    g_variant_builder_add (&b2, "s", folded[j]);
+                }
+              for (j = 0; j < g_strv_length (ascii); j++)
+                {
+                  if (!g_strv_contains ((const char * const *)name_tokens, ascii[j]))
+                    g_variant_builder_add (&b2, "s", ascii[j]);
+                }
+              g_strfreev (folded);
+              g_strfreev (ascii);
+            }
         }
       else
         shortname = "";
 
+      g_strfreev (name_tokens);
+
       g_variant_builder_add (&builder, "(aussas)", &b1, name, shortname, &b2);
     }
 
diff --git a/gtk/emoji/emoji.data b/gtk/emoji/emoji.data
index 15d093d933..5b9298db16 100644
Binary files a/gtk/emoji/emoji.data and b/gtk/emoji/emoji.data differ
diff --git a/gtk/gtkemojichooser.c b/gtk/gtkemojichooser.c
index 55a17cc9ca..c58fe51bb2 100644
--- a/gtk/gtkemojichooser.c
+++ b/gtk/gtkemojichooser.c
@@ -710,6 +710,31 @@ adj_value_changed (GtkAdjustment *adj,
     }
 }
 
+static gboolean
+match_tokens (const char **term_tokens,
+              const char **hit_tokens)
+{
+  int i, j;
+  gboolean matched;
+
+  matched = TRUE;
+
+  for (i = 0; term_tokens[i]; i++)
+    {
+      for (j = 0; hit_tokens[j]; j++)
+        if (g_str_has_prefix (hit_tokens[j], term_tokens[i]))
+          goto one_matched;
+
+      matched = FALSE;
+      break;
+
+one_matched:
+      continue;
+    }
+
+  return matched;
+}
+
 static gboolean
 filter_func (GtkFlowBoxChild *child,
              gpointer         data)
@@ -720,8 +745,9 @@ filter_func (GtkFlowBoxChild *child,
   const char *text;
   const char *name;
   const char **keywords;
+  char **term_tokens;
+  char **name_tokens;
   gboolean res;
-  int i;
 
   res = TRUE;
 
@@ -735,12 +761,17 @@ filter_func (GtkFlowBoxChild *child,
   if (!emoji_data)
     goto out;
 
+  term_tokens = g_str_tokenize_and_fold (text, "en", NULL);
+  
   g_variant_get_child (emoji_data, 1, "&s", &name);
-  res = g_str_match_string (text, name, TRUE);
-
+  name_tokens = g_str_tokenize_and_fold (name, "en", NULL);
   g_variant_get_child (emoji_data, 3, "^a&s", &keywords);
-  for (i = 0; !res && keywords[i]; i++)
-    res = g_str_match_string (text, keywords[i], TRUE);
+
+  res = match_tokens ((const char **)term_tokens, (const char **)name_tokens) ||
+        match_tokens ((const char **)term_tokens, keywords);
+
+  g_strfreev (term_tokens);
+  g_strfreev (name_tokens);
 
 out:
   if (res)
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]