[gtk/input-tweaks: 7/7] composetable: Allow multiple dead keys

From: Matthias Clasen <matthiasc src gnome org>
To: commits-list gnome org
Cc:
Subject: [gtk/input-tweaks: 7/7] composetable: Allow multiple dead keys
Date: Thu, 29 Jul 2021 20:42:57 +0000 (UTC)

commit 5c37290a1697e445cca54e3eb859b2a823cab600
Author: Matthias Clasen <mclasen redhat com>
Date:   Thu Jul 29 16:36:25 2021 -0400

    composetable: Allow multiple dead keys
    
    Remove the limitation on the number of dead keys
    that we match, and allow the result be be multiple
    characters.
    
    Regenerate the builtin sequences, since this allows
    us to eliminate more dead key sequences.
    
    Update tests to match.
    
    Fixes: #10

 gtk/compose/chars                     | Bin 1573 -> 1242 bytes
 gtk/compose/gtkcomposedata.h          |   4 +-
 gtk/compose/sequences                 | Bin 33042 -> 32894 bytes
 gtk/gtkcomposetable.c                 | 128 ++++++++--------------------------
 testsuite/gtk/compose/system.expected | 113 +-----------------------------
 testsuite/gtk/composetable.c          |   5 +-
 6 files changed, 38 insertions(+), 212 deletions(-)
---
diff --git a/gtk/compose/chars b/gtk/compose/chars
index 9f9b15314f..f3897230ea 100644
Binary files a/gtk/compose/chars and b/gtk/compose/chars differ
diff --git a/gtk/compose/gtkcomposedata.h b/gtk/compose/gtkcomposedata.h
index 057347a36b..442696218c 100644
--- a/gtk/compose/gtkcomposedata.h
+++ b/gtk/compose/gtkcomposedata.h
@@ -3,7 +3,7 @@
 
 #define MAX_SEQ_LEN 5
 #define N_INDEX_SIZE 30
-#define DATA_SIZE 16521
-#define N_CHARS 1572
+#define DATA_SIZE 16447
+#define N_CHARS 1241
 
 #endif
diff --git a/gtk/compose/sequences b/gtk/compose/sequences
index 1db7a6cddb..39984d4125 100644
Binary files a/gtk/compose/sequences and b/gtk/compose/sequences differ
diff --git a/gtk/gtkcomposetable.c b/gtk/gtkcomposetable.c
index ef8d01266a..464bf1876a 100644
--- a/gtk/gtkcomposetable.c
+++ b/gtk/gtkcomposetable.c
@@ -1462,77 +1462,6 @@ gtk_compose_table_foreach (const GtkComposeTable      *table,
 #define IS_DEAD_KEY(k) \
     ((k) >= GDK_KEY_dead_grave && (k) <= GDK_KEY_dead_greek)
 
-/* This function receives a sequence of Unicode characters and tries to
- * normalize it (NFC). We check for the case where the resulting string
- * has length 1 (single character).
- * NFC normalisation normally rearranges diacritic marks, unless these
- * belong to the same Canonical Combining Class.
- * If they belong to the same canonical combining class, we produce all
- * permutations of the diacritic marks, then attempt to normalize.
- */
-static gboolean
-check_normalize_nfc (gunichar *combination_buffer,
-                     int       n_compose)
-{
-  gunichar *combination_buffer_temp;
-  char *combination_utf8_temp = NULL;
-  char *nfc_temp = NULL;
-  int n_combinations;
-  gunichar temp_swap;
-  int i;
-
-  combination_buffer_temp = g_alloca (n_compose * sizeof (gunichar));
-
-  n_combinations = 1;
-
-  for (i = 1; i < n_compose; i++)
-     n_combinations *= i;
-
-  /* Xorg reuses dead_tilde for the perispomeni diacritic mark.
-   * We check if base character belongs to Greek Unicode block,
-   * and if so, we replace tilde with perispomeni.
-   */
-  if (combination_buffer[0] >= 0x390 && combination_buffer[0] <= 0x3FF)
-    {
-      for (i = 1; i < n_compose; i++ )
-        if (combination_buffer[i] == 0x303)
-          combination_buffer[i] = 0x342;
-    }
-
-  memcpy (combination_buffer_temp, combination_buffer, n_compose * sizeof (gunichar) );
-
-  for (i = 0; i < n_combinations; i++)
-    {
-      g_unicode_canonical_ordering (combination_buffer_temp, n_compose);
-      combination_utf8_temp = g_ucs4_to_utf8 (combination_buffer_temp, n_compose, NULL, NULL, NULL);
-      nfc_temp = g_utf8_normalize (combination_utf8_temp, -1, G_NORMALIZE_NFC);
-
-      if (g_utf8_strlen (nfc_temp, -1) == 1)
-        {
-          memcpy (combination_buffer, combination_buffer_temp, n_compose * sizeof (gunichar) );
-
-          g_free (combination_utf8_temp);
-          g_free (nfc_temp);
-
-          return TRUE;
-        }
-
-      g_free (combination_utf8_temp);
-      g_free (nfc_temp);
-
-      if (n_compose > 2)
-        {
-          temp_swap = combination_buffer_temp[i % (n_compose - 1) + 1];
-          combination_buffer_temp[i % (n_compose - 1) + 1] = combination_buffer_temp[(i+1) % (n_compose - 1) 
+ 1];
-          combination_buffer_temp[(i+1) % (n_compose - 1) + 1] = temp_swap;
-        }
-      else
-        break;
-    }
-
-  return FALSE;
-}
-
 gboolean
 gtk_check_algorithmically (const guint16 *compose_buffer,
                            int            n_compose,
@@ -1540,40 +1469,49 @@ gtk_check_algorithmically (const guint16 *compose_buffer,
 
 {
   int i;
-  gunichar *combination_buffer;
-  char *combination_utf8, *nfc;
-
-  combination_buffer = alloca (sizeof (gunichar) * (n_compose + 1));
 
   g_string_set_size (output, 0);
 
   for (i = 0; i < n_compose && IS_DEAD_KEY (compose_buffer[i]); i++)
     ;
 
-  /* Allow at most 2 dead keys */
-  if (i > 2)
-    return FALSE;
-
-  /* Can't combine if there's no base character */
+  /* Can't combine if there's no base character: incomplete sequence */
   if (i == n_compose)
     return TRUE;
 
   if (i > 0 && i == n_compose - 1)
     {
-      combination_buffer[0] = gdk_keyval_to_unicode (compose_buffer[i]);
-      combination_buffer[n_compose] = 0;
+      GString *input;
+      char *nfc;
+      gunichar ch;
+
+      ch = gdk_keyval_to_unicode (compose_buffer[i]);
+
+      /* We don't allow combining with non-letters */
+      if (!g_unichar_isalpha (ch))
+        return FALSE;
+
+      input = g_string_sized_new (4 * n_compose);
+
+      g_string_append_unichar (input, ch);
+
       i--;
       while (i >= 0)
         {
           switch (compose_buffer[i])
             {
 #define CASE(keysym, unicode) \
-            case GDK_KEY_dead_##keysym: combination_buffer[i+1] = unicode; break
+            case GDK_KEY_dead_##keysym: g_string_append_unichar (input, unicode); break
 
             CASE (grave, 0x0300);
             CASE (acute, 0x0301);
             CASE (circumflex, 0x0302);
-            CASE (tilde, 0x0303);       /* Also used with perispomeni, 0x342. */
+            case GDK_KEY_dead_tilde:
+              if (g_unichar_get_script (ch) == G_UNICODE_SCRIPT_GREEK)
+                g_string_append_unichar (input, 0x342); /* combining perispomeni */
+              else
+                g_string_append_unichar (input, 0x303); /* combining tilde */
+              break;
             CASE (macron, 0x0304);
             CASE (breve, 0x0306);
             CASE (abovedot, 0x0307);
@@ -1591,7 +1529,7 @@ gtk_check_algorithmically (const guint16 *compose_buffer,
             CASE (horn, 0x031B);        /* Legacy use for psili, 0x313 (or 0x343). */
             CASE (stroke, 0x335);
             CASE (abovecomma, 0x0313);  /* Equivalent to psili */
-            CASE (abovereversedcomma, 0x0314); /* Equivalent to dasia */
+            CASE (abovereversedcomma, 0x0314);   /* Equivalent to dasia */
             CASE (doublegrave, 0x30F);
             CASE (belowring, 0x325);
             CASE (belowmacron, 0x331);
@@ -1619,26 +1557,20 @@ gtk_check_algorithmically (const guint16 *compose_buffer,
             CASE (capital_schwa, 0x1DEA);
 #undef CASE
             default:
-              combination_buffer[i+1] = gdk_keyval_to_unicode (compose_buffer[i]);
+              g_string_append_unichar (input, gdk_keyval_to_unicode (compose_buffer[i]));
             }
           i--;
         }
 
-      /* If the buffer normalizes to a single character, then modify the order
-       * of combination_buffer accordingly, if necessary, and return TRUE.
-       */
-      if (check_normalize_nfc (combination_buffer, n_compose))
-        {
-          combination_utf8 = g_ucs4_to_utf8 (combination_buffer, -1, NULL, NULL, NULL);
-          nfc = g_utf8_normalize (combination_utf8, -1, G_NORMALIZE_NFC);
+      nfc = g_utf8_normalize (input->str, input->len, G_NORMALIZE_NFC);
 
-          g_string_assign (output, nfc);
+      g_string_assign (output, nfc);
 
-          g_free (combination_utf8);
-          g_free (nfc);
+      g_free (nfc);
 
-          return TRUE;
-        }
+      g_string_free (input, TRUE);
+
+      return TRUE;
     }
 
   return FALSE;
diff --git a/testsuite/gtk/compose/system.expected b/testsuite/gtk/compose/system.expected
index aac5ac82ea..4b0ebd4420 100644
--- a/testsuite/gtk/compose/system.expected
+++ b/testsuite/gtk/compose/system.expected
@@ -1,14 +1,12 @@
-# n_sequences: 4909
+# n_sequences: 4802
 # max_seq_len: 5
 # n_index_size: 30
-# data_size: 16521
-# n_chars: 1572
+# data_size: 16231
+# n_chars: 1241
 <U7ae> <U7e9> : "ΐ" # U390
 <U7ae> <U7f5> : "ΰ" # U3b0
 <Ufe50> <U20> : "`" # U60
-<Ufe50> <U4d> : "M̀"
 <Ufe50> <U56> : "Ǜ" # U1db
-<Ufe50> <U6d> : "m̀"
 <Ufe50> <U76> : "ǜ" # U1dc
 <Ufe50> <Ua0> : "̀" # U300
 <Ufe50> <U186> : "Ɔ̀"
@@ -23,16 +21,6 @@
 <Ufe50> <U269> : "ɩ̀"
 <Ufe50> <U28a> : "ʊ̀"
 <Ufe50> <U28b> : "ʋ̀"
-<Ufe50> <U3bd> : "Ŋ̀"
-<Ufe50> <U3bf> : "ŋ̀"
-<Ufe50> <U6c1> : "а̀"
-<Ufe50> <U6cf> : "о̀"
-<Ufe50> <U6d2> : "р̀"
-<Ufe50> <U6d5> : "у̀"
-<Ufe50> <U6e1> : "А̀"
-<Ufe50> <U6ef> : "О̀"
-<Ufe50> <U6f2> : "Р̀"
-<Ufe50> <U6f5> : "У̀"
 <Ufe50> <U1f00> : "ἂ" # U1f02
 <Ufe50> <U1f01> : "ἃ" # U1f03
 <Ufe50> <U1f08> : "Ἂ" # U1f0a
@@ -131,9 +119,7 @@
 <Ufe50> <Uff20> <Uaf> <U65> : "ḕ" # U1e15
 <Ufe50> <Uff20> <Uaf> <U6f> : "ṑ" # U1e51
 <Ufe51> <U20> : "'" # U27
-<Ufe51> <U4a> : "J́"
 <Ufe51> <U56> : "Ǘ" # U1d7
-<Ufe51> <U6a> : "j́"
 <Ufe51> <U76> : "ǘ" # U1d8
 <Ufe51> <Ua0> : "́" # U301
 <Ufe51> <U186> : "Ɔ́"
@@ -148,28 +134,7 @@
 <Ufe51> <U269> : "ɩ́"
 <Ufe51> <U28a> : "ʊ́"
 <Ufe51> <U28b> : "ʋ́"
-<Ufe51> <U3bd> : "Ŋ́"
-<Ufe51> <U3bf> : "ŋ́"
-<Ufe51> <U6c0> : "ю́"
-<Ufe51> <U6c1> : "а́"
-<Ufe51> <U6c5> : "е́"
-<Ufe51> <U6c9> : "и́"
-<Ufe51> <U6cf> : "о́"
-<Ufe51> <U6d1> : "я́"
-<Ufe51> <U6d2> : "р́"
-<Ufe51> <U6d5> : "у́"
-<Ufe51> <U6d9> : "ы́"
-<Ufe51> <U6dc> : "э́"
 <Ufe51> <U6e0> : "Ю́́"
-<Ufe51> <U6e1> : "А́"
-<Ufe51> <U6e5> : "Е́"
-<Ufe51> <U6e9> : "И́"
-<Ufe51> <U6ef> : "О́"
-<Ufe51> <U6f1> : "Я́"
-<Ufe51> <U6f2> : "Р́"
-<Ufe51> <U6f5> : "У́"
-<Ufe51> <U6f9> : "Ы́"
-<Ufe51> <U6fc> : "Э́"
 <Ufe51> <U1f00> : "ἄ" # U1f04
 <Ufe51> <U1f01> : "ἅ" # U1f05
 <Ufe51> <U1f08> : "Ἄ" # U1f0c
@@ -327,18 +292,6 @@
 <Ufe52> <U269> : "ɩ̂"
 <Ufe52> <U28a> : "ʊ̂"
 <Ufe52> <U28b> : "ʋ̂"
-<Ufe52> <U6c1> : "а̂"
-<Ufe52> <U6c5> : "е̂"
-<Ufe52> <U6c9> : "и̂"
-<Ufe52> <U6cf> : "о̂"
-<Ufe52> <U6d2> : "р̂"
-<Ufe52> <U6d5> : "у̂"
-<Ufe52> <U6e1> : "А̂"
-<Ufe52> <U6e5> : "Е̂"
-<Ufe52> <U6e9> : "И̂"
-<Ufe52> <U6ef> : "О̂"
-<Ufe52> <U6f2> : "Р̂"
-<Ufe52> <U6f5> : "У̂"
 <Ufe52> <U1ea0> : "Ậ" # U1eac
 <Ufe52> <U1ea1> : "ậ" # U1ead
 <Ufe52> <U1eb8> : "Ệ" # U1ec6
@@ -528,21 +481,11 @@
 <Ufe54> <U269> : "ɩ̄"
 <Ufe54> <U28a> : "ʊ̄"
 <Ufe54> <U28b> : "ʋ̄"
-<Ufe54> <U6c1> : "а̄"
-<Ufe54> <U6c5> : "е̄"
-<Ufe54> <U6cf> : "о̄"
-<Ufe54> <U6d2> : "р̄"
-<Ufe54> <U6e1> : "А̄"
-<Ufe54> <U6e5> : "Е̄"
-<Ufe54> <U6ef> : "О̄"
-<Ufe54> <U6f2> : "Р̄"
 <Ufe54> <U1e36> : "Ḹ" # U1e38
 <Ufe54> <U1e37> : "ḹ" # U1e39
 <Ufe54> <U1e5a> : "Ṝ" # U1e5c
 <Ufe54> <U1e5b> : "ṝ" # U1e5d
 <Ufe54> <Ufe54> : "¯" # Uaf
-<Ufe54> <Ufe57> <U55> : "Ǖ" # U1d5
-<Ufe54> <Ufe57> <U75> : "ǖ" # U1d6
 <Ufe54> <Ufe8c> <U41> : "Ᾱ" # U1fb9
 <Ufe54> <Ufe8c> <U49> : "Ῑ" # U1fd9
 <Ufe54> <Ufe8c> <U55> : "Ῡ" # U1fe9
@@ -632,8 +575,6 @@
 <Ufe57> <U4e9> : "ӫ" # U4eb
 <Ufe57> <Ufe57> : "¨" # Ua8
 <Ufe57> <Ufe51> <U20> : "΅" # U385
-<Ufe57> <Ufe54> <U55> : "Ṻ" # U1e7a
-<Ufe57> <Ufe54> <U75> : "ṻ" # U1e7b
 <Ufe57> <Ufe6c> <U3d> : "⩷" # U2a77
 <Ufe57> <Uff20> <U5f> <U55> : "Ṻ" # U1e7a
 <Ufe57> <Uff20> <U5f> <U75> : "ṻ" # U1e7b
@@ -787,42 +728,6 @@
 <Ufe5d> <Ufe53> <U1f61> : "ᾧ" # U1fa7
 <Ufe5d> <Ufe53> <U1f68> : "ᾮ" # U1fae
 <Ufe5d> <Ufe53> <U1f69> : "ᾯ" # U1faf
-<Ufe5d> <Ufe50> <Ufe64> <U7c1> : "ᾊ" # U1f8a
-<Ufe5d> <Ufe50> <Ufe64> <U7c7> : "ᾚ" # U1f9a
-<Ufe5d> <Ufe50> <Ufe64> <U7d9> : "ᾪ" # U1faa
-<Ufe5d> <Ufe50> <Ufe64> <U7e1> : "ᾂ" # U1f82
-<Ufe5d> <Ufe50> <Ufe64> <U7e7> : "ᾒ" # U1f92
-<Ufe5d> <Ufe50> <Ufe64> <U7f9> : "ᾢ" # U1fa2
-<Ufe5d> <Ufe50> <Ufe65> <U7c1> : "ᾋ" # U1f8b
-<Ufe5d> <Ufe50> <Ufe65> <U7c7> : "ᾛ" # U1f9b
-<Ufe5d> <Ufe50> <Ufe65> <U7d9> : "ᾫ" # U1fab
-<Ufe5d> <Ufe50> <Ufe65> <U7e1> : "ᾃ" # U1f83
-<Ufe5d> <Ufe50> <Ufe65> <U7e7> : "ᾓ" # U1f93
-<Ufe5d> <Ufe50> <Ufe65> <U7f9> : "ᾣ" # U1fa3
-<Ufe5d> <Ufe51> <Ufe64> <U7c1> : "ᾌ" # U1f8c
-<Ufe5d> <Ufe51> <Ufe64> <U7c7> : "ᾜ" # U1f9c
-<Ufe5d> <Ufe51> <Ufe64> <U7d9> : "ᾬ" # U1fac
-<Ufe5d> <Ufe51> <Ufe64> <U7e1> : "ᾄ" # U1f84
-<Ufe5d> <Ufe51> <Ufe64> <U7e7> : "ᾔ" # U1f94
-<Ufe5d> <Ufe51> <Ufe64> <U7f9> : "ᾤ" # U1fa4
-<Ufe5d> <Ufe51> <Ufe65> <U7c1> : "ᾍ" # U1f8d
-<Ufe5d> <Ufe51> <Ufe65> <U7c7> : "ᾝ" # U1f9d
-<Ufe5d> <Ufe51> <Ufe65> <U7d9> : "ᾭ" # U1fad
-<Ufe5d> <Ufe51> <Ufe65> <U7e1> : "ᾅ" # U1f85
-<Ufe5d> <Ufe51> <Ufe65> <U7e7> : "ᾕ" # U1f95
-<Ufe5d> <Ufe51> <Ufe65> <U7f9> : "ᾥ" # U1fa5
-<Ufe5d> <Ufe53> <Ufe64> <U7c1> : "ᾎ" # U1f8e
-<Ufe5d> <Ufe53> <Ufe64> <U7c7> : "ᾞ" # U1f9e
-<Ufe5d> <Ufe53> <Ufe64> <U7d9> : "ᾮ" # U1fae
-<Ufe5d> <Ufe53> <Ufe64> <U7e1> : "ᾆ" # U1f86
-<Ufe5d> <Ufe53> <Ufe64> <U7e7> : "ᾖ" # U1f96
-<Ufe5d> <Ufe53> <Ufe64> <U7f9> : "ᾦ" # U1fa6
-<Ufe5d> <Ufe53> <Ufe65> <U7c1> : "ᾏ" # U1f8f
-<Ufe5d> <Ufe53> <Ufe65> <U7c7> : "ᾟ" # U1f9f
-<Ufe5d> <Ufe53> <Ufe65> <U7d9> : "ᾯ" # U1faf
-<Ufe5d> <Ufe53> <Ufe65> <U7e1> : "ᾇ" # U1f87
-<Ufe5d> <Ufe53> <Ufe65> <U7e7> : "ᾗ" # U1f97
-<Ufe5d> <Ufe53> <Ufe65> <U7f9> : "ᾧ" # U1fa7
 <Ufe5d> <Uff20> <U27> <U7e1> : "ᾴ" # U1fb4
 <Ufe5d> <Uff20> <U27> <U7e7> : "ῄ" # U1fc4
 <Ufe5d> <Uff20> <U27> <U7f9> : "ῴ" # U1ff4
@@ -1175,18 +1080,6 @@
 <Ufe63> <Ufe8c> <U72> : "ϼ" # U3fc
 <Ufe66> <U474> : "Ѷ" # U476
 <Ufe66> <U475> : "ѷ" # U477
-<Ufe66> <U6c1> : "а̏"
-<Ufe66> <U6c5> : "е̏"
-<Ufe66> <U6c9> : "и̏"
-<Ufe66> <U6cf> : "о̏"
-<Ufe66> <U6d2> : "р̏"
-<Ufe66> <U6d5> : "у̏"
-<Ufe66> <U6e1> : "А̏"
-<Ufe66> <U6e5> : "Е̏"
-<Ufe66> <U6e9> : "И̏"
-<Ufe66> <U6ef> : "О̏"
-<Ufe66> <U6f2> : "Р̏"
-<Ufe66> <U6f5> : "У̏"
 <Ufe67> <U7c> : "⫰" # U2af0
 <Ufe6a> <U2b> : "⨦" # U2a26
 <Ufe6c> <Ufe57> <U3d> : "⩷" # U2a77
diff --git a/testsuite/gtk/composetable.c b/testsuite/gtk/composetable.c
index 5adb9caca4..dec41af467 100644
--- a/testsuite/gtk/composetable.c
+++ b/testsuite/gtk/composetable.c
@@ -341,7 +341,7 @@ match_algorithmic (void)
 
   ret = gtk_check_algorithmically (buffer, 3, output);
   g_assert_true (ret);
-  g_assert_cmpstr (output->str, ==, "ἇ");
+  g_assert_cmpstr (output->str, ==, "ᾶ\xcc\x94");
 
   buffer[0] = GDK_KEY_dead_perispomeni;
   buffer[1] = GDK_KEY_dead_dasia;
@@ -379,7 +379,8 @@ match_algorithmic (void)
   buffer[2] = GDK_KEY_dead_grave;
 
   ret = gtk_check_algorithmically (buffer, 3, output);
-  g_assert_false (ret);
+  g_assert_true (ret);
+  g_assert_cmpstr (output->str, ==, "");
 
   buffer[0] = GDK_KEY_dead_diaeresis;
   buffer[1] = GDK_KEY_a;
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]