[pango] break: Stop lumping ZWJ with Extend



commit 287d6ecd5f1a8683e2163bcde4bf36815082534f
Author: Matthias Clasen <mclasen redhat com>
Date:   Tue Aug 15 19:11:29 2017 -0400

    break: Stop lumping ZWJ with Extend
    
    ZWJ is treated as its own class in TR29, and us lumping
    them together and then manually checking wc == 0x200d in
    various places was causing us to inadvertedly inserting
    grapheme breaks in the middle of Emoji ZWJ sequences
    where they are not suppose to be.
    
    Add test cases to verify this.

 pango/break.c            |   37 ++++++++++++++++++++++++++-----------
 tests/EmojiBreakTest.txt |   11 ++++++++---
 2 files changed, 34 insertions(+), 14 deletions(-)
---
diff --git a/pango/break.c b/pango/break.c
index b210e1d..ccf1394 100644
--- a/pango/break.c
+++ b/pango/break.c
@@ -496,6 +496,7 @@ pango_default_break (const gchar   *text,
     GB_Other,
     GB_ControlCRLF,
     GB_Extend,
+    GB_ZWJ,
     GB_Prepend,
     GB_SpacingMark,
     GB_InHangulSyllable, /* Handles all of L, V, T, LV, LVT rules */
@@ -673,9 +674,14 @@ pango_default_break (const gchar   *text,
        switch ((int) type)
          {
          case G_UNICODE_FORMAT:
-           if (wc == 0x200C || wc == 0x200D)
+           if (G_UNLIKELY (wc == 0x200C))
              {
-               GB_type = GB_Extend; /* U+200C and U+200D are Other_Grapheme_Extend */
+               GB_type = GB_Extend;
+               break;
+             }
+           if (G_UNLIKELY (wc == 0x200D))
+             {
+               GB_type = GB_ZWJ;
                break;
              }
             if (G_UNLIKELY((wc >= 0x600 && wc <= 0x605) ||
@@ -766,8 +772,10 @@ pango_default_break (const gchar   *text,
                            (wc >= 0x1F930 && wc <= 0x1F939) ||
                            (wc >= 0x1F93D && wc <= 0x1F93E) ||
                            (wc >= 0x1F9D1 && wc <= 0x1F9DD)))
-              GB_type = GB_E_Base;
-
+              {
+                GB_type = GB_E_Base;
+                break;
+              }
             if (G_UNLIKELY(wc == 0x2640 ||
                            wc == 0x2642 ||
                            (wc >= 0x2695 && wc <= 0x2696) ||
@@ -788,11 +796,15 @@ pango_default_break (const gchar   *text,
                            wc == 0x1F5E8 ||
                            wc == 0x1F680 ||
                            wc == 0x1F692))
-              GB_type = GB_Glue_After_Zwj;
-
+              {
+                GB_type = GB_Glue_After_Zwj;
+                break;
+              }
             if (G_UNLIKELY(wc >= 0x1F466 && wc <= 0x1F469))
-              GB_type = GB_E_Base_GAZ;
-
+              {
+                GB_type = GB_E_Base_GAZ;
+                break;
+              }
             if (G_UNLIKELY(wc >=0x1F1E6 && wc <=0x1F1FF))
               {
                 if (prev_GB_type == GB_RI_Odd)
@@ -801,6 +813,7 @@ pango_default_break (const gchar   *text,
                   GB_type = GB_RI_Odd;
                 else
                   GB_type = GB_RI_Odd;
+                break;
               }
             break;
 
@@ -814,7 +827,7 @@ pango_default_break (const gchar   *text,
 
        /* We apply Rules GB1 and GB2 at the end of the function */
        if (wc == '\n' && prev_wc == '\r')
-         is_grapheme_boundary = FALSE; /* Rule GB3 */
+          is_grapheme_boundary = FALSE; /* Rule GB3 */
        else if (prev_GB_type == GB_ControlCRLF || GB_type == GB_ControlCRLF)
          is_grapheme_boundary = TRUE; /* Rules GB4 and GB5 */
        else if (GB_type == GB_InHangulSyllable)
@@ -826,6 +839,8 @@ pango_default_break (const gchar   *text,
              GB_type = prev_GB_type;
            is_grapheme_boundary = FALSE; /* Rule GB9 */
           }
+        else if (GB_type == GB_ZWJ)
+         is_grapheme_boundary = FALSE; /* Rule GB9 */
        else if (GB_type == GB_SpacingMark)
          is_grapheme_boundary = FALSE; /* Rule GB9a */
        else if (prev_GB_type == GB_Prepend)
@@ -838,13 +853,13 @@ pango_default_break (const gchar   *text,
             else
               is_grapheme_boundary = TRUE;
           }
-       else if (prev_wc == 0x200D &&
+       else if (prev_GB_type == GB_ZWJ &&
                  (GB_type == GB_Glue_After_Zwj || GB_type == GB_E_Base_GAZ))
          is_grapheme_boundary = FALSE; /* Rule GB11 */
        else if (prev_GB_type == GB_RI_Odd && GB_type == GB_RI_Even)
          is_grapheme_boundary = FALSE; /* Rule GB12 and GB13 */
        else
-         is_grapheme_boundary = TRUE;  /* Rule GB999 */
+         is_grapheme_boundary = TRUE; /* Rule GB999 */
 
        prev_GB_type = GB_type;
 
diff --git a/tests/EmojiBreakTest.txt b/tests/EmojiBreakTest.txt
index d41b647..3840ea1 100644
--- a/tests/EmojiBreakTest.txt
+++ b/tests/EmojiBreakTest.txt
@@ -1,3 +1,8 @@
-÷ 1F3CC × FE0F × 200D ÷ 2642 × FE0F ÷
-÷ 1F3CC × 200D ÷ 2642 ÷
-# Lines: 2
+÷ 1F3CC × FE0F × 200D × 2642 × FE0F ÷
+÷ 1F3CC × 200D × 2642 ÷
+÷ 1F468 × 200D × 2695 × FE0F ÷ # man health worker
+÷ 1F468 × 1F3FC × 200D × 2695 × FE0F ÷ # man health worker: medium-light skin tone
+÷ 1F468 × 200D × 1F469 × 200D × 1F467 × 200D × 1F466 ÷ # family: man, woman, girl, boy
+÷ 1F1E6 × 1F1FA ÷ # Australia
+÷ 0031 × FE0F × 20E3 ÷ # keycap: 1
+# Lines: 7


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]