[pango/emoji-ragel: 2/2] [emoji] Port to new Ragel-based iterator, based on Chromium again



commit f8ca9ca5ed3198e7fbab7381e232f9f20ecd93da
Author: Behdad Esfahbod <behdad behdad org>
Date:   Mon Oct 15 14:53:31 2018 -0700

    [emoji] Port to new Ragel-based iterator, based on Chromium again
    
    There's a couple of regressions in this apparently.  I'm working with Dominik
    to fix on Chrome side and will push here after.

 pango/emoji_presentation_scanner.c  | 497 ++++++++++++++++++++++++++++++++++++
 pango/emoji_presentation_scanner.rl |  96 +++++++
 pango/pango-emoji-private.h         |   7 +
 pango/pango-emoji.c                 | 256 ++++++++++---------
 4 files changed, 740 insertions(+), 116 deletions(-)
---
diff --git a/pango/emoji_presentation_scanner.c b/pango/emoji_presentation_scanner.c
new file mode 100644
index 00000000..43872abb
--- /dev/null
+++ b/pango/emoji_presentation_scanner.c
@@ -0,0 +1,497 @@
+// Copyright 2018 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+
+static const char _emoji_presentation_actions[] = {
+       0, 1, 0, 1, 1, 1, 5, 1,
+       6, 1, 7, 1, 8, 1, 9, 2,
+       2, 3, 2, 2, 4, 0
+};
+
+static const char _emoji_presentation_key_offsets[] = {
+       0, 3, 8, 9, 13, 15, 22, 26,
+       33, 42, 52, 63, 71, 82, 92, 103,
+       115, 116, 121, 0
+};
+
+static const unsigned char _emoji_presentation_trans_keys[] = {
+       9u, 10u, 12u, 3u, 7u, 13u, 0u, 2u,
+       6u, 10u, 12u, 8u, 9u, 14u, 15u, 2u,
+       3u, 6u, 7u, 13u, 0u, 1u, 9u, 10u,
+       11u, 12u, 2u, 3u, 6u, 7u, 13u, 0u,
+       1u, 2u, 3u, 6u, 7u, 10u, 12u, 13u,
+       0u, 1u, 2u, 3u, 6u, 7u, 9u, 10u,
+       12u, 13u, 0u, 1u, 2u, 3u, 4u, 6u,
+       7u, 9u, 10u, 12u, 13u, 0u, 1u, 2u,
+       3u, 6u, 7u, 10u, 13u, 0u, 1u, 2u,
+       3u, 6u, 7u, 9u, 10u, 12u, 13u, 14u,
+       0u, 1u, 2u, 3u, 4u, 6u, 7u, 10u,
+       12u, 13u, 0u, 1u, 2u, 3u, 6u, 7u,
+       9u, 10u, 11u, 12u, 13u, 0u, 1u, 2u,
+       3u, 4u, 6u, 7u, 9u, 10u, 11u, 12u,
+       13u, 0u, 1u, 6u, 10u, 11u, 12u, 8u,
+       9u, 2u, 3u, 6u, 7u, 9u, 10u, 11u,
+       12u, 13u, 14u, 0u, 1u, 0u
+};
+
+static const char _emoji_presentation_single_lengths[] = {
+       3, 3, 1, 2, 2, 5, 4, 5,
+       7, 8, 9, 6, 9, 8, 9, 10,
+       1, 3, 10, 0
+};
+
+static const char _emoji_presentation_range_lengths[] = {
+       0, 1, 0, 1, 0, 1, 0, 1,
+       1, 1, 1, 1, 1, 1, 1, 1,
+       0, 1, 1, 0
+};
+
+static const char _emoji_presentation_index_offsets[] = {
+       0, 4, 9, 11, 15, 18, 25, 30,
+       37, 46, 56, 67, 75, 86, 96, 107,
+       119, 121, 126, 0
+};
+
+static const char _emoji_presentation_trans_cond_spaces[] = {
+       -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, -1, -1, -1, 0
+};
+
+static const short _emoji_presentation_trans_offsets[] = {
+       0, 1, 2, 3, 4, 5, 6, 7,
+       8, 9, 10, 11, 12, 13, 14, 15,
+       16, 17, 18, 19, 20, 21, 22, 23,
+       24, 25, 26, 27, 28, 29, 30, 31,
+       32, 33, 34, 35, 36, 37, 38, 39,
+       40, 41, 42, 43, 44, 45, 46, 47,
+       48, 49, 50, 51, 52, 53, 54, 55,
+       56, 57, 58, 59, 60, 61, 62, 63,
+       64, 65, 66, 67, 68, 69, 70, 71,
+       72, 73, 74, 75, 76, 77, 78, 79,
+       80, 81, 82, 83, 84, 85, 86, 87,
+       88, 89, 90, 91, 92, 93, 94, 95,
+       96, 97, 98, 99, 100, 101, 102, 103,
+       104, 105, 106, 107, 108, 109, 110, 111,
+       112, 113, 114, 115, 116, 117, 118, 119,
+       120, 121, 122, 123, 124, 125, 126, 127,
+       128, 129, 130, 131, 132, 133, 134, 135,
+       136, 137, 138, 139, 140, 141, 142, 143,
+       144, 145, 146, 147, 148, 149, 150, 151,
+       152, 153, 154, 155, 0
+};
+
+static const char _emoji_presentation_trans_lengths[] = {
+       1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 0
+};
+
+static const char _emoji_presentation_cond_keys[] = {
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0
+};
+
+static const char _emoji_presentation_cond_targs[] = {
+       7, 1, 11, 5, 13, 8, 8, 8,
+       5, 7, 5, 1, 11, 7, 5, 4,
+       7, 5, 14, 15, 16, 17, 18, 6,
+       5, 7, 1, 5, 11, 5, 9, 10,
+       2, 3, 12, 0, 5, 9, 10, 2,
+       3, 1, 11, 12, 0, 5, 9, 10,
+       2, 3, 7, 1, 11, 12, 0, 5,
+       9, 10, 11, 2, 3, 7, 1, 11,
+       12, 0, 5, 9, 10, 2, 3, 1,
+       12, 0, 5, 9, 10, 2, 3, 7,
+       1, 11, 12, 4, 0, 5, 9, 10,
+       11, 2, 3, 1, 11, 12, 0, 5,
+       9, 10, 2, 3, 7, 1, 5, 11,
+       12, 0, 5, 9, 10, 11, 2, 3,
+       7, 1, 5, 11, 12, 0, 5, 7,
+       5, 1, 5, 11, 7, 5, 9, 10,
+       2, 3, 7, 1, 5, 11, 12, 4,
+       0, 5, 5, 5, 5, 5, 5, 5,
+       5, 5, 5, 5, 5, 5, 5, 5,
+       5, 5, 5, 5, 0
+};
+
+static const char _emoji_presentation_cond_actions[] = {
+       15, 0, 15, 11, 15, 15, 15, 15,
+       13, 15, 11, 0, 15, 15, 11, 0,
+       15, 11, 15, 15, 0, 18, 15, 18,
+       5, 15, 0, 5, 15, 9, 15, 15,
+       0, 0, 15, 0, 7, 15, 15, 0,
+       0, 0, 15, 15, 0, 7, 15, 15,
+       0, 0, 15, 0, 15, 15, 0, 7,
+       15, 15, 15, 0, 0, 15, 0, 15,
+       15, 0, 7, 15, 15, 0, 0, 0,
+       15, 0, 7, 15, 15, 0, 0, 15,
+       0, 15, 15, 0, 0, 7, 15, 15,
+       15, 0, 0, 0, 15, 15, 0, 7,
+       15, 15, 0, 0, 15, 0, 5, 15,
+       15, 0, 7, 15, 15, 15, 0, 0,
+       15, 0, 5, 15, 15, 0, 7, 15,
+       9, 0, 5, 15, 15, 9, 15, 15,
+       0, 0, 15, 0, 5, 15, 15, 0,
+       0, 7, 11, 13, 11, 11, 11, 9,
+       7, 7, 7, 7, 7, 7, 7, 7,
+       7, 9, 9, 7, 0
+};
+
+static const char _emoji_presentation_to_state_actions[] = {
+       0, 0, 0, 0, 0, 1, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0
+};
+
+static const char _emoji_presentation_from_state_actions[] = {
+       0, 0, 0, 0, 0, 3, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0
+};
+
+static const char _emoji_presentation_eof_cond_spaces[] = {
+       -1, -1, -1, -1,
+       -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, -1, -1, -1, -1, -1, -1, 0
+};
+
+static const char _emoji_presentation_eof_cond_key_offs[] = {
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0
+};
+
+static const char _emoji_presentation_eof_cond_key_lens[] = {
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0
+};
+
+static const char _emoji_presentation_eof_cond_keys[] = {
+       0
+};
+
+static const short _emoji_presentation_eof_trans[] = {
+       139, 140, 141, 142, 143, 0, 144, 145,
+       146, 147, 148, 149, 150, 151, 152, 153,
+       154, 155, 156, 0
+};
+
+static const char _emoji_presentation_nfa_targs[] = {
+       0, 0
+};
+
+static const char _emoji_presentation_nfa_offsets[] = {
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0
+};
+
+static const char _emoji_presentation_nfa_push_actions[] = {
+       0, 0
+};
+
+static const char _emoji_presentation_nfa_pop_trans[] = {
+       0, 0
+};
+
+static const int emoji_presentation_start = 5;
+
+static const int emoji_presentation_en_text_and_emoji_run = 5;
+
+
+
+
+
+static gboolean
+scan_emoji_presentation (const unsigned char* buffer,
+unsigned buffer_size,
+unsigned cursor,
+unsigned* last,
+unsigned* end)
+{
+       const unsigned char *p = buffer + cursor;
+       const unsigned char *pe, *eof, *ts, *te;
+       unsigned act;
+       int cs;
+       pe = eof = buffer + buffer_size;
+       
+       
+       {
+               cs = (int)emoji_presentation_start;
+               ts = 0;
+               te = 0;
+               act = 0;
+       }
+       
+       {
+               int _cpc;
+               int _klen;const char * _cekeys;unsigned int _trans = 0;const unsigned char * _keys;const char 
* _acts;unsigned int _nacts;       {
+                       if ( p == pe )
+                       goto _test_eof;
+                       _resume:  {
+                               _acts = ( _emoji_presentation_actions + 
(_emoji_presentation_from_state_actions[cs]));
+                               _nacts = (unsigned int)(*( _acts));
+                               _acts += 1;
+                               while ( _nacts > 0 ) {
+                                       switch ( (*( _acts)) ) {
+                                               case 1:  {
+                                                       {
+                                                               #line 1 "NONE"
+                                                               {ts = p;}}
+                                                       break; }
+                                       }
+                                       _nacts -= 1;
+                                       _acts += 1;
+                               }
+                               
+                               _keys = ( _emoji_presentation_trans_keys + 
(_emoji_presentation_key_offsets[cs]));
+                               _trans = (unsigned int)_emoji_presentation_index_offsets[cs];
+                               
+                               _klen = (int)_emoji_presentation_single_lengths[cs];
+                               if ( _klen > 0 ) {
+                                       const unsigned char *_lower = _keys;
+                                       const unsigned char *_upper = _keys + _klen - 1;
+                                       const unsigned char *_mid;
+                                       while ( 1 ) {
+                                               if ( _upper < _lower )
+                                               break;
+                                               
+                                               _mid = _lower + ((_upper-_lower) >> 1);
+                                               if ( ( (*( p))) < (*( _mid)) )
+                                               _upper = _mid - 1;
+                                               else if ( ( (*( p))) > (*( _mid)) )
+                                               _lower = _mid + 1;
+                                               else {
+                                                       _trans += (unsigned int)(_mid - _keys);
+                                                       goto _match;
+                                               }
+                                       }
+                                       _keys += _klen;
+                                       _trans += (unsigned int)_klen;
+                               }
+                               
+                               _klen = (int)_emoji_presentation_range_lengths[cs];
+                               if ( _klen > 0 ) {
+                                       const unsigned char *_lower = _keys;
+                                       const unsigned char *_upper = _keys + (_klen<<1) - 2;
+                                       const unsigned char *_mid;
+                                       while ( 1 ) {
+                                               if ( _upper < _lower )
+                                               break;
+                                               
+                                               _mid = _lower + (((_upper-_lower) >> 1) & ~1);
+                                               if ( ( (*( p))) < (*( _mid)) )
+                                               _upper = _mid - 2;
+                                               else if ( ( (*( p))) > (*( _mid + 1)) )
+                                               _lower = _mid + 2;
+                                               else {
+                                                       _trans += (unsigned int)((_mid - _keys)>>1);
+                                                       goto _match;
+                                               }
+                                       }
+                                       _trans += (unsigned int)_klen;
+                               }
+                               
+                               _match:  {
+                                       goto _match_cond;
+                               }
+                       }
+                       _match_cond:  {
+                               cs = (int)_emoji_presentation_cond_targs[_trans];
+                               
+                               if ( _emoji_presentation_cond_actions[_trans] == 0 )
+                               goto _again;
+                               
+                               _acts = ( _emoji_presentation_actions + 
(_emoji_presentation_cond_actions[_trans]));
+                               _nacts = (unsigned int)(*( _acts));
+                               _acts += 1;
+                               while ( _nacts > 0 ) {
+                                       switch ( (*( _acts)) )
+                                       {
+                                               case 2:  {
+                                                       {
+                                                               #line 1 "NONE"
+                                                               {te = p+1;}}
+                                                       break; }
+                                               case 3:  {
+                                                       {
+                                                               #line 71 "emoji_presentation_scanner.rl"
+                                                               {act = 1;}}
+                                                       break; }
+                                               case 4:  {
+                                                       {
+                                                               #line 72 "emoji_presentation_scanner.rl"
+                                                               {act = 2;}}
+                                                       break; }
+                                               case 5:  {
+                                                       {
+                                                               #line 72 "emoji_presentation_scanner.rl"
+                                                               {te = p+1;{
+                                                                               #line 72 
"emoji_presentation_scanner.rl"
+                                                                               
found_text_presentation_sequence }}}
+                                                       break; }
+                                               case 6:  {
+                                                       {
+                                                               #line 71 "emoji_presentation_scanner.rl"
+                                                               {te = p;p = p - 1;{
+                                                                               #line 71 
"emoji_presentation_scanner.rl"
+                                                                               
found_emoji_presentation_sequence }}}
+                                                       break; }
+                                               case 7:  {
+                                                       {
+                                                               #line 72 "emoji_presentation_scanner.rl"
+                                                               {te = p;p = p - 1;{
+                                                                               #line 72 
"emoji_presentation_scanner.rl"
+                                                                               
found_text_presentation_sequence }}}
+                                                       break; }
+                                               case 8:  {
+                                                       {
+                                                               #line 71 "emoji_presentation_scanner.rl"
+                                                               {p = ((te))-1;
+                                                                       {
+                                                                               #line 71 
"emoji_presentation_scanner.rl"
+                                                                               
found_emoji_presentation_sequence }}}
+                                                       break; }
+                                               case 9:  {
+                                                       {
+                                                               #line 1 "NONE"
+                                                               {switch( act ) {
+                                                                               case 1:  {
+                                                                                       p = ((te))-1;
+                                                                                       {
+                                                                                               #line 71 
"emoji_presentation_scanner.rl"
+                                                                                               
found_emoji_presentation_sequence } break; }
+                                                                               case 2:  {
+                                                                                       p = ((te))-1;
+                                                                                       {
+                                                                                               #line 72 
"emoji_presentation_scanner.rl"
+                                                                                               
found_text_presentation_sequence } break; }
+                                                                       }}
+                                                       }
+                                                       break; }
+                                       }
+                                       _nacts -= 1;
+                                       _acts += 1;
+                               }
+                               
+                               
+                       }
+                       _again:  {
+                               _acts = ( _emoji_presentation_actions + 
(_emoji_presentation_to_state_actions[cs]));
+                               _nacts = (unsigned int)(*( _acts));
+                               _acts += 1;
+                               while ( _nacts > 0 ) {
+                                       switch ( (*( _acts)) ) {
+                                               case 0:  {
+                                                       {
+                                                               #line 1 "NONE"
+                                                               {ts = 0;}}
+                                                       break; }
+                                       }
+                                       _nacts -= 1;
+                                       _acts += 1;
+                               }
+                               
+                               p += 1;
+                               if ( p != pe )
+                               goto _resume;
+                       }
+                       _test_eof:  { {}
+                               if ( p == eof )
+                               {
+                                       if ( _emoji_presentation_eof_cond_spaces[cs] != -1 ) {
+                                               _cekeys = ( _emoji_presentation_eof_cond_keys + 
(_emoji_presentation_eof_cond_key_offs[cs]));
+                                               _klen = (int)_emoji_presentation_eof_cond_key_lens[cs];
+                                               _cpc = 0;
+                                               {
+                                                       const char *_lower = _cekeys;
+                                                       const char *_upper = _cekeys + _klen - 1;
+                                                       const char *_mid;
+                                                       while ( 1 ) {
+                                                               if ( _upper < _lower )
+                                                               break;
+                                                               
+                                                               _mid = _lower + ((_upper-_lower) >> 1);
+                                                               if ( _cpc < (int)(*( _mid)) )
+                                                               _upper = _mid - 1;
+                                                               else if ( _cpc > (int)(*( _mid)) )
+                                                               _lower = _mid + 1;
+                                                               else {
+                                                                       goto _ok;
+                                                               }
+                                                       }
+                                                       cs = -1;
+                                                       goto _out;
+                                               }
+                                               _ok: {}
+                                       }
+                                       if ( _emoji_presentation_eof_trans[cs] > 0 ) {
+                                               _trans = (unsigned int)_emoji_presentation_eof_trans[cs] - 1;
+                                               goto _match_cond;
+                                       }
+                               }
+                               
+                       }
+                       _out:  { {}
+                       }
+               }
+       }
+       
+       return FALSE;
+}
+
diff --git a/pango/emoji_presentation_scanner.rl b/pango/emoji_presentation_scanner.rl
new file mode 100644
index 00000000..5eea495a
--- /dev/null
+++ b/pango/emoji_presentation_scanner.rl
@@ -0,0 +1,96 @@
+// Copyright 2018 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+%%{
+  machine emoji_presentation;
+  alphtype unsigned char;
+  write data noerror nofinal noentry;
+}%%
+
+%%{
+
+EMOJI = 0;
+EMOJI_TEXT_PRESENTATION = 1;
+EMOJI_EMOJI_PRESENTATION = 2;
+EMOJI_MODIFIER_BASE = 3;
+EMOJI_MODIFIER = 4;
+EMOJI_VS_BASE = 5;
+REGIONAL_INDICATOR = 6;
+KEYCAP_BASE = 7;
+COMBINING_ENCLOSING_KEYCAP = 8;
+COMBINING_ENCLOSING_CIRCLE_BACKSLASH = 9;
+ZWJ = 10;
+VS15 = 11;
+VS16 = 12;
+TAG_BASE = 13;
+TAG_SEQUENCE = 14;
+TAG_TERM = 15;
+
+any_emoji =  EMOJI_TEXT_PRESENTATION | EMOJI_EMOJI_PRESENTATION |  KEYCAP_BASE |
+  EMOJI_MODIFIER_BASE | TAG_BASE | EMOJI;
+
+emoji_combining_encloding_circle_backslash_sequence = any_emoji
+  COMBINING_ENCLOSING_CIRCLE_BACKSLASH;
+
+# This could be sharper than any_emoji by restricting this only to valid
+# variation sequences:
+# https://www.unicode.org/Public/emoji/11.0/emoji-variation-sequences.txt
+# However, implementing
+# https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence is
+# sufficient for our purposes here.
+emoji_presentation_sequence = any_emoji VS16;
+
+emoji_modifier_sequence = EMOJI_MODIFIER_BASE EMOJI_MODIFIER;
+
+emoji_flag_sequence = REGIONAL_INDICATOR REGIONAL_INDICATOR;
+
+# Here we only allow the valid tag sequences
+# https://www.unicode.org/reports/tr51/#valid-emoji-tag-sequences, instead of
+# all well-formed ones defined in
+# https://www.unicode.org/reports/tr51/#def_emoji_tag_sequence
+emoji_tag_sequence = TAG_BASE TAG_SEQUENCE+ TAG_TERM;
+
+emoji_keycap_sequence = KEYCAP_BASE COMBINING_ENCLOSING_KEYCAP;
+
+emoji_zwj_element =  emoji_presentation_sequence | emoji_modifier_sequence | any_emoji;
+
+emoji_zwj_sequence = emoji_zwj_element ( ZWJ emoji_zwj_element )+;
+
+emoji_presentation = EMOJI_EMOJI_PRESENTATION | TAG_BASE | EMOJI_MODIFIER_BASE |
+  emoji_presentation_sequence | emoji_modifier_sequence | emoji_flag_sequence |
+  emoji_tag_sequence | emoji_keycap_sequence | emoji_zwj_sequence |
+  emoji_combining_encloding_circle_backslash_sequence;
+
+emoji_run = emoji_presentation+;
+
+text_presentation_emoji = any_emoji VS15;
+text_run = text_presentation_emoji | any;
+
+text_and_emoji_run := |*
+emoji_run => { found_emoji_presentation_sequence };
+text_run => { found_text_presentation_sequence };
+*|;
+
+}%%
+
+static gboolean
+scan_emoji_presentation (const unsigned char* buffer,
+                         unsigned buffer_size,
+                         unsigned cursor,
+                         unsigned* last,
+                         unsigned* end)
+{
+  const unsigned char *p = buffer + cursor;
+  const unsigned char *pe, *eof, *ts, *te;
+  unsigned act;
+  int cs;
+  pe = eof = buffer + buffer_size;
+
+  %%{
+    write init;
+    write exec;
+  }%%
+  return FALSE;
+}
+
diff --git a/pango/pango-emoji-private.h b/pango/pango-emoji-private.h
index eb8a52a7..a360b37a 100644
--- a/pango/pango-emoji-private.h
+++ b/pango/pango-emoji-private.h
@@ -33,6 +33,13 @@ struct _PangoEmojiIter
   const gchar *start;
   const gchar *end;
   gboolean is_emoji;
+
+  const gchar *token_start;
+  const gchar *token_end;
+
+  const unsigned char *types;
+  unsigned int n_chars;
+  unsigned int cursor;
 };
 
 PangoEmojiIter *
diff --git a/pango/pango-emoji.c b/pango/pango-emoji.c
index 29472452..46ab5b3f 100644
--- a/pango/pango-emoji.c
+++ b/pango/pango-emoji.c
@@ -18,11 +18,27 @@
  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  * Boston, MA 02111-1307, USA.
  *
- * Implementation of pango_emoji_iter is derived from Chromium:
+ * Implementation of pango_emoji_iter is based on Chromium's Ragel-based
+ * parser:
  *
- * https://cs.chromium.org/chromium/src/third_party/WebKit/Source/platform/fonts/FontFallbackPriority.h
- * https://cs.chromium.org/chromium/src/third_party/WebKit/Source/platform/text/CharacterEmoji.cpp
- * https://cs.chromium.org/chromium/src/third_party/WebKit/Source/platform/fonts/SymbolsIterator.cpp
+ * https://chromium-review.googlesource.com/c/chromium/src/+/1264577
+ *
+ * The grammar file emoji_presentation_scanner.rl was just modified to
+ * adapt the function signature and variables to our usecase.  The
+ * grammar itself was NOT modified:
+ *
+ * 
https://chromium-review.googlesource.com/c/chromium/src/+/1264577/3/third_party/blink/renderer/platform/fonts/emoji_presentation_scanner.rl
+ *
+ * The emoji_presentation_scanner.c is generated from .rl file by
+ * running ragel on it.
+ *
+ * The categorization is also based on:
+ *
+ * 
https://chromium-review.googlesource.com/c/chromium/src/+/1264577/3/third_party/blink/renderer/platform/fonts/utf16_ragel_iterator.h
+ *
+ * The iterator next() is based on:
+ *
+ * 
https://chromium-review.googlesource.com/c/chromium/src/+/1264577/3/third_party/blink/renderer/platform/fonts/symbols_iterator.cc
  *
  * // Copyright 2015 The Chromium Authors. All rights reserved.
  * // Use of this source code is governed by a BSD-style license that can be
@@ -105,62 +121,110 @@ _pango_Is_Regional_Indicator (gunichar ch)
 
 const gunichar kCombiningEnclosingCircleBackslashCharacter = 0x20E0;
 const gunichar kCombiningEnclosingKeycapCharacter = 0x20E3;
-const gunichar kEyeCharacter = 0x1F441;
-const gunichar kFemaleSignCharacter = 0x2640;
-const gunichar kLeftSpeechBubbleCharacter = 0x1F5E8;
-const gunichar kMaleSignCharacter = 0x2642;
-const gunichar kRainbowCharacter = 0x1F308;
-const gunichar kStaffOfAesculapiusCharacter = 0x2695;
 const gunichar kVariationSelector15Character = 0xFE0E;
 const gunichar kVariationSelector16Character = 0xFE0F;
-const gunichar kWavingWhiteFlagCharacter = 0x1F3F3;
 const gunichar kZeroWidthJoinerCharacter = 0x200D;
 
-
-typedef enum {
-  PANGO_EMOJI_TYPE_INVALID,
-  PANGO_EMOJI_TYPE_TEXT, /* For regular non-symbols text */
-  PANGO_EMOJI_TYPE_EMOJI_TEXT, /* For emoji in text presentaiton */
-  PANGO_EMOJI_TYPE_EMOJI_EMOJI /* For emoji in emoji presentation */
-} PangoEmojiType;
-
-static PangoEmojiType
-_pango_get_emoji_type (gunichar codepoint)
+enum PangoEmojiScannerCategory {
+  EMOJI = 0,
+  EMOJI_TEXT_PRESENTATION = 1,
+  EMOJI_EMOJI_PRESENTATION = 2,
+  EMOJI_MODIFIER_BASE = 3,
+  EMOJI_MODIFIER = 4,
+  EMOJI_VS_BASE = 5,
+  REGIONAL_INDICATOR = 6,
+  KEYCAP_BASE = 7,
+  COMBINING_ENCLOSING_KEYCAP = 8,
+  COMBINING_ENCLOSING_CIRCLE_BACKSLASH = 9,
+  ZWJ = 10,
+  VS15 = 11,
+  VS16 = 12,
+  TAG_BASE = 13,
+  TAG_SEQUENCE = 14,
+  TAG_TERM = 15,
+  kMaxEmojiScannerCategory = 16
+};
+
+static unsigned char
+_pango_EmojiSegmentationCategory (gunichar codepoint)
 {
-  /* Those should only be Emoji presentation as combinations of two. */
-  if (_pango_Is_Emoji_Keycap_Base (codepoint) ||
-      _pango_Is_Regional_Indicator (codepoint))
-    return PANGO_EMOJI_TYPE_TEXT;
-
+  /* Specific ones first. */
   if (codepoint == kCombiningEnclosingKeycapCharacter)
-    return PANGO_EMOJI_TYPE_EMOJI_EMOJI;
-
-  if (_pango_Is_Emoji_Emoji_Default (codepoint) ||
-      _pango_Is_Emoji_Modifier_Base (codepoint) ||
-      _pango_Is_Emoji_Modifier (codepoint))
-    return PANGO_EMOJI_TYPE_EMOJI_EMOJI;
-
+    return COMBINING_ENCLOSING_KEYCAP;
+  if (codepoint == kCombiningEnclosingCircleBackslashCharacter)
+    return COMBINING_ENCLOSING_CIRCLE_BACKSLASH;
+  if (codepoint == kZeroWidthJoinerCharacter)
+    return ZWJ;
+  if (codepoint == kVariationSelector15Character)
+    return VS15;
+  if (codepoint == kVariationSelector16Character)
+    return VS16;
+  if (codepoint == 0x1F3F4)
+    return TAG_BASE;
+  if ((codepoint >= 0xE0030 && codepoint <= 0xE0039) ||
+      (codepoint >= 0xE0061 && codepoint <= 0xE007A))
+    return TAG_SEQUENCE;
+  if (codepoint == 0xE007F)
+    return TAG_TERM;
+  if (_pango_Is_Emoji_Modifier_Base (codepoint))
+    return EMOJI_MODIFIER_BASE;
+  if (_pango_Is_Emoji_Modifier (codepoint))
+    return EMOJI_MODIFIER;
+  if (_pango_Is_Regional_Indicator (codepoint))
+    return REGIONAL_INDICATOR;
+  if (_pango_Is_Emoji_Keycap_Base (codepoint))
+    return KEYCAP_BASE;
+
+  if (_pango_Is_Emoji_Emoji_Default (codepoint))
+    return EMOJI_EMOJI_PRESENTATION;
   if (_pango_Is_Emoji_Text_Default (codepoint))
-    return PANGO_EMOJI_TYPE_EMOJI_TEXT;
+    return EMOJI_TEXT_PRESENTATION;
+  if (_pango_Is_Emoji (codepoint))
+    return EMOJI;
 
-  return PANGO_EMOJI_TYPE_TEXT;
+  /* Ragel state machine will interpret unknown category as "any". */
+  return kMaxEmojiScannerCategory;
 }
 
+#define found_text_presentation_sequence
+#define found_emoji_presentation_sequence                                \
+  {                                                                      \
+    if (0) g_print ("emoji %ld..%ld\n", ts - buffer, te - buffer);       \
+    *last = ts - buffer;                                                 \
+    *end = te - buffer;                                                  \
+    return TRUE;                                                         \
+  }
+
+#include "emoji_presentation_scanner.c"
+
 
 PangoEmojiIter *
 _pango_emoji_iter_init (PangoEmojiIter *iter,
                        const char     *text,
                        int             length)
 {
-  iter->text_start = text;
+  unsigned int n_chars = g_utf8_strlen (text, length);
+  unsigned char *types = g_malloc (n_chars);
+  unsigned int i;
+  const char *p;
+
+  p = text;
+  for (i = 0; i < n_chars; i++)
+  {
+    types[i] = _pango_EmojiSegmentationCategory (g_utf8_get_char (p));
+    p = g_utf8_next_char (p);
+  }
+
+  iter->text_start = iter->start = iter->end = iter->token_start = iter->token_end = text;
   if (length >= 0)
     iter->text_end = text + length;
   else
     iter->text_end = text + strlen (text);
+  iter->is_emoji = FALSE;
 
-  iter->start = text;
-  iter->end = text;
-  iter->is_emoji = (gboolean) 2; /* HACK */
+  iter->types = types;
+  iter->n_chars = n_chars;
+  iter->cursor = 0;
 
   _pango_emoji_iter_next (iter);
 
@@ -170,102 +234,62 @@ _pango_emoji_iter_init (PangoEmojiIter *iter,
 void
 _pango_emoji_iter_fini (PangoEmojiIter *iter)
 {
+  g_free (iter->types);
 }
 
-#define PANGO_EMOJI_TYPE_IS_EMOJI(typ) ((typ) == PANGO_EMOJI_TYPE_EMOJI_EMOJI)
-
 gboolean
 _pango_emoji_iter_next (PangoEmojiIter *iter)
 {
-  PangoEmojiType current_emoji_type = PANGO_EMOJI_TYPE_INVALID;
-
-  if (iter->end == iter->text_end)
+  if (iter->end >= iter->text_end)
     return FALSE;
 
   iter->start = iter->end;
 
-  for (; iter->end < iter->text_end; iter->end = g_utf8_next_char (iter->end))
+  /* The scan_emoji_presentation scanner function returns false when it reaches
+   * the end of the buffer and has not discovered any emoji runs in between. For
+   * Emoji runs, it returns true, and token_start_ and token_end_ are set to the
+   * start and end of the emoji sequence. This means, it may skip over text runs
+   * in between, see below. */
+  if (iter->start >= iter->token_end)
     {
-      gunichar ch = g_utf8_get_char (iter->end);
-
-    /* Except at the beginning, ZWJ just carries over the emoji or neutral
-     * text type, VS15 & VS16 we just carry over as well, since we already
-     * resolved those through lookahead. Also, don't downgrade to text
-     * presentation for emoji that are part of a ZWJ sequence, example
-     * U+1F441 U+200D U+1F5E8, eye (text presentation) + ZWJ + left speech
-     * bubble, see below. */
-    if ((!(ch == kZeroWidthJoinerCharacter && !iter->is_emoji) &&
-        ch != kVariationSelector15Character &&
-        ch != kVariationSelector16Character &&
-        ch != kCombiningEnclosingCircleBackslashCharacter &&
-        !_pango_Is_Regional_Indicator(ch) &&
-        !((ch == kLeftSpeechBubbleCharacter ||
-           ch == kRainbowCharacter ||
-           ch == kMaleSignCharacter ||
-           ch == kFemaleSignCharacter ||
-           ch == kStaffOfAesculapiusCharacter) &&
-          !iter->is_emoji)) ||
-       current_emoji_type == PANGO_EMOJI_TYPE_INVALID) {
-      current_emoji_type = _pango_get_emoji_type (ch);
+      /* We need to scan furhter. */
+      unsigned int token_start, token_end;
+      if (!scan_emoji_presentation (iter->types, iter->n_chars, iter->cursor,
+                                   &token_start, &token_end))
+       {
+         /* The scanner returned false, which means it has reached the end of the
+          * buffer without discovering any emoji segments in between. */
+         iter->end = iter->text_end;
+         iter->is_emoji = FALSE;
+
+         return TRUE;
+       };
+      /* Ugly... */
+      g_assert (iter->cursor <= token_start && token_start < token_end && token_end <= iter->n_chars);
+      iter->token_start = g_utf8_offset_to_pointer (iter->token_end, token_start - iter->cursor);
+      iter->token_end   = g_utf8_offset_to_pointer (iter->token_end, token_end   - iter->cursor);
+      iter->cursor = token_end;
     }
 
-    if (g_utf8_next_char (iter->end) < iter->text_end) /* Optimize. */
+  if (iter->start < iter->token_start)
     {
-      gunichar peek_char = g_utf8_get_char (g_utf8_next_char (iter->end));
-
-      /* Variation Selectors */
-      if (current_emoji_type ==
-             PANGO_EMOJI_TYPE_EMOJI_EMOJI &&
-         peek_char == kVariationSelector15Character) {
-       current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_TEXT;
-      }
-
-      if ((current_emoji_type ==
-              PANGO_EMOJI_TYPE_EMOJI_TEXT ||
-          _pango_Is_Emoji_Keycap_Base(ch)) &&
-         peek_char == kVariationSelector16Character) {
-       current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI;
-      }
-
-      /* Combining characters Keycap... */
-      if (_pango_Is_Emoji_Keycap_Base(ch) &&
-         peek_char == kCombiningEnclosingKeycapCharacter) {
-       current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI;
-      };
-
-      /* Regional indicators */
-      if (_pango_Is_Regional_Indicator(ch) &&
-         _pango_Is_Regional_Indicator(peek_char)) {
-       current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI;
-      }
-
-      /* Upgrade text presentation emoji to emoji presentation when followed by
-       * ZWJ, Example U+1F441 U+200D U+1F5E8, eye + ZWJ + left speech bubble. */
-      if ((ch == kEyeCharacter ||
-          ch == kWavingWhiteFlagCharacter) &&
-         peek_char == kZeroWidthJoinerCharacter) {
-       current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI;
-      }
+      /* The scanner function has progressed to the next emoji segment, but we
+       * need to return the text segment over which it had skipped. */
+      iter->end = iter->token_start;
+      iter->is_emoji = FALSE;
+      return TRUE;
     }
 
-    if (iter->is_emoji == (gboolean) 2)
-      iter->is_emoji = !PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type);
-    if (iter->is_emoji == PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type))
+  if (iter->start >= iter->token_start && iter->start < iter->token_end)
     {
-      iter->is_emoji = !PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type);
-
-      /* Make sure we make progress.  Weird sequences, like a VC15 followed
-       * by VC16, can trick us into stalling otherwise. */
-      if (iter->start == iter->end)
-        iter->end = g_utf8_next_char (iter->end);
-
+      /* Now our cursor has reached the emoji segment, and we can return it. */
+      iter->end = iter->token_end;
+      iter->is_emoji = TRUE;
       return TRUE;
     }
-  }
-
-  iter->is_emoji = PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type);
 
-  return TRUE;
+  g_assert_not_reached ();
+  return FALSE;
 }
 
 


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]