[pango/emoji-ragel: 2/2] [emoji] Port to new Ragel-based iterator, based on Chromium again
- From: Behdad Esfahbod <behdad src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [pango/emoji-ragel: 2/2] [emoji] Port to new Ragel-based iterator, based on Chromium again
- Date: Mon, 15 Oct 2018 22:36:58 +0000 (UTC)
commit f8ca9ca5ed3198e7fbab7381e232f9f20ecd93da
Author: Behdad Esfahbod <behdad behdad org>
Date: Mon Oct 15 14:53:31 2018 -0700
[emoji] Port to new Ragel-based iterator, based on Chromium again
There's a couple of regressions in this apparently. I'm working with Dominik
to fix on Chrome side and will push here after.
pango/emoji_presentation_scanner.c | 497 ++++++++++++++++++++++++++++++++++++
pango/emoji_presentation_scanner.rl | 96 +++++++
pango/pango-emoji-private.h | 7 +
pango/pango-emoji.c | 256 ++++++++++---------
4 files changed, 740 insertions(+), 116 deletions(-)
---
diff --git a/pango/emoji_presentation_scanner.c b/pango/emoji_presentation_scanner.c
new file mode 100644
index 00000000..43872abb
--- /dev/null
+++ b/pango/emoji_presentation_scanner.c
@@ -0,0 +1,497 @@
+// Copyright 2018 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+
+static const char _emoji_presentation_actions[] = {
+ 0, 1, 0, 1, 1, 1, 5, 1,
+ 6, 1, 7, 1, 8, 1, 9, 2,
+ 2, 3, 2, 2, 4, 0
+};
+
+static const char _emoji_presentation_key_offsets[] = {
+ 0, 3, 8, 9, 13, 15, 22, 26,
+ 33, 42, 52, 63, 71, 82, 92, 103,
+ 115, 116, 121, 0
+};
+
+static const unsigned char _emoji_presentation_trans_keys[] = {
+ 9u, 10u, 12u, 3u, 7u, 13u, 0u, 2u,
+ 6u, 10u, 12u, 8u, 9u, 14u, 15u, 2u,
+ 3u, 6u, 7u, 13u, 0u, 1u, 9u, 10u,
+ 11u, 12u, 2u, 3u, 6u, 7u, 13u, 0u,
+ 1u, 2u, 3u, 6u, 7u, 10u, 12u, 13u,
+ 0u, 1u, 2u, 3u, 6u, 7u, 9u, 10u,
+ 12u, 13u, 0u, 1u, 2u, 3u, 4u, 6u,
+ 7u, 9u, 10u, 12u, 13u, 0u, 1u, 2u,
+ 3u, 6u, 7u, 10u, 13u, 0u, 1u, 2u,
+ 3u, 6u, 7u, 9u, 10u, 12u, 13u, 14u,
+ 0u, 1u, 2u, 3u, 4u, 6u, 7u, 10u,
+ 12u, 13u, 0u, 1u, 2u, 3u, 6u, 7u,
+ 9u, 10u, 11u, 12u, 13u, 0u, 1u, 2u,
+ 3u, 4u, 6u, 7u, 9u, 10u, 11u, 12u,
+ 13u, 0u, 1u, 6u, 10u, 11u, 12u, 8u,
+ 9u, 2u, 3u, 6u, 7u, 9u, 10u, 11u,
+ 12u, 13u, 14u, 0u, 1u, 0u
+};
+
+static const char _emoji_presentation_single_lengths[] = {
+ 3, 3, 1, 2, 2, 5, 4, 5,
+ 7, 8, 9, 6, 9, 8, 9, 10,
+ 1, 3, 10, 0
+};
+
+static const char _emoji_presentation_range_lengths[] = {
+ 0, 1, 0, 1, 0, 1, 0, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 0, 1, 1, 0
+};
+
+static const char _emoji_presentation_index_offsets[] = {
+ 0, 4, 9, 11, 15, 18, 25, 30,
+ 37, 46, 56, 67, 75, 86, 96, 107,
+ 119, 121, 126, 0
+};
+
+static const char _emoji_presentation_trans_cond_spaces[] = {
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, 0
+};
+
+static const short _emoji_presentation_trans_offsets[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87,
+ 88, 89, 90, 91, 92, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+ 128, 129, 130, 131, 132, 133, 134, 135,
+ 136, 137, 138, 139, 140, 141, 142, 143,
+ 144, 145, 146, 147, 148, 149, 150, 151,
+ 152, 153, 154, 155, 0
+};
+
+static const char _emoji_presentation_trans_lengths[] = {
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 0
+};
+
+static const char _emoji_presentation_cond_keys[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0
+};
+
+static const char _emoji_presentation_cond_targs[] = {
+ 7, 1, 11, 5, 13, 8, 8, 8,
+ 5, 7, 5, 1, 11, 7, 5, 4,
+ 7, 5, 14, 15, 16, 17, 18, 6,
+ 5, 7, 1, 5, 11, 5, 9, 10,
+ 2, 3, 12, 0, 5, 9, 10, 2,
+ 3, 1, 11, 12, 0, 5, 9, 10,
+ 2, 3, 7, 1, 11, 12, 0, 5,
+ 9, 10, 11, 2, 3, 7, 1, 11,
+ 12, 0, 5, 9, 10, 2, 3, 1,
+ 12, 0, 5, 9, 10, 2, 3, 7,
+ 1, 11, 12, 4, 0, 5, 9, 10,
+ 11, 2, 3, 1, 11, 12, 0, 5,
+ 9, 10, 2, 3, 7, 1, 5, 11,
+ 12, 0, 5, 9, 10, 11, 2, 3,
+ 7, 1, 5, 11, 12, 0, 5, 7,
+ 5, 1, 5, 11, 7, 5, 9, 10,
+ 2, 3, 7, 1, 5, 11, 12, 4,
+ 0, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 0
+};
+
+static const char _emoji_presentation_cond_actions[] = {
+ 15, 0, 15, 11, 15, 15, 15, 15,
+ 13, 15, 11, 0, 15, 15, 11, 0,
+ 15, 11, 15, 15, 0, 18, 15, 18,
+ 5, 15, 0, 5, 15, 9, 15, 15,
+ 0, 0, 15, 0, 7, 15, 15, 0,
+ 0, 0, 15, 15, 0, 7, 15, 15,
+ 0, 0, 15, 0, 15, 15, 0, 7,
+ 15, 15, 15, 0, 0, 15, 0, 15,
+ 15, 0, 7, 15, 15, 0, 0, 0,
+ 15, 0, 7, 15, 15, 0, 0, 15,
+ 0, 15, 15, 0, 0, 7, 15, 15,
+ 15, 0, 0, 0, 15, 15, 0, 7,
+ 15, 15, 0, 0, 15, 0, 5, 15,
+ 15, 0, 7, 15, 15, 15, 0, 0,
+ 15, 0, 5, 15, 15, 0, 7, 15,
+ 9, 0, 5, 15, 15, 9, 15, 15,
+ 0, 0, 15, 0, 5, 15, 15, 0,
+ 0, 7, 11, 13, 11, 11, 11, 9,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 9, 9, 7, 0
+};
+
+static const char _emoji_presentation_to_state_actions[] = {
+ 0, 0, 0, 0, 0, 1, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0
+};
+
+static const char _emoji_presentation_from_state_actions[] = {
+ 0, 0, 0, 0, 0, 3, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0
+};
+
+static const char _emoji_presentation_eof_cond_spaces[] = {
+ -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, 0
+};
+
+static const char _emoji_presentation_eof_cond_key_offs[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0
+};
+
+static const char _emoji_presentation_eof_cond_key_lens[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0
+};
+
+static const char _emoji_presentation_eof_cond_keys[] = {
+ 0
+};
+
+static const short _emoji_presentation_eof_trans[] = {
+ 139, 140, 141, 142, 143, 0, 144, 145,
+ 146, 147, 148, 149, 150, 151, 152, 153,
+ 154, 155, 156, 0
+};
+
+static const char _emoji_presentation_nfa_targs[] = {
+ 0, 0
+};
+
+static const char _emoji_presentation_nfa_offsets[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0
+};
+
+static const char _emoji_presentation_nfa_push_actions[] = {
+ 0, 0
+};
+
+static const char _emoji_presentation_nfa_pop_trans[] = {
+ 0, 0
+};
+
+static const int emoji_presentation_start = 5;
+
+static const int emoji_presentation_en_text_and_emoji_run = 5;
+
+
+
+
+
+static gboolean
+scan_emoji_presentation (const unsigned char* buffer,
+unsigned buffer_size,
+unsigned cursor,
+unsigned* last,
+unsigned* end)
+{
+ const unsigned char *p = buffer + cursor;
+ const unsigned char *pe, *eof, *ts, *te;
+ unsigned act;
+ int cs;
+ pe = eof = buffer + buffer_size;
+
+
+ {
+ cs = (int)emoji_presentation_start;
+ ts = 0;
+ te = 0;
+ act = 0;
+ }
+
+ {
+ int _cpc;
+ int _klen;const char * _cekeys;unsigned int _trans = 0;const unsigned char * _keys;const char
* _acts;unsigned int _nacts; {
+ if ( p == pe )
+ goto _test_eof;
+ _resume: {
+ _acts = ( _emoji_presentation_actions +
(_emoji_presentation_from_state_actions[cs]));
+ _nacts = (unsigned int)(*( _acts));
+ _acts += 1;
+ while ( _nacts > 0 ) {
+ switch ( (*( _acts)) ) {
+ case 1: {
+ {
+ #line 1 "NONE"
+ {ts = p;}}
+ break; }
+ }
+ _nacts -= 1;
+ _acts += 1;
+ }
+
+ _keys = ( _emoji_presentation_trans_keys +
(_emoji_presentation_key_offsets[cs]));
+ _trans = (unsigned int)_emoji_presentation_index_offsets[cs];
+
+ _klen = (int)_emoji_presentation_single_lengths[cs];
+ if ( _klen > 0 ) {
+ const unsigned char *_lower = _keys;
+ const unsigned char *_upper = _keys + _klen - 1;
+ const unsigned char *_mid;
+ while ( 1 ) {
+ if ( _upper < _lower )
+ break;
+
+ _mid = _lower + ((_upper-_lower) >> 1);
+ if ( ( (*( p))) < (*( _mid)) )
+ _upper = _mid - 1;
+ else if ( ( (*( p))) > (*( _mid)) )
+ _lower = _mid + 1;
+ else {
+ _trans += (unsigned int)(_mid - _keys);
+ goto _match;
+ }
+ }
+ _keys += _klen;
+ _trans += (unsigned int)_klen;
+ }
+
+ _klen = (int)_emoji_presentation_range_lengths[cs];
+ if ( _klen > 0 ) {
+ const unsigned char *_lower = _keys;
+ const unsigned char *_upper = _keys + (_klen<<1) - 2;
+ const unsigned char *_mid;
+ while ( 1 ) {
+ if ( _upper < _lower )
+ break;
+
+ _mid = _lower + (((_upper-_lower) >> 1) & ~1);
+ if ( ( (*( p))) < (*( _mid)) )
+ _upper = _mid - 2;
+ else if ( ( (*( p))) > (*( _mid + 1)) )
+ _lower = _mid + 2;
+ else {
+ _trans += (unsigned int)((_mid - _keys)>>1);
+ goto _match;
+ }
+ }
+ _trans += (unsigned int)_klen;
+ }
+
+ _match: {
+ goto _match_cond;
+ }
+ }
+ _match_cond: {
+ cs = (int)_emoji_presentation_cond_targs[_trans];
+
+ if ( _emoji_presentation_cond_actions[_trans] == 0 )
+ goto _again;
+
+ _acts = ( _emoji_presentation_actions +
(_emoji_presentation_cond_actions[_trans]));
+ _nacts = (unsigned int)(*( _acts));
+ _acts += 1;
+ while ( _nacts > 0 ) {
+ switch ( (*( _acts)) )
+ {
+ case 2: {
+ {
+ #line 1 "NONE"
+ {te = p+1;}}
+ break; }
+ case 3: {
+ {
+ #line 71 "emoji_presentation_scanner.rl"
+ {act = 1;}}
+ break; }
+ case 4: {
+ {
+ #line 72 "emoji_presentation_scanner.rl"
+ {act = 2;}}
+ break; }
+ case 5: {
+ {
+ #line 72 "emoji_presentation_scanner.rl"
+ {te = p+1;{
+ #line 72
"emoji_presentation_scanner.rl"
+
found_text_presentation_sequence }}}
+ break; }
+ case 6: {
+ {
+ #line 71 "emoji_presentation_scanner.rl"
+ {te = p;p = p - 1;{
+ #line 71
"emoji_presentation_scanner.rl"
+
found_emoji_presentation_sequence }}}
+ break; }
+ case 7: {
+ {
+ #line 72 "emoji_presentation_scanner.rl"
+ {te = p;p = p - 1;{
+ #line 72
"emoji_presentation_scanner.rl"
+
found_text_presentation_sequence }}}
+ break; }
+ case 8: {
+ {
+ #line 71 "emoji_presentation_scanner.rl"
+ {p = ((te))-1;
+ {
+ #line 71
"emoji_presentation_scanner.rl"
+
found_emoji_presentation_sequence }}}
+ break; }
+ case 9: {
+ {
+ #line 1 "NONE"
+ {switch( act ) {
+ case 1: {
+ p = ((te))-1;
+ {
+ #line 71
"emoji_presentation_scanner.rl"
+
found_emoji_presentation_sequence } break; }
+ case 2: {
+ p = ((te))-1;
+ {
+ #line 72
"emoji_presentation_scanner.rl"
+
found_text_presentation_sequence } break; }
+ }}
+ }
+ break; }
+ }
+ _nacts -= 1;
+ _acts += 1;
+ }
+
+
+ }
+ _again: {
+ _acts = ( _emoji_presentation_actions +
(_emoji_presentation_to_state_actions[cs]));
+ _nacts = (unsigned int)(*( _acts));
+ _acts += 1;
+ while ( _nacts > 0 ) {
+ switch ( (*( _acts)) ) {
+ case 0: {
+ {
+ #line 1 "NONE"
+ {ts = 0;}}
+ break; }
+ }
+ _nacts -= 1;
+ _acts += 1;
+ }
+
+ p += 1;
+ if ( p != pe )
+ goto _resume;
+ }
+ _test_eof: { {}
+ if ( p == eof )
+ {
+ if ( _emoji_presentation_eof_cond_spaces[cs] != -1 ) {
+ _cekeys = ( _emoji_presentation_eof_cond_keys +
(_emoji_presentation_eof_cond_key_offs[cs]));
+ _klen = (int)_emoji_presentation_eof_cond_key_lens[cs];
+ _cpc = 0;
+ {
+ const char *_lower = _cekeys;
+ const char *_upper = _cekeys + _klen - 1;
+ const char *_mid;
+ while ( 1 ) {
+ if ( _upper < _lower )
+ break;
+
+ _mid = _lower + ((_upper-_lower) >> 1);
+ if ( _cpc < (int)(*( _mid)) )
+ _upper = _mid - 1;
+ else if ( _cpc > (int)(*( _mid)) )
+ _lower = _mid + 1;
+ else {
+ goto _ok;
+ }
+ }
+ cs = -1;
+ goto _out;
+ }
+ _ok: {}
+ }
+ if ( _emoji_presentation_eof_trans[cs] > 0 ) {
+ _trans = (unsigned int)_emoji_presentation_eof_trans[cs] - 1;
+ goto _match_cond;
+ }
+ }
+
+ }
+ _out: { {}
+ }
+ }
+ }
+
+ return FALSE;
+}
+
diff --git a/pango/emoji_presentation_scanner.rl b/pango/emoji_presentation_scanner.rl
new file mode 100644
index 00000000..5eea495a
--- /dev/null
+++ b/pango/emoji_presentation_scanner.rl
@@ -0,0 +1,96 @@
+// Copyright 2018 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+%%{
+ machine emoji_presentation;
+ alphtype unsigned char;
+ write data noerror nofinal noentry;
+}%%
+
+%%{
+
+EMOJI = 0;
+EMOJI_TEXT_PRESENTATION = 1;
+EMOJI_EMOJI_PRESENTATION = 2;
+EMOJI_MODIFIER_BASE = 3;
+EMOJI_MODIFIER = 4;
+EMOJI_VS_BASE = 5;
+REGIONAL_INDICATOR = 6;
+KEYCAP_BASE = 7;
+COMBINING_ENCLOSING_KEYCAP = 8;
+COMBINING_ENCLOSING_CIRCLE_BACKSLASH = 9;
+ZWJ = 10;
+VS15 = 11;
+VS16 = 12;
+TAG_BASE = 13;
+TAG_SEQUENCE = 14;
+TAG_TERM = 15;
+
+any_emoji = EMOJI_TEXT_PRESENTATION | EMOJI_EMOJI_PRESENTATION | KEYCAP_BASE |
+ EMOJI_MODIFIER_BASE | TAG_BASE | EMOJI;
+
+emoji_combining_encloding_circle_backslash_sequence = any_emoji
+ COMBINING_ENCLOSING_CIRCLE_BACKSLASH;
+
+# This could be sharper than any_emoji by restricting this only to valid
+# variation sequences:
+# https://www.unicode.org/Public/emoji/11.0/emoji-variation-sequences.txt
+# However, implementing
+# https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence is
+# sufficient for our purposes here.
+emoji_presentation_sequence = any_emoji VS16;
+
+emoji_modifier_sequence = EMOJI_MODIFIER_BASE EMOJI_MODIFIER;
+
+emoji_flag_sequence = REGIONAL_INDICATOR REGIONAL_INDICATOR;
+
+# Here we only allow the valid tag sequences
+# https://www.unicode.org/reports/tr51/#valid-emoji-tag-sequences, instead of
+# all well-formed ones defined in
+# https://www.unicode.org/reports/tr51/#def_emoji_tag_sequence
+emoji_tag_sequence = TAG_BASE TAG_SEQUENCE+ TAG_TERM;
+
+emoji_keycap_sequence = KEYCAP_BASE COMBINING_ENCLOSING_KEYCAP;
+
+emoji_zwj_element = emoji_presentation_sequence | emoji_modifier_sequence | any_emoji;
+
+emoji_zwj_sequence = emoji_zwj_element ( ZWJ emoji_zwj_element )+;
+
+emoji_presentation = EMOJI_EMOJI_PRESENTATION | TAG_BASE | EMOJI_MODIFIER_BASE |
+ emoji_presentation_sequence | emoji_modifier_sequence | emoji_flag_sequence |
+ emoji_tag_sequence | emoji_keycap_sequence | emoji_zwj_sequence |
+ emoji_combining_encloding_circle_backslash_sequence;
+
+emoji_run = emoji_presentation+;
+
+text_presentation_emoji = any_emoji VS15;
+text_run = text_presentation_emoji | any;
+
+text_and_emoji_run := |*
+emoji_run => { found_emoji_presentation_sequence };
+text_run => { found_text_presentation_sequence };
+*|;
+
+}%%
+
+static gboolean
+scan_emoji_presentation (const unsigned char* buffer,
+ unsigned buffer_size,
+ unsigned cursor,
+ unsigned* last,
+ unsigned* end)
+{
+ const unsigned char *p = buffer + cursor;
+ const unsigned char *pe, *eof, *ts, *te;
+ unsigned act;
+ int cs;
+ pe = eof = buffer + buffer_size;
+
+ %%{
+ write init;
+ write exec;
+ }%%
+ return FALSE;
+}
+
diff --git a/pango/pango-emoji-private.h b/pango/pango-emoji-private.h
index eb8a52a7..a360b37a 100644
--- a/pango/pango-emoji-private.h
+++ b/pango/pango-emoji-private.h
@@ -33,6 +33,13 @@ struct _PangoEmojiIter
const gchar *start;
const gchar *end;
gboolean is_emoji;
+
+ const gchar *token_start;
+ const gchar *token_end;
+
+ const unsigned char *types;
+ unsigned int n_chars;
+ unsigned int cursor;
};
PangoEmojiIter *
diff --git a/pango/pango-emoji.c b/pango/pango-emoji.c
index 29472452..46ab5b3f 100644
--- a/pango/pango-emoji.c
+++ b/pango/pango-emoji.c
@@ -18,11 +18,27 @@
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*
- * Implementation of pango_emoji_iter is derived from Chromium:
+ * Implementation of pango_emoji_iter is based on Chromium's Ragel-based
+ * parser:
*
- * https://cs.chromium.org/chromium/src/third_party/WebKit/Source/platform/fonts/FontFallbackPriority.h
- * https://cs.chromium.org/chromium/src/third_party/WebKit/Source/platform/text/CharacterEmoji.cpp
- * https://cs.chromium.org/chromium/src/third_party/WebKit/Source/platform/fonts/SymbolsIterator.cpp
+ * https://chromium-review.googlesource.com/c/chromium/src/+/1264577
+ *
+ * The grammar file emoji_presentation_scanner.rl was just modified to
+ * adapt the function signature and variables to our usecase. The
+ * grammar itself was NOT modified:
+ *
+ *
https://chromium-review.googlesource.com/c/chromium/src/+/1264577/3/third_party/blink/renderer/platform/fonts/emoji_presentation_scanner.rl
+ *
+ * The emoji_presentation_scanner.c is generated from .rl file by
+ * running ragel on it.
+ *
+ * The categorization is also based on:
+ *
+ *
https://chromium-review.googlesource.com/c/chromium/src/+/1264577/3/third_party/blink/renderer/platform/fonts/utf16_ragel_iterator.h
+ *
+ * The iterator next() is based on:
+ *
+ *
https://chromium-review.googlesource.com/c/chromium/src/+/1264577/3/third_party/blink/renderer/platform/fonts/symbols_iterator.cc
*
* // Copyright 2015 The Chromium Authors. All rights reserved.
* // Use of this source code is governed by a BSD-style license that can be
@@ -105,62 +121,110 @@ _pango_Is_Regional_Indicator (gunichar ch)
const gunichar kCombiningEnclosingCircleBackslashCharacter = 0x20E0;
const gunichar kCombiningEnclosingKeycapCharacter = 0x20E3;
-const gunichar kEyeCharacter = 0x1F441;
-const gunichar kFemaleSignCharacter = 0x2640;
-const gunichar kLeftSpeechBubbleCharacter = 0x1F5E8;
-const gunichar kMaleSignCharacter = 0x2642;
-const gunichar kRainbowCharacter = 0x1F308;
-const gunichar kStaffOfAesculapiusCharacter = 0x2695;
const gunichar kVariationSelector15Character = 0xFE0E;
const gunichar kVariationSelector16Character = 0xFE0F;
-const gunichar kWavingWhiteFlagCharacter = 0x1F3F3;
const gunichar kZeroWidthJoinerCharacter = 0x200D;
-
-typedef enum {
- PANGO_EMOJI_TYPE_INVALID,
- PANGO_EMOJI_TYPE_TEXT, /* For regular non-symbols text */
- PANGO_EMOJI_TYPE_EMOJI_TEXT, /* For emoji in text presentaiton */
- PANGO_EMOJI_TYPE_EMOJI_EMOJI /* For emoji in emoji presentation */
-} PangoEmojiType;
-
-static PangoEmojiType
-_pango_get_emoji_type (gunichar codepoint)
+enum PangoEmojiScannerCategory {
+ EMOJI = 0,
+ EMOJI_TEXT_PRESENTATION = 1,
+ EMOJI_EMOJI_PRESENTATION = 2,
+ EMOJI_MODIFIER_BASE = 3,
+ EMOJI_MODIFIER = 4,
+ EMOJI_VS_BASE = 5,
+ REGIONAL_INDICATOR = 6,
+ KEYCAP_BASE = 7,
+ COMBINING_ENCLOSING_KEYCAP = 8,
+ COMBINING_ENCLOSING_CIRCLE_BACKSLASH = 9,
+ ZWJ = 10,
+ VS15 = 11,
+ VS16 = 12,
+ TAG_BASE = 13,
+ TAG_SEQUENCE = 14,
+ TAG_TERM = 15,
+ kMaxEmojiScannerCategory = 16
+};
+
+static unsigned char
+_pango_EmojiSegmentationCategory (gunichar codepoint)
{
- /* Those should only be Emoji presentation as combinations of two. */
- if (_pango_Is_Emoji_Keycap_Base (codepoint) ||
- _pango_Is_Regional_Indicator (codepoint))
- return PANGO_EMOJI_TYPE_TEXT;
-
+ /* Specific ones first. */
if (codepoint == kCombiningEnclosingKeycapCharacter)
- return PANGO_EMOJI_TYPE_EMOJI_EMOJI;
-
- if (_pango_Is_Emoji_Emoji_Default (codepoint) ||
- _pango_Is_Emoji_Modifier_Base (codepoint) ||
- _pango_Is_Emoji_Modifier (codepoint))
- return PANGO_EMOJI_TYPE_EMOJI_EMOJI;
-
+ return COMBINING_ENCLOSING_KEYCAP;
+ if (codepoint == kCombiningEnclosingCircleBackslashCharacter)
+ return COMBINING_ENCLOSING_CIRCLE_BACKSLASH;
+ if (codepoint == kZeroWidthJoinerCharacter)
+ return ZWJ;
+ if (codepoint == kVariationSelector15Character)
+ return VS15;
+ if (codepoint == kVariationSelector16Character)
+ return VS16;
+ if (codepoint == 0x1F3F4)
+ return TAG_BASE;
+ if ((codepoint >= 0xE0030 && codepoint <= 0xE0039) ||
+ (codepoint >= 0xE0061 && codepoint <= 0xE007A))
+ return TAG_SEQUENCE;
+ if (codepoint == 0xE007F)
+ return TAG_TERM;
+ if (_pango_Is_Emoji_Modifier_Base (codepoint))
+ return EMOJI_MODIFIER_BASE;
+ if (_pango_Is_Emoji_Modifier (codepoint))
+ return EMOJI_MODIFIER;
+ if (_pango_Is_Regional_Indicator (codepoint))
+ return REGIONAL_INDICATOR;
+ if (_pango_Is_Emoji_Keycap_Base (codepoint))
+ return KEYCAP_BASE;
+
+ if (_pango_Is_Emoji_Emoji_Default (codepoint))
+ return EMOJI_EMOJI_PRESENTATION;
if (_pango_Is_Emoji_Text_Default (codepoint))
- return PANGO_EMOJI_TYPE_EMOJI_TEXT;
+ return EMOJI_TEXT_PRESENTATION;
+ if (_pango_Is_Emoji (codepoint))
+ return EMOJI;
- return PANGO_EMOJI_TYPE_TEXT;
+ /* Ragel state machine will interpret unknown category as "any". */
+ return kMaxEmojiScannerCategory;
}
+#define found_text_presentation_sequence
+#define found_emoji_presentation_sequence \
+ { \
+ if (0) g_print ("emoji %ld..%ld\n", ts - buffer, te - buffer); \
+ *last = ts - buffer; \
+ *end = te - buffer; \
+ return TRUE; \
+ }
+
+#include "emoji_presentation_scanner.c"
+
PangoEmojiIter *
_pango_emoji_iter_init (PangoEmojiIter *iter,
const char *text,
int length)
{
- iter->text_start = text;
+ unsigned int n_chars = g_utf8_strlen (text, length);
+ unsigned char *types = g_malloc (n_chars);
+ unsigned int i;
+ const char *p;
+
+ p = text;
+ for (i = 0; i < n_chars; i++)
+ {
+ types[i] = _pango_EmojiSegmentationCategory (g_utf8_get_char (p));
+ p = g_utf8_next_char (p);
+ }
+
+ iter->text_start = iter->start = iter->end = iter->token_start = iter->token_end = text;
if (length >= 0)
iter->text_end = text + length;
else
iter->text_end = text + strlen (text);
+ iter->is_emoji = FALSE;
- iter->start = text;
- iter->end = text;
- iter->is_emoji = (gboolean) 2; /* HACK */
+ iter->types = types;
+ iter->n_chars = n_chars;
+ iter->cursor = 0;
_pango_emoji_iter_next (iter);
@@ -170,102 +234,62 @@ _pango_emoji_iter_init (PangoEmojiIter *iter,
void
_pango_emoji_iter_fini (PangoEmojiIter *iter)
{
+ g_free (iter->types);
}
-#define PANGO_EMOJI_TYPE_IS_EMOJI(typ) ((typ) == PANGO_EMOJI_TYPE_EMOJI_EMOJI)
-
gboolean
_pango_emoji_iter_next (PangoEmojiIter *iter)
{
- PangoEmojiType current_emoji_type = PANGO_EMOJI_TYPE_INVALID;
-
- if (iter->end == iter->text_end)
+ if (iter->end >= iter->text_end)
return FALSE;
iter->start = iter->end;
- for (; iter->end < iter->text_end; iter->end = g_utf8_next_char (iter->end))
+ /* The scan_emoji_presentation scanner function returns false when it reaches
+ * the end of the buffer and has not discovered any emoji runs in between. For
+ * Emoji runs, it returns true, and token_start_ and token_end_ are set to the
+ * start and end of the emoji sequence. This means, it may skip over text runs
+ * in between, see below. */
+ if (iter->start >= iter->token_end)
{
- gunichar ch = g_utf8_get_char (iter->end);
-
- /* Except at the beginning, ZWJ just carries over the emoji or neutral
- * text type, VS15 & VS16 we just carry over as well, since we already
- * resolved those through lookahead. Also, don't downgrade to text
- * presentation for emoji that are part of a ZWJ sequence, example
- * U+1F441 U+200D U+1F5E8, eye (text presentation) + ZWJ + left speech
- * bubble, see below. */
- if ((!(ch == kZeroWidthJoinerCharacter && !iter->is_emoji) &&
- ch != kVariationSelector15Character &&
- ch != kVariationSelector16Character &&
- ch != kCombiningEnclosingCircleBackslashCharacter &&
- !_pango_Is_Regional_Indicator(ch) &&
- !((ch == kLeftSpeechBubbleCharacter ||
- ch == kRainbowCharacter ||
- ch == kMaleSignCharacter ||
- ch == kFemaleSignCharacter ||
- ch == kStaffOfAesculapiusCharacter) &&
- !iter->is_emoji)) ||
- current_emoji_type == PANGO_EMOJI_TYPE_INVALID) {
- current_emoji_type = _pango_get_emoji_type (ch);
+ /* We need to scan furhter. */
+ unsigned int token_start, token_end;
+ if (!scan_emoji_presentation (iter->types, iter->n_chars, iter->cursor,
+ &token_start, &token_end))
+ {
+ /* The scanner returned false, which means it has reached the end of the
+ * buffer without discovering any emoji segments in between. */
+ iter->end = iter->text_end;
+ iter->is_emoji = FALSE;
+
+ return TRUE;
+ };
+ /* Ugly... */
+ g_assert (iter->cursor <= token_start && token_start < token_end && token_end <= iter->n_chars);
+ iter->token_start = g_utf8_offset_to_pointer (iter->token_end, token_start - iter->cursor);
+ iter->token_end = g_utf8_offset_to_pointer (iter->token_end, token_end - iter->cursor);
+ iter->cursor = token_end;
}
- if (g_utf8_next_char (iter->end) < iter->text_end) /* Optimize. */
+ if (iter->start < iter->token_start)
{
- gunichar peek_char = g_utf8_get_char (g_utf8_next_char (iter->end));
-
- /* Variation Selectors */
- if (current_emoji_type ==
- PANGO_EMOJI_TYPE_EMOJI_EMOJI &&
- peek_char == kVariationSelector15Character) {
- current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_TEXT;
- }
-
- if ((current_emoji_type ==
- PANGO_EMOJI_TYPE_EMOJI_TEXT ||
- _pango_Is_Emoji_Keycap_Base(ch)) &&
- peek_char == kVariationSelector16Character) {
- current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI;
- }
-
- /* Combining characters Keycap... */
- if (_pango_Is_Emoji_Keycap_Base(ch) &&
- peek_char == kCombiningEnclosingKeycapCharacter) {
- current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI;
- };
-
- /* Regional indicators */
- if (_pango_Is_Regional_Indicator(ch) &&
- _pango_Is_Regional_Indicator(peek_char)) {
- current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI;
- }
-
- /* Upgrade text presentation emoji to emoji presentation when followed by
- * ZWJ, Example U+1F441 U+200D U+1F5E8, eye + ZWJ + left speech bubble. */
- if ((ch == kEyeCharacter ||
- ch == kWavingWhiteFlagCharacter) &&
- peek_char == kZeroWidthJoinerCharacter) {
- current_emoji_type = PANGO_EMOJI_TYPE_EMOJI_EMOJI;
- }
+ /* The scanner function has progressed to the next emoji segment, but we
+ * need to return the text segment over which it had skipped. */
+ iter->end = iter->token_start;
+ iter->is_emoji = FALSE;
+ return TRUE;
}
- if (iter->is_emoji == (gboolean) 2)
- iter->is_emoji = !PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type);
- if (iter->is_emoji == PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type))
+ if (iter->start >= iter->token_start && iter->start < iter->token_end)
{
- iter->is_emoji = !PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type);
-
- /* Make sure we make progress. Weird sequences, like a VC15 followed
- * by VC16, can trick us into stalling otherwise. */
- if (iter->start == iter->end)
- iter->end = g_utf8_next_char (iter->end);
-
+ /* Now our cursor has reached the emoji segment, and we can return it. */
+ iter->end = iter->token_end;
+ iter->is_emoji = TRUE;
return TRUE;
}
- }
-
- iter->is_emoji = PANGO_EMOJI_TYPE_IS_EMOJI (current_emoji_type);
- return TRUE;
+ g_assert_not_reached ();
+ return FALSE;
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]