[gtk/wip/otte/json: 7/21] jsonparser: Split parsing strings from unescaping them
- From: Benjamin Otte <otte src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gtk/wip/otte/json: 7/21] jsonparser: Split parsing strings from unescaping them
- Date: Sun, 5 Dec 2021 21:10:00 +0000 (UTC)
commit d9e9e50f294274b4c7945f1a8d88347f3fc1cb1c
Author: Benjamin Otte <otte redhat com>
Date: Mon Nov 29 02:07:23 2021 +0100
jsonparser: Split parsing strings from unescaping them
gtk/json/gtkjsonparser.c | 252 +++++++++++++++++++++++++++++------------------
1 file changed, 155 insertions(+), 97 deletions(-)
---
diff --git a/gtk/json/gtkjsonparser.c b/gtk/json/gtkjsonparser.c
index ce8c112dcf..b3307713d2 100644
--- a/gtk/json/gtkjsonparser.c
+++ b/gtk/json/gtkjsonparser.c
@@ -61,6 +61,7 @@ struct _GtkJsonParser
typedef enum {
WHITESPACE = (1 << 0),
STRING_ELEMENT = (1 << 1),
+ STRING_MARKER = (1 << 2),
} JsonCharacterType;
static const guchar json_character_table[256] = {
@@ -70,7 +71,7 @@ static const guchar json_character_table[256] = {
[' '] = WHITESPACE | STRING_ELEMENT,
[' '] = STRING_ELEMENT,
['!'] = STRING_ELEMENT,
- ['"'] = 0,
+ ['"'] = STRING_MARKER,
['#'] = STRING_ELEMENT,
['$'] = STRING_ELEMENT,
['%'] = STRING_ELEMENT,
@@ -128,7 +129,7 @@ static const guchar json_character_table[256] = {
['Y'] = STRING_ELEMENT,
['Z'] = STRING_ELEMENT,
['['] = STRING_ELEMENT,
- ['\\'] = 0,
+ ['\\'] = STRING_MARKER,
[']'] = STRING_ELEMENT,
['^'] = STRING_ELEMENT,
['_'] = STRING_ELEMENT,
@@ -175,7 +176,21 @@ json_skip_characters (const guchar *start,
for (s = start; s < end; s++)
{
- if ((json_character_table[*s] & type) != type)
+ if (!(json_character_table[*s] & type))
+ break;
+ }
+ return s;
+}
+
+static const guchar *
+json_find_character (const guchar *start,
+ JsonCharacterType type)
+{
+ const guchar *s;
+
+ for (s = start; ; s++)
+ {
+ if ((json_character_table[*s] & type))
break;
}
return s;
@@ -333,29 +348,124 @@ decode_utf16_surrogate_pair (gunichar first,
| (second & 0x3ff);
}
+static gsize
+gtk_json_unescape_char (const guchar *json_escape,
+ char out_data[6],
+ gsize *out_len)
+{
+ switch (json_escape[1])
+ {
+ case '"':
+ case '\\':
+ case '/':
+ out_data[0] = json_escape[1];
+ *out_len = 1;
+ return 2;
+ case 'b':
+ out_data[0] = '\b';
+ *out_len = 1;
+ return 2;
+ case 'f':
+ out_data[0] = '\f';
+ *out_len = 1;
+ return 2;
+ case 'n':
+ out_data[0] = '\n';
+ *out_len = 1;
+ return 2;
+ case 'r':
+ out_data[0] = '\r';
+ *out_len = 1;
+ return 2;
+ case 't':
+ out_data[0] = '\t';
+ *out_len = 1;
+ return 2;
+ case 'u':
+ {
+ gunichar unichar = (g_ascii_xdigit_value (json_escape[2]) << 12) |
+ (g_ascii_xdigit_value (json_escape[3]) << 8) |
+ (g_ascii_xdigit_value (json_escape[4]) << 4) |
+ (g_ascii_xdigit_value (json_escape[5]));
+ gsize result = 6;
+
+ /* resolve UTF-16 surrogates for Unicode characters not in the BMP,
+ * as per ECMA 404, § 9, "String"
+ */
+ if (g_unichar_type (unichar) == G_UNICODE_SURROGATE)
+ {
+ unichar = decode_utf16_surrogate_pair (unichar,
+ (g_ascii_xdigit_value (json_escape[8]) << 12) |
+ (g_ascii_xdigit_value (json_escape[9]) << 8) |
+ (g_ascii_xdigit_value (json_escape[10]) << 4) |
+ (g_ascii_xdigit_value (json_escape[11])));
+ result += 6;
+ }
+ *out_len = g_unichar_to_utf8 (unichar, out_data);
+ return result;
+ }
+ default:
+ g_assert_not_reached ();
+ return 0;
+ }
+}
+
+/* The escaped string MUST be valid json, so it must begin
+ * with " and end with " and must not contain any invalid
+ * escape codes.
+ * This function is meant to be fast
+ */
+static char *
+gtk_json_unescape_string (const guchar *escaped)
+{
+ char buf[6];
+ gsize buf_size;
+ GString *string;
+ const guchar *last, *s;
+
+ string = NULL;
+
+ g_assert (*escaped == '"');
+ last = escaped + 1;
+ for (s = json_find_character (last, STRING_MARKER);
+ *s != '"';
+ s = json_find_character (last, STRING_MARKER))
+ {
+ g_assert (*s == '\\');
+ if (string == NULL)
+ string = g_string_new (NULL);
+ g_string_append_len (string, (const char *) last, s - last);
+ last = s + gtk_json_unescape_char (s, buf, &buf_size);
+ g_string_append_len (string, buf, buf_size);
+ }
+
+ if (string)
+ {
+ g_string_append_len (string, (const char *) last, s - last);
+ return g_string_free (string, FALSE);
+ }
+ else
+ {
+ return g_strndup ((const char *) last, s - last);
+ }
+}
+
static gboolean
gtk_json_reader_parse_string (GtkJsonReader *reader,
- char **out_string,
GError **error)
{
- GString *string = NULL;
- const guchar *last;
-
if (!gtk_json_reader_try_char (reader, '"'))
{
gtk_json_set_syntax_error (error, "Not a string");
return FALSE;
}
- last = reader->data;
reader->data = json_skip_characters (reader->data, reader->end, STRING_ELEMENT);
while (gtk_json_reader_remaining (reader))
{
if (*reader->data < 0x20)
{
- if (string)
- g_string_free (string, TRUE);
gtk_json_set_syntax_error (error, "Disallowed control character in string literal");
return FALSE;
}
@@ -364,8 +474,6 @@ gtk_json_reader_parse_string (GtkJsonReader *reader,
gunichar c = g_utf8_get_char_validated ((const char *) reader->data, reader->end - reader->data);
if (c == (gunichar) -2 || c == (gunichar) -1)
{
- if (string)
- g_string_free (string, TRUE);
gtk_json_set_syntax_error (error, "Invalid UTF-8");
return FALSE;
}
@@ -373,16 +481,6 @@ gtk_json_reader_parse_string (GtkJsonReader *reader,
}
else if (*reader->data == '"')
{
- if (out_string)
- {
- if (string)
- {
- g_string_append_len (string, (const char *) last, reader->data - last);
- *out_string = g_string_free (string, FALSE);
- }
- else
- *out_string = g_strndup ((const char *) last, reader->data - last);
- }
reader->data++;
return TRUE;
}
@@ -390,50 +488,27 @@ gtk_json_reader_parse_string (GtkJsonReader *reader,
{
if (gtk_json_reader_remaining (reader) < 2)
goto end;
- if (out_string)
- {
- if (!string)
- string = g_string_new (NULL);
- g_string_append_len (string, (const char *) last, reader->data - last);
- }
reader->data++;
switch (*reader->data)
{
case '"':
case '\\':
case '/':
- if (string)
- g_string_append_c (string, *reader->data);
- break;
case 'b':
- if (string)
- g_string_append_c (string, '\b');
- break;
case 'f':
- if (string)
- g_string_append_c (string, '\f');
- break;
case 'n':
- if (string)
- g_string_append_c (string, '\n');
- break;
case 'r':
- if (string)
- g_string_append_c (string, '\r');
- break;
case 't':
- if (string)
- g_string_append_c (string, '\t');
break;
+
case 'u':
+ /* lots of work necessary to validate the unicode escapes here */
if (gtk_json_reader_remaining (reader) < 5 ||
!g_ascii_isxdigit (reader->data[1]) ||
!g_ascii_isxdigit (reader->data[2]) ||
!g_ascii_isxdigit (reader->data[3]) ||
!g_ascii_isxdigit (reader->data[4]))
{
- if (string)
- g_string_free (string, TRUE);
gtk_json_set_syntax_error (error, "Invalid Unicode escape sequence");
return FALSE;
}
@@ -448,49 +523,47 @@ gtk_json_reader_parse_string (GtkJsonReader *reader,
/* resolve UTF-16 surrogates for Unicode characters not in the BMP,
* as per ECMA 404, § 9, "String"
*/
- if (g_unichar_type (unichar) == G_UNICODE_SURROGATE &&
- gtk_json_reader_remaining (reader) >= 7 &&
- reader->data[1] == '\\' &&
- reader->data[2] == 'u' &&
- g_ascii_isxdigit (reader->data[3]) &&
- g_ascii_isxdigit (reader->data[4]) &&
- g_ascii_isxdigit (reader->data[5]) &&
- g_ascii_isxdigit (reader->data[6]))
+ if (g_unichar_type (unichar) == G_UNICODE_SURROGATE)
{
- unichar = decode_utf16_surrogate_pair (unichar,
- (g_ascii_xdigit_value (reader->data[3]) << 12) |
- (g_ascii_xdigit_value (reader->data[4]) << 8) |
- (g_ascii_xdigit_value (reader->data[5]) << 4) |
- (g_ascii_xdigit_value (reader->data[6])));
- reader->data += 6;
+ if (gtk_json_reader_remaining (reader) >= 7 &&
+ reader->data[1] == '\\' &&
+ reader->data[2] == 'u' &&
+ g_ascii_isxdigit (reader->data[3]) &&
+ g_ascii_isxdigit (reader->data[4]) &&
+ g_ascii_isxdigit (reader->data[5]) &&
+ g_ascii_isxdigit (reader->data[6]))
+ {
+ unichar = decode_utf16_surrogate_pair (unichar,
+ (g_ascii_xdigit_value (reader->data[3]) <<
12) |
+ (g_ascii_xdigit_value (reader->data[4]) <<
8) |
+ (g_ascii_xdigit_value (reader->data[5]) <<
4) |
+ (g_ascii_xdigit_value (reader->data[6])));
+ reader->data += 6;
+ }
+ else
+ {
+ unichar = 0;
+ }
if (unichar == 0)
{
- if (string)
- g_string_free (string, TRUE);
- gtk_json_set_syntax_error (error, "Invalid UTF-16 surrogate pair");
+ gtk_json_parser_syntax_error (self, "Invalid UTF-16 surrogate pair");
return FALSE;
}
}
- if (string)
- g_string_append_unichar (string, unichar);
}
break;
default:
- if (string)
- g_string_free (string, TRUE);
gtk_json_set_syntax_error (error, "Unknown escape sequence");
return FALSE;
}
reader->data++;
}
- last = reader->data;
+
reader->data = json_skip_characters (reader->data, reader->end, STRING_ELEMENT);
}
end:
- if (string)
- g_string_free (string, TRUE);
gtk_json_set_syntax_error (error, "Unterminated string literal");
return FALSE;
}
@@ -659,7 +732,7 @@ gtk_json_parser_skip_value (GtkJsonParser *self)
switch (*self->reader.data)
{
case '"':
- return gtk_json_reader_parse_string (&self->reader, NULL, &self->error);
+ return gtk_json_reader_parse_string (&self->reader, &self->error);
case '-':
case '0':
@@ -754,7 +827,7 @@ gtk_json_parser_next (GtkJsonParser *self)
gtk_json_reader_skip_whitespace (&self->reader);
self->block->member_name = self->reader.data;
- if (!gtk_json_reader_parse_string (&self->reader, NULL, &self->error))
+ if (!gtk_json_reader_parse_string (&self->reader, &self->error))
return FALSE;
gtk_json_reader_skip_whitespace (&self->reader);
if (!gtk_json_reader_try_char (&self->reader, ':'))
@@ -863,9 +936,6 @@ gtk_json_parser_get_error (GtkJsonParser *self)
char *
gtk_json_parser_get_member_name (GtkJsonParser *self)
{
- GtkJsonReader reader;
- char *result;
-
if (self->error)
return NULL;
@@ -875,13 +945,7 @@ gtk_json_parser_get_member_name (GtkJsonParser *self)
if (self->block->member_name == NULL)
return NULL;
- gtk_json_reader_init (&reader,
- self->block->member_name,
- self->reader.end - self->block->member_name);
- if (!gtk_json_reader_parse_string (&reader, &result, NULL))
- return NULL;
-
- return result;
+ return gtk_json_unescape_string (self->block->member_name);
}
gboolean
@@ -959,28 +1023,22 @@ guint gtk_json_parser_get_uint (GtkJsonParser
char *
gtk_json_parser_get_string (GtkJsonParser *self)
{
- char *result;
-
if (self->error)
return g_strdup ("");
if (self->block->value == NULL)
return g_strdup ("");
- if (gtk_json_parser_has_skipped_value (self))
- {
- GtkJsonReader reader;
- gtk_json_reader_init (&reader, self->block->value, self->reader.end - self->block->value);
- if (!gtk_json_reader_parse_string (&reader, &result, NULL))
- return g_strdup ("");
- }
- else
+ if (!gtk_json_parser_skip_value (self))
+ return FALSE;
+
+ if (*self->block->value != '"')
{
- if (!gtk_json_reader_parse_string (&self->reader, &result, &self->error))
- return g_strdup ("");
+ gtk_json_parser_value_error (self, "Expected a string");
+ return g_strdup ("");
}
- return result;
+ return gtk_json_unescape_string (self->block->value);
}
gboolean
@@ -1007,7 +1065,7 @@ gtk_json_parser_start_object (GtkJsonParser *self)
return TRUE;
self->block->member_name = self->reader.data;
- if (!gtk_json_reader_parse_string (&self->reader, NULL, &self->error))
+ if (!gtk_json_reader_parse_string (&self->reader, &self->error))
return FALSE;
gtk_json_reader_skip_whitespace (&self->reader);
if (!gtk_json_reader_try_char (&self->reader, ':'))
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]