[gtk/wip/otte/json: 6/9] jsonparser: Prevalidate numbers and strings




commit 6e0b388442cbb36922845fde1026307404067173
Author: Benjamin Otte <otte redhat com>
Date:   Mon Nov 29 02:37:06 2021 +0100

    jsonparser: Prevalidate numbers and strings
    
    Validation and parsing are 2 separate steps anyway.

 gtk/json/gtkjsonparser.c | 345 +++++++++++++++++++++++------------------------
 1 file changed, 165 insertions(+), 180 deletions(-)
---
diff --git a/gtk/json/gtkjsonparser.c b/gtk/json/gtkjsonparser.c
index b044b9331e..93ec8cf6a6 100644
--- a/gtk/json/gtkjsonparser.c
+++ b/gtk/json/gtkjsonparser.c
@@ -217,27 +217,6 @@ gtk_json_parser_value_error (GtkJsonParser *self,
   va_end (args);
 }
 
-static void
-gtk_json_set_syntax_error (GError     **error,
-                           const char  *format,
-                           ...) G_GNUC_PRINTF(2, 3);
-static void
-gtk_json_set_syntax_error (GError     **error,
-                           const char  *format,
-                           ...)
-{
-  va_list args;
-
-  if (error == NULL)
-    return;
-
-  va_start (args, format);
-  *error = g_error_new_valist (G_FILE_ERROR,
-                               G_FILE_ERROR_FAILED,
-                               format, args);
-  va_end (args);
-}
-
 static void
 gtk_json_parser_syntax_error (GtkJsonParser *self,
                               const char  *format,
@@ -451,45 +430,44 @@ gtk_json_unescape_string (const guchar *escaped)
 }
 
 static gboolean
-gtk_json_reader_parse_string (GtkJsonReader  *reader,
-                              GError        **error)
+gtk_json_parser_parse_string (GtkJsonParser *self)
 {
-  if (!gtk_json_reader_try_char (reader, '"'))
+  if (!gtk_json_reader_try_char (&self->reader, '"'))
     {
-      gtk_json_set_syntax_error (error, "Not a string");
+      gtk_json_parser_syntax_error (self, "Not a string");
       return FALSE;
     }
 
-  reader->data = json_skip_characters (reader->data, reader->end, STRING_ELEMENT);
+  self->reader.data = json_skip_characters (self->reader.data, self->reader.end, STRING_ELEMENT);
 
-  while (gtk_json_reader_remaining (reader))
+  while (gtk_json_reader_remaining (&self->reader))
     {
-      if (*reader->data < 0x20)
+      if (*self->reader.data < 0x20)
         {
-          gtk_json_set_syntax_error (error, "Disallowed control character in string literal");
+          gtk_json_parser_syntax_error (self, "Disallowed control character in string literal");
           return FALSE;
         }
-      else if (*reader->data > 127)
+      else if (*self->reader.data > 127)
         {
-          gunichar c = g_utf8_get_char_validated ((const char *) reader->data, reader->end - reader->data);
+          gunichar c = g_utf8_get_char_validated ((const char *) self->reader.data, 
gtk_json_reader_remaining (&self->reader));
           if (c == (gunichar) -2 || c == (gunichar) -1)
             {
-              gtk_json_set_syntax_error (error, "Invalid UTF-8");
+              gtk_json_parser_syntax_error (self, "Invalid UTF-8");
               return FALSE;
             }
-          reader->data = (const guchar *) g_utf8_next_char ((const char *) reader->data);
+          self->reader.data = (const guchar *) g_utf8_next_char ((const char *) self->reader.data);
         }
-      else if (*reader->data == '"')
+      else if (*self->reader.data == '"')
         {
-          reader->data++;
+          self->reader.data++;
           return TRUE;
         }
-      else if (*reader->data == '\\')
+      else if (*self->reader.data == '\\')
         {
-          if (gtk_json_reader_remaining (reader) < 2)
+          if (gtk_json_reader_remaining (&self->reader) < 2)
             goto end;
-          reader->data++;
-          switch (*reader->data)
+          self->reader.data++;
+          switch (*self->reader.data)
             {
             case '"':
             case '\\':
@@ -503,108 +481,166 @@ gtk_json_reader_parse_string (GtkJsonReader  *reader,
 
             case 'u':
               /* lots of work necessary to validate the unicode escapes here */
-              if (gtk_json_reader_remaining (reader) < 5 ||
-                  !g_ascii_isxdigit (reader->data[1]) ||
-                  !g_ascii_isxdigit (reader->data[2]) ||
-                  !g_ascii_isxdigit (reader->data[3]) ||
-                  !g_ascii_isxdigit (reader->data[4]))
+              if (gtk_json_reader_remaining (&self->reader) < 5 ||
+                  !g_ascii_isxdigit (self->reader.data[1]) ||
+                  !g_ascii_isxdigit (self->reader.data[2]) ||
+                  !g_ascii_isxdigit (self->reader.data[3]) ||
+                  !g_ascii_isxdigit (self->reader.data[4]))
                 {
-                  gtk_json_set_syntax_error (error, "Invalid Unicode escape sequence");
+                  gtk_json_parser_syntax_error (self, "Invalid Unicode escape sequence");
                   return FALSE;
                 }
               else
                 {
-                  gunichar unichar = (g_ascii_xdigit_value (reader->data[1]) << 12) |
-                                     (g_ascii_xdigit_value (reader->data[2]) <<  8) |
-                                     (g_ascii_xdigit_value (reader->data[3]) <<  4) |
-                                     (g_ascii_xdigit_value (reader->data[4]));
+                  gunichar unichar = (g_ascii_xdigit_value (self->reader.data[1]) << 12) |
+                                     (g_ascii_xdigit_value (self->reader.data[2]) <<  8) |
+                                     (g_ascii_xdigit_value (self->reader.data[3]) <<  4) |
+                                     (g_ascii_xdigit_value (self->reader.data[4]));
 
-                  reader->data += 4;
+                  self->reader.data += 4;
                   /* resolve UTF-16 surrogates for Unicode characters not in the BMP,
                    * as per ECMA 404, ยง 9, "String"
                    */
                   if (g_unichar_type (unichar) == G_UNICODE_SURROGATE &&
-                      gtk_json_reader_remaining (reader) >= 7 &&
-                      reader->data[1] == '\\' &&
-                      reader->data[2] == 'u' &&
-                      g_ascii_isxdigit (reader->data[3]) &&
-                      g_ascii_isxdigit (reader->data[4]) &&
-                      g_ascii_isxdigit (reader->data[5]) &&
-                      g_ascii_isxdigit (reader->data[6]))
+                      gtk_json_reader_remaining (&self->reader) >= 7 &&
+                      self->reader.data[1] == '\\' &&
+                      self->reader.data[2] == 'u' &&
+                      g_ascii_isxdigit (self->reader.data[3]) &&
+                      g_ascii_isxdigit (self->reader.data[4]) &&
+                      g_ascii_isxdigit (self->reader.data[5]) &&
+                      g_ascii_isxdigit (self->reader.data[6]))
                     {
                       unichar = decode_utf16_surrogate_pair (unichar,
-                                                             (g_ascii_xdigit_value (reader->data[3]) << 12) |
-                                                             (g_ascii_xdigit_value (reader->data[4]) <<  8) |
-                                                             (g_ascii_xdigit_value (reader->data[5]) <<  4) |
-                                                             (g_ascii_xdigit_value (reader->data[6])));
-                      reader->data += 6;
-
-                      if (unichar == 0)
-                        {
-                          gtk_json_set_syntax_error (error, "Invalid UTF-16 surrogate pair");
-                          return FALSE;
-                        }
+                                                             (g_ascii_xdigit_value (self->reader.data[3]) << 
12) |
+                                                             (g_ascii_xdigit_value (self->reader.data[4]) << 
 8) |
+                                                             (g_ascii_xdigit_value (self->reader.data[5]) << 
 4) |
+                                                             (g_ascii_xdigit_value (self->reader.data[6])));
+                      self->reader.data += 6;
+                    }
+                  else
+                    {
+                      unichar = 0;
+                    }
+
+                  if (unichar == 0)
+                    {
+                      gtk_json_parser_syntax_error (self, "Invalid UTF-16 surrogate pair");
+                      return FALSE;
                     }
                 }
               break;
             default:
-              gtk_json_set_syntax_error (error, "Unknown escape sequence");
+              gtk_json_parser_syntax_error (self, "Unknown escape sequence");
               return FALSE;
             }
-          reader->data++;
+          self->reader.data++;
         }
 
-      reader->data = json_skip_characters (reader->data, reader->end, STRING_ELEMENT);
+      self->reader.data = json_skip_characters (self->reader.data, self->reader.end, STRING_ELEMENT);
     }
 
 end:
-  gtk_json_set_syntax_error (error, "Unterminated string literal");
+  gtk_json_parser_syntax_error (self, "Unterminated string literal");
   return FALSE;
 }
 
 static gboolean
-gtk_json_reader_parse_number (GtkJsonReader  *reader,
-                              GError        **error)
+gtk_json_parser_parse_number (GtkJsonParser *self)
 {
   /* sign */
-  gtk_json_reader_try_char (reader, '-');
+  gtk_json_reader_try_char (&self->reader, '-');
 
   /* integer part */
-  if (!gtk_json_reader_try_char (reader, '0'))
+  if (!gtk_json_reader_try_char (&self->reader, '0'))
     {
-      if (reader->data >= reader->end ||
-          !g_ascii_isdigit (*reader->data))
+      if (gtk_json_reader_is_eof (&self->reader) ||
+          !g_ascii_isdigit (*self->reader.data))
         goto out;
 
-      reader->data++;
+      self->reader.data++;
 
-      while (reader->data < reader->end && g_ascii_isdigit (*reader->data))
-        reader->data++;
+      while (!gtk_json_reader_is_eof (&self->reader) && g_ascii_isdigit (*self->reader.data))
+        self->reader.data++;
     }
 
   /* fractional part */
-  if (gtk_json_reader_remaining (reader) >= 2 && *reader->data == '.' && g_ascii_isdigit (reader->data[1]))
+  if (gtk_json_reader_remaining (&self->reader) >= 2 && *self->reader.data == '.' && g_ascii_isdigit 
(self->reader.data[1]))
     {
-      reader->data += 2;
+      self->reader.data += 2;
 
-      while (reader->data < reader->end && g_ascii_isdigit (*reader->data))
-        reader->data++;
+      while (!gtk_json_reader_is_eof (&self->reader) && g_ascii_isdigit (*self->reader.data))
+        self->reader.data++;
     }
 
   /* exponent */
-  if (gtk_json_reader_remaining (reader) >= 2 && (reader->data[0] == 'e' || reader->data[0] == 'E') &&
-      (g_ascii_isdigit (reader->data[1]) ||
-       (gtk_json_reader_remaining (reader) >= 3 && (reader->data[1] == '+' || reader->data[1] == '-') && 
g_ascii_isdigit (reader->data[2]))))
+  if (gtk_json_reader_remaining (&self->reader) >= 2 && (self->reader.data[0] == 'e' || self->reader.data[0] 
== 'E') &&
+      (g_ascii_isdigit (self->reader.data[1]) ||
+       (gtk_json_reader_remaining (&self->reader) >= 3 && (self->reader.data[1] == '+' || 
self->reader.data[1] == '-') && g_ascii_isdigit (self->reader.data[2]))))
     {
-      reader->data += 2;
+      self->reader.data += 2;
 
-      while (reader->data < reader->end && g_ascii_isdigit (*reader->data))
-        reader->data++;
+      while (!gtk_json_reader_is_eof (&self->reader) && g_ascii_isdigit (*self->reader.data))
+        self->reader.data++;
     }
   return TRUE;
 
 out:
-  gtk_json_set_syntax_error (error, "Not a valid number");
+  gtk_json_parser_syntax_error (self, "Not a valid number");
+  return FALSE;
+}
+
+static gboolean
+gtk_json_parser_parse_value (GtkJsonParser *self)
+{
+  if (gtk_json_reader_is_eof (&self->reader))
+    {
+      gtk_json_parser_syntax_error (self, "Unexpected end of file");
+      return FALSE;
+    }
+
+  switch (*self->reader.data)
+  {
+    case '"':
+      return gtk_json_parser_parse_string (self);
+    
+    case '-':
+    case '0':
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+    case '8':
+    case '9':
+      return gtk_json_parser_parse_number (self);
+
+    case 'f':
+      if (gtk_json_reader_try_identifier (&self->reader, "false"))
+        return TRUE;
+      break;
+
+    case 'n':
+      if (gtk_json_reader_try_identifier (&self->reader, "null"))
+        return TRUE;
+      break;
+
+    case 't':
+      if (gtk_json_reader_try_identifier (&self->reader, "true"))
+        return TRUE;
+      break;
+
+    case '{':
+    case '[':
+      /* don't preparse objects */
+      return TRUE;
+
+    default:
+      break;
+  }
+
+  gtk_json_parser_syntax_error (self, "Expected a value");
   return FALSE;
 }
 
@@ -682,10 +718,7 @@ gtk_json_parser_new_for_bytes (GBytes *bytes)
 
   gtk_json_reader_skip_whitespace (&self->reader);
   self->block->value = self->reader.data;
-  if (!gtk_json_parser_get_node (self))
-    {
-      gtk_json_parser_syntax_error (self, "Empty document");
-    }
+  gtk_json_parser_parse_value (self);
 
   return self;
 }
@@ -705,71 +738,26 @@ gtk_json_parser_free (GtkJsonParser *self)
 }
 
 static gboolean
-gtk_json_parser_has_skipped_value (GtkJsonParser *self)
+gtk_json_parser_skip_block (GtkJsonParser *self)
 {
-  return self->reader.data != self->block->value;
-}
-
-static gboolean
-gtk_json_parser_skip_value (GtkJsonParser *self)
-{
-  /* read the value already */
-  if (gtk_json_parser_has_skipped_value (self))
+  if (self->reader.data != self->block->value)
     return TRUE;
 
-  if (gtk_json_reader_is_eof (&self->reader))
+  if (*self->reader.data == '{')
     {
-      gtk_json_parser_syntax_error (self, "Unexpected end of file");
-      return FALSE;
-    }
-
-  switch (*self->reader.data)
-  {
-    case '"':
-      return gtk_json_reader_parse_string (&self->reader, &self->error);
-    
-    case '-':
-    case '0':
-    case '1':
-    case '2':
-    case '3':
-    case '4':
-    case '5':
-    case '6':
-    case '7':
-    case '8':
-    case '9':
-      return gtk_json_reader_parse_number (&self->reader, &self->error);
-
-    case 'f':
-      if (gtk_json_reader_try_identifier (&self->reader, "false"))
-        return TRUE;
-      break;
-
-    case 'n':
-      if (gtk_json_reader_try_identifier (&self->reader, "null"))
-        return TRUE;
-      break;
-
-    case 't':
-      if (gtk_json_reader_try_identifier (&self->reader, "true"))
-        return TRUE;
-      break;
-
-    case '{':
       return gtk_json_parser_start_object (self) &&
              gtk_json_parser_end (self);
-
-    case '[':
+    }
+  else if (*self->reader.data == '[')
+    {
       return gtk_json_parser_start_array (self) &&
              gtk_json_parser_end (self);
-
-    default:
-      break;
-  }
-
-  gtk_json_parser_syntax_error (self, "Unexpected character in document");
-  return FALSE;
+    }
+  else
+    {
+      g_assert_not_reached ();
+      return FALSE;
+    }
 }
 
 gboolean
@@ -778,9 +766,15 @@ gtk_json_parser_next (GtkJsonParser *self)
   if (self->error)
     return FALSE;
 
-  if (!gtk_json_parser_skip_value (self))
+  if (self->block->value == NULL)
     return FALSE;
 
+  if (!gtk_json_parser_skip_block (self))
+    {
+      g_assert (self->error);
+      return FALSE;
+    }
+
   switch (self->block->type)
     {
     case GTK_JSON_BLOCK_TOPLEVEL:
@@ -821,7 +815,7 @@ gtk_json_parser_next (GtkJsonParser *self)
       gtk_json_reader_skip_whitespace (&self->reader);
       self->block->member_name = self->reader.data;
 
-      if (!gtk_json_reader_parse_string (&self->reader, &self->error))
+      if (!gtk_json_parser_parse_string (self))
         return FALSE;
       gtk_json_reader_skip_whitespace (&self->reader);
       if (!gtk_json_reader_try_char (&self->reader, ':'))
@@ -832,11 +826,8 @@ gtk_json_parser_next (GtkJsonParser *self)
 
       gtk_json_reader_skip_whitespace (&self->reader);
       self->block->value = self->reader.data;
-      if (!gtk_json_parser_get_node (self))
-        {
-          gtk_json_parser_syntax_error (self, "No value after ','");
-          return FALSE;
-        }
+      if (!gtk_json_parser_parse_value (self))
+        return FALSE;
       break;
 
     case GTK_JSON_BLOCK_ARRAY:
@@ -861,11 +852,8 @@ gtk_json_parser_next (GtkJsonParser *self)
 
       gtk_json_reader_skip_whitespace (&self->reader);
       self->block->value = self->reader.data;
-      if (!gtk_json_parser_get_node (self))
-        {
-          gtk_json_parser_syntax_error (self, "No value after ','");
-          return FALSE;
-        }
+      if (!gtk_json_parser_parse_value (self))
+        return FALSE;
       break;
 
     default:
@@ -951,9 +939,6 @@ gtk_json_parser_get_boolean (GtkJsonParser *self)
   if (self->block->value == NULL)
     return FALSE;
 
-  if (!gtk_json_parser_skip_value (self))
-    return FALSE;
-
   if (*self->block->value == 't')
     return TRUE;
   else if (*self->block->value == 'f')
@@ -974,9 +959,6 @@ gtk_json_parser_get_number (GtkJsonParser *self)
   if (self->block->value == NULL)
     return 0;
 
-  if (!gtk_json_parser_skip_value (self))
-    return FALSE;
-
   if (!strchr ("-0123456789", *self->block->value))
     {
       gtk_json_parser_value_error (self, "Expected a number");
@@ -1023,9 +1005,6 @@ gtk_json_parser_get_string (GtkJsonParser *self)
   if (self->block->value == NULL)
     return g_strdup ("");
 
-  if (!gtk_json_parser_skip_value (self))
-    return FALSE;
-
   if (*self->block->value != '"')
     {
       gtk_json_parser_value_error (self, "Expected a string");
@@ -1059,7 +1038,7 @@ gtk_json_parser_start_object (GtkJsonParser *self)
     return TRUE;
   self->block->member_name = self->reader.data;
 
-  if (!gtk_json_reader_parse_string (&self->reader, &self->error))
+  if (!gtk_json_parser_parse_string (self))
     return FALSE;
   gtk_json_reader_skip_whitespace (&self->reader);
   if (!gtk_json_reader_try_char (&self->reader, ':'))
@@ -1070,6 +1049,8 @@ gtk_json_parser_start_object (GtkJsonParser *self)
 
   gtk_json_reader_skip_whitespace (&self->reader);
   self->block->value = self->reader.data;
+  if (!gtk_json_parser_parse_value (self))
+    return FALSE;
 
   return TRUE;
 }
@@ -1099,6 +1080,8 @@ gtk_json_parser_start_array (GtkJsonParser *self)
       return TRUE;
     }
   self->block->value = self->reader.data;
+  if (!gtk_json_parser_parse_value (self))
+    return FALSE;
 
   return TRUE;
 }
@@ -1108,11 +1091,13 @@ gtk_json_parser_end (GtkJsonParser *self)
 {
   char bracket;
 
+  g_return_val_if_fail (self != NULL, FALSE);
+
+  while (gtk_json_parser_next (self));
+
   if (self->error)
     return FALSE;
 
-  g_return_val_if_fail (self != NULL, FALSE);
-
   switch (self->block->type)
     {
     case GTK_JSON_BLOCK_OBJECT:
@@ -1126,10 +1111,10 @@ gtk_json_parser_end (GtkJsonParser *self)
       g_return_val_if_reached (FALSE);
     }
 
-  while (!gtk_json_reader_try_char (&self->reader, bracket))
+  if (!gtk_json_reader_try_char (&self->reader, bracket))
     {
-      if (!gtk_json_parser_next (self))
-        return FALSE;
+      gtk_json_parser_syntax_error (self, "No terminating '%c'", bracket);
+      return FALSE;
     }
 
   gtk_json_parser_pop_block (self);


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]