[glib] Patch originally committed only to the ChangeLog & tests ...



commit 83699774fa669abfbc5c5c3dc9265308246bd4f6
Author: Michael Meeks <michael meeks novell com>
Date:   Fri May 1 15:23:23 2009 +0100

    Patch originally committed only to the ChangeLog & tests ...
    
            Bug 572508 â?? gmarkup speedup ...
    
            * glib/gmarkup.c: Various optimizations: do less allocations by
            keeping a pool of GStrings, do in-place unescaping, avoid redundant
            utf-8 validation.
---
 glib/gmarkup.c | 1129 +++++++++++++++++++++++---------------------------------
 1 files changed, 455 insertions(+), 674 deletions(-)

diff --git a/glib/gmarkup.c b/glib/gmarkup.c
index b1f4f46..35118ad 100644
--- a/glib/gmarkup.c
+++ b/glib/gmarkup.c
@@ -82,11 +82,15 @@ struct _GMarkupParseContext
    * the callback for it.
    */
   GString *partial_chunk;
+  GSList *spare_chunks;
 
   GMarkupParseState state;
   GSList *tag_stack;
-  gchar **attr_names;
-  gchar **attr_values;
+  GSList *tag_stack_gstr;
+  GSList *spare_list_nodes;
+
+  GString **attr_names;
+  GString **attr_values;
   gint cur_attr;
   gint alloc_attrs;
 
@@ -94,8 +98,6 @@ struct _GMarkupParseContext
   gssize       current_text_len;      
   const gchar *current_text_end;
 
-  GString *leftover_char_portion;
-
   /* used to save the start of the last interesting thingy */
   const gchar *start;
 
@@ -112,6 +114,39 @@ struct _GMarkupParseContext
   gpointer held_user_data;
 };
 
+/*
+ * Helpers to reduce our allocation overhead, we have
+ * a well defined allocation lifecycle.
+ */
+static GSList *
+get_list_node (GMarkupParseContext *context, gpointer data)
+{
+  GSList *node;
+  if (context->spare_list_nodes != NULL)
+    {
+      node = context->spare_list_nodes;
+      context->spare_list_nodes = g_slist_remove_link (context->spare_list_nodes, node);
+    }
+  else
+    node = g_slist_alloc();
+  node->data = data;
+  return node;
+}
+
+static void
+free_list_node (GMarkupParseContext *context, GSList *node)
+{
+  node->data = NULL;
+  context->spare_list_nodes = g_slist_concat (node, context->spare_list_nodes);
+}
+
+static inline void
+string_blank (GString *string)
+{
+  string->str[0] = '\0';
+  string->len = 0;
+}
+
 /**
  * g_markup_parse_context_new:
  * @parser: a #GMarkupParser
@@ -148,9 +183,12 @@ g_markup_parse_context_new (const GMarkupParser *parser,
   context->char_number = 1;
 
   context->partial_chunk = NULL;
+  context->spare_chunks = NULL;
+  context->spare_list_nodes = NULL;
 
   context->state = STATE_START;
   context->tag_stack = NULL;
+  context->tag_stack_gstr = NULL;
   context->attr_names = NULL;
   context->attr_values = NULL;
   context->cur_attr = -1;
@@ -159,7 +197,6 @@ g_markup_parse_context_new (const GMarkupParser *parser,
   context->current_text = NULL;
   context->current_text_len = -1;
   context->current_text_end = NULL;
-  context->leftover_char_portion = NULL;
 
   context->start = NULL;
   context->iter = NULL;
@@ -179,6 +216,14 @@ g_markup_parse_context_new (const GMarkupParser *parser,
   return context;
 }
 
+static void
+string_full_free (gpointer ptr, gpointer user_data)
+{
+  g_string_free (ptr, TRUE);
+}
+
+static void clear_attributes (GMarkupParseContext *context);
+
 /**
  * g_markup_parse_context_free:
  * @context: a #GMarkupParseContext
@@ -198,18 +243,21 @@ g_markup_parse_context_free (GMarkupParseContext *context)
   if (context->dnotify)
     (* context->dnotify) (context->user_data);
 
-  g_strfreev (context->attr_names);
-  g_strfreev (context->attr_values);
+  clear_attributes (context);
+  g_free (context->attr_names);
+  g_free (context->attr_values);
 
-  g_slist_foreach (context->tag_stack, (GFunc)g_free, NULL);
+  g_slist_foreach (context->tag_stack_gstr, string_full_free, NULL);
+  g_slist_free (context->tag_stack_gstr);
   g_slist_free (context->tag_stack);
 
+  g_slist_foreach (context->spare_chunks, string_full_free, NULL);
+  g_slist_free (context->spare_chunks);
+  g_slist_free (context->spare_list_nodes);
+
   if (context->partial_chunk)
     g_string_free (context->partial_chunk, TRUE);
 
-  if (context->leftover_char_portion)
-    g_string_free (context->leftover_char_portion, TRUE);
-
   g_free (context);
 }
 
@@ -301,43 +349,101 @@ propagate_error (GMarkupParseContext  *context,
   g_propagate_error (dest, src);
 }
 
-/* To make these faster, we first use the ascii-only tests, then check
- * for the usual non-alnum name-end chars, and only then call the
- * expensive unicode stuff. Nobody uses non-ascii in XML tag/attribute
- * names, so this is a reasonable hack that virtually always avoids
- * the guniprop call.
- */
 #define IS_COMMON_NAME_END_CHAR(c) \
   ((c) == '=' || (c) == '/' || (c) == '>' || (c) == ' ')
 
 static gboolean
-is_name_start_char (const gchar *p)
+slow_name_validate (GMarkupParseContext *context, const char *name, GError **error)
 {
-  if (g_ascii_isalpha (*p) ||
-      (!IS_COMMON_NAME_END_CHAR (*p) &&
-       (*p == '_' || 
-	*p == ':' ||
-	g_unichar_isalpha (g_utf8_get_char (p)))))
-    return TRUE;
-  else
-    return FALSE;
+  const char *p = name;
+
+  if (!g_utf8_validate (name, strlen (name), NULL))
+    {
+      set_error (context, error, G_MARKUP_ERROR_BAD_UTF8,
+		 _("Invalid UTF-8 encoded text in name - not valid '%s'"), name);
+      return FALSE;
+    }
+
+  if (!(g_ascii_isalpha (*p) ||
+	(!IS_COMMON_NAME_END_CHAR (*p) &&
+	 (*p == '_' || 
+	  *p == ':' ||
+	  g_unichar_isalpha (g_utf8_get_char (p))))))
+    {
+      set_error (context, error, G_MARKUP_ERROR_PARSE,
+		 _("'%s' is not a valid name "), name);
+      return FALSE;
+    }
+
+  for (p = g_utf8_next_char (name); *p != '\0'; p = g_utf8_next_char (p))
+    {
+      /* is_name_char */
+      if (!(g_ascii_isalnum (*p) ||
+	    (!IS_COMMON_NAME_END_CHAR (*p) &&
+	     (*p == '.' || 
+	      *p == '-' ||
+	      *p == '_' ||
+	      *p == ':' ||
+	      g_unichar_isalpha (g_utf8_get_char (p))))))
+	{
+	  set_error (context, error, G_MARKUP_ERROR_PARSE,
+		     _("'%s' is not a valid name: '%c' "), name, *p);
+	  return FALSE;
+	}
+    }
+  return TRUE;
 }
 
+/* 
+ * Use me for elements, attributes etc.
+ */
 static gboolean
-is_name_char (const gchar *p)
+name_validate (GMarkupParseContext *context, const char *name, GError **error)
 {
-  if (g_ascii_isalnum (*p) ||
-      (!IS_COMMON_NAME_END_CHAR (*p) &&
-       (*p == '.' || 
-	*p == '-' ||
-	*p == '_' ||
-	*p == ':' ||
-	g_unichar_isalpha (g_utf8_get_char (p)))))
-    return TRUE;
-  else
-    return FALSE;
+  char mask;
+  const char *p;
+
+  /* name start char */
+  p = name;
+  if (G_UNLIKELY (IS_COMMON_NAME_END_CHAR (*p) ||
+		  !(g_ascii_isalpha (*p) || *p == '_' || *p == ':')))
+    goto slow_validate;
+  
+  for (mask = *p++; *p != '\0'; p++)
+    {
+      mask |= *p;
+
+      /* is_name_char */
+      if (G_UNLIKELY (!(g_ascii_isalnum (*p) ||
+			(!IS_COMMON_NAME_END_CHAR (*p) &&
+			 (*p == '.' || 
+			  *p == '-' ||
+			  *p == '_' ||
+			  *p == ':')))))
+	goto slow_validate;
+    }
+
+  if (mask & 0x80) /* un-common / non-ascii */
+    goto slow_validate;
+
+  return TRUE;
+
+ slow_validate:
+  return slow_name_validate (context, name, error);
 }
 
+static gboolean
+text_validate (GMarkupParseContext *context, const char *p, int len, GError **error)
+{
+  if (!g_utf8_validate (p, len, NULL))
+    {
+      set_error (context, error, G_MARKUP_ERROR_BAD_UTF8,
+		 _("Invalid UTF-8 encoded text in name - not valid '%s'"), p);
+      return FALSE;
+    }
+  else
+    return TRUE;
+}
 
 static gchar*
 char_str (gunichar c,
@@ -360,7 +466,6 @@ static void
 set_unescape_error (GMarkupParseContext *context,
                     GError             **error,
                     const gchar         *remaining_text,
-                    const gchar         *remaining_text_end,
                     GMarkupError         code,
                     const gchar         *format,
                     ...)
@@ -373,7 +478,7 @@ set_unescape_error (GMarkupParseContext *context,
 
   remaining_newlines = 0;
   p = remaining_text;
-  while (p != remaining_text_end)
+  while (*p != '\0')
     {
       if (*p == '\n')
         ++remaining_newlines;
@@ -397,414 +502,199 @@ set_unescape_error (GMarkupParseContext *context,
   g_propagate_error (error, tmp_error);
 }
 
-typedef enum
-{
-  USTATE_INSIDE_TEXT,
-  USTATE_AFTER_AMPERSAND,
-  USTATE_INSIDE_ENTITY_NAME,
-  USTATE_AFTER_CHARREF_HASH
-} UnescapeState;
-
-typedef struct
-{
-  GMarkupParseContext *context;
-  GString *str;
-  UnescapeState state;
-  const gchar *text;
-  const gchar *text_end;
-  const gchar *entity_start;
-} UnescapeContext;
-
-static const gchar*
-unescape_text_state_inside_text (UnescapeContext *ucontext,
-                                 const gchar     *p,
-                                 GError         **error)
+/*
+ * re-write the GString in-place, unescaping anything that escaped.
+ * most XML does not contain entities, or escaping.
+ */
+static gboolean
+unescape_gstring_inplace (GMarkupParseContext *context,
+			  GString             *string,
+			  gboolean            *is_ascii,
+			  GError             **error)
 {
-  const gchar *start;
+  char mask, *to;
+  int line_num = 1;
+  const char *from;
   gboolean normalize_attribute;
 
-  if (ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ ||
-      ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ)
+  *is_ascii = FALSE;
+
+  /* are we unescaping an attribute or not ? */
+  if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ ||
+      context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ)
     normalize_attribute = TRUE;
   else
     normalize_attribute = FALSE;
 
-  start = p;
-  
-  while (p != ucontext->text_end)
-    {
-      if (*p == '&')
-        {
-          break;
-        }
-      else if (normalize_attribute && (*p == '\t' || *p == '\n'))
-        {
-          g_string_append_len (ucontext->str, start, p - start);
-          g_string_append_c (ucontext->str, ' ');
-          p = g_utf8_next_char (p);
-          start = p;
-        }
-      else if (*p == '\r')
-        {
-          g_string_append_len (ucontext->str, start, p - start);
-          g_string_append_c (ucontext->str, normalize_attribute ? ' ' : '\n');
-          p = g_utf8_next_char (p);
-          if (p != ucontext->text_end && *p == '\n')
-            p = g_utf8_next_char (p);
-          start = p;
-        }
-      else
-        p = g_utf8_next_char (p);
-    }
-  
-  if (p != start)
-    g_string_append_len (ucontext->str, start, p - start);
-  
-  if (p != ucontext->text_end && *p == '&')
-    {
-      p = g_utf8_next_char (p);
-      ucontext->state = USTATE_AFTER_AMPERSAND;
-    }
-
-  return p;
-}
-
-static const gchar*
-unescape_text_state_after_ampersand (UnescapeContext *ucontext,
-                                     const gchar     *p,
-                                     GError         **error)
-{
-  ucontext->entity_start = NULL;
-  
-  if (*p == '#')
-    {
-      p = g_utf8_next_char (p);
-
-      ucontext->entity_start = p;
-      ucontext->state = USTATE_AFTER_CHARREF_HASH;
-    }
-  else if (!is_name_start_char (p))
-    {
-      if (*p == ';')
-        {
-          set_unescape_error (ucontext->context, error,
-                              p, ucontext->text_end,
-                              G_MARKUP_ERROR_PARSE,
-                              _("Empty entity '&;' seen; valid "
-                                "entities are: &amp; &quot; &lt; &gt; &apos;"));
-        }
-      else
-        {
-          gchar buf[8];
-
-          set_unescape_error (ucontext->context, error,
-                              p, ucontext->text_end,
-                              G_MARKUP_ERROR_PARSE,
-                              _("Character '%s' is not valid at "
-                                "the start of an entity name; "
-                                "the & character begins an entity; "
-                                "if this ampersand isn't supposed "
-                                "to be an entity, escape it as "
-                                "&amp;"),
-                              utf8_str (p, buf));
-        }
-    }
-  else
-    {
-      ucontext->entity_start = p;
-      ucontext->state = USTATE_INSIDE_ENTITY_NAME;
-    }
-
-  return p;
-}
-
-static const gchar*
-unescape_text_state_inside_entity_name (UnescapeContext *ucontext,
-                                        const gchar     *p,
-                                        GError         **error)
-{
-  while (p != ucontext->text_end)
-    {
-      if (*p == ';')
-        break;
-      else if (!is_name_char (p))
-        {
-          gchar ubuf[8];
-
-          set_unescape_error (ucontext->context, error,
-                              p, ucontext->text_end,
-                              G_MARKUP_ERROR_PARSE,
-                              _("Character '%s' is not valid "
-                                "inside an entity name"),
-                              utf8_str (p, ubuf));
-          break;
-        }
-
-      p = g_utf8_next_char (p);
-    }
-
-  if (ucontext->context->state != STATE_ERROR)
-    {
-      if (p != ucontext->text_end)
-        {
-	  gint len = p - ucontext->entity_start;
-
-          /* move to after semicolon */
-          p = g_utf8_next_char (p);
-          ucontext->state = USTATE_INSIDE_TEXT;
-
-          if (strncmp (ucontext->entity_start, "lt", len) == 0)
-            g_string_append_c (ucontext->str, '<');
-          else if (strncmp (ucontext->entity_start, "gt", len) == 0)
-            g_string_append_c (ucontext->str, '>');
-          else if (strncmp (ucontext->entity_start, "amp", len) == 0)
-            g_string_append_c (ucontext->str, '&');
-          else if (strncmp (ucontext->entity_start, "quot", len) == 0)
-            g_string_append_c (ucontext->str, '"');
-          else if (strncmp (ucontext->entity_start, "apos", len) == 0)
-            g_string_append_c (ucontext->str, '\'');
-          else
-            {
-	      gchar *name;
-
-	      name = g_strndup (ucontext->entity_start, len);
-              set_unescape_error (ucontext->context, error,
-                                  p, ucontext->text_end,
-                                  G_MARKUP_ERROR_PARSE,
-                                  _("Entity name '%s' is not known"),
-                                  name);
-	      g_free (name);
-            }
-        }
-      else
-        {
-          set_unescape_error (ucontext->context, error,
-                              /* give line number of the & */
-                              ucontext->entity_start, ucontext->text_end,
-                              G_MARKUP_ERROR_PARSE,
-                              _("Entity did not end with a semicolon; "
-                                "most likely you used an ampersand "
-                                "character without intending to start "
-                                "an entity - escape ampersand as &amp;"));
-        }
-    }
-#undef MAX_ENT_LEN
-
-  return p;
-}
-
-static const gchar*
-unescape_text_state_after_charref_hash (UnescapeContext *ucontext,
-                                        const gchar     *p,
-                                        GError         **error)
-{
-  gboolean is_hex = FALSE;
-  const char *start;
-
-  start = ucontext->entity_start;
-
-  if (*p == 'x')
-    {
-      is_hex = TRUE;
-      p = g_utf8_next_char (p);
-      start = p;
-    }
-
-  while (p != ucontext->text_end && *p != ';')
-    p = g_utf8_next_char (p);
-
-  if (p != ucontext->text_end)
+  /*
+   * Meeks' theorum: unescaping can only shrink text.
+   * for &lt; etc. this is obvious, for &#xffff; more
+   * thought is required, but this is patently so.
+   */
+  mask = 0;
+  for (from = to = string->str; *from != '\0'; from++, to++) 
     {
-      g_assert (*p == ';');
+      *to = *from;
+
+      mask |= *to;
+      if (*to == '\n')
+	line_num++;
+      if (normalize_attribute && (*to == '\t' || *to == '\n'))
+	*to = ' ';
+      if (*to == '\r')
+	{
+	  *to = normalize_attribute ? ' ' : '\n';
+	  if (from[1] == '\n')
+	    from++;
+	}
+      if (*from == '&')
+	{
+	  from++;
+	  if (*from == '#')
+	    {
+	      gboolean is_hex = FALSE;
+	      gulong l;
+	      gchar *end = NULL;
 
-      /* digit is between start and p */
+	      from++;
 
-      if (start != p)
-        {
-          gulong l;
-          gchar *end = NULL;
-                    
-          errno = 0;
-          if (is_hex)
-            l = strtoul (start, &end, 16);
-          else
-            l = strtoul (start, &end, 10);
+	      if (*from == 'x')
+		{
+		  is_hex = TRUE;
+		  from++;
+		}
 
-          if (end != p || errno != 0)
-            {
-              set_unescape_error (ucontext->context, error,
-                                  start, ucontext->text_end,
-                                  G_MARKUP_ERROR_PARSE,
-                                  _("Failed to parse '%-.*s', which "
-                                    "should have been a digit "
-                                    "inside a character reference "
-                                    "(&#234; for example) - perhaps "
-                                    "the digit is too large"),
-                                  p - start, start);
-            }
-          else
-            {
-              /* characters XML 1.1 permits */
-              if ((0 < l && l <= 0xD7FF) ||
-                  (0xE000 <= l && l <= 0xFFFD) ||
-                  (0x10000 <= l && l <= 0x10FFFF))
-                {
-                  gchar buf[8];
-                  g_string_append (ucontext->str, char_str (l, buf));
-                }
-              else
-                {
-                  set_unescape_error (ucontext->context, error,
-                                      start, ucontext->text_end,
-                                      G_MARKUP_ERROR_PARSE,
-                                      _("Character reference '%-.*s' does not "
-					"encode a permitted character"),
-                                      p - start, start);
-                }
+	      /* digit is between start and p */
+	      errno = 0;
+	      if (is_hex)
+		l = strtoul (from, &end, 16);
+	      else
+		l = strtoul (from, &end, 10);
+	      
+	      if (end == from || errno != 0)
+		{
+		  set_unescape_error (context, error,
+				      from, G_MARKUP_ERROR_PARSE,
+				      _("Failed to parse '%-.*s', which "
+					"should have been a digit "
+					"inside a character reference "
+					"(&#234; for example) - perhaps "
+					"the digit is too large"),
+				      end - from, from);
+		  return FALSE;
+		}
+	      else if (*end != ';')
+		{
+		  set_unescape_error (context, error,
+				      from, G_MARKUP_ERROR_PARSE,
+				      _("Character reference did not end with a "
+					"semicolon; "
+					"most likely you used an ampersand "
+					"character without intending to start "
+					"an entity - escape ampersand as &amp;"));
+		  return FALSE;
+		}
+	      else
+		{
+		  /* characters XML 1.1 permits */
+		  if ((0 < l && l <= 0xD7FF) ||
+		      (0xE000 <= l && l <= 0xFFFD) ||
+		      (0x10000 <= l && l <= 0x10FFFF))
+		    {
+		      gchar buf[8];
+		      char_str (l, buf);
+		      strcpy (to, buf);
+		      to += strlen (buf) - 1;
+		      from = end;
+		      if (l >= 0x80) /* not ascii */
+			mask |= 0x80;
+		    }
+		  else
+		    {
+		      set_unescape_error (context, error,
+					  from, G_MARKUP_ERROR_PARSE,
+					  _("Character reference '%-.*s' does not "
+					    "encode a permitted character"),
+					  end - from, from);
+		      return FALSE;
+		    }
+		}
             }
 
-          /* Move to next state */
-          p = g_utf8_next_char (p); /* past semicolon */
-          ucontext->state = USTATE_INSIDE_TEXT;
-        }
-      else
-        {
-          set_unescape_error (ucontext->context, error,
-                              start, ucontext->text_end,
-                              G_MARKUP_ERROR_PARSE,
-                              _("Empty character reference; "
-                                "should include a digit such as "
-                                "&#454;"));
-        }
-    }
-  else
-    {
-      set_unescape_error (ucontext->context, error,
-                          start, ucontext->text_end,
-                          G_MARKUP_ERROR_PARSE,
-                          _("Character reference did not end with a "
-                            "semicolon; "
-                            "most likely you used an ampersand "
-                            "character without intending to start "
-                            "an entity - escape ampersand as &amp;"));
+          else if (strncmp (from, "lt;", 3) == 0)
+	    {
+	      *to = '<';
+	      from += 2;
+	    }
+          else if (strncmp (from, "gt;", 3) == 0)
+	    {
+	      *to = '>';
+	      from += 2;
+	    }
+          else if (strncmp (from, "amp;", 4) == 0)
+	    {
+	      *to = '&';
+	      from += 3;
+	    }
+          else if (strncmp (from, "quot;", 5) == 0)
+	    {
+	      *to = '"';
+	      from += 4;
+	    }
+	  else if (strncmp (from, "apos;", 5) == 0)
+	    {
+	      *to = '\'';
+	      from += 4;
+	    }
+	  else
+	    {
+	      if (*from == ';')
+		set_unescape_error (context, error,
+				    from, G_MARKUP_ERROR_PARSE,
+				    _("Empty entity '&;' seen; valid "
+				      "entities are: &amp; &quot; &lt; &gt; &apos;"));
+	      else
+		{
+		  const char *end = strchr (from, ';');
+		  if (end)
+		    set_unescape_error (context, error,
+					from, G_MARKUP_ERROR_PARSE,
+					_("Entity name '%-.*s' is not known"),
+					end-from, from);
+		  else
+		    set_unescape_error (context, error,
+					from, G_MARKUP_ERROR_PARSE,
+					_("Entity did not end with a semicolon; "
+					  "most likely you used an ampersand "
+					  "character without intending to start "
+					  "an entity - escape ampersand as &amp;"));
+		}
+	      return FALSE;
+	    }
+	}
     }
 
-  return p;
-}
+  g_assert (to - string->str <= string->len);
+  if (to - string->str != string->len)
+    g_string_truncate (string, to - string->str);
 
-static gboolean
-unescape_text (GMarkupParseContext *context,
-               const gchar         *text,
-               const gchar         *text_end,
-               GString            **unescaped,
-               GError             **error)
-{
-  UnescapeContext ucontext;
-  const gchar *p;
-
-  ucontext.context = context;
-  ucontext.text = text;
-  ucontext.text_end = text_end;
-  ucontext.entity_start = NULL;
-  
-  ucontext.str = g_string_sized_new (text_end - text);
-
-  ucontext.state = USTATE_INSIDE_TEXT;
-  p = text;
-
-  while (p != text_end && context->state != STATE_ERROR)
-    {
-      g_assert (p < text_end);
-      
-      switch (ucontext.state)
-        {
-        case USTATE_INSIDE_TEXT:
-          {
-            p = unescape_text_state_inside_text (&ucontext,
-                                                 p,
-                                                 error);
-          }
-          break;
-
-        case USTATE_AFTER_AMPERSAND:
-          {
-            p = unescape_text_state_after_ampersand (&ucontext,
-                                                     p,
-                                                     error);
-          }
-          break;
-
-
-        case USTATE_INSIDE_ENTITY_NAME:
-          {
-            p = unescape_text_state_inside_entity_name (&ucontext,
-                                                        p,
-                                                        error);
-          }
-          break;
-
-        case USTATE_AFTER_CHARREF_HASH:
-          {
-            p = unescape_text_state_after_charref_hash (&ucontext,
-                                                        p,
-                                                        error);
-          }
-          break;
+  *is_ascii = !(mask & 0x80);
 
-        default:
-          g_assert_not_reached ();
-          break;
-        }
-    }
-
-  if (context->state != STATE_ERROR) 
-    {
-      switch (ucontext.state) 
-	{
-	case USTATE_INSIDE_TEXT:
-	  break;
-	case USTATE_AFTER_AMPERSAND:
-	case USTATE_INSIDE_ENTITY_NAME:
-	  set_unescape_error (context, error,
-			      NULL, NULL,
-			      G_MARKUP_ERROR_PARSE,
-			      _("Unfinished entity reference"));
-	  break;
-	case USTATE_AFTER_CHARREF_HASH:
-	  set_unescape_error (context, error,
-			      NULL, NULL,
-			      G_MARKUP_ERROR_PARSE,
-			      _("Unfinished character reference"));
-	  break;
-	}
-    }
-
-  if (context->state == STATE_ERROR)
-    {
-      g_string_free (ucontext.str, TRUE);
-      *unescaped = NULL;
-      return FALSE;
-    }
-  else
-    {
-      *unescaped = ucontext.str;
-      return TRUE;
-    }
+  return TRUE;
 }
 
 static inline gboolean
 advance_char (GMarkupParseContext *context)
 {  
-  context->iter = g_utf8_next_char (context->iter);
-  context->char_number += 1;
+  context->iter++;
+  context->char_number++;
 
-  if (context->iter == context->current_text_end)
-    {
+  if (G_UNLIKELY (context->iter == context->current_text_end))
       return FALSE;
-    }
-  else if (*context->iter == '\n')
+
+  else if (G_UNLIKELY (*context->iter == '\n'))
     {
-      context->line_number += 1;
+      context->line_number++;
       context->char_number = 1;
     }
   
@@ -833,37 +723,62 @@ advance_to_name_end (GMarkupParseContext *context)
 {
   do
     {
-      if (!is_name_char (context->iter))
+      if (IS_COMMON_NAME_END_CHAR (*(context->iter)))
         return;
+      if (xml_isspace (*(context->iter)))
+	return;
     }
   while (advance_char (context));
 }
 
 static void
+release_chunk (GMarkupParseContext *context, GString *str)
+{
+  GSList *node;
+  if (!str)
+    return;
+  if (str->allocated_len > 256)
+    { /* large strings are unusual and worth freeing */
+      g_string_free (str, TRUE);
+      return;
+    }
+  string_blank (str);
+  node = get_list_node (context, str);
+  context->spare_chunks = g_slist_concat (node, context->spare_chunks);
+}
+
+static void
 add_to_partial (GMarkupParseContext *context,
                 const gchar         *text_start,
                 const gchar         *text_end)
 {
   if (context->partial_chunk == NULL)
-    context->partial_chunk = g_string_sized_new (text_end - text_start);
+    { /* allocate a new chunk to parse into */
 
-  if (text_start != text_end)
-    g_string_append_len (context->partial_chunk, text_start,
-                         text_end - text_start);
+      if (context->spare_chunks != NULL)
+	{
+	  GSList *node = context->spare_chunks;
+	  context->spare_chunks = g_slist_remove_link (context->spare_chunks, node);
+	  context->partial_chunk = node->data;
+	  free_list_node (context, node);
+	}
+      else
+	context->partial_chunk = g_string_sized_new (MAX (28, text_end - text_start));
+    }
 
-  /* Invariant here that partial_chunk exists */
+  if (text_start != text_end)
+    g_string_insert_len (context->partial_chunk, -1,
+			 text_start, text_end - text_start);
 }
 
-static void
+static inline void
 truncate_partial (GMarkupParseContext *context)
 {
   if (context->partial_chunk != NULL)
-    {
-      context->partial_chunk = g_string_truncate (context->partial_chunk, 0);
-    }
+    string_blank (context->partial_chunk);
 }
 
-static const gchar*
+static inline const gchar*
 current_element (GMarkupParseContext *context)
 {
   return context->tag_stack->data;
@@ -891,6 +806,30 @@ pop_subparser_stack (GMarkupParseContext *context)
 }
 
 static void
+push_partial_as_tag (GMarkupParseContext *context)
+{
+  GString *str = context->partial_chunk;
+  /* sadly, this is exported by gmarkup_get_element_stack as-is */
+  context->tag_stack = g_slist_concat (get_list_node (context, str->str), context->tag_stack);
+  context->tag_stack_gstr = g_slist_concat (get_list_node (context, str), context->tag_stack_gstr);
+  context->partial_chunk = NULL;
+}
+
+static void
+pop_tag (GMarkupParseContext *context)
+{
+  GSList *nodea, *nodeb;
+
+  nodea = context->tag_stack;
+  nodeb = context->tag_stack_gstr;
+  release_chunk (context, nodeb->data);
+  context->tag_stack = g_slist_remove_link (context->tag_stack, nodea);
+  context->tag_stack_gstr = g_slist_remove_link (context->tag_stack_gstr, nodeb);
+  free_list_node (context, nodea);
+  free_list_node (context, nodeb);
+}
+
+static void
 possibly_finish_subparser (GMarkupParseContext *context)
 {
   if (current_element (context) == context->subparser_element)
@@ -916,60 +855,42 @@ static const gchar*
 current_attribute (GMarkupParseContext *context)
 {
   g_assert (context->cur_attr >= 0);
-  return context->attr_names[context->cur_attr];
-}
-
-static void
-find_current_text_end (GMarkupParseContext *context)
-{
-  /* This function must be safe (non-segfaulting) on invalid UTF8.
-   * It assumes the string starts with a character start
-   */
-  const gchar *end = context->current_text + context->current_text_len;
-  const gchar *p;
-  const gchar *next;
-
-  g_assert (context->current_text_len > 0);
-
-  p = g_utf8_find_prev_char (context->current_text, end);
-
-  g_assert (p != NULL); /* since current_text was a char start */
-
-  /* p is now the start of the last character or character portion. */
-  g_assert (p != end);
-  next = g_utf8_next_char (p); /* this only touches *p, nothing beyond */
-
-  if (next == end)
-    {
-      /* whole character */
-      context->current_text_end = end;
-    }
-  else
-    {
-      /* portion */
-      context->leftover_char_portion = g_string_new_len (p, end - p);
-      context->current_text_len -= (end - p);
-      context->current_text_end = p;
-    }
+  return context->attr_names[context->cur_attr]->str;
 }
 
-
 static void
-add_attribute (GMarkupParseContext *context, char *name)
+add_attribute (GMarkupParseContext *context, GString *str)
 {
   if (context->cur_attr + 2 >= context->alloc_attrs)
     {
       context->alloc_attrs += 5; /* silly magic number */
-      context->attr_names = g_realloc (context->attr_names, sizeof(char*)*context->alloc_attrs);
-      context->attr_values = g_realloc (context->attr_values, sizeof(char*)*context->alloc_attrs);
+      context->attr_names = g_realloc (context->attr_names, sizeof(GString*)*context->alloc_attrs);
+      context->attr_values = g_realloc (context->attr_values, sizeof(GString*)*context->alloc_attrs);
     }
   context->cur_attr++;
-  context->attr_names[context->cur_attr] = name;
+  context->attr_names[context->cur_attr] = str;
   context->attr_values[context->cur_attr] = NULL;
   context->attr_names[context->cur_attr+1] = NULL;
   context->attr_values[context->cur_attr+1] = NULL;
 }
 
+static void
+clear_attributes (GMarkupParseContext *context)
+{
+  /* Go ahead and free the attributes. */
+  for (; context->cur_attr >= 0; context->cur_attr--)
+    {
+      int pos = context->cur_attr;
+      release_chunk (context, context->attr_names[pos]);
+      release_chunk (context, context->attr_values[pos]);
+      context->attr_names[pos] = context->attr_values[pos] = NULL;
+    }
+  g_assert (context->cur_attr == -1);
+  g_assert (context->attr_names == NULL ||
+	    context->attr_names[0] == NULL);
+  g_assert (context->attr_values == NULL ||
+	    context->attr_values[0] == NULL);
+}
 /**
  * g_markup_parse_context_parse:
  * @context: a #GMarkupParseContext
@@ -994,8 +915,6 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
                               gssize               text_len,
                               GError             **error)
 {
-  const gchar *first_invalid;
-  
   g_return_val_if_fail (context != NULL, FALSE);
   g_return_val_if_fail (text != NULL, FALSE);
   g_return_val_if_fail (context->state != STATE_ERROR, FALSE);
@@ -1009,130 +928,16 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
   
   context->parsing = TRUE;
   
-  if (context->leftover_char_portion)
-    {
-      const gchar *first_char;
-
-      if ((*text & 0xc0) != 0x80)
-        first_char = text;
-      else
-        first_char = g_utf8_find_next_char (text, text + text_len);
-
-      if (first_char)
-        {
-          /* leftover_char_portion was completed. Parse it. */
-          GString *portion = context->leftover_char_portion;
-          
-          g_string_append_len (context->leftover_char_portion,
-                               text, first_char - text);
-
-          /* hacks to allow recursion */
-          context->parsing = FALSE;
-          context->leftover_char_portion = NULL;
-          
-          if (!g_markup_parse_context_parse (context,
-                                             portion->str, portion->len,
-                                             error))
-            {
-              g_assert (context->state == STATE_ERROR);
-            }
-          
-          g_string_free (portion, TRUE);
-          context->parsing = TRUE;
-
-          /* Skip the fraction of char that was in this text */
-          text_len -= (first_char - text);
-          text = first_char;
-        }
-      else
-        {
-          /* another little chunk of the leftover char; geez
-           * someone is inefficient.
-           */
-          g_string_append_len (context->leftover_char_portion,
-                               text, text_len);
-
-          if (context->leftover_char_portion->len > 7)
-            {
-              /* The leftover char portion is too big to be
-               * a UTF-8 character
-               */
-              set_error_literal (context,
-                                 error,
-                                 G_MARKUP_ERROR_BAD_UTF8,
-                                 _("Invalid UTF-8 encoded text - overlong sequence"));
-            }
-          
-          goto finished;
-        }
-    }
 
   context->current_text = text;
   context->current_text_len = text_len;
+  context->current_text_end = context->current_text + text_len;
   context->iter = context->current_text;
   context->start = context->iter;
 
-  /* Nothing left after finishing the leftover char, or nothing
-   * passed in to begin with.
-   */
   if (context->current_text_len == 0)
     goto finished;
 
-  /* find_current_text_end () assumes the string starts at
-   * a character start, so we need to validate at least
-   * that much. It doesn't assume any following bytes
-   * are valid.
-   */
-  if ((*context->current_text & 0xc0) == 0x80) /* not a char start */
-    {
-      set_error_literal (context,
-                         error,
-                         G_MARKUP_ERROR_BAD_UTF8,
-                         _("Invalid UTF-8 encoded text - not a start char"));
-      goto finished;
-    }
-
-  /* Initialize context->current_text_end, possibly adjusting
-   * current_text_len, and add any leftover char portion
-   */
-  find_current_text_end (context);
-
-  /* Validate UTF8 (must be done after we find the end, since
-   * we could have a trailing incomplete char)
-   */
-  if (!g_utf8_validate (context->current_text,
-			context->current_text_len,
-			&first_invalid))
-    {
-      gint newlines = 0;
-      const gchar *p, *q;
-      gchar *current_text_dup;
-
-      q = p = context->current_text;
-      while (p != first_invalid)
-        {
-          if (*p == '\n')
-            {
-              ++newlines;
-              q = p + 1;
-              context->char_number = 1;
-            }
-          ++p;
-        }
-
-      context->line_number += newlines;
-      context->char_number += g_utf8_strlen (q, first_invalid - q);
-
-      current_text_dup = g_strndup (context->current_text, context->current_text_len);
-      set_error (context,
-                 error,
-                 G_MARKUP_ERROR_BAD_UTF8,
-                 _("Invalid UTF-8 encoded text - not valid '%s'"),
-                 current_text_dup);
-      g_free (current_text_dup);
-      goto finished;
-    }
-
   while (context->iter != context->current_text_end)
     {
       switch (context->state)
@@ -1191,7 +996,7 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
 
               context->state = STATE_AFTER_CLOSE_TAG_SLASH;
             }
-          else if (is_name_start_char (context->iter))
+          else if (!IS_COMMON_NAME_END_CHAR (*(context->iter)))
             {
               context->state = STATE_INSIDE_OPEN_TAG_NAME;
 
@@ -1247,7 +1052,7 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
             tmp_error = NULL;
             if (context->parser->end_element)
               (* context->parser->end_element) (context,
-                                                context->tag_stack->data,
+						current_element (context),
                                                 context->user_data,
                                                 &tmp_error);
 
@@ -1279,10 +1084,7 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
                                current_element (context));
                   }
               }
-
-            g_free (context->tag_stack->data);
-            context->tag_stack = g_slist_delete_link (context->tag_stack,
-                                                      context->tag_stack);
+	    pop_tag (context);
           }
           break;
 
@@ -1309,12 +1111,7 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
                * if any; push it on the stack; enter next state.
                */
               add_to_partial (context, context->start, context->iter);
-              context->tag_stack =
-                g_slist_prepend (context->tag_stack,
-                                 g_string_free (context->partial_chunk,
-                                                FALSE));
-
-              context->partial_chunk = NULL;
+	      push_partial_as_tag (context);
 
               context->state = STATE_BETWEEN_ATTRIBUTES;
               context->start = NULL;
@@ -1345,7 +1142,10 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
 	      /* The name has ended. Combine it with the partial chunk
 	       * if any; push it on the stack; enter next state.
 	       */
-              add_attribute (context, g_string_free (context->partial_chunk, FALSE));
+	      if (!name_validate (context, context->partial_chunk->str, error))
+		break;
+
+	      add_attribute (context, context->partial_chunk);
 	      
               context->partial_chunk = NULL;
               context->start = NULL;
@@ -1387,11 +1187,10 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
                 }
               else if (*context->iter == '>')
                 {
-
                   advance_char (context);
                   context->state = STATE_AFTER_CLOSE_ANGLE;
                 }
-              else if (is_name_start_char (context->iter))
+              else if (!IS_COMMON_NAME_END_CHAR (*(context->iter)))
                 {
                   context->state = STATE_INSIDE_ATTRIBUTE_NAME;
                   /* start of attribute name */
@@ -1419,44 +1218,35 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
               if (context->state == STATE_AFTER_ELISION_SLASH ||
                   context->state == STATE_AFTER_CLOSE_ANGLE)
                 {
-                  const gchar *start_name;
-		  /* Ugly, but the current code expects an empty array instead of NULL */
-		  const gchar *empty = NULL;
-                  const gchar **attr_names =  &empty;
-                  const gchar **attr_values = &empty;
+		  int i;
+		  const gchar *start_name;
+                  const gchar **attr_names;
+                  const gchar **attr_values;
                   GError *tmp_error;
 
-                  /* Call user callback for element start */
-                  start_name = current_element (context);
-
-		  if (context->cur_attr >= 0)
+		  attr_names = g_newa (const gchar *, context->cur_attr + 2);
+		  attr_values = g_newa (const gchar *, context->cur_attr + 2);
+		  for (i = 0; i < context->cur_attr + 1; i++)
 		    {
-		      attr_names = (const gchar**)context->attr_names;
-		      attr_values = (const gchar**)context->attr_values;
+		      attr_names[i] = context->attr_names[i]->str;
+		      attr_values[i] = context->attr_values[i]->str;
 		    }
+		  attr_names[i] = NULL;
+		  attr_values[i] = NULL;
 
+                  /* Call user callback for element start */
                   tmp_error = NULL;
-                  if (context->parser->start_element)
+		  start_name = current_element (context);
+
+                  if (context->parser->start_element &&
+		      name_validate (context, start_name, error))
                     (* context->parser->start_element) (context,
-                                                        start_name,
+							start_name,
                                                         (const gchar **)attr_names,
                                                         (const gchar **)attr_values,
                                                         context->user_data,
                                                         &tmp_error);
-
-                  /* Go ahead and free the attributes. */
-		  for (; context->cur_attr >= 0; context->cur_attr--)
-		    {
-		      int pos = context->cur_attr;
-		      g_free (context->attr_names[pos]);
-		      g_free (context->attr_values[pos]);
-		      context->attr_names[pos] = context->attr_values[pos] = NULL;
-		    }
-                  g_assert (context->cur_attr == -1);
-                  g_assert (context->attr_names == NULL ||
-                            context->attr_names[0] == NULL);
-                  g_assert (context->attr_values == NULL ||
-                            context->attr_values[0] == NULL);
+		  clear_attributes (context);
                   
                   if (tmp_error != NULL)
                     propagate_error (context, error, tmp_error);
@@ -1531,25 +1321,22 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
             }
           else
             {
+	      gboolean is_ascii;
               /* The value has ended at the quote mark. Combine it
                * with the partial chunk if any; set it for the current
                * attribute.
                */
-              GString *unescaped;
-              
               add_to_partial (context, context->start, context->iter);
 
               g_assert (context->cur_attr >= 0);
               
-              if (unescape_text (context,
-                                 context->partial_chunk->str,
-                                 context->partial_chunk->str +
-                                 context->partial_chunk->len,
-                                 &unescaped,
-                                 error))
+              if (unescape_gstring_inplace (context, context->partial_chunk, &is_ascii, error) &&
+		  (is_ascii || text_validate (context, context->partial_chunk->str,
+					      context->partial_chunk->len, error)))
                 {
                   /* success, advance past quote and set state. */
-                  context->attr_values[context->cur_attr] = g_string_free (unescaped, FALSE);
+                  context->attr_values[context->cur_attr] = context->partial_chunk;
+		  context->partial_chunk = NULL;
                   advance_char (context);
                   context->state = STATE_BETWEEN_ATTRIBUTES;
                   context->start = NULL;
@@ -1576,30 +1363,25 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
 
           if (context->iter != context->current_text_end)
             {
-              GString *unescaped = NULL;
+	      gboolean is_ascii;
 
               /* The text has ended at the open angle. Call the text
                * callback.
                */
               
-              if (unescape_text (context,
-                                 context->partial_chunk->str,
-                                 context->partial_chunk->str +
-                                 context->partial_chunk->len,
-                                 &unescaped,
-                                 error))
+              if (unescape_gstring_inplace (context, context->partial_chunk, &is_ascii, error) &&
+		  (is_ascii || text_validate (context, context->partial_chunk->str,
+					      context->partial_chunk->len, error)))
                 {
                   GError *tmp_error = NULL;
 
                   if (context->parser->text)
                     (*context->parser->text) (context,
-                                              unescaped->str,
-                                              unescaped->len,
+					      context->partial_chunk->str,
+					      context->partial_chunk->len,
                                               context->user_data,
                                               &tmp_error);
                   
-                  g_string_free (unescaped, TRUE);
-
                   if (tmp_error == NULL)
                     {
                       /* advance past open angle and set state. */
@@ -1618,7 +1400,7 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
 
         case STATE_AFTER_CLOSE_TAG_SLASH:
           /* Possible next state: INSIDE_CLOSE_TAG_NAME */
-          if (is_name_start_char (context->iter))
+          if (!IS_COMMON_NAME_END_CHAR (*(context->iter)))
             {
               context->state = STATE_INSIDE_CLOSE_TAG_NAME;
 
@@ -1656,13 +1438,9 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
 	  
 	  if (context->iter != context->current_text_end)
 	    {
-	      gchar *close_name;
+	      GString *close_name;
 
-	      /* The name has ended. Combine it with the partial chunk
-	       * if any; check that it matches stack top and pop
-	       * stack; invoke proper callback; enter next state.
-	       */
-	      close_name = g_string_free (context->partial_chunk, FALSE);
+	      close_name = context->partial_chunk;
 	      context->partial_chunk = NULL;
               
 	      if (*context->iter != '>')
@@ -1676,7 +1454,7 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
 			       "the close element name '%s'; the allowed "
 			       "character is '>'"),
 			     utf8_str (context->iter, buf),
-			     close_name);
+			     close_name->str);
 		}
 	      else if (context->tag_stack == NULL)
 		{
@@ -1685,16 +1463,16 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
 			     G_MARKUP_ERROR_PARSE,
 			     _("Element '%s' was closed, no element "
 			       "is currently open"),
-			     close_name);
+			     close_name->str);
 		}
-	      else if (strcmp (close_name, current_element (context)) != 0)
+	      else if (strcmp (close_name->str, current_element (context)) != 0)
 		{
 		  set_error (context,
 			     error,
 			     G_MARKUP_ERROR_PARSE,
 			     _("Element '%s' was closed, but the currently "
 			       "open element is '%s'"),
-			     close_name,
+			     close_name->str,
 			     current_element (context));
 		}
 	      else
@@ -1710,22 +1488,18 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
 		  tmp_error = NULL;
 		  if (context->parser->end_element)
 		    (* context->parser->end_element) (context,
-						      close_name,
+						      close_name->str,
 						      context->user_data,
 						      &tmp_error);
 		  
                   ensure_no_outstanding_subparser (context);
-		  
-		  /* Pop the tag stack */
-		  g_free (context->tag_stack->data);
-		  context->tag_stack = g_slist_delete_link (context->tag_stack,
-							    context->tag_stack);
+		  pop_tag (context);
 		  
 		  if (tmp_error)
                     propagate_error (context, error, tmp_error);
                 }
-	      
-              g_free (close_name);
+	      context->partial_chunk = close_name;
+	      truncate_partial (context);
             }
           break;
 	  
@@ -1784,14 +1558,22 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
 	      if (context->flags & G_MARKUP_TREAT_CDATA_AS_TEXT &&
 		  strncmp (context->partial_chunk->str, "<![CDATA[", 9) == 0)
 		{
-		  if (context->parser->text)
+		  if (context->parser->text &&
+		      text_validate (context,
+				     context->partial_chunk->str + 9,
+				     context->partial_chunk->len - 12,
+				     error))
 		    (*context->parser->text) (context,
 					      context->partial_chunk->str + 9,
 					      context->partial_chunk->len - 12,
 					      context->user_data,
 					      &tmp_error);
 		}
-	      else if (context->parser->passthrough)
+	      else if (context->parser->passthrough &&
+		       text_validate (context,
+				      context->partial_chunk->str,
+				      context->partial_chunk->len,
+				      error))
                 (*context->parser->passthrough) (context,
                                                  context->partial_chunk->str,
                                                  context->partial_chunk->len,
@@ -1999,7 +1781,6 @@ G_CONST_RETURN GSList *
 g_markup_parse_context_get_element_stack (GMarkupParseContext *context)
 {
   g_return_val_if_fail (context != NULL, NULL);
-
   return context->tag_stack;
 }
 



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]