[glib] Patch originally committed only to the ChangeLog & tests ...
- From: Michael Meeks <michael src gnome org>
- To: svn-commits-list gnome org
- Subject: [glib] Patch originally committed only to the ChangeLog & tests ...
- Date: Fri, 1 May 2009 10:23:22 -0400 (EDT)
commit 83699774fa669abfbc5c5c3dc9265308246bd4f6
Author: Michael Meeks <michael meeks novell com>
Date: Fri May 1 15:23:23 2009 +0100
Patch originally committed only to the ChangeLog & tests ...
Bug 572508 â?? gmarkup speedup ...
* glib/gmarkup.c: Various optimizations: do less allocations by
keeping a pool of GStrings, do in-place unescaping, avoid redundant
utf-8 validation.
---
glib/gmarkup.c | 1129 +++++++++++++++++++++++---------------------------------
1 files changed, 455 insertions(+), 674 deletions(-)
diff --git a/glib/gmarkup.c b/glib/gmarkup.c
index b1f4f46..35118ad 100644
--- a/glib/gmarkup.c
+++ b/glib/gmarkup.c
@@ -82,11 +82,15 @@ struct _GMarkupParseContext
* the callback for it.
*/
GString *partial_chunk;
+ GSList *spare_chunks;
GMarkupParseState state;
GSList *tag_stack;
- gchar **attr_names;
- gchar **attr_values;
+ GSList *tag_stack_gstr;
+ GSList *spare_list_nodes;
+
+ GString **attr_names;
+ GString **attr_values;
gint cur_attr;
gint alloc_attrs;
@@ -94,8 +98,6 @@ struct _GMarkupParseContext
gssize current_text_len;
const gchar *current_text_end;
- GString *leftover_char_portion;
-
/* used to save the start of the last interesting thingy */
const gchar *start;
@@ -112,6 +114,39 @@ struct _GMarkupParseContext
gpointer held_user_data;
};
+/*
+ * Helpers to reduce our allocation overhead, we have
+ * a well defined allocation lifecycle.
+ */
+static GSList *
+get_list_node (GMarkupParseContext *context, gpointer data)
+{
+ GSList *node;
+ if (context->spare_list_nodes != NULL)
+ {
+ node = context->spare_list_nodes;
+ context->spare_list_nodes = g_slist_remove_link (context->spare_list_nodes, node);
+ }
+ else
+ node = g_slist_alloc();
+ node->data = data;
+ return node;
+}
+
+static void
+free_list_node (GMarkupParseContext *context, GSList *node)
+{
+ node->data = NULL;
+ context->spare_list_nodes = g_slist_concat (node, context->spare_list_nodes);
+}
+
+static inline void
+string_blank (GString *string)
+{
+ string->str[0] = '\0';
+ string->len = 0;
+}
+
/**
* g_markup_parse_context_new:
* @parser: a #GMarkupParser
@@ -148,9 +183,12 @@ g_markup_parse_context_new (const GMarkupParser *parser,
context->char_number = 1;
context->partial_chunk = NULL;
+ context->spare_chunks = NULL;
+ context->spare_list_nodes = NULL;
context->state = STATE_START;
context->tag_stack = NULL;
+ context->tag_stack_gstr = NULL;
context->attr_names = NULL;
context->attr_values = NULL;
context->cur_attr = -1;
@@ -159,7 +197,6 @@ g_markup_parse_context_new (const GMarkupParser *parser,
context->current_text = NULL;
context->current_text_len = -1;
context->current_text_end = NULL;
- context->leftover_char_portion = NULL;
context->start = NULL;
context->iter = NULL;
@@ -179,6 +216,14 @@ g_markup_parse_context_new (const GMarkupParser *parser,
return context;
}
+static void
+string_full_free (gpointer ptr, gpointer user_data)
+{
+ g_string_free (ptr, TRUE);
+}
+
+static void clear_attributes (GMarkupParseContext *context);
+
/**
* g_markup_parse_context_free:
* @context: a #GMarkupParseContext
@@ -198,18 +243,21 @@ g_markup_parse_context_free (GMarkupParseContext *context)
if (context->dnotify)
(* context->dnotify) (context->user_data);
- g_strfreev (context->attr_names);
- g_strfreev (context->attr_values);
+ clear_attributes (context);
+ g_free (context->attr_names);
+ g_free (context->attr_values);
- g_slist_foreach (context->tag_stack, (GFunc)g_free, NULL);
+ g_slist_foreach (context->tag_stack_gstr, string_full_free, NULL);
+ g_slist_free (context->tag_stack_gstr);
g_slist_free (context->tag_stack);
+ g_slist_foreach (context->spare_chunks, string_full_free, NULL);
+ g_slist_free (context->spare_chunks);
+ g_slist_free (context->spare_list_nodes);
+
if (context->partial_chunk)
g_string_free (context->partial_chunk, TRUE);
- if (context->leftover_char_portion)
- g_string_free (context->leftover_char_portion, TRUE);
-
g_free (context);
}
@@ -301,43 +349,101 @@ propagate_error (GMarkupParseContext *context,
g_propagate_error (dest, src);
}
-/* To make these faster, we first use the ascii-only tests, then check
- * for the usual non-alnum name-end chars, and only then call the
- * expensive unicode stuff. Nobody uses non-ascii in XML tag/attribute
- * names, so this is a reasonable hack that virtually always avoids
- * the guniprop call.
- */
#define IS_COMMON_NAME_END_CHAR(c) \
((c) == '=' || (c) == '/' || (c) == '>' || (c) == ' ')
static gboolean
-is_name_start_char (const gchar *p)
+slow_name_validate (GMarkupParseContext *context, const char *name, GError **error)
{
- if (g_ascii_isalpha (*p) ||
- (!IS_COMMON_NAME_END_CHAR (*p) &&
- (*p == '_' ||
- *p == ':' ||
- g_unichar_isalpha (g_utf8_get_char (p)))))
- return TRUE;
- else
- return FALSE;
+ const char *p = name;
+
+ if (!g_utf8_validate (name, strlen (name), NULL))
+ {
+ set_error (context, error, G_MARKUP_ERROR_BAD_UTF8,
+ _("Invalid UTF-8 encoded text in name - not valid '%s'"), name);
+ return FALSE;
+ }
+
+ if (!(g_ascii_isalpha (*p) ||
+ (!IS_COMMON_NAME_END_CHAR (*p) &&
+ (*p == '_' ||
+ *p == ':' ||
+ g_unichar_isalpha (g_utf8_get_char (p))))))
+ {
+ set_error (context, error, G_MARKUP_ERROR_PARSE,
+ _("'%s' is not a valid name "), name);
+ return FALSE;
+ }
+
+ for (p = g_utf8_next_char (name); *p != '\0'; p = g_utf8_next_char (p))
+ {
+ /* is_name_char */
+ if (!(g_ascii_isalnum (*p) ||
+ (!IS_COMMON_NAME_END_CHAR (*p) &&
+ (*p == '.' ||
+ *p == '-' ||
+ *p == '_' ||
+ *p == ':' ||
+ g_unichar_isalpha (g_utf8_get_char (p))))))
+ {
+ set_error (context, error, G_MARKUP_ERROR_PARSE,
+ _("'%s' is not a valid name: '%c' "), name, *p);
+ return FALSE;
+ }
+ }
+ return TRUE;
}
+/*
+ * Use me for elements, attributes etc.
+ */
static gboolean
-is_name_char (const gchar *p)
+name_validate (GMarkupParseContext *context, const char *name, GError **error)
{
- if (g_ascii_isalnum (*p) ||
- (!IS_COMMON_NAME_END_CHAR (*p) &&
- (*p == '.' ||
- *p == '-' ||
- *p == '_' ||
- *p == ':' ||
- g_unichar_isalpha (g_utf8_get_char (p)))))
- return TRUE;
- else
- return FALSE;
+ char mask;
+ const char *p;
+
+ /* name start char */
+ p = name;
+ if (G_UNLIKELY (IS_COMMON_NAME_END_CHAR (*p) ||
+ !(g_ascii_isalpha (*p) || *p == '_' || *p == ':')))
+ goto slow_validate;
+
+ for (mask = *p++; *p != '\0'; p++)
+ {
+ mask |= *p;
+
+ /* is_name_char */
+ if (G_UNLIKELY (!(g_ascii_isalnum (*p) ||
+ (!IS_COMMON_NAME_END_CHAR (*p) &&
+ (*p == '.' ||
+ *p == '-' ||
+ *p == '_' ||
+ *p == ':')))))
+ goto slow_validate;
+ }
+
+ if (mask & 0x80) /* un-common / non-ascii */
+ goto slow_validate;
+
+ return TRUE;
+
+ slow_validate:
+ return slow_name_validate (context, name, error);
}
+static gboolean
+text_validate (GMarkupParseContext *context, const char *p, int len, GError **error)
+{
+ if (!g_utf8_validate (p, len, NULL))
+ {
+ set_error (context, error, G_MARKUP_ERROR_BAD_UTF8,
+ _("Invalid UTF-8 encoded text in name - not valid '%s'"), p);
+ return FALSE;
+ }
+ else
+ return TRUE;
+}
static gchar*
char_str (gunichar c,
@@ -360,7 +466,6 @@ static void
set_unescape_error (GMarkupParseContext *context,
GError **error,
const gchar *remaining_text,
- const gchar *remaining_text_end,
GMarkupError code,
const gchar *format,
...)
@@ -373,7 +478,7 @@ set_unescape_error (GMarkupParseContext *context,
remaining_newlines = 0;
p = remaining_text;
- while (p != remaining_text_end)
+ while (*p != '\0')
{
if (*p == '\n')
++remaining_newlines;
@@ -397,414 +502,199 @@ set_unescape_error (GMarkupParseContext *context,
g_propagate_error (error, tmp_error);
}
-typedef enum
-{
- USTATE_INSIDE_TEXT,
- USTATE_AFTER_AMPERSAND,
- USTATE_INSIDE_ENTITY_NAME,
- USTATE_AFTER_CHARREF_HASH
-} UnescapeState;
-
-typedef struct
-{
- GMarkupParseContext *context;
- GString *str;
- UnescapeState state;
- const gchar *text;
- const gchar *text_end;
- const gchar *entity_start;
-} UnescapeContext;
-
-static const gchar*
-unescape_text_state_inside_text (UnescapeContext *ucontext,
- const gchar *p,
- GError **error)
+/*
+ * re-write the GString in-place, unescaping anything that escaped.
+ * most XML does not contain entities, or escaping.
+ */
+static gboolean
+unescape_gstring_inplace (GMarkupParseContext *context,
+ GString *string,
+ gboolean *is_ascii,
+ GError **error)
{
- const gchar *start;
+ char mask, *to;
+ int line_num = 1;
+ const char *from;
gboolean normalize_attribute;
- if (ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ ||
- ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ)
+ *is_ascii = FALSE;
+
+ /* are we unescaping an attribute or not ? */
+ if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ ||
+ context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ)
normalize_attribute = TRUE;
else
normalize_attribute = FALSE;
- start = p;
-
- while (p != ucontext->text_end)
- {
- if (*p == '&')
- {
- break;
- }
- else if (normalize_attribute && (*p == '\t' || *p == '\n'))
- {
- g_string_append_len (ucontext->str, start, p - start);
- g_string_append_c (ucontext->str, ' ');
- p = g_utf8_next_char (p);
- start = p;
- }
- else if (*p == '\r')
- {
- g_string_append_len (ucontext->str, start, p - start);
- g_string_append_c (ucontext->str, normalize_attribute ? ' ' : '\n');
- p = g_utf8_next_char (p);
- if (p != ucontext->text_end && *p == '\n')
- p = g_utf8_next_char (p);
- start = p;
- }
- else
- p = g_utf8_next_char (p);
- }
-
- if (p != start)
- g_string_append_len (ucontext->str, start, p - start);
-
- if (p != ucontext->text_end && *p == '&')
- {
- p = g_utf8_next_char (p);
- ucontext->state = USTATE_AFTER_AMPERSAND;
- }
-
- return p;
-}
-
-static const gchar*
-unescape_text_state_after_ampersand (UnescapeContext *ucontext,
- const gchar *p,
- GError **error)
-{
- ucontext->entity_start = NULL;
-
- if (*p == '#')
- {
- p = g_utf8_next_char (p);
-
- ucontext->entity_start = p;
- ucontext->state = USTATE_AFTER_CHARREF_HASH;
- }
- else if (!is_name_start_char (p))
- {
- if (*p == ';')
- {
- set_unescape_error (ucontext->context, error,
- p, ucontext->text_end,
- G_MARKUP_ERROR_PARSE,
- _("Empty entity '&;' seen; valid "
- "entities are: & " < > '"));
- }
- else
- {
- gchar buf[8];
-
- set_unescape_error (ucontext->context, error,
- p, ucontext->text_end,
- G_MARKUP_ERROR_PARSE,
- _("Character '%s' is not valid at "
- "the start of an entity name; "
- "the & character begins an entity; "
- "if this ampersand isn't supposed "
- "to be an entity, escape it as "
- "&"),
- utf8_str (p, buf));
- }
- }
- else
- {
- ucontext->entity_start = p;
- ucontext->state = USTATE_INSIDE_ENTITY_NAME;
- }
-
- return p;
-}
-
-static const gchar*
-unescape_text_state_inside_entity_name (UnescapeContext *ucontext,
- const gchar *p,
- GError **error)
-{
- while (p != ucontext->text_end)
- {
- if (*p == ';')
- break;
- else if (!is_name_char (p))
- {
- gchar ubuf[8];
-
- set_unescape_error (ucontext->context, error,
- p, ucontext->text_end,
- G_MARKUP_ERROR_PARSE,
- _("Character '%s' is not valid "
- "inside an entity name"),
- utf8_str (p, ubuf));
- break;
- }
-
- p = g_utf8_next_char (p);
- }
-
- if (ucontext->context->state != STATE_ERROR)
- {
- if (p != ucontext->text_end)
- {
- gint len = p - ucontext->entity_start;
-
- /* move to after semicolon */
- p = g_utf8_next_char (p);
- ucontext->state = USTATE_INSIDE_TEXT;
-
- if (strncmp (ucontext->entity_start, "lt", len) == 0)
- g_string_append_c (ucontext->str, '<');
- else if (strncmp (ucontext->entity_start, "gt", len) == 0)
- g_string_append_c (ucontext->str, '>');
- else if (strncmp (ucontext->entity_start, "amp", len) == 0)
- g_string_append_c (ucontext->str, '&');
- else if (strncmp (ucontext->entity_start, "quot", len) == 0)
- g_string_append_c (ucontext->str, '"');
- else if (strncmp (ucontext->entity_start, "apos", len) == 0)
- g_string_append_c (ucontext->str, '\'');
- else
- {
- gchar *name;
-
- name = g_strndup (ucontext->entity_start, len);
- set_unescape_error (ucontext->context, error,
- p, ucontext->text_end,
- G_MARKUP_ERROR_PARSE,
- _("Entity name '%s' is not known"),
- name);
- g_free (name);
- }
- }
- else
- {
- set_unescape_error (ucontext->context, error,
- /* give line number of the & */
- ucontext->entity_start, ucontext->text_end,
- G_MARKUP_ERROR_PARSE,
- _("Entity did not end with a semicolon; "
- "most likely you used an ampersand "
- "character without intending to start "
- "an entity - escape ampersand as &"));
- }
- }
-#undef MAX_ENT_LEN
-
- return p;
-}
-
-static const gchar*
-unescape_text_state_after_charref_hash (UnescapeContext *ucontext,
- const gchar *p,
- GError **error)
-{
- gboolean is_hex = FALSE;
- const char *start;
-
- start = ucontext->entity_start;
-
- if (*p == 'x')
- {
- is_hex = TRUE;
- p = g_utf8_next_char (p);
- start = p;
- }
-
- while (p != ucontext->text_end && *p != ';')
- p = g_utf8_next_char (p);
-
- if (p != ucontext->text_end)
+ /*
+ * Meeks' theorum: unescaping can only shrink text.
+ * for < etc. this is obvious, for  more
+ * thought is required, but this is patently so.
+ */
+ mask = 0;
+ for (from = to = string->str; *from != '\0'; from++, to++)
{
- g_assert (*p == ';');
+ *to = *from;
+
+ mask |= *to;
+ if (*to == '\n')
+ line_num++;
+ if (normalize_attribute && (*to == '\t' || *to == '\n'))
+ *to = ' ';
+ if (*to == '\r')
+ {
+ *to = normalize_attribute ? ' ' : '\n';
+ if (from[1] == '\n')
+ from++;
+ }
+ if (*from == '&')
+ {
+ from++;
+ if (*from == '#')
+ {
+ gboolean is_hex = FALSE;
+ gulong l;
+ gchar *end = NULL;
- /* digit is between start and p */
+ from++;
- if (start != p)
- {
- gulong l;
- gchar *end = NULL;
-
- errno = 0;
- if (is_hex)
- l = strtoul (start, &end, 16);
- else
- l = strtoul (start, &end, 10);
+ if (*from == 'x')
+ {
+ is_hex = TRUE;
+ from++;
+ }
- if (end != p || errno != 0)
- {
- set_unescape_error (ucontext->context, error,
- start, ucontext->text_end,
- G_MARKUP_ERROR_PARSE,
- _("Failed to parse '%-.*s', which "
- "should have been a digit "
- "inside a character reference "
- "(ê for example) - perhaps "
- "the digit is too large"),
- p - start, start);
- }
- else
- {
- /* characters XML 1.1 permits */
- if ((0 < l && l <= 0xD7FF) ||
- (0xE000 <= l && l <= 0xFFFD) ||
- (0x10000 <= l && l <= 0x10FFFF))
- {
- gchar buf[8];
- g_string_append (ucontext->str, char_str (l, buf));
- }
- else
- {
- set_unescape_error (ucontext->context, error,
- start, ucontext->text_end,
- G_MARKUP_ERROR_PARSE,
- _("Character reference '%-.*s' does not "
- "encode a permitted character"),
- p - start, start);
- }
+ /* digit is between start and p */
+ errno = 0;
+ if (is_hex)
+ l = strtoul (from, &end, 16);
+ else
+ l = strtoul (from, &end, 10);
+
+ if (end == from || errno != 0)
+ {
+ set_unescape_error (context, error,
+ from, G_MARKUP_ERROR_PARSE,
+ _("Failed to parse '%-.*s', which "
+ "should have been a digit "
+ "inside a character reference "
+ "(ê for example) - perhaps "
+ "the digit is too large"),
+ end - from, from);
+ return FALSE;
+ }
+ else if (*end != ';')
+ {
+ set_unescape_error (context, error,
+ from, G_MARKUP_ERROR_PARSE,
+ _("Character reference did not end with a "
+ "semicolon; "
+ "most likely you used an ampersand "
+ "character without intending to start "
+ "an entity - escape ampersand as &"));
+ return FALSE;
+ }
+ else
+ {
+ /* characters XML 1.1 permits */
+ if ((0 < l && l <= 0xD7FF) ||
+ (0xE000 <= l && l <= 0xFFFD) ||
+ (0x10000 <= l && l <= 0x10FFFF))
+ {
+ gchar buf[8];
+ char_str (l, buf);
+ strcpy (to, buf);
+ to += strlen (buf) - 1;
+ from = end;
+ if (l >= 0x80) /* not ascii */
+ mask |= 0x80;
+ }
+ else
+ {
+ set_unescape_error (context, error,
+ from, G_MARKUP_ERROR_PARSE,
+ _("Character reference '%-.*s' does not "
+ "encode a permitted character"),
+ end - from, from);
+ return FALSE;
+ }
+ }
}
- /* Move to next state */
- p = g_utf8_next_char (p); /* past semicolon */
- ucontext->state = USTATE_INSIDE_TEXT;
- }
- else
- {
- set_unescape_error (ucontext->context, error,
- start, ucontext->text_end,
- G_MARKUP_ERROR_PARSE,
- _("Empty character reference; "
- "should include a digit such as "
- "dž"));
- }
- }
- else
- {
- set_unescape_error (ucontext->context, error,
- start, ucontext->text_end,
- G_MARKUP_ERROR_PARSE,
- _("Character reference did not end with a "
- "semicolon; "
- "most likely you used an ampersand "
- "character without intending to start "
- "an entity - escape ampersand as &"));
+ else if (strncmp (from, "lt;", 3) == 0)
+ {
+ *to = '<';
+ from += 2;
+ }
+ else if (strncmp (from, "gt;", 3) == 0)
+ {
+ *to = '>';
+ from += 2;
+ }
+ else if (strncmp (from, "amp;", 4) == 0)
+ {
+ *to = '&';
+ from += 3;
+ }
+ else if (strncmp (from, "quot;", 5) == 0)
+ {
+ *to = '"';
+ from += 4;
+ }
+ else if (strncmp (from, "apos;", 5) == 0)
+ {
+ *to = '\'';
+ from += 4;
+ }
+ else
+ {
+ if (*from == ';')
+ set_unescape_error (context, error,
+ from, G_MARKUP_ERROR_PARSE,
+ _("Empty entity '&;' seen; valid "
+ "entities are: & " < > '"));
+ else
+ {
+ const char *end = strchr (from, ';');
+ if (end)
+ set_unescape_error (context, error,
+ from, G_MARKUP_ERROR_PARSE,
+ _("Entity name '%-.*s' is not known"),
+ end-from, from);
+ else
+ set_unescape_error (context, error,
+ from, G_MARKUP_ERROR_PARSE,
+ _("Entity did not end with a semicolon; "
+ "most likely you used an ampersand "
+ "character without intending to start "
+ "an entity - escape ampersand as &"));
+ }
+ return FALSE;
+ }
+ }
}
- return p;
-}
+ g_assert (to - string->str <= string->len);
+ if (to - string->str != string->len)
+ g_string_truncate (string, to - string->str);
-static gboolean
-unescape_text (GMarkupParseContext *context,
- const gchar *text,
- const gchar *text_end,
- GString **unescaped,
- GError **error)
-{
- UnescapeContext ucontext;
- const gchar *p;
-
- ucontext.context = context;
- ucontext.text = text;
- ucontext.text_end = text_end;
- ucontext.entity_start = NULL;
-
- ucontext.str = g_string_sized_new (text_end - text);
-
- ucontext.state = USTATE_INSIDE_TEXT;
- p = text;
-
- while (p != text_end && context->state != STATE_ERROR)
- {
- g_assert (p < text_end);
-
- switch (ucontext.state)
- {
- case USTATE_INSIDE_TEXT:
- {
- p = unescape_text_state_inside_text (&ucontext,
- p,
- error);
- }
- break;
-
- case USTATE_AFTER_AMPERSAND:
- {
- p = unescape_text_state_after_ampersand (&ucontext,
- p,
- error);
- }
- break;
-
-
- case USTATE_INSIDE_ENTITY_NAME:
- {
- p = unescape_text_state_inside_entity_name (&ucontext,
- p,
- error);
- }
- break;
-
- case USTATE_AFTER_CHARREF_HASH:
- {
- p = unescape_text_state_after_charref_hash (&ucontext,
- p,
- error);
- }
- break;
+ *is_ascii = !(mask & 0x80);
- default:
- g_assert_not_reached ();
- break;
- }
- }
-
- if (context->state != STATE_ERROR)
- {
- switch (ucontext.state)
- {
- case USTATE_INSIDE_TEXT:
- break;
- case USTATE_AFTER_AMPERSAND:
- case USTATE_INSIDE_ENTITY_NAME:
- set_unescape_error (context, error,
- NULL, NULL,
- G_MARKUP_ERROR_PARSE,
- _("Unfinished entity reference"));
- break;
- case USTATE_AFTER_CHARREF_HASH:
- set_unescape_error (context, error,
- NULL, NULL,
- G_MARKUP_ERROR_PARSE,
- _("Unfinished character reference"));
- break;
- }
- }
-
- if (context->state == STATE_ERROR)
- {
- g_string_free (ucontext.str, TRUE);
- *unescaped = NULL;
- return FALSE;
- }
- else
- {
- *unescaped = ucontext.str;
- return TRUE;
- }
+ return TRUE;
}
static inline gboolean
advance_char (GMarkupParseContext *context)
{
- context->iter = g_utf8_next_char (context->iter);
- context->char_number += 1;
+ context->iter++;
+ context->char_number++;
- if (context->iter == context->current_text_end)
- {
+ if (G_UNLIKELY (context->iter == context->current_text_end))
return FALSE;
- }
- else if (*context->iter == '\n')
+
+ else if (G_UNLIKELY (*context->iter == '\n'))
{
- context->line_number += 1;
+ context->line_number++;
context->char_number = 1;
}
@@ -833,37 +723,62 @@ advance_to_name_end (GMarkupParseContext *context)
{
do
{
- if (!is_name_char (context->iter))
+ if (IS_COMMON_NAME_END_CHAR (*(context->iter)))
return;
+ if (xml_isspace (*(context->iter)))
+ return;
}
while (advance_char (context));
}
static void
+release_chunk (GMarkupParseContext *context, GString *str)
+{
+ GSList *node;
+ if (!str)
+ return;
+ if (str->allocated_len > 256)
+ { /* large strings are unusual and worth freeing */
+ g_string_free (str, TRUE);
+ return;
+ }
+ string_blank (str);
+ node = get_list_node (context, str);
+ context->spare_chunks = g_slist_concat (node, context->spare_chunks);
+}
+
+static void
add_to_partial (GMarkupParseContext *context,
const gchar *text_start,
const gchar *text_end)
{
if (context->partial_chunk == NULL)
- context->partial_chunk = g_string_sized_new (text_end - text_start);
+ { /* allocate a new chunk to parse into */
- if (text_start != text_end)
- g_string_append_len (context->partial_chunk, text_start,
- text_end - text_start);
+ if (context->spare_chunks != NULL)
+ {
+ GSList *node = context->spare_chunks;
+ context->spare_chunks = g_slist_remove_link (context->spare_chunks, node);
+ context->partial_chunk = node->data;
+ free_list_node (context, node);
+ }
+ else
+ context->partial_chunk = g_string_sized_new (MAX (28, text_end - text_start));
+ }
- /* Invariant here that partial_chunk exists */
+ if (text_start != text_end)
+ g_string_insert_len (context->partial_chunk, -1,
+ text_start, text_end - text_start);
}
-static void
+static inline void
truncate_partial (GMarkupParseContext *context)
{
if (context->partial_chunk != NULL)
- {
- context->partial_chunk = g_string_truncate (context->partial_chunk, 0);
- }
+ string_blank (context->partial_chunk);
}
-static const gchar*
+static inline const gchar*
current_element (GMarkupParseContext *context)
{
return context->tag_stack->data;
@@ -891,6 +806,30 @@ pop_subparser_stack (GMarkupParseContext *context)
}
static void
+push_partial_as_tag (GMarkupParseContext *context)
+{
+ GString *str = context->partial_chunk;
+ /* sadly, this is exported by gmarkup_get_element_stack as-is */
+ context->tag_stack = g_slist_concat (get_list_node (context, str->str), context->tag_stack);
+ context->tag_stack_gstr = g_slist_concat (get_list_node (context, str), context->tag_stack_gstr);
+ context->partial_chunk = NULL;
+}
+
+static void
+pop_tag (GMarkupParseContext *context)
+{
+ GSList *nodea, *nodeb;
+
+ nodea = context->tag_stack;
+ nodeb = context->tag_stack_gstr;
+ release_chunk (context, nodeb->data);
+ context->tag_stack = g_slist_remove_link (context->tag_stack, nodea);
+ context->tag_stack_gstr = g_slist_remove_link (context->tag_stack_gstr, nodeb);
+ free_list_node (context, nodea);
+ free_list_node (context, nodeb);
+}
+
+static void
possibly_finish_subparser (GMarkupParseContext *context)
{
if (current_element (context) == context->subparser_element)
@@ -916,60 +855,42 @@ static const gchar*
current_attribute (GMarkupParseContext *context)
{
g_assert (context->cur_attr >= 0);
- return context->attr_names[context->cur_attr];
-}
-
-static void
-find_current_text_end (GMarkupParseContext *context)
-{
- /* This function must be safe (non-segfaulting) on invalid UTF8.
- * It assumes the string starts with a character start
- */
- const gchar *end = context->current_text + context->current_text_len;
- const gchar *p;
- const gchar *next;
-
- g_assert (context->current_text_len > 0);
-
- p = g_utf8_find_prev_char (context->current_text, end);
-
- g_assert (p != NULL); /* since current_text was a char start */
-
- /* p is now the start of the last character or character portion. */
- g_assert (p != end);
- next = g_utf8_next_char (p); /* this only touches *p, nothing beyond */
-
- if (next == end)
- {
- /* whole character */
- context->current_text_end = end;
- }
- else
- {
- /* portion */
- context->leftover_char_portion = g_string_new_len (p, end - p);
- context->current_text_len -= (end - p);
- context->current_text_end = p;
- }
+ return context->attr_names[context->cur_attr]->str;
}
-
static void
-add_attribute (GMarkupParseContext *context, char *name)
+add_attribute (GMarkupParseContext *context, GString *str)
{
if (context->cur_attr + 2 >= context->alloc_attrs)
{
context->alloc_attrs += 5; /* silly magic number */
- context->attr_names = g_realloc (context->attr_names, sizeof(char*)*context->alloc_attrs);
- context->attr_values = g_realloc (context->attr_values, sizeof(char*)*context->alloc_attrs);
+ context->attr_names = g_realloc (context->attr_names, sizeof(GString*)*context->alloc_attrs);
+ context->attr_values = g_realloc (context->attr_values, sizeof(GString*)*context->alloc_attrs);
}
context->cur_attr++;
- context->attr_names[context->cur_attr] = name;
+ context->attr_names[context->cur_attr] = str;
context->attr_values[context->cur_attr] = NULL;
context->attr_names[context->cur_attr+1] = NULL;
context->attr_values[context->cur_attr+1] = NULL;
}
+static void
+clear_attributes (GMarkupParseContext *context)
+{
+ /* Go ahead and free the attributes. */
+ for (; context->cur_attr >= 0; context->cur_attr--)
+ {
+ int pos = context->cur_attr;
+ release_chunk (context, context->attr_names[pos]);
+ release_chunk (context, context->attr_values[pos]);
+ context->attr_names[pos] = context->attr_values[pos] = NULL;
+ }
+ g_assert (context->cur_attr == -1);
+ g_assert (context->attr_names == NULL ||
+ context->attr_names[0] == NULL);
+ g_assert (context->attr_values == NULL ||
+ context->attr_values[0] == NULL);
+}
/**
* g_markup_parse_context_parse:
* @context: a #GMarkupParseContext
@@ -994,8 +915,6 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
gssize text_len,
GError **error)
{
- const gchar *first_invalid;
-
g_return_val_if_fail (context != NULL, FALSE);
g_return_val_if_fail (text != NULL, FALSE);
g_return_val_if_fail (context->state != STATE_ERROR, FALSE);
@@ -1009,130 +928,16 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
context->parsing = TRUE;
- if (context->leftover_char_portion)
- {
- const gchar *first_char;
-
- if ((*text & 0xc0) != 0x80)
- first_char = text;
- else
- first_char = g_utf8_find_next_char (text, text + text_len);
-
- if (first_char)
- {
- /* leftover_char_portion was completed. Parse it. */
- GString *portion = context->leftover_char_portion;
-
- g_string_append_len (context->leftover_char_portion,
- text, first_char - text);
-
- /* hacks to allow recursion */
- context->parsing = FALSE;
- context->leftover_char_portion = NULL;
-
- if (!g_markup_parse_context_parse (context,
- portion->str, portion->len,
- error))
- {
- g_assert (context->state == STATE_ERROR);
- }
-
- g_string_free (portion, TRUE);
- context->parsing = TRUE;
-
- /* Skip the fraction of char that was in this text */
- text_len -= (first_char - text);
- text = first_char;
- }
- else
- {
- /* another little chunk of the leftover char; geez
- * someone is inefficient.
- */
- g_string_append_len (context->leftover_char_portion,
- text, text_len);
-
- if (context->leftover_char_portion->len > 7)
- {
- /* The leftover char portion is too big to be
- * a UTF-8 character
- */
- set_error_literal (context,
- error,
- G_MARKUP_ERROR_BAD_UTF8,
- _("Invalid UTF-8 encoded text - overlong sequence"));
- }
-
- goto finished;
- }
- }
context->current_text = text;
context->current_text_len = text_len;
+ context->current_text_end = context->current_text + text_len;
context->iter = context->current_text;
context->start = context->iter;
- /* Nothing left after finishing the leftover char, or nothing
- * passed in to begin with.
- */
if (context->current_text_len == 0)
goto finished;
- /* find_current_text_end () assumes the string starts at
- * a character start, so we need to validate at least
- * that much. It doesn't assume any following bytes
- * are valid.
- */
- if ((*context->current_text & 0xc0) == 0x80) /* not a char start */
- {
- set_error_literal (context,
- error,
- G_MARKUP_ERROR_BAD_UTF8,
- _("Invalid UTF-8 encoded text - not a start char"));
- goto finished;
- }
-
- /* Initialize context->current_text_end, possibly adjusting
- * current_text_len, and add any leftover char portion
- */
- find_current_text_end (context);
-
- /* Validate UTF8 (must be done after we find the end, since
- * we could have a trailing incomplete char)
- */
- if (!g_utf8_validate (context->current_text,
- context->current_text_len,
- &first_invalid))
- {
- gint newlines = 0;
- const gchar *p, *q;
- gchar *current_text_dup;
-
- q = p = context->current_text;
- while (p != first_invalid)
- {
- if (*p == '\n')
- {
- ++newlines;
- q = p + 1;
- context->char_number = 1;
- }
- ++p;
- }
-
- context->line_number += newlines;
- context->char_number += g_utf8_strlen (q, first_invalid - q);
-
- current_text_dup = g_strndup (context->current_text, context->current_text_len);
- set_error (context,
- error,
- G_MARKUP_ERROR_BAD_UTF8,
- _("Invalid UTF-8 encoded text - not valid '%s'"),
- current_text_dup);
- g_free (current_text_dup);
- goto finished;
- }
-
while (context->iter != context->current_text_end)
{
switch (context->state)
@@ -1191,7 +996,7 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
context->state = STATE_AFTER_CLOSE_TAG_SLASH;
}
- else if (is_name_start_char (context->iter))
+ else if (!IS_COMMON_NAME_END_CHAR (*(context->iter)))
{
context->state = STATE_INSIDE_OPEN_TAG_NAME;
@@ -1247,7 +1052,7 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
tmp_error = NULL;
if (context->parser->end_element)
(* context->parser->end_element) (context,
- context->tag_stack->data,
+ current_element (context),
context->user_data,
&tmp_error);
@@ -1279,10 +1084,7 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
current_element (context));
}
}
-
- g_free (context->tag_stack->data);
- context->tag_stack = g_slist_delete_link (context->tag_stack,
- context->tag_stack);
+ pop_tag (context);
}
break;
@@ -1309,12 +1111,7 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
* if any; push it on the stack; enter next state.
*/
add_to_partial (context, context->start, context->iter);
- context->tag_stack =
- g_slist_prepend (context->tag_stack,
- g_string_free (context->partial_chunk,
- FALSE));
-
- context->partial_chunk = NULL;
+ push_partial_as_tag (context);
context->state = STATE_BETWEEN_ATTRIBUTES;
context->start = NULL;
@@ -1345,7 +1142,10 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
/* The name has ended. Combine it with the partial chunk
* if any; push it on the stack; enter next state.
*/
- add_attribute (context, g_string_free (context->partial_chunk, FALSE));
+ if (!name_validate (context, context->partial_chunk->str, error))
+ break;
+
+ add_attribute (context, context->partial_chunk);
context->partial_chunk = NULL;
context->start = NULL;
@@ -1387,11 +1187,10 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
}
else if (*context->iter == '>')
{
-
advance_char (context);
context->state = STATE_AFTER_CLOSE_ANGLE;
}
- else if (is_name_start_char (context->iter))
+ else if (!IS_COMMON_NAME_END_CHAR (*(context->iter)))
{
context->state = STATE_INSIDE_ATTRIBUTE_NAME;
/* start of attribute name */
@@ -1419,44 +1218,35 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
if (context->state == STATE_AFTER_ELISION_SLASH ||
context->state == STATE_AFTER_CLOSE_ANGLE)
{
- const gchar *start_name;
- /* Ugly, but the current code expects an empty array instead of NULL */
- const gchar *empty = NULL;
- const gchar **attr_names = ∅
- const gchar **attr_values = ∅
+ int i;
+ const gchar *start_name;
+ const gchar **attr_names;
+ const gchar **attr_values;
GError *tmp_error;
- /* Call user callback for element start */
- start_name = current_element (context);
-
- if (context->cur_attr >= 0)
+ attr_names = g_newa (const gchar *, context->cur_attr + 2);
+ attr_values = g_newa (const gchar *, context->cur_attr + 2);
+ for (i = 0; i < context->cur_attr + 1; i++)
{
- attr_names = (const gchar**)context->attr_names;
- attr_values = (const gchar**)context->attr_values;
+ attr_names[i] = context->attr_names[i]->str;
+ attr_values[i] = context->attr_values[i]->str;
}
+ attr_names[i] = NULL;
+ attr_values[i] = NULL;
+ /* Call user callback for element start */
tmp_error = NULL;
- if (context->parser->start_element)
+ start_name = current_element (context);
+
+ if (context->parser->start_element &&
+ name_validate (context, start_name, error))
(* context->parser->start_element) (context,
- start_name,
+ start_name,
(const gchar **)attr_names,
(const gchar **)attr_values,
context->user_data,
&tmp_error);
-
- /* Go ahead and free the attributes. */
- for (; context->cur_attr >= 0; context->cur_attr--)
- {
- int pos = context->cur_attr;
- g_free (context->attr_names[pos]);
- g_free (context->attr_values[pos]);
- context->attr_names[pos] = context->attr_values[pos] = NULL;
- }
- g_assert (context->cur_attr == -1);
- g_assert (context->attr_names == NULL ||
- context->attr_names[0] == NULL);
- g_assert (context->attr_values == NULL ||
- context->attr_values[0] == NULL);
+ clear_attributes (context);
if (tmp_error != NULL)
propagate_error (context, error, tmp_error);
@@ -1531,25 +1321,22 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
}
else
{
+ gboolean is_ascii;
/* The value has ended at the quote mark. Combine it
* with the partial chunk if any; set it for the current
* attribute.
*/
- GString *unescaped;
-
add_to_partial (context, context->start, context->iter);
g_assert (context->cur_attr >= 0);
- if (unescape_text (context,
- context->partial_chunk->str,
- context->partial_chunk->str +
- context->partial_chunk->len,
- &unescaped,
- error))
+ if (unescape_gstring_inplace (context, context->partial_chunk, &is_ascii, error) &&
+ (is_ascii || text_validate (context, context->partial_chunk->str,
+ context->partial_chunk->len, error)))
{
/* success, advance past quote and set state. */
- context->attr_values[context->cur_attr] = g_string_free (unescaped, FALSE);
+ context->attr_values[context->cur_attr] = context->partial_chunk;
+ context->partial_chunk = NULL;
advance_char (context);
context->state = STATE_BETWEEN_ATTRIBUTES;
context->start = NULL;
@@ -1576,30 +1363,25 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
if (context->iter != context->current_text_end)
{
- GString *unescaped = NULL;
+ gboolean is_ascii;
/* The text has ended at the open angle. Call the text
* callback.
*/
- if (unescape_text (context,
- context->partial_chunk->str,
- context->partial_chunk->str +
- context->partial_chunk->len,
- &unescaped,
- error))
+ if (unescape_gstring_inplace (context, context->partial_chunk, &is_ascii, error) &&
+ (is_ascii || text_validate (context, context->partial_chunk->str,
+ context->partial_chunk->len, error)))
{
GError *tmp_error = NULL;
if (context->parser->text)
(*context->parser->text) (context,
- unescaped->str,
- unescaped->len,
+ context->partial_chunk->str,
+ context->partial_chunk->len,
context->user_data,
&tmp_error);
- g_string_free (unescaped, TRUE);
-
if (tmp_error == NULL)
{
/* advance past open angle and set state. */
@@ -1618,7 +1400,7 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
case STATE_AFTER_CLOSE_TAG_SLASH:
/* Possible next state: INSIDE_CLOSE_TAG_NAME */
- if (is_name_start_char (context->iter))
+ if (!IS_COMMON_NAME_END_CHAR (*(context->iter)))
{
context->state = STATE_INSIDE_CLOSE_TAG_NAME;
@@ -1656,13 +1438,9 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
if (context->iter != context->current_text_end)
{
- gchar *close_name;
+ GString *close_name;
- /* The name has ended. Combine it with the partial chunk
- * if any; check that it matches stack top and pop
- * stack; invoke proper callback; enter next state.
- */
- close_name = g_string_free (context->partial_chunk, FALSE);
+ close_name = context->partial_chunk;
context->partial_chunk = NULL;
if (*context->iter != '>')
@@ -1676,7 +1454,7 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
"the close element name '%s'; the allowed "
"character is '>'"),
utf8_str (context->iter, buf),
- close_name);
+ close_name->str);
}
else if (context->tag_stack == NULL)
{
@@ -1685,16 +1463,16 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
G_MARKUP_ERROR_PARSE,
_("Element '%s' was closed, no element "
"is currently open"),
- close_name);
+ close_name->str);
}
- else if (strcmp (close_name, current_element (context)) != 0)
+ else if (strcmp (close_name->str, current_element (context)) != 0)
{
set_error (context,
error,
G_MARKUP_ERROR_PARSE,
_("Element '%s' was closed, but the currently "
"open element is '%s'"),
- close_name,
+ close_name->str,
current_element (context));
}
else
@@ -1710,22 +1488,18 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
tmp_error = NULL;
if (context->parser->end_element)
(* context->parser->end_element) (context,
- close_name,
+ close_name->str,
context->user_data,
&tmp_error);
ensure_no_outstanding_subparser (context);
-
- /* Pop the tag stack */
- g_free (context->tag_stack->data);
- context->tag_stack = g_slist_delete_link (context->tag_stack,
- context->tag_stack);
+ pop_tag (context);
if (tmp_error)
propagate_error (context, error, tmp_error);
}
-
- g_free (close_name);
+ context->partial_chunk = close_name;
+ truncate_partial (context);
}
break;
@@ -1784,14 +1558,22 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
if (context->flags & G_MARKUP_TREAT_CDATA_AS_TEXT &&
strncmp (context->partial_chunk->str, "<![CDATA[", 9) == 0)
{
- if (context->parser->text)
+ if (context->parser->text &&
+ text_validate (context,
+ context->partial_chunk->str + 9,
+ context->partial_chunk->len - 12,
+ error))
(*context->parser->text) (context,
context->partial_chunk->str + 9,
context->partial_chunk->len - 12,
context->user_data,
&tmp_error);
}
- else if (context->parser->passthrough)
+ else if (context->parser->passthrough &&
+ text_validate (context,
+ context->partial_chunk->str,
+ context->partial_chunk->len,
+ error))
(*context->parser->passthrough) (context,
context->partial_chunk->str,
context->partial_chunk->len,
@@ -1999,7 +1781,6 @@ G_CONST_RETURN GSList *
g_markup_parse_context_get_element_stack (GMarkupParseContext *context)
{
g_return_val_if_fail (context != NULL, NULL);
-
return context->tag_stack;
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]