[glib/wip/sadiq/fixes: 78/79] gmarkup: Optimize g_markup_escape_text()
- From: Mohammed Sadiq <pksadiq src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [glib/wip/sadiq/fixes: 78/79] gmarkup: Optimize g_markup_escape_text()
- Date: Wed, 3 Oct 2018 02:58:09 +0000 (UTC)
commit 714e4a3750dfbdc35c3c14b2cbf165056d2663dd
Author: Mohammed Sadiq <sadiq sadiqpk org>
Date: Sat Sep 15 00:06:25 2018 +0530
gmarkup: Optimize g_markup_escape_text()
The string @text is supposed to be only UTF-8. Instead of reading
every unichar, parse every byte, and escape if required. If there
comes a UTF-8 control char, then read the unichar and escape it.
* In best case (All are print chars), the function is ~5 times faster.
* When every char is an XML escape char (<, >, ', ", &) the speed is +~5%.
Fixes https://gitlab.gnome.org/GNOME/glib/issues/1376
glib/gmarkup.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++-----------
1 file changed, 61 insertions(+), 14 deletions(-)
---
diff --git a/glib/gmarkup.c b/glib/gmarkup.c
index 43bb0c7f8..41a04978c 100644
--- a/glib/gmarkup.c
+++ b/glib/gmarkup.c
@@ -2154,62 +2154,109 @@ g_markup_parse_context_pop (GMarkupParseContext *context)
return user_data;
}
+#define APPEND_TEXT_AND_SEEK(_str, _start, _end) \
+ G_STMT_START { \
+ if (_end > _start) \
+ g_string_append_len (_str, _start, _end - _start); \
+ _start = ++_end; \
+ } G_STMT_END
+
+/*
+ * https://www.w3.org/TR/REC-xml/ defines the set of valid
+ * characters as:
+ * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
+ *
+ * That is, from non-ASCII UTF-8 character set, only 0xC27F - 0xC284 and
+ * 0xC286 - 0xC29F have to be escaped (excluding the surrogate blocks).
+ * Corresponding Unicode code points are [0x7F-0x84] and [0x86-0x9F].
+ *
+ * So instead of using costly g_utf8_next_char or similar UTF8 functions, it's
+ * better to read each byte, and make an exception for 0xC2XX.
+ */
static void
append_escaped_text (GString *str,
const gchar *text,
gssize length)
{
- const gchar *p;
+ const gchar *p, *pending;
const gchar *end;
- gunichar c;
- p = text;
+ p = pending = text;
end = text + length;
- while (p < end)
+ while (p < end && pending < end)
{
- const gchar *next;
- next = g_utf8_next_char (p);
+ guchar c = (guchar) *pending;
- switch (*p)
+ switch (c)
{
case '&':
+ APPEND_TEXT_AND_SEEK (str, p, pending);
g_string_append (str, "&");
break;
case '<':
+ APPEND_TEXT_AND_SEEK (str, p, pending);
g_string_append (str, "<");
break;
case '>':
+ APPEND_TEXT_AND_SEEK (str, p, pending);
g_string_append (str, ">");
break;
case '\'':
+ APPEND_TEXT_AND_SEEK (str, p, pending);
g_string_append (str, "'");
break;
case '"':
+ APPEND_TEXT_AND_SEEK (str, p, pending);
g_string_append (str, """);
break;
default:
- c = g_utf8_get_char (p);
if ((0x1 <= c && c <= 0x8) ||
(0xb <= c && c <= 0xc) ||
(0xe <= c && c <= 0x1f) ||
- (0x7f <= c && c <= 0x84) ||
- (0x86 <= c && c <= 0x9f))
- g_string_append_printf (str, "&#x%x;", c);
+ (c == 0x7f))
+ {
+ APPEND_TEXT_AND_SEEK (str, p, pending);
+ g_string_append_printf (str, "&#x%x;", c);
+ }
+ /* The utf-8 control characters to escape begins with 0xc2 byte */
+ else if (c == 0xc2)
+ {
+ gunichar u = g_utf8_get_char (pending);
+
+ if ((0x7f < u && u <= 0x84) ||
+ (0x86 <= u && u <= 0x9f))
+ {
+ APPEND_TEXT_AND_SEEK (str, p, pending);
+ g_string_append_printf (str, "&#x%x;", u);
+
+ /*
+ * We have appended a two byte character above, which
+ * is one byte ahead of what we read on every loop.
+ * Increment to skip 0xc2 and point to the right location.
+ */
+ p++;
+ }
+ else
+ pending++;
+ }
else
- g_string_append_len (str, p, next - p);
+ pending++;
break;
}
-
- p = next;
}
+
+ if (pending > p)
+ g_string_append_len (str, p, pending - p);
}
+#undef APPEND_TEXT_AND_SEEK
+
/**
* g_markup_escape_text:
* @text: some valid UTF-8 text
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]