[tepl/wip/icu: 4/5] icu: write tepl_utils_markup_escape_text()



commit 71f4742f6027b2c09fa8731f54880badac8135f0
Author: Sébastien Wilmet <swilmet gnome org>
Date:   Fri May 29 22:08:34 2020 +0200

    icu: write tepl_utils_markup_escape_text()

 docs/reference/tepl-sections.txt |  1 +
 tepl/tepl-utils.c                | 68 ++++++++++++++++++++++++++++++++++++++++
 tepl/tepl-utils.h                |  3 ++
 testsuite/test-utils.c           | 35 +++++++++++++++++++++
 4 files changed, 107 insertions(+)
---
diff --git a/docs/reference/tepl-sections.txt b/docs/reference/tepl-sections.txt
index 13fc4e9..96f5441 100644
--- a/docs/reference/tepl-sections.txt
+++ b/docs/reference/tepl-sections.txt
@@ -445,6 +445,7 @@ tepl_notebook_get_type
 tepl_utils_str_middle_truncate
 tepl_utils_str_end_truncate
 tepl_utils_str_replace
+tepl_utils_markup_escape_text
 tepl_utils_get_file_extension
 tepl_utils_get_file_shortname
 tepl_utils_replace_home_dir_with_tilde
diff --git a/tepl/tepl-utils.c b/tepl/tepl-utils.c
index ceb5baa..d49c42f 100644
--- a/tepl/tepl-utils.c
+++ b/tepl/tepl-utils.c
@@ -10,6 +10,7 @@
 #include "tepl-utils.h"
 #include <string.h>
 #include "tepl-application-window.h"
+#include "tepl-icu.h"
 
 /**
  * SECTION:utils
@@ -159,6 +160,73 @@ tepl_utils_str_replace (const gchar *string,
        return ret;
 }
 
+/**
+ * tepl_utils_markup_escape_text:
+ * @src: a nul-terminated UTF-8 string.
+ *
+ * The same as g_markup_escape_text(), but with an implementation that fully
+ * supports round-trip integrity. I.e. when #GMarkupParser or any other XML
+ * parser will decode/unescape the string, the exact same string as @src will be
+ * brought back. As long as @src is a valid UTF-8 string.
+ *
+ * The other difference with g_markup_escape_text() is that the @length
+ * parameter is not present for tepl_utils_markup_escape_text().
+ *
+ * # g_markup_escape_text() doesn't fully support round-trip integrity
+ *
+ * In fact, g_markup_escape_text() doesn't escape the tabstop, newline and
+ * carriage return characters. And the #GMarkupParser correctly processes
+ * whitespace and line endings according to the [XML rules for normalization of
+ * line endings and attribute values](https://www.w3.org/TR/xml/#AVNormalize).
+ *
+ * For example `"\t"` (a tab) after a round-trip through g_markup_escape_text()
+ * and #GMarkupParser becomes a simple space.
+ *
+ * Returns: (transfer full) (nullable): a newly allocated string with the
+ * escaped text, or %NULL if @src is not a valid UTF-8 string. Free with
+ * g_free() when no longer needed.
+ * Since: 5.0
+ */
+gchar *
+tepl_utils_markup_escape_text (const gchar *src)
+{
+       UChar *src_uchars;
+       UTransliterator *trans;
+       UChar *dest_uchars = NULL;
+       gchar *dest = NULL;
+
+       src_uchars = _tepl_icu_strFromUTF8Simple (src);
+       if (src_uchars == NULL)
+       {
+               return NULL;
+       }
+
+       trans = _tepl_icu_trans_open_xml_escape ();
+       if (trans == NULL)
+       {
+               goto out;
+       }
+
+       dest_uchars = _tepl_icu_trans_transUCharsSimple (trans, src_uchars);
+       if (dest_uchars == NULL)
+       {
+               goto out;
+       }
+
+       dest = _tepl_icu_strToUTF8Simple (dest_uchars);
+
+out:
+       g_free (src_uchars);
+       g_free (dest_uchars);
+
+       if (trans != NULL)
+       {
+               utrans_close (trans);
+       }
+
+       return dest;
+}
+
 static gint
 get_extension_position (const gchar *filename)
 {
diff --git a/tepl/tepl-utils.h b/tepl/tepl-utils.h
index e0759ee..88369f8 100644
--- a/tepl/tepl-utils.h
+++ b/tepl/tepl-utils.h
@@ -29,6 +29,9 @@ gchar *               tepl_utils_str_replace                          (const gchar *string,
                                                                 const gchar *search,
                                                                 const gchar *replacement);
 
+_TEPL_EXTERN
+gchar *                tepl_utils_markup_escape_text                   (const gchar *src);
+
 /* File utilities */
 
 _TEPL_EXTERN
diff --git a/testsuite/test-utils.c b/testsuite/test-utils.c
index 5f3c77f..9f22855 100644
--- a/testsuite/test-utils.c
+++ b/testsuite/test-utils.c
@@ -51,6 +51,40 @@ test_str_replace (void)
        g_free (result);
 }
 
+static void
+check_markup_escape_text (const gchar *src,
+                         const gchar *expected_dest)
+{
+       gchar *received_dest;
+
+       received_dest = tepl_utils_markup_escape_text (src);
+       g_assert_cmpstr (received_dest, ==, expected_dest);
+       g_free (received_dest);
+}
+
+static void
+test_markup_escape_text (void)
+{
+       check_markup_escape_text ("", "");
+       check_markup_escape_text ("123ASCIIabc.,;/_-:", "123ASCIIabc.,;/_-:");
+       check_markup_escape_text ("é", "&#xE9;");
+       check_markup_escape_text ("\t", "&#x9;");
+       check_markup_escape_text ("ẞ", "&#x1E9E;"); // multi-byte UTF-8 char.
+
+       {
+               gchar *dest;
+
+               /* If this changes in the future, maybe g_markup_escape_text()
+                * has been modified to fully support round-trip integrity, in
+                * which case tepl_utils_markup_escape_text() is no longer
+                * useful.
+                */
+               dest = g_markup_escape_text ("\t", -1);
+               g_assert_cmpstr (dest, ==, "\t");
+               g_free (dest);
+       }
+}
+
 static void
 test_get_file_extension (void)
 {
@@ -191,6 +225,7 @@ main (int    argc,
        g_test_add_func ("/utils/str-middle-truncate", test_str_middle_truncate);
        g_test_add_func ("/utils/str-end-truncate", test_str_end_truncate);
        g_test_add_func ("/utils/str-replace", test_str_replace);
+       g_test_add_func ("/utils/markup-escape-text", test_markup_escape_text);
        g_test_add_func ("/utils/get-file-extension", test_get_file_extension);
        g_test_add_func ("/utils/get-file-shortname", test_get_file_shortname);
        g_test_add_func ("/utils/replace-home-dir-with-tilde", test_replace_home_dir_with_tilde);


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]