[tepl/wip/icu: 3/5] icu: write some _tepl_icu utils functions



commit e5443b1e943397a1d50e2cc0b4cc52a4b2ad5ec4
Author: Sébastien Wilmet <swilmet gnome org>
Date:   Fri May 29 16:19:35 2020 +0200

    icu: write some _tepl_icu utils functions

 po/POTFILES.in        |   1 +
 tepl/meson.build      |   2 +
 tepl/tepl-icu.c       | 265 ++++++++++++++++++++++++++++++++++++++++++++++++++
 tepl/tepl-icu.h       |  47 +++++++++
 testsuite/meson.build |   1 +
 testsuite/test-icu.c  | 118 ++++++++++++++++++++++
 6 files changed, 434 insertions(+)
---
diff --git a/po/POTFILES.in b/po/POTFILES.in
index 5faab17..872c893 100644
--- a/po/POTFILES.in
+++ b/po/POTFILES.in
@@ -8,6 +8,7 @@ tepl/tepl-file.c
 tepl/tepl-file-loader.c
 tepl/tepl-file-saver.c
 tepl/tepl-goto-line-bar.c
+tepl/tepl-icu.c
 tepl/tepl-info-bar.c
 tepl/tepl-init.c
 tepl/tepl-io-error-info-bars.c
diff --git a/tepl/meson.build b/tepl/meson.build
index 6a9c8ae..226d913 100644
--- a/tepl/meson.build
+++ b/tepl/meson.build
@@ -64,6 +64,7 @@ tepl_public_c_files = [
 
 TEPL_PRIVATE_HEADERS = [
   'tepl-close-confirm-dialog-single.h',
+  'tepl-icu.h',
   'tepl-io-error-info-bar.h',
   'tepl-metadata-attic.h',
   'tepl-metadata-parser.h',
@@ -76,6 +77,7 @@ TEPL_PRIVATE_HEADERS = [
 
 tepl_private_c_files = [
   'tepl-close-confirm-dialog-single.c',
+  'tepl-icu.c',
   'tepl-io-error-info-bar.c',
   'tepl-metadata-attic.c',
   'tepl-metadata-parser.c',
diff --git a/tepl/tepl-icu.c b/tepl/tepl-icu.c
new file mode 100644
index 0000000..c4978c5
--- /dev/null
+++ b/tepl/tepl-icu.c
@@ -0,0 +1,265 @@
+/* SPDX-FileCopyrightText: 2020 - Sébastien Wilmet <swilmet gnome org>
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#include "tepl-icu.h"
+
+/* Wrapper around u_strFromUTF8() that handles the pre-flighting.
+ *
+ * Returns: (transfer full) (nullable): the newly-allocated buffer with the
+ * right size. Free with g_free() when no longer needed.
+ */
+UChar *
+_tepl_icu_strFromUTF8 (int32_t    *pDestLength,
+                      const char *src,
+                      int32_t     srcLength,
+                      UErrorCode *pErrorCode)
+{
+       int32_t my_DestLength = 0;
+       UErrorCode my_ErrorCode = U_ZERO_ERROR;
+       UChar *dest = NULL;
+
+       u_strFromUTF8 (NULL, 0, &my_DestLength,
+                      src, srcLength,
+                      &my_ErrorCode);
+
+       if (my_ErrorCode != U_BUFFER_OVERFLOW_ERROR &&
+           my_ErrorCode != U_STRING_NOT_TERMINATED_WARNING)
+       {
+               if (pDestLength != NULL)
+               {
+                       *pDestLength = my_DestLength;
+               }
+               if (pErrorCode != NULL)
+               {
+                       *pErrorCode = my_ErrorCode;
+               }
+
+               return NULL;
+       }
+
+       dest = g_new0 (UChar, my_DestLength + 1);
+
+       u_strFromUTF8 (dest, my_DestLength + 1, pDestLength,
+                      src, srcLength,
+                      pErrorCode);
+
+       return dest;
+}
+
+/* Wrapper around u_strToUTF8() that handles the pre-flighting.
+ *
+ * Returns: (transfer full) (nullable): the newly-allocated string with the
+ * right size. Free with g_free() when no longer needed.
+ */
+char *
+_tepl_icu_strToUTF8 (int32_t     *pDestLength,
+                    const UChar *src,
+                    int32_t      srcLength,
+                    UErrorCode  *pErrorCode)
+{
+       int32_t my_DestLength = 0;
+       UErrorCode my_ErrorCode = U_ZERO_ERROR;
+       char *dest = NULL;
+
+       u_strToUTF8 (NULL, 0, &my_DestLength,
+                    src, srcLength,
+                    &my_ErrorCode);
+
+       if (my_ErrorCode != U_BUFFER_OVERFLOW_ERROR &&
+           my_ErrorCode != U_STRING_NOT_TERMINATED_WARNING)
+       {
+               if (pDestLength != NULL)
+               {
+                       *pDestLength = my_DestLength;
+               }
+               if (pErrorCode != NULL)
+               {
+                       *pErrorCode = my_ErrorCode;
+               }
+
+               return NULL;
+       }
+
+       dest = g_malloc0 (my_DestLength + 1);
+
+       u_strToUTF8 (dest, my_DestLength + 1, pDestLength,
+                    src, srcLength,
+                    pErrorCode);
+
+       return dest;
+}
+
+/* Returns: (transfer full) (nullable): a nul-terminated UTF-16 string. Free
+ * with g_free() when no longer needed.
+ */
+UChar *
+_tepl_icu_strFromUTF8Simple (const char *utf8_str)
+{
+       UChar *uchars;
+       UErrorCode error_code = U_ZERO_ERROR;
+
+       uchars = _tepl_icu_strFromUTF8 (NULL, utf8_str, -1, &error_code);
+
+       if (U_FAILURE (error_code))
+       {
+               g_free (uchars);
+               return NULL;
+       }
+
+       return uchars;
+}
+
+/* Returns: (transfer full) (nullable): a nul-terminated UTF-8 string. Free with
+ * g_free() when no longer needed.
+ */
+char *
+_tepl_icu_strToUTF8Simple (const UChar *uchars)
+{
+       char *utf8_str;
+       UErrorCode error_code = U_ZERO_ERROR;
+
+       utf8_str = _tepl_icu_strToUTF8 (NULL, uchars, -1, &error_code);
+
+       if (U_FAILURE (error_code))
+       {
+               g_free (utf8_str);
+               return NULL;
+       }
+
+       return utf8_str;
+}
+
+/* Returns: (transfer full) (nullable): a copy of @uchars. Free with g_free()
+ * when no longer needed.
+ */
+UChar *
+_tepl_icu_strdup (const UChar *uchars)
+{
+       int32_t length;
+       UChar *copy;
+
+       if (uchars == NULL)
+       {
+               return NULL;
+       }
+
+       length = u_strlen (uchars);
+       copy = g_new0 (UChar, length + 1);
+
+       return u_strncpy (copy, uchars, length + 1);
+}
+
+/* A wrapper around utrans_openU(). */
+UTransliterator *
+_tepl_icu_trans_openUSimple (const char *utf8_id)
+{
+       UChar *id;
+       UTransliterator *transliterator;
+       UErrorCode error_code = U_ZERO_ERROR;
+
+       id = _tepl_icu_strFromUTF8Simple (utf8_id);
+       g_return_val_if_fail (id != NULL, NULL);
+
+       transliterator = utrans_openU (id, -1,
+                                      UTRANS_FORWARD,
+                                      NULL, 0,
+                                      NULL, &error_code);
+       g_free (id);
+
+       if (U_FAILURE (error_code))
+       {
+               g_warn_if_reached ();
+
+               if (transliterator != NULL)
+               {
+                       utrans_close (transliterator);
+               }
+
+               return NULL;
+       }
+
+       return transliterator;
+}
+
+UTransliterator *
+_tepl_icu_trans_open_xml_escape (void)
+{
+       /* Don't escape all the characters, keep certain printable ASCII
+        * characters as is. That way it's a bit easier to understand when
+        * reading/debugging the XML content.
+        *
+        * The ICU transliterator/transform can be tested easily with the uconv
+        * command, including a round-trip:
+        * $ echo -n -e '\t' | uconv -x '[^a-zA-Z0-9.,;/_\x2D\x3A] Any-Hex/XML' | uconv -x 'Hex-Any/XML'
+        *
+        * "\\x2D" is '-' and "\\x3A" is ':'.
+        */
+       return _tepl_icu_trans_openUSimple ("[^a-zA-Z0-9.,;/_\\x2D\\x3A] Any-Hex/XML");
+}
+
+/* Like utrans_transUChars(), but simpler to use.
+ * @src must be nul-terminated, and is not modified.
+ *
+ * Returns: (transfer full) (nullable): the transformed string, as a
+ * newly-allocated nul-terminated buffer of the right size. Free with g_free()
+ * when no longer needed.
+ */
+UChar *
+_tepl_icu_trans_transUCharsSimple (const UTransliterator *trans,
+                                  const UChar           *src)
+{
+       UChar *src_copy;
+       int32_t src_length;
+       int32_t text_length;
+       int32_t limit;
+       int32_t dest_capacity;
+       UChar *dest;
+       UErrorCode error_code = U_ZERO_ERROR;
+
+       /* Pre-flighting */
+
+       src_copy = _tepl_icu_strdup (src);
+       src_length = u_strlen (src);
+       text_length = src_length;
+
+       limit = src_length;
+
+       utrans_transUChars (trans,
+                           src_copy, &text_length, src_length + 1,
+                           0, &limit,
+                           &error_code);
+
+       g_free (src_copy);
+
+       if (error_code != U_BUFFER_OVERFLOW_ERROR &&
+           error_code != U_STRING_NOT_TERMINATED_WARNING &&
+           U_FAILURE (error_code))
+       {
+               g_warn_if_reached ();
+               return NULL;
+       }
+
+       /* Do the real transform */
+
+       dest_capacity = MAX (text_length + 1, src_length + 1);
+       dest = g_new0 (UChar, dest_capacity);
+       u_strncpy (dest, src, src_length + 1);
+
+       limit = src_length;
+       error_code = U_ZERO_ERROR;
+
+       utrans_transUChars (trans,
+                           dest, NULL, dest_capacity,
+                           0, &limit,
+                           &error_code);
+
+       if (U_FAILURE (error_code))
+       {
+               g_warn_if_reached ();
+               g_free (dest);
+               return NULL;
+       }
+
+       return dest;
+}
diff --git a/tepl/tepl-icu.h b/tepl/tepl-icu.h
new file mode 100644
index 0000000..3d48d15
--- /dev/null
+++ b/tepl/tepl-icu.h
@@ -0,0 +1,47 @@
+/* SPDX-FileCopyrightText: 2020 - Sébastien Wilmet <swilmet gnome org>
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef TEPL_ICU_H
+#define TEPL_ICU_H
+
+#include <glib.h>
+#include <unicode/ustring.h>
+#include <unicode/utrans.h>
+
+G_BEGIN_DECLS
+
+G_GNUC_INTERNAL
+UChar *                        _tepl_icu_strFromUTF8                   (int32_t    *pDestLength,
+                                                                const char *src,
+                                                                int32_t     srcLength,
+                                                                UErrorCode *pErrorCode);
+
+G_GNUC_INTERNAL
+char *                 _tepl_icu_strToUTF8                     (int32_t     *pDestLength,
+                                                                const UChar *src,
+                                                                int32_t      srcLength,
+                                                                UErrorCode  *pErrorCode);
+
+G_GNUC_INTERNAL
+UChar *                        _tepl_icu_strFromUTF8Simple             (const char *utf8_str);
+
+G_GNUC_INTERNAL
+char *                 _tepl_icu_strToUTF8Simple               (const UChar *uchars);
+
+G_GNUC_INTERNAL
+UChar *                        _tepl_icu_strdup                        (const UChar *uchars);
+
+G_GNUC_INTERNAL
+UTransliterator *      _tepl_icu_trans_openUSimple             (const char *utf8_id);
+
+G_GNUC_INTERNAL
+UTransliterator *      _tepl_icu_trans_open_xml_escape         (void);
+
+G_GNUC_INTERNAL
+UChar *                        _tepl_icu_trans_transUCharsSimple       (const UTransliterator *trans,
+                                                                const UChar           *src);
+
+G_END_DECLS
+
+#endif /* TEPL_ICU_H */
diff --git a/testsuite/meson.build b/testsuite/meson.build
index e4577c4..c0bbe87 100644
--- a/testsuite/meson.build
+++ b/testsuite/meson.build
@@ -3,6 +3,7 @@ unit_tests = [
   'test-file-loader',
   'test-file-saver',
   'test-fold-region',
+  'test-icu',
   'test-info-bar',
   'test-metadata',
   'test-metadata-manager',
diff --git a/testsuite/test-icu.c b/testsuite/test-icu.c
new file mode 100644
index 0000000..9f0a545
--- /dev/null
+++ b/testsuite/test-icu.c
@@ -0,0 +1,118 @@
+/* SPDX-FileCopyrightText: 2020 - Sébastien Wilmet <swilmet gnome org>
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#include "tepl/tepl-icu.h"
+
+static void
+check_str_from_and_to_utf8_raw (const gchar *utf8_str,
+                               gboolean     expect_success)
+{
+       UChar *uchars;
+       gchar *utf8_str_after_round_trip;
+       UErrorCode error_code = U_ZERO_ERROR;
+
+       uchars = _tepl_icu_strFromUTF8 (NULL, utf8_str, -1, &error_code);
+       if (expect_success)
+       {
+               g_assert_true (U_SUCCESS (error_code));
+       }
+       else
+       {
+               g_assert_true (U_FAILURE (error_code));
+               g_free (uchars);
+               return;
+       }
+
+       error_code = U_ZERO_ERROR;
+       utf8_str_after_round_trip = _tepl_icu_strToUTF8 (NULL, uchars, -1, &error_code);
+       g_assert_cmpstr (utf8_str, ==, utf8_str_after_round_trip);
+
+       g_free (uchars);
+       g_free (utf8_str_after_round_trip);
+}
+
+static void
+check_str_from_and_to_utf8_simple (const gchar *utf8_str,
+                                  gboolean     expect_success)
+{
+       UChar *uchars;
+       gchar *utf8_str_after_round_trip;
+
+       uchars = _tepl_icu_strFromUTF8Simple (utf8_str);
+       if (expect_success)
+       {
+               g_assert_true (uchars != NULL);
+       }
+       else
+       {
+               g_assert_true (uchars == NULL);
+               return;
+       }
+
+       utf8_str_after_round_trip = _tepl_icu_strToUTF8Simple (uchars);
+       g_assert_cmpstr (utf8_str, ==, utf8_str_after_round_trip);
+
+       g_free (uchars);
+       g_free (utf8_str_after_round_trip);
+}
+
+static void
+check_str_from_and_to_utf8 (const gchar *utf8_str,
+                           gboolean     expect_success)
+{
+       check_str_from_and_to_utf8_raw (utf8_str, expect_success);
+       check_str_from_and_to_utf8_simple (utf8_str, expect_success);
+}
+
+static void
+test_str_from_and_to_utf8 (void)
+{
+       check_str_from_and_to_utf8 (NULL, FALSE);
+       check_str_from_and_to_utf8 ("", TRUE);
+       check_str_from_and_to_utf8 ("ASCII", TRUE);
+       check_str_from_and_to_utf8 ("À ski", TRUE);
+
+       /* Not valid UTF-8. */
+       check_str_from_and_to_utf8 ("\xFF", FALSE);
+}
+
+static void
+test_strdup (void)
+{
+       UChar *uchars;
+       UChar *uchars_copy;
+       gchar *utf8_str;
+
+       uchars = _tepl_icu_strFromUTF8Simple ("Évo");
+       uchars_copy = _tepl_icu_strdup (uchars);
+       utf8_str = _tepl_icu_strToUTF8Simple (uchars_copy);
+       g_assert_cmpstr (utf8_str, ==, "Évo");
+
+       g_free (uchars);
+       g_free (uchars_copy);
+       g_free (utf8_str);
+}
+
+static void
+test_trans_open (void)
+{
+       UTransliterator *transliterator;
+
+       transliterator = _tepl_icu_trans_open_xml_escape ();
+       g_assert_true (transliterator != NULL);
+       utrans_close (transliterator);
+}
+
+int
+main (int    argc,
+      char **argv)
+{
+       g_test_init (&argc, &argv, NULL);
+
+       g_test_add_func ("/icu/str_from_and_to_utf8", test_str_from_and_to_utf8);
+       g_test_add_func ("/icu/strdup", test_strdup);
+       g_test_add_func ("/icu/trans_open", test_trans_open);
+
+       return g_test_run ();
+}


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]