[tepl/wip/icu: 3/5] icu: write some _tepl_icu utils functions
- From: Sébastien Wilmet <swilmet src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tepl/wip/icu: 3/5] icu: write some _tepl_icu utils functions
- Date: Sat, 30 May 2020 11:18:11 +0000 (UTC)
commit e5443b1e943397a1d50e2cc0b4cc52a4b2ad5ec4
Author: Sébastien Wilmet <swilmet gnome org>
Date: Fri May 29 16:19:35 2020 +0200
icu: write some _tepl_icu utils functions
po/POTFILES.in | 1 +
tepl/meson.build | 2 +
tepl/tepl-icu.c | 265 ++++++++++++++++++++++++++++++++++++++++++++++++++
tepl/tepl-icu.h | 47 +++++++++
testsuite/meson.build | 1 +
testsuite/test-icu.c | 118 ++++++++++++++++++++++
6 files changed, 434 insertions(+)
---
diff --git a/po/POTFILES.in b/po/POTFILES.in
index 5faab17..872c893 100644
--- a/po/POTFILES.in
+++ b/po/POTFILES.in
@@ -8,6 +8,7 @@ tepl/tepl-file.c
tepl/tepl-file-loader.c
tepl/tepl-file-saver.c
tepl/tepl-goto-line-bar.c
+tepl/tepl-icu.c
tepl/tepl-info-bar.c
tepl/tepl-init.c
tepl/tepl-io-error-info-bars.c
diff --git a/tepl/meson.build b/tepl/meson.build
index 6a9c8ae..226d913 100644
--- a/tepl/meson.build
+++ b/tepl/meson.build
@@ -64,6 +64,7 @@ tepl_public_c_files = [
TEPL_PRIVATE_HEADERS = [
'tepl-close-confirm-dialog-single.h',
+ 'tepl-icu.h',
'tepl-io-error-info-bar.h',
'tepl-metadata-attic.h',
'tepl-metadata-parser.h',
@@ -76,6 +77,7 @@ TEPL_PRIVATE_HEADERS = [
tepl_private_c_files = [
'tepl-close-confirm-dialog-single.c',
+ 'tepl-icu.c',
'tepl-io-error-info-bar.c',
'tepl-metadata-attic.c',
'tepl-metadata-parser.c',
diff --git a/tepl/tepl-icu.c b/tepl/tepl-icu.c
new file mode 100644
index 0000000..c4978c5
--- /dev/null
+++ b/tepl/tepl-icu.c
@@ -0,0 +1,265 @@
+/* SPDX-FileCopyrightText: 2020 - Sébastien Wilmet <swilmet gnome org>
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#include "tepl-icu.h"
+
+/* Wrapper around u_strFromUTF8() that handles the pre-flighting.
+ *
+ * Returns: (transfer full) (nullable): the newly-allocated buffer with the
+ * right size. Free with g_free() when no longer needed.
+ */
+UChar *
+_tepl_icu_strFromUTF8 (int32_t *pDestLength,
+ const char *src,
+ int32_t srcLength,
+ UErrorCode *pErrorCode)
+{
+ int32_t my_DestLength = 0;
+ UErrorCode my_ErrorCode = U_ZERO_ERROR;
+ UChar *dest = NULL;
+
+ u_strFromUTF8 (NULL, 0, &my_DestLength,
+ src, srcLength,
+ &my_ErrorCode);
+
+ if (my_ErrorCode != U_BUFFER_OVERFLOW_ERROR &&
+ my_ErrorCode != U_STRING_NOT_TERMINATED_WARNING)
+ {
+ if (pDestLength != NULL)
+ {
+ *pDestLength = my_DestLength;
+ }
+ if (pErrorCode != NULL)
+ {
+ *pErrorCode = my_ErrorCode;
+ }
+
+ return NULL;
+ }
+
+ dest = g_new0 (UChar, my_DestLength + 1);
+
+ u_strFromUTF8 (dest, my_DestLength + 1, pDestLength,
+ src, srcLength,
+ pErrorCode);
+
+ return dest;
+}
+
+/* Wrapper around u_strToUTF8() that handles the pre-flighting.
+ *
+ * Returns: (transfer full) (nullable): the newly-allocated string with the
+ * right size. Free with g_free() when no longer needed.
+ */
+char *
+_tepl_icu_strToUTF8 (int32_t *pDestLength,
+ const UChar *src,
+ int32_t srcLength,
+ UErrorCode *pErrorCode)
+{
+ int32_t my_DestLength = 0;
+ UErrorCode my_ErrorCode = U_ZERO_ERROR;
+ char *dest = NULL;
+
+ u_strToUTF8 (NULL, 0, &my_DestLength,
+ src, srcLength,
+ &my_ErrorCode);
+
+ if (my_ErrorCode != U_BUFFER_OVERFLOW_ERROR &&
+ my_ErrorCode != U_STRING_NOT_TERMINATED_WARNING)
+ {
+ if (pDestLength != NULL)
+ {
+ *pDestLength = my_DestLength;
+ }
+ if (pErrorCode != NULL)
+ {
+ *pErrorCode = my_ErrorCode;
+ }
+
+ return NULL;
+ }
+
+ dest = g_malloc0 (my_DestLength + 1);
+
+ u_strToUTF8 (dest, my_DestLength + 1, pDestLength,
+ src, srcLength,
+ pErrorCode);
+
+ return dest;
+}
+
+/* Returns: (transfer full) (nullable): a nul-terminated UTF-16 string. Free
+ * with g_free() when no longer needed.
+ */
+UChar *
+_tepl_icu_strFromUTF8Simple (const char *utf8_str)
+{
+ UChar *uchars;
+ UErrorCode error_code = U_ZERO_ERROR;
+
+ uchars = _tepl_icu_strFromUTF8 (NULL, utf8_str, -1, &error_code);
+
+ if (U_FAILURE (error_code))
+ {
+ g_free (uchars);
+ return NULL;
+ }
+
+ return uchars;
+}
+
+/* Returns: (transfer full) (nullable): a nul-terminated UTF-8 string. Free with
+ * g_free() when no longer needed.
+ */
+char *
+_tepl_icu_strToUTF8Simple (const UChar *uchars)
+{
+ char *utf8_str;
+ UErrorCode error_code = U_ZERO_ERROR;
+
+ utf8_str = _tepl_icu_strToUTF8 (NULL, uchars, -1, &error_code);
+
+ if (U_FAILURE (error_code))
+ {
+ g_free (utf8_str);
+ return NULL;
+ }
+
+ return utf8_str;
+}
+
+/* Returns: (transfer full) (nullable): a copy of @uchars. Free with g_free()
+ * when no longer needed.
+ */
+UChar *
+_tepl_icu_strdup (const UChar *uchars)
+{
+ int32_t length;
+ UChar *copy;
+
+ if (uchars == NULL)
+ {
+ return NULL;
+ }
+
+ length = u_strlen (uchars);
+ copy = g_new0 (UChar, length + 1);
+
+ return u_strncpy (copy, uchars, length + 1);
+}
+
+/* A wrapper around utrans_openU(). */
+UTransliterator *
+_tepl_icu_trans_openUSimple (const char *utf8_id)
+{
+ UChar *id;
+ UTransliterator *transliterator;
+ UErrorCode error_code = U_ZERO_ERROR;
+
+ id = _tepl_icu_strFromUTF8Simple (utf8_id);
+ g_return_val_if_fail (id != NULL, NULL);
+
+ transliterator = utrans_openU (id, -1,
+ UTRANS_FORWARD,
+ NULL, 0,
+ NULL, &error_code);
+ g_free (id);
+
+ if (U_FAILURE (error_code))
+ {
+ g_warn_if_reached ();
+
+ if (transliterator != NULL)
+ {
+ utrans_close (transliterator);
+ }
+
+ return NULL;
+ }
+
+ return transliterator;
+}
+
+UTransliterator *
+_tepl_icu_trans_open_xml_escape (void)
+{
+ /* Don't escape all the characters, keep certain printable ASCII
+ * characters as is. That way it's a bit easier to understand when
+ * reading/debugging the XML content.
+ *
+ * The ICU transliterator/transform can be tested easily with the uconv
+ * command, including a round-trip:
+ * $ echo -n -e '\t' | uconv -x '[^a-zA-Z0-9.,;/_\x2D\x3A] Any-Hex/XML' | uconv -x 'Hex-Any/XML'
+ *
+ * "\\x2D" is '-' and "\\x3A" is ':'.
+ */
+ return _tepl_icu_trans_openUSimple ("[^a-zA-Z0-9.,;/_\\x2D\\x3A] Any-Hex/XML");
+}
+
+/* Like utrans_transUChars(), but simpler to use.
+ * @src must be nul-terminated, and is not modified.
+ *
+ * Returns: (transfer full) (nullable): the transformed string, as a
+ * newly-allocated nul-terminated buffer of the right size. Free with g_free()
+ * when no longer needed.
+ */
+UChar *
+_tepl_icu_trans_transUCharsSimple (const UTransliterator *trans,
+ const UChar *src)
+{
+ UChar *src_copy;
+ int32_t src_length;
+ int32_t text_length;
+ int32_t limit;
+ int32_t dest_capacity;
+ UChar *dest;
+ UErrorCode error_code = U_ZERO_ERROR;
+
+ /* Pre-flighting */
+
+ src_copy = _tepl_icu_strdup (src);
+ src_length = u_strlen (src);
+ text_length = src_length;
+
+ limit = src_length;
+
+ utrans_transUChars (trans,
+ src_copy, &text_length, src_length + 1,
+ 0, &limit,
+ &error_code);
+
+ g_free (src_copy);
+
+ if (error_code != U_BUFFER_OVERFLOW_ERROR &&
+ error_code != U_STRING_NOT_TERMINATED_WARNING &&
+ U_FAILURE (error_code))
+ {
+ g_warn_if_reached ();
+ return NULL;
+ }
+
+ /* Do the real transform */
+
+ dest_capacity = MAX (text_length + 1, src_length + 1);
+ dest = g_new0 (UChar, dest_capacity);
+ u_strncpy (dest, src, src_length + 1);
+
+ limit = src_length;
+ error_code = U_ZERO_ERROR;
+
+ utrans_transUChars (trans,
+ dest, NULL, dest_capacity,
+ 0, &limit,
+ &error_code);
+
+ if (U_FAILURE (error_code))
+ {
+ g_warn_if_reached ();
+ g_free (dest);
+ return NULL;
+ }
+
+ return dest;
+}
diff --git a/tepl/tepl-icu.h b/tepl/tepl-icu.h
new file mode 100644
index 0000000..3d48d15
--- /dev/null
+++ b/tepl/tepl-icu.h
@@ -0,0 +1,47 @@
+/* SPDX-FileCopyrightText: 2020 - Sébastien Wilmet <swilmet gnome org>
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef TEPL_ICU_H
+#define TEPL_ICU_H
+
+#include <glib.h>
+#include <unicode/ustring.h>
+#include <unicode/utrans.h>
+
+G_BEGIN_DECLS
+
+G_GNUC_INTERNAL
+UChar * _tepl_icu_strFromUTF8 (int32_t *pDestLength,
+ const char *src,
+ int32_t srcLength,
+ UErrorCode *pErrorCode);
+
+G_GNUC_INTERNAL
+char * _tepl_icu_strToUTF8 (int32_t *pDestLength,
+ const UChar *src,
+ int32_t srcLength,
+ UErrorCode *pErrorCode);
+
+G_GNUC_INTERNAL
+UChar * _tepl_icu_strFromUTF8Simple (const char *utf8_str);
+
+G_GNUC_INTERNAL
+char * _tepl_icu_strToUTF8Simple (const UChar *uchars);
+
+G_GNUC_INTERNAL
+UChar * _tepl_icu_strdup (const UChar *uchars);
+
+G_GNUC_INTERNAL
+UTransliterator * _tepl_icu_trans_openUSimple (const char *utf8_id);
+
+G_GNUC_INTERNAL
+UTransliterator * _tepl_icu_trans_open_xml_escape (void);
+
+G_GNUC_INTERNAL
+UChar * _tepl_icu_trans_transUCharsSimple (const UTransliterator *trans,
+ const UChar *src);
+
+G_END_DECLS
+
+#endif /* TEPL_ICU_H */
diff --git a/testsuite/meson.build b/testsuite/meson.build
index e4577c4..c0bbe87 100644
--- a/testsuite/meson.build
+++ b/testsuite/meson.build
@@ -3,6 +3,7 @@ unit_tests = [
'test-file-loader',
'test-file-saver',
'test-fold-region',
+ 'test-icu',
'test-info-bar',
'test-metadata',
'test-metadata-manager',
diff --git a/testsuite/test-icu.c b/testsuite/test-icu.c
new file mode 100644
index 0000000..9f0a545
--- /dev/null
+++ b/testsuite/test-icu.c
@@ -0,0 +1,118 @@
+/* SPDX-FileCopyrightText: 2020 - Sébastien Wilmet <swilmet gnome org>
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#include "tepl/tepl-icu.h"
+
+static void
+check_str_from_and_to_utf8_raw (const gchar *utf8_str,
+ gboolean expect_success)
+{
+ UChar *uchars;
+ gchar *utf8_str_after_round_trip;
+ UErrorCode error_code = U_ZERO_ERROR;
+
+ uchars = _tepl_icu_strFromUTF8 (NULL, utf8_str, -1, &error_code);
+ if (expect_success)
+ {
+ g_assert_true (U_SUCCESS (error_code));
+ }
+ else
+ {
+ g_assert_true (U_FAILURE (error_code));
+ g_free (uchars);
+ return;
+ }
+
+ error_code = U_ZERO_ERROR;
+ utf8_str_after_round_trip = _tepl_icu_strToUTF8 (NULL, uchars, -1, &error_code);
+ g_assert_cmpstr (utf8_str, ==, utf8_str_after_round_trip);
+
+ g_free (uchars);
+ g_free (utf8_str_after_round_trip);
+}
+
+static void
+check_str_from_and_to_utf8_simple (const gchar *utf8_str,
+ gboolean expect_success)
+{
+ UChar *uchars;
+ gchar *utf8_str_after_round_trip;
+
+ uchars = _tepl_icu_strFromUTF8Simple (utf8_str);
+ if (expect_success)
+ {
+ g_assert_true (uchars != NULL);
+ }
+ else
+ {
+ g_assert_true (uchars == NULL);
+ return;
+ }
+
+ utf8_str_after_round_trip = _tepl_icu_strToUTF8Simple (uchars);
+ g_assert_cmpstr (utf8_str, ==, utf8_str_after_round_trip);
+
+ g_free (uchars);
+ g_free (utf8_str_after_round_trip);
+}
+
+static void
+check_str_from_and_to_utf8 (const gchar *utf8_str,
+ gboolean expect_success)
+{
+ check_str_from_and_to_utf8_raw (utf8_str, expect_success);
+ check_str_from_and_to_utf8_simple (utf8_str, expect_success);
+}
+
+static void
+test_str_from_and_to_utf8 (void)
+{
+ check_str_from_and_to_utf8 (NULL, FALSE);
+ check_str_from_and_to_utf8 ("", TRUE);
+ check_str_from_and_to_utf8 ("ASCII", TRUE);
+ check_str_from_and_to_utf8 ("À ski", TRUE);
+
+ /* Not valid UTF-8. */
+ check_str_from_and_to_utf8 ("\xFF", FALSE);
+}
+
+static void
+test_strdup (void)
+{
+ UChar *uchars;
+ UChar *uchars_copy;
+ gchar *utf8_str;
+
+ uchars = _tepl_icu_strFromUTF8Simple ("Évo");
+ uchars_copy = _tepl_icu_strdup (uchars);
+ utf8_str = _tepl_icu_strToUTF8Simple (uchars_copy);
+ g_assert_cmpstr (utf8_str, ==, "Évo");
+
+ g_free (uchars);
+ g_free (uchars_copy);
+ g_free (utf8_str);
+}
+
+static void
+test_trans_open (void)
+{
+ UTransliterator *transliterator;
+
+ transliterator = _tepl_icu_trans_open_xml_escape ();
+ g_assert_true (transliterator != NULL);
+ utrans_close (transliterator);
+}
+
+int
+main (int argc,
+ char **argv)
+{
+ g_test_init (&argc, &argv, NULL);
+
+ g_test_add_func ("/icu/str_from_and_to_utf8", test_str_from_and_to_utf8);
+ g_test_add_func ("/icu/strdup", test_strdup);
+ g_test_add_func ("/icu/trans_open", test_trans_open);
+
+ return g_test_run ();
+}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]