[glib] gconvert: Tighten, document embedded NUL behavior of UTF-8 conversions

From: Philip Withnall <pwithnall src gnome org>
To: commits-list gnome org
Cc:
Subject: [glib] gconvert: Tighten, document embedded NUL behavior of UTF-8 conversions
Date: Fri, 19 Jan 2018 12:10:56 +0000 (UTC)
commit 81cd8154061338dfee7a9d3e23752efe190310bb
Author: Mikhail Zabaluev <mikhail zabaluev gmail com>
Date:   Sun Jan 14 16:55:03 2018 +0200

    gconvert: Tighten, document embedded NUL behavior of UTF-8 conversions
    
    The character encoding conversion utility functions g_locale_to_utf8()
    and g_filename_to_utf8() had inconsistent behavior on producing strings
    with inner NUL bytes: in the all-UTF-8 strdup path, the input string
    validation prohibits embedded NULs, while g_convert(), using iconv(),
    can produce UTF-8 output with NUL bytes inside the output buffer.
    This, while valid UTF-8 per the Unicode standard, is not valid for
    the nul-terminated (type utf8) return value format that the *_to_utf8()
    functions are annotated with (as per discussion in bug 756128).
    
    Check the output of g_convert() for embedded NUL bytes, and if any
    are found, set the newly introduced error
    G_CONVERT_ERROR_EMBEDDED_NUL.
    
    Also document the error set by g_{locale,filename}_{from,to}_utf8()
    when the input string contains nul bytes.
    
    https://bugzilla.gnome.org/show_bug.cgi?id=792516

 glib/gconvert.c |   84 ++++++++++++++++++++++++++++++++++++++++++++++--------
 glib/gconvert.h |    6 +++-
 2 files changed, 76 insertions(+), 14 deletions(-)
---
diff --git a/glib/gconvert.c b/glib/gconvert.c
index 083ea17..586b53a 100644
--- a/glib/gconvert.c
+++ b/glib/gconvert.c
@@ -866,6 +866,40 @@ strdup_len (const gchar *string,
   return g_strndup (string, real_len);
 }
 
+static gchar *
+convert_to_utf8 (const gchar *opsysstring,
+                 gssize       len,
+                 const gchar *charset,
+                 gsize       *bytes_read,
+                 gsize       *bytes_written,
+                 GError     **error)
+{
+  gchar *utf8;
+  gsize outbytes;
+
+  utf8 = g_convert (opsysstring, len, "UTF-8", charset,
+                    bytes_read, &outbytes, error);
+  if (utf8 == NULL)
+    {
+      if (bytes_written)
+        *bytes_written = 0;
+      return NULL;
+    }
+  if (memchr (utf8, '\0', outbytes) != NULL)
+    {
+      g_free (utf8);
+      if (bytes_written)
+        *bytes_written = 0;
+      g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_EMBEDDED_NUL,
+                           _("Embedded NUL byte in conversion output"));
+      return NULL;
+    }
+
+  if (bytes_written)
+    *bytes_written = outbytes;
+  return utf8;
+}
+
 /**
  * g_locale_to_utf8:
  * @opsysstring:   a string in the encoding of the current locale. On Windows
@@ -879,7 +913,7 @@ strdup_len (const gchar *string,
  *                 Even if the conversion was successful, this may be 
  *                 less than @len if there were partial characters
  *                 at the end of the input. If the error
- *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
+ *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
  *                 stored will the byte offset after the last valid
  *                 input sequence.
  * @bytes_written: (out) (optional): the number of bytes stored in the output
@@ -890,6 +924,14 @@ strdup_len (const gchar *string,
  * Converts a string which is in the encoding used for strings by
  * the C runtime (usually the same as that used by the operating
  * system) in the [current locale][setlocale] into a UTF-8 string.
+ *
+ * If the source encoding is not UTF-8 and the conversion output contains a
+ * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
+ * function returns %NULL.
+ * If the source encoding is UTF-8, an embedded nul character is treated with
+ * the %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error for backward compatibility with
+ * earlier versions of this library. Use g_convert() to produce output that
+ * may contain embedded nul characters.
  * 
  * Returns: A newly-allocated buffer containing the converted string,
  *               or %NULL on an error, and error will be set.
@@ -906,23 +948,21 @@ g_locale_to_utf8 (const gchar  *opsysstring,
   if (g_get_charset (&charset))
     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
   else
-    return g_convert (opsysstring, len, 
-                     "UTF-8", charset, bytes_read, bytes_written, error);
+    return convert_to_utf8 (opsysstring, len, charset,
+                            bytes_read, bytes_written, error);
 }
 
 /**
  * g_locale_from_utf8:
  * @utf8string:    a UTF-8 encoded string 
  * @len:           the length of the string, or -1 if the string is
- *                 nul-terminated (Note that some encodings may allow nul
- *                 bytes to occur inside strings. In that case, using -1
- *                 for the @len parameter is unsafe)
+ *                 nul-terminated.
  * @bytes_read: (out) (optional): location to store the number of bytes in the
  *                 input string that were successfully converted, or %NULL.
  *                 Even if the conversion was successful, this may be 
  *                 less than @len if there were partial characters
  *                 at the end of the input. If the error
- *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
+ *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
  *                 stored will the byte offset after the last valid
  *                 input sequence.
  * @bytes_written: (out) (optional): the number of bytes stored in the output
@@ -934,7 +974,12 @@ g_locale_to_utf8 (const gchar  *opsysstring,
  * the C runtime (usually the same as that used by the operating
  * system) in the [current locale][setlocale]. On Windows this means
  * the system codepage.
- * 
+ *
+ * The input string should not contain nul characters even if the @len
+ * argument is positive. A nul character found inside the string may result
+ * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. Use g_convert() to convert
+ * input that may contain embedded nul characters.
+ *
  * Returns: A newly-allocated buffer containing the converted string,
  *               or %NULL on an error, and error will be set.
  **/
@@ -1126,7 +1171,7 @@ get_filename_charset (const gchar **filename_charset)
  *                 Even if the conversion was successful, this may be 
  *                 less than @len if there were partial characters
  *                 at the end of the input. If the error
- *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
+ *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
  *                 stored will the byte offset after the last valid
  *                 input sequence.
  * @bytes_written: (out) (optional): the number of bytes stored in the output
@@ -1138,6 +1183,14 @@ get_filename_charset (const gchar **filename_charset)
  * filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8
  * for filenames; on other platforms, this function indirectly depends on 
  * the [current locale][setlocale].
+ *
+ * If the source encoding is not UTF-8 and the conversion output contains a
+ * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
+ * function returns %NULL.
+ * If the source encoding is UTF-8, an embedded nul character is treated with
+ * the %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error for backward compatibility with
+ * earlier versions of this library. Use g_convert() to produce output that
+ * may contain embedded nul characters.
  * 
  * Returns: The converted string, or %NULL on an error.
  **/
@@ -1155,8 +1208,8 @@ g_filename_to_utf8 (const gchar *opsysstring,
   if (get_filename_charset (&charset))
     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
   else
-    return g_convert (opsysstring, len, 
-                     "UTF-8", charset, bytes_read, bytes_written, error);
+    return convert_to_utf8 (opsysstring, len, charset,
+                            bytes_read, bytes_written, error);
 }
 
 /**
@@ -1169,7 +1222,7 @@ g_filename_to_utf8 (const gchar *opsysstring,
  *                 Even if the conversion was successful, this may be 
  *                 less than @len if there were partial characters
  *                 at the end of the input. If the error
- *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
+ *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
  *                 stored will the byte offset after the last valid
  *                 input sequence.
  * @bytes_written: (out): the number of bytes stored in the output buffer (not 
@@ -1181,7 +1234,12 @@ g_filename_to_utf8 (const gchar *opsysstring,
  * filenames. Note that on Windows GLib uses UTF-8 for filenames;
  * on other platforms, this function indirectly depends on the 
  * [current locale][setlocale].
- * 
+ *
+ * The input string should not contain nul characters even if the @len
+ * argument is positive. A nul character found inside the string may result
+ * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. Note that nul bytes are
+ * prohibited in all filename encodings that GLib is known to work with.
+ *
  * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
  *               The converted string, or %NULL on an error.
  **/
diff --git a/glib/gconvert.h b/glib/gconvert.h
index f064e41..ea93006 100644
--- a/glib/gconvert.h
+++ b/glib/gconvert.h
@@ -43,6 +43,9 @@ G_BEGIN_DECLS
  * @G_CONVERT_ERROR_BAD_URI: URI is invalid.
  * @G_CONVERT_ERROR_NOT_ABSOLUTE_PATH: Pathname is not an absolute path.
  * @G_CONVERT_ERROR_NO_MEMORY: No memory available. Since: 2.40
+ * @G_CONVERT_ERROR_EMBEDDED_NUL: An embedded NUL character is present in
+ *     conversion output where a NUL-terminated string is expected.
+ *     Since: 2.56
  *
  * Error codes returned by character set conversion routines.
  */
@@ -54,7 +57,8 @@ typedef enum
   G_CONVERT_ERROR_PARTIAL_INPUT,
   G_CONVERT_ERROR_BAD_URI,
   G_CONVERT_ERROR_NOT_ABSOLUTE_PATH,
-  G_CONVERT_ERROR_NO_MEMORY
+  G_CONVERT_ERROR_NO_MEMORY,
+  G_CONVERT_ERROR_EMBEDDED_NUL
 } GConvertError;
 
 /**
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]