[pygobject] Add support for bytes and non-utf-8 file names.



commit d6e46f778ea2bfede89f4fe2422b80998ed7fec8
Author: Christoph Reiter <creiter src gnome org>
Date:   Fri Jun 3 15:51:37 2016 +0200

    Add support for bytes and non-utf-8 file names.
    
    Py2+Unix: Convert unicode to bytes using the fsencoding. Pass bytes as is.
    Returns path as bytes as is.
    
    Py2+Windows: Convert unicode to wtf-8. Pass bytes as is.
    Returns path as bytes as is.
    
    Py3+Unix: Convert str using os.fsencode so that the surrogateescape handler
    can restore the real path if the source was a Python API such as os.listdir
    sys.argv etc. Pass bytes as is. Return str decoded using os.fsdecode so that
    it can be passed to Python API such as open, os.listdir etc.
    
    Py3+Windows: Convert str to wtf-8. Decode bytes using the fsencoding first.
    Returns str + surrogates.
    
    This change makes anyting taking filenames on Python 3 behave the same
    as Python functions like listdir() or open(). Compared to Python 3 builtin
    functions which return the same type that was passed in we always return str.
    
    One remaining problem is that glib assumes that Windows paths are utf-16,
    while they are just 16bit arrays and as a result utf-8 is not enough to
    represent all possible paths. We use wtf-8 here instead (utf-8 with lone surrogates),
    which allows us to convert all paths, but glib functions accessing the fs
    will fail with them.
    
    PyUnicode_EncodeFSDefault was added in CPython 3.2 so bump the requirement.
    
    https://bugzilla.gnome.org/show_bug.cgi?id=746564

 configure.ac                    |    2 +-
 gi/pygi-basictype.c             |  186 +++++++++++++++++++++++++++++++--------
 gi/pyglib-python-compat.h       |    2 +
 tests/compathelper.py           |    4 +
 tests/gimarshallingtestsextra.c |   14 +++
 tests/gimarshallingtestsextra.h |    1 +
 tests/test_gi.py                |  152 +++++++++++++++++++++++++++++++-
 7 files changed, 320 insertions(+), 41 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index b8a811e..b9f1adc 100644
--- a/configure.ac
+++ b/configure.ac
@@ -13,7 +13,7 @@ AC_PREREQ([2.68])
 #   $ ./configure --with-python=~/my-patched-python/python
 
 m4_define(python_min_ver, 2.7)
-m4_define(python3_min_ver, 3.1)
+m4_define(python3_min_ver, 3.2)
 
 dnl the pygobject version number
 m4_define(pygobject_major_version, 3)
diff --git a/gi/pygi-basictype.c b/gi/pygi-basictype.c
index 4a5e112..6d4e64e 100644
--- a/gi/pygi-basictype.c
+++ b/gi/pygi-basictype.c
@@ -246,53 +246,167 @@ _pygi_marshal_from_py_utf8 (PyObject          *py_arg,
     return TRUE;
 }
 
-static gboolean
-_pygi_marshal_from_py_filename (PyObject          *py_arg,
-                                GIArgument        *arg,
-                                gpointer          *cleanup_data)
+G_GNUC_UNUSED static gboolean
+_pygi_marshal_from_py_filename_unix (PyObject          *py_arg,
+                                     GIArgument        *arg,
+                                     gpointer          *cleanup_data)
 {
-    gchar *string_;
-    GError *error = NULL;
-    PyObject *tmp = NULL;
+    gchar *filename;
 
     if (py_arg == Py_None) {
         arg->v_pointer = NULL;
         return TRUE;
     }
 
-    if (PyUnicode_Check (py_arg)) {
-        tmp = PyUnicode_AsUTF8String (py_arg);
-        if (!tmp)
+    if (PYGLIB_PyBytes_Check (py_arg)) {
+        char *buffer;
+
+        if (PYGLIB_PyBytes_AsStringAndSize (py_arg, &buffer, NULL) == -1)
             return FALSE;
 
-        string_ = PYGLIB_PyBytes_AsString (tmp);
-    }
+        filename = g_strdup (buffer);
+    } else if (PyUnicode_Check (py_arg)) {
+        PyObject *bytes;
+        char *buffer;
+
 #if PY_VERSION_HEX < 0x03000000
-    else if (PyString_Check (py_arg)) {
-        string_ = PyString_AsString (py_arg);
-    }
+        bytes = PyUnicode_AsEncodedString (py_arg, Py_FileSystemDefaultEncoding,
+                                           NULL);
+#else
+        bytes = PyUnicode_EncodeFSDefault (py_arg);
 #endif
-    else {
-        PyErr_Format (PyExc_TypeError, "Must be string, not %s",
+
+        if (!bytes)
+            return FALSE;
+
+        if (PYGLIB_PyBytes_AsStringAndSize (bytes, &buffer, NULL) == -1) {
+            Py_DECREF (bytes);
+            return FALSE;
+        }
+
+        filename = g_strdup (buffer);
+        Py_DECREF (bytes);
+    } else {
+        PyErr_Format (PyExc_TypeError, "Must be bytes, not %s",
                       py_arg->ob_type->tp_name);
         return FALSE;
     }
 
-    arg->v_string = g_filename_from_utf8 (string_, -1, NULL, NULL, &error);
-    Py_XDECREF (tmp);
+    arg->v_string = filename;
+    *cleanup_data = filename;
+    return TRUE;
+}
 
-    if (arg->v_string == NULL) {
-        PyErr_SetString (PyExc_Exception, error->message);
-        g_error_free (error);
-        /* TODO: Convert the error to an exception. */
+G_GNUC_UNUSED static gboolean
+_pygi_marshal_from_py_filename_win32 (PyObject          *py_arg,
+                                      GIArgument        *arg,
+                                      gpointer          *cleanup_data)
+{
+    gchar *filename;
+
+    if (py_arg == Py_None) {
+        arg->v_pointer = NULL;
+        return TRUE;
+    }
+
+#if PY_VERSION_HEX < 0x03000000
+    if (PYGLIB_PyBytes_Check (py_arg)) {
+        char *buffer;
+
+        if (PYGLIB_PyBytes_AsStringAndSize (py_arg, &buffer, NULL) == -1)
+            return FALSE;
+
+        filename = g_strdup (buffer);
+    } else if (PyUnicode_Check (py_arg)) {
+        PyObject *bytes;
+        char *buffer;
+
+        bytes = PyUnicode_AsUTF8String (py_arg);
+        if (!bytes)
+            return FALSE;
+
+        if (PYGLIB_PyBytes_AsStringAndSize (bytes, &buffer, NULL) == -1) {
+            Py_DECREF (bytes);
+            return FALSE;
+        }
+
+        filename = g_strdup (buffer);
+        Py_DECREF (bytes);
+    } else {
+        PyErr_Format (PyExc_TypeError, "Must be unicode, not %s",
+                      py_arg->ob_type->tp_name);
         return FALSE;
     }
+#else
+    if (PYGLIB_PyBytes_Check (py_arg)) {
+        PyObject *uni_arg;
+        gboolean result;
+        char *buffer;
 
-    *cleanup_data = arg->v_string;
+        if (PYGLIB_PyBytes_AsStringAndSize (py_arg, &buffer, NULL) == -1)
+            return FALSE;
+
+        uni_arg = PyUnicode_DecodeFSDefault (buffer);
+        if (!uni_arg)
+            return FALSE;
+        result = _pygi_marshal_from_py_filename_win32 (uni_arg, arg, cleanup_data);
+        Py_DECREF (uni_arg);
+        return result;
+    } else if (PyUnicode_Check (py_arg)) {
+        PyObject *bytes, *temp_uni;
+        char *buffer;
+
+        /* The roundtrip merges lone surrogates, so we get the same output as
+         * with Py 2. Requires 3.4+ because of https://bugs.python.org/issue27971
+         * Separated lone surrogates can occur when concatenating two paths.
+         */
+        bytes = PyUnicode_AsEncodedString (py_arg, "utf-16-le", "surrogatepass");
+        if (!bytes)
+            return FALSE;
+        temp_uni = PyUnicode_FromEncodedObject (bytes, "utf-16-le", "surrogatepass");
+        Py_DECREF (bytes);
+        if (!temp_uni)
+            return FALSE;
+        /* glib uses utf-8, so encode to that and allow surrogates so we can
+         * represent all possible path values
+         */
+        bytes = PyUnicode_AsEncodedString (temp_uni, "utf-8", "surrogatepass");
+        Py_DECREF (temp_uni);
+        if (!bytes)
+            return FALSE;
+
+        if (PYGLIB_PyBytes_AsStringAndSize (bytes, &buffer, NULL) == -1) {
+            Py_DECREF (bytes);
+            return FALSE;
+        }
+
+        filename = g_strdup (buffer);
+        Py_DECREF (bytes);
+    } else {
+        PyErr_Format (PyExc_TypeError, "Must be str, not %s",
+                      py_arg->ob_type->tp_name);
+        return FALSE;
+    }
+#endif
+
+    arg->v_string = filename;
+    *cleanup_data = filename;
     return TRUE;
 }
 
 static gboolean
+_pygi_marshal_from_py_filename (PyObject          *py_arg,
+                                GIArgument        *arg,
+                                gpointer          *cleanup_data)
+{
+#ifdef G_OS_WIN32
+    return _pygi_marshal_from_py_filename_win32 (py_arg, arg, cleanup_data);
+#else
+    return _pygi_marshal_from_py_filename_unix (py_arg, arg, cleanup_data);
+#endif
+}
+
+static gboolean
 _pygi_marshal_from_py_long (PyObject   *object,   /* in */
                             GIArgument *arg,      /* out */
                             GITypeTag   type_tag,
@@ -617,23 +731,23 @@ _pygi_marshal_to_py_utf8 (GIArgument *arg)
 static PyObject *
 _pygi_marshal_to_py_filename (GIArgument *arg)
 {
-    gchar *string = NULL;
-    PyObject *py_obj = NULL;
-    GError *error = NULL;
+    PyObject *py_obj;
 
     if (arg->v_string == NULL) {
         Py_RETURN_NONE;
     }
 
-    string = g_filename_to_utf8 (arg->v_string, -1, NULL, NULL, &error);
-    if (string == NULL) {
-        PyErr_SetString (PyExc_Exception, error->message);
-        /* TODO: Convert the error to an exception. */
-        return NULL;
-    }
-
-    py_obj = PYGLIB_PyUnicode_FromString (string);
-    g_free (string);
+#if PY_VERSION_HEX < 0x03000000
+    /* On PY2 we return str as is */
+    py_obj = PyString_FromString (arg->v_string);
+#else
+#ifdef G_OS_WIN32
+    py_obj = PyUnicode_DecodeUTF8 (arg->v_string, strlen(arg->v_string),
+                                   "surrogatepass");
+#else
+    py_obj = PyUnicode_DecodeFSDefault (arg->v_string);
+#endif
+#endif
 
     return py_obj;
 }
diff --git a/gi/pyglib-python-compat.h b/gi/pyglib-python-compat.h
index 7f18452..d6f7553 100644
--- a/gi/pyglib-python-compat.h
+++ b/gi/pyglib-python-compat.h
@@ -54,6 +54,7 @@
 #define PYGLIB_PyBytes_FromStringAndSize PyString_FromStringAndSize
 #define PYGLIB_PyBytes_Resize _PyString_Resize
 #define PYGLIB_PyBytes_AsString PyString_AsString
+#define PYGLIB_PyBytes_AsStringAndSize PyString_AsStringAndSize
 #define PYGLIB_PyBytes_Size PyString_Size
 #define PYGLIB_PyBytes_Check PyString_Check
 
@@ -189,6 +190,7 @@ PyTypeObject symbol = {                                 \
 #define PYGLIB_PyBytes_FromStringAndSize PyBytes_FromStringAndSize
 #define PYGLIB_PyBytes_Resize(o, len) _PyBytes_Resize(o, len)
 #define PYGLIB_PyBytes_AsString PyBytes_AsString
+#define PYGLIB_PyBytes_AsStringAndSize PyBytes_AsStringAndSize
 #define PYGLIB_PyBytes_Size PyBytes_Size
 #define PYGLIB_PyBytes_Check PyBytes_Check
 
diff --git a/tests/compathelper.py b/tests/compathelper.py
index e5de550..d7ad650 100644
--- a/tests/compathelper.py
+++ b/tests/compathelper.py
@@ -1,6 +1,8 @@
 import sys
 import collections
 
+PY2 = PY3 = False
+
 if sys.version_info >= (3, 0):
     '''
     for tests that need to test long values in python 2
@@ -67,6 +69,7 @@ if sys.version_info >= (3, 0):
     callable = lambda x: isinstance(x, collections.Callable)
     from io import StringIO
     StringIO
+    PY3 = True
 else:
     _long = long
     _basestring = basestring
@@ -75,3 +78,4 @@ else:
     callable = callable
     from StringIO import StringIO
     StringIO
+    PY2 = True
diff --git a/tests/gimarshallingtestsextra.c b/tests/gimarshallingtestsextra.c
index 85a9fba..eee3a14 100644
--- a/tests/gimarshallingtestsextra.c
+++ b/tests/gimarshallingtestsextra.c
@@ -17,6 +17,7 @@
  */
 
 #include "gimarshallingtestsextra.h"
+#include <string.h>
 
 void
 gi_marshalling_tests_compare_two_gerrors_in_gvalue (GValue *v, GValue *v1)
@@ -82,6 +83,19 @@ gi_marshalling_tests_filename_copy (gchar *path_in)
 }
 
 /**
+ * gi_marshalling_tests_filename_to_glib_repr:
+ * @path_in: (type filename) (nullable)
+ *
+ * Returns: (array length=len) (element-type guint8)
+ */
+gchar *
+gi_marshalling_tests_filename_to_glib_repr (gchar *path_in, gsize *len)
+{
+  *len = strlen(path_in);
+  return g_strdup (path_in);
+}
+
+/**
  * gi_marshalling_tests_filename_exists:
  * @path: (type filename)
  */
diff --git a/tests/gimarshallingtestsextra.h b/tests/gimarshallingtestsextra.h
index 51a65f2..5452688 100644
--- a/tests/gimarshallingtestsextra.h
+++ b/tests/gimarshallingtestsextra.h
@@ -35,5 +35,6 @@ GHashTable * gi_marshalling_tests_ghashtable_enum_none_return (void);
 
 gchar * gi_marshalling_tests_filename_copy (gchar *path_in);
 gboolean gi_marshalling_tests_filename_exists (gchar *path);
+gchar * gi_marshalling_tests_filename_to_glib_repr (gchar *path_in, gsize *len);
 
 #endif /* EXTRA_TESTS */
diff --git a/tests/test_gi.py b/tests/test_gi.py
index 0eaa0b7..16ed076 100644
--- a/tests/test_gi.py
+++ b/tests/test_gi.py
@@ -23,10 +23,10 @@ from gi.repository import GObject, GLib, Gio
 
 from gi.repository import GIMarshallingTests
 
-from compathelper import _bytes, _unicode
+from compathelper import _bytes, _unicode, PY2, PY3
 from helper import capture_exceptions
 
-if sys.version_info < (3, 0):
+if PY2:
     CONSTANT_UTF8 = "const \xe2\x99\xa5 utf8"
     PY2_UNICODE_UTF8 = unicode(CONSTANT_UTF8, 'UTF-8')
     CHAR_255 = '\xff'
@@ -724,8 +724,152 @@ class TestFilename(unittest.TestCase):
         self.assertTrue(os.path.isdir(dirname))
         os.rmdir(dirname)
 
-    def test_filename_type_error(self):
-        self.assertRaises(TypeError, GLib.file_get_contents, 23)
+    def test_wrong_types(self):
+        self.assertRaises(TypeError, GIMarshallingTests.filename_copy, 23)
+        self.assertRaises(TypeError, GIMarshallingTests.filename_copy, [])
+
+    def test_null(self):
+        self.assertTrue(GIMarshallingTests.filename_copy(None) is None)
+        self.assertRaises(TypeError, GIMarshallingTests.filename_exists, None)
+
+    def test_round_trip(self):
+        self.assertEqual(GIMarshallingTests.filename_copy(u"foo"), "foo")
+        self.assertEqual(GIMarshallingTests.filename_copy(b"foo"), "foo")
+
+    def test_contains_null(self):
+        self.assertRaises(
+            (ValueError, TypeError),
+            GIMarshallingTests.filename_copy, b"foo\x00")
+        self.assertRaises(
+            (ValueError, TypeError),
+            GIMarshallingTests.filename_copy, u"foo\x00")
+
+    def test_as_is_py2(self):
+        if not PY2:
+            return
+
+        values = [
+            b"foo",
+            b"\xff\xff",
+            b"\xc3\xb6\xc3\xa4\xc3\xbc",
+            b"\xed\xa0\xbd",
+            b"\xf0\x90\x80\x81",
+        ]
+
+        for v in values:
+            self.assertEqual(GIMarshallingTests.filename_copy(v), v)
+            self.assertEqual(GIMarshallingTests.filename_to_glib_repr(v), v)
+
+    def test_win32_surrogates(self):
+        if os.name != "nt":
+            return
+
+        copy = GIMarshallingTests.filename_copy
+        glib_repr = GIMarshallingTests.filename_to_glib_repr
+
+        if PY3:
+            self.assertEqual(copy(u"\ud83d"), u"\ud83d")
+            self.assertEqual(copy(u"\x61\uDC00"), u"\x61\uDC00")
+            self.assertEqual(copy(u"\uD800\uDC01"), u"\U00010001")
+            self.assertEqual(copy(u"\uD83D\x20\uDCA9"), u"\uD83D\x20\uDCA9")
+        else:
+            self.assertEqual(copy(u"\ud83d"), u"\ud83d".encode("utf-8"))
+            self.assertEqual(copy(u"\uD800\uDC01").decode("utf-8"),
+                             u"\U00010001")
+
+        self.assertEqual(glib_repr(u"\ud83d"), b"\xed\xa0\xbd")
+        self.assertEqual(glib_repr(u"\uD800\uDC01"), b"\xf0\x90\x80\x81")
+
+        self.assertEqual(
+            glib_repr(u"\uD800\uDBFF"), b"\xED\xA0\x80\xED\xAF\xBF")
+        self.assertEqual(
+            glib_repr(u"\uD800\uE000"), b"\xED\xA0\x80\xEE\x80\x80")
+        self.assertEqual(
+            glib_repr(u"\uD7FF\uDC00"), b"\xED\x9F\xBF\xED\xB0\x80")
+        self.assertEqual(glib_repr(u"\x61\uDC00"), b"\x61\xED\xB0\x80")
+        self.assertEqual(glib_repr(u"\uDC00"), b"\xED\xB0\x80")
+
+    def test_win32_bytes_py3(self):
+        if not (os.name == "nt" and PY3):
+            return
+
+        values = [
+            b"foo",
+            b"\xff\xff",
+            b"\xc3\xb6\xc3\xa4\xc3\xbc",
+            b"\xed\xa0\xbd",
+            b"\xf0\x90\x80\x81",
+        ]
+
+        for v in values:
+            try:
+                uni = v.decode(sys.getfilesystemencoding(), "surrogatepass")
+            except UnicodeDecodeError:
+                continue
+            self.assertEqual(GIMarshallingTests.filename_copy(v), uni)
+
+    def test_unix_various(self):
+        if os.name == "nt":
+            return
+
+        copy = GIMarshallingTests.filename_copy
+        glib_repr = GIMarshallingTests.filename_to_glib_repr
+
+        if PY3:
+            str_path = copy(b"\xff\xfe")
+            self.assertTrue(isinstance(str_path, str))
+            self.assertEqual(str_path, os.fsdecode(b"\xff\xfe"))
+            self.assertEqual(copy(str_path), str_path)
+            self.assertEqual(glib_repr(b"\xff\xfe"), b"\xff\xfe")
+            self.assertEqual(glib_repr(str_path), b"\xff\xfe")
+
+            # if getfilesystemencoding is ASCII, then we should fail like
+            # os.fsencode
+            try:
+                byte_path = os.fsencode(u"ä")
+            except UnicodeEncodeError:
+                self.assertRaises(UnicodeEncodeError, copy, u"ä")
+            else:
+                self.assertEqual(copy(u"ä"), u"ä")
+                self.assertEqual(glib_repr(u"ä"), byte_path)
+        else:
+            self.assertTrue(isinstance(copy(b"\xff\xfe"), bytes))
+            self.assertEqual(copy(u"foo"), b"foo")
+            self.assertTrue(isinstance(copy(u"foo"), bytes))
+            try:
+                byte_path = u"ä".encode(sys.getfilesystemencoding())
+            except UnicodeEncodeError:
+                self.assertRaises(UnicodeEncodeError, copy, u"ä")
+            else:
+                self.assertEqual(copy(u"ä"), byte_path)
+                self.assertEqual(glib_repr(u"ä"), byte_path)
+
+    @unittest.skip("glib can't handle non-unicode paths")
+    def test_win32_surrogates_exists(self):
+        if os.name != "nt":
+            return
+
+        path = os.path.join(self.workdir, u"\ud83d")
+        with open(path, "wb"):
+            self.assertTrue(os.path.exists(path))
+            self.assertTrue(GIMarshallingTests.filename_exists(path))
+        os.unlink(path)
+
+    def test_path_exists_various_types(self):
+        wd = self.workdir
+        wdb = os.fsencode(wd) if PY3 else wd
+
+        paths = [(wdb, b"foo-1"), (wd, u"foo-2"), (wd, u"öäü-3")]
+        if PY3:
+            paths.append((wd, os.fsdecode(b"\xff\xfe-4")))
+
+        if os.name != "nt":
+            paths.append((wdb, b"\xff\xfe-5"))
+
+        for (d, path) in paths:
+            path = os.path.join(d, path)
+            with open(path, "wb"):
+                self.assertTrue(GIMarshallingTests.filename_exists(path))
 
 
 class TestArray(unittest.TestCase):


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]