Re: URIs vs. half-baked URIs [glib PATCH]
- From: Alex Larsson <alexl redhat com>
- To: Darin Adler <darin bentspoon com>
- Cc: <gtk-devel-list gnome org>, <gnome-hackers gnome org>
- Subject: Re: URIs vs. half-baked URIs [glib PATCH]
- Date: Fri, 3 Aug 2001 20:57:27 -0400 (EDT)
On Thu, 2 Aug 2001, Darin Adler wrote:
> I know this kind of code is already in GNOME, but eventually we have to
> distinguish real URIs from half-baked URIs: paths with "file:" or
> "file://<hostname>" prefixes but without URI escaping. The code above
> strips the prefix (ignoring the hostname) and then uses the rest as a path.
> This won't work for files with "%" characters in their name, it won't
> work properly if the URIs have "%" escape sequences in them, and it will
> do the wrong thing for URIs with host names (other than the current host)
> in them.
Ok. How about this glib patch then? I've tested it a bit, but this kind of
stuff is easy to get wrong. I need some reviewing action.
/ Alex
Index: glib/gconvert.c
===================================================================
RCS file: /cvs/gnome/glib/glib/gconvert.c,v
retrieving revision 1.16
diff -u -p -r1.16 gconvert.c
--- glib/gconvert.c 2001/06/23 13:55:07 1.16
+++ glib/gconvert.c 2001/08/04 00:50:36
@@ -519,11 +519,24 @@ static gchar *
strdup_len (const gchar *string,
gssize len,
gsize *bytes_written,
- gsize *bytes_read)
+ gsize *bytes_read,
+ GError **error)
{
gsize real_len;
+ if (!g_utf8_validate (string, -1, NULL))
+ {
+ if (bytes_read)
+ *bytes_read = 0;
+ if (bytes_written)
+ *bytes_written = 0;
+
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+ _("Invalid byte sequence in conversion input"));
+ return NULL;
+ }
+
if (len < 0)
real_len = strlen (string);
else
@@ -674,7 +687,7 @@ g_locale_to_utf8 (const gchar *opsysstr
const char *charset;
if (g_get_charset (&charset))
- return strdup_len (opsysstring, len, bytes_read, bytes_written);
+ return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
else
return g_convert (opsysstring, len,
"UTF-8", charset, bytes_read, bytes_written, error);
@@ -820,7 +833,7 @@ g_locale_from_utf8 (const gchar *utf8str
const gchar *charset;
if (g_get_charset (&charset))
- return strdup_len (utf8string, len, bytes_read, bytes_written);
+ return strdup_len (utf8string, len, bytes_read, bytes_written, error);
else
return g_convert (utf8string, len,
charset, "UTF-8", bytes_read, bytes_written, error);
@@ -863,12 +876,13 @@ g_filename_to_utf8 (const gchar *opsysst
bytes_read, bytes_written,
error);
#else /* !G_PLATFORM_WIN32 */
+
if (getenv ("G_BROKEN_FILENAMES"))
return g_locale_to_utf8 (opsysstring, len,
bytes_read, bytes_written,
error);
else
- return strdup_len (opsysstring, len, bytes_read, bytes_written);
+ return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
#endif /* !G_PLATFORM_WIN32 */
}
@@ -911,6 +925,362 @@ g_filename_from_utf8 (const gchar *utf8s
bytes_read, bytes_written,
error);
else
- return strdup_len (utf8string, len, bytes_read, bytes_written);
+ return strdup_len (utf8string, len, bytes_read, bytes_written, error);
#endif /* !G_PLATFORM_WIN32 */
}
+
+/* Test of haystack has the needle prefix, comparing case
+ * insensitive. All strings UTF-8. */
+static gboolean
+has_case_prefix_utf8 (const gchar *haystack, const gchar *needle)
+{
+ const gchar *h, *n;
+ gunichar hc, nc;
+
+ /* Eat one character at a time. */
+ h = haystack == NULL ? "" : haystack;
+ n = needle == NULL ? "" : needle;
+ do
+ {
+ if (*n == '\0')
+ return TRUE;
+ if (*h == '\0')
+ return FALSE;
+
+ hc = g_utf8_get_char (h);
+ h = g_utf8_next_char (h);
+
+ nc = g_utf8_get_char (n);
+ n = g_utf8_next_char (n);
+
+ hc = g_unichar_tolower (hc);
+ nc = g_unichar_tolower (nc);
+ }
+ while (hc == nc);
+
+ return FALSE;
+}
+
+typedef enum {
+ UNSAFE_ALL = 0x1, /* Escape all unsafe characters */
+ UNSAFE_ALLOW_PLUS = 0x2, /* Allows '+' */
+ UNSAFE_PATH = 0x4, /* Allows '/' and '?' and '&' and '=' */
+ UNSAFE_DOS_PATH = 0x8, /* Allows '/' and '?' and '&' and '=' and ':' */
+ UNSAFE_HOST = 0x10, /* Allows '/' and ':' and '@' */
+ UNSAFE_SLASHES = 0x20 /* Allows all characters except for '/' and '%' */
+} UnsafeCharacterSet;
+
+static const guchar acceptable[96] = {
+ /* X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
+ 0x00,0x3F,0x20,0x20,0x20,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x22,0x20,0x3F,0x3F,0x1C, /* 2X !"#$%&'()*+,-./ */
+ 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x2C, /* 3X 0123456789:;<=>? */
+ 0x30,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, /* 4X @ABCDEFGHIJKLMNO */
+ 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F, /* 5X PQRSTUVWXYZ[\]^_ */
+ 0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, /* 6X `abcdefghijklmno */
+ 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20 /* 7X pqrstuvwxyz{|}~DEL */
+};
+
+#define HEX_ESCAPE '%'
+
+static const gchar hex[16] = "0123456789ABCDEF";
+
+static gchar *
+g_escape_uri_string (const gchar *string,
+ UnsafeCharacterSet mask)
+{
+#define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask))
+
+ const gchar *p;
+ gchar *q;
+ gchar *result;
+ guchar c;
+ gint unacceptable;
+ UnsafeCharacterSet use_mask;
+
+ g_return_val_if_fail (mask == UNSAFE_ALL
+ || mask == UNSAFE_ALLOW_PLUS
+ || mask == UNSAFE_PATH
+ || mask == UNSAFE_DOS_PATH
+ || mask == UNSAFE_HOST
+ || mask == UNSAFE_SLASHES, NULL);
+
+ if (string == NULL)
+ return NULL;
+
+ unacceptable = 0;
+ use_mask = mask;
+ for (p = string; *p != '\0'; p = g_utf8_next_char (p))
+ {
+ c = *p;
+ if (!ACCEPTABLE (c))
+ unacceptable++;
+
+ if ((use_mask == UNSAFE_HOST) &&
+ (unacceptable || (c == '/')))
+ {
+ /* when escaping a host, if we hit something that needs to be escaped, or we finally
+ * hit a path separator, revert to path mode (the host segment of the url is over).
+ */
+ use_mask = UNSAFE_PATH;
+ }
+ }
+
+ result = g_malloc (p - string + unacceptable * 2 + 1);
+
+ use_mask = mask;
+ for (q = result, p = string; *p != '\0'; p = g_utf8_next_char (p))
+ {
+ c = *p;
+
+ if (!ACCEPTABLE (c))
+ {
+ *q++ = HEX_ESCAPE; /* means hex coming */
+ *q++ = hex[c >> 4];
+ *q++ = hex[c & 15];
+ }
+ else
+ {
+ g_utf8_strncpy (q, p, 1);
+ q = g_utf8_next_char (q);
+ }
+ if ((use_mask == UNSAFE_HOST) &&
+ (!ACCEPTABLE (c) || (c == '/')))
+ use_mask = UNSAFE_PATH;
+ }
+
+ *q = '\0';
+
+ return result;
+}
+
+
+static int
+hex_to_int (gchar c)
+{
+ return c >= '0' && c <= '9' ? c - '0'
+ : c >= 'A' && c <= 'F' ? c - 'A' + 10
+ : c >= 'a' && c <= 'f' ? c - 'a' + 10
+ : -1;
+}
+
+static int
+unescape_character (const char *scanner)
+{
+ int first_digit;
+ int second_digit;
+
+ first_digit = hex_to_int (*scanner++);
+
+ if (first_digit < 0)
+ return -1;
+
+ second_digit = hex_to_int (*scanner++);
+ if (second_digit < 0)
+ return -1;
+
+ return (first_digit << 4) | second_digit;
+}
+
+static gchar *
+g_unescape_uri_string (const gchar *escaped,
+ const gchar *illegal_characters)
+{
+ const gchar *in;
+ gchar *out, *result;
+ int character;
+
+ if (escaped == NULL)
+ return NULL;
+
+ result = g_malloc (strlen (escaped) + 1);
+
+ out = result;
+ for (in = escaped; *in != '\0'; in = g_utf8_next_char (in))
+ {
+ character = *in;
+ if (character == HEX_ESCAPE)
+ {
+ character = unescape_character (in + 1);
+
+ /* Check for an illegal character. We consider '\0' illegal here. */
+ if (character <= 0
+ || (illegal_characters != NULL
+ && strchr (illegal_characters, (char)character) != NULL))
+ {
+ g_free (result);
+ return NULL;
+ }
+ in += 2;
+
+ out += g_unichar_to_utf8 ((gunichar)character, out);
+ }
+ else
+ {
+ g_utf8_strncpy (out, in, 1);
+ out = g_utf8_next_char (out);
+ }
+ }
+
+ *out = '\0';
+
+ g_assert (out - result <= strlen (escaped));
+
+ return result;
+}
+
+/**
+ * g_filename_from_uri:
+ * @uri: a UTF-8 encoded uri
+ * @hostname: If the URI specifies a hostname it will be placed here,
+ or %NULL to ignore the hostname.
+ * @error: location to store the error occuring, or %NULL to ignore
+ * errors. Any of the errors in #GConvertError may occur.
+ *
+ * Converts a UTF-8 encoded uri to a local filename in the encoding
+ * used for filenames. Or NULL if the uri doesn't specify a local
+ * filename.
+ *
+ * Return value: The converted string, or %NULL on an error.
+ **/
+gchar *
+g_filename_from_uri (const char *uri,
+ char **hostname,
+ GError **error)
+{
+ const char *path_part;
+ const char *host_part;
+ char *result;
+ char *filename;
+ GError *e;
+
+ if (hostname)
+ *hostname = NULL;
+
+ if (!g_utf8_validate (uri, -1, NULL))
+ {
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+ _("The URI is not valid UTF-8"),
+ uri);
+ return NULL;
+ }
+
+ if (!has_case_prefix_utf8 (uri, "file:/"))
+ {
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_LOCAL_FILE,
+ _("The URI `%s' does not specify a local file"),
+ uri);
+ return NULL;
+ }
+
+ path_part = uri + strlen ("file:");
+
+ if (strchr (path_part, '#') != NULL)
+ {
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_INVALID_URI,
+ _("The local file URI `%s' may not include a `#'"),
+ uri);
+ return NULL;
+ }
+
+ if (has_case_prefix_utf8 (path_part, "///"))
+ path_part += 2;
+ else if (has_case_prefix_utf8 (path_part, "//"))
+ {
+ path_part += 2;
+ host_part = path_part;
+
+ path_part = strchr (path_part, '/');
+
+ if (path_part == NULL)
+ {
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_INVALID_URI,
+ _("The URI `%s' is invalid"),
+ uri);
+ return NULL;
+ }
+
+ if (hostname)
+ {
+ *hostname = g_malloc (path_part - host_part + 1);
+ memcpy (*hostname, host_part, path_part - host_part);
+ (*hostname)[path_part - host_part] = 0;
+ }
+ }
+
+ filename = g_unescape_uri_string (path_part, "/");
+
+ if (filename == NULL)
+ {
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_INVALID_URI,
+ _("The URI `%s' is contains invalidly escaped characters"),
+ uri);
+ return NULL;
+ }
+
+ result = g_filename_from_utf8 (filename, -1, NULL, NULL, error);
+ g_free (filename);
+
+ return result;
+}
+
+/**
+ * g_filename_to_uri:
+ * @filename: an absolute filename specified in the encoding
+ * used for filenames.
+ * @hostname: A utf-8 encoded hostname, or %NULL for none.
+ * @error: location to store the error occuring, or %NULL to ignore
+ * errors. Any of the errors in #GConvertError may occur.
+ *
+ * Converts a UTF-8 encoded uri to a local filename in the encoding
+ * used for filenames. Or NULL if the uri doesn't specify a local
+ * filename.
+ *
+ * Return value: The converted string, or %NULL on an error.
+ **/
+gchar *
+g_filename_to_uri (const char *filename,
+ char *hostname,
+ GError **error)
+{
+ char *escaped_uri, *uri;
+ char *utf8_filename;
+ GError *tmp_error;
+
+ g_return_val_if_fail (filename != NULL, NULL);
+
+ if (!g_path_is_absolute (filename))
+ {
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH,
+ _("The pathname '%s' is not an absolute path"),
+ filename);
+ return NULL;
+ }
+
+ tmp_error = NULL;
+ utf8_filename = g_filename_to_utf8 (filename, -1, NULL, NULL, &tmp_error);
+ if (tmp_error)
+ {
+ g_propagate_error (error, tmp_error);
+ return NULL;
+ }
+
+ if (hostname)
+ {
+ if (!g_utf8_validate (hostname, -1, NULL))
+ {
+ g_free (utf8_filename);
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+ _("Invalid byte sequence in hostname"));
+ return NULL;
+ }
+ uri = g_strconcat ("file://", hostname, utf8_filename, NULL);
+ }
+ else
+ uri = g_strconcat ("file://", utf8_filename, NULL);
+
+ escaped_uri = g_escape_uri_string (uri, (hostname)?UNSAFE_HOST:UNSAFE_PATH);
+ g_free (uri);
+
+ return escaped_uri;
+}
+
Index: glib/gconvert.h
===================================================================
RCS file: /cvs/gnome/glib/glib/gconvert.h,v
retrieving revision 1.7
diff -u -p -r1.7 gconvert.h
--- glib/gconvert.h 2001/06/26 16:01:14 1.7
+++ glib/gconvert.h 2001/08/04 00:50:36
@@ -37,7 +37,10 @@ typedef enum
G_CONVERT_ERROR_NO_CONVERSION,
G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
G_CONVERT_ERROR_FAILED,
- G_CONVERT_ERROR_PARTIAL_INPUT
+ G_CONVERT_ERROR_PARTIAL_INPUT,
+ G_CONVERT_ERROR_NOT_LOCAL_FILE,
+ G_CONVERT_ERROR_INVALID_URI,
+ G_CONVERT_ERROR_NOT_ABSOLUTE_PATH
} GConvertError;
#define G_CONVERT_ERROR g_convert_error_quark()
@@ -100,6 +103,15 @@ gchar* g_filename_from_utf8 (const gchar
gsize *bytes_read,
gsize *bytes_written,
GError **error);
+
+gchar *g_filename_from_uri (const char *uri,
+ char **hostname,
+ GError **error);
+
+gchar *g_filename_to_uri (const char *filename,
+ char *hostname,
+ GError **error);
+
G_END_DECLS
_______________________________________________
gnome-hackers mailing list
gnome-hackers gnome org
http://mail.gnome.org/mailman/listinfo/gnome-hackers
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]