Re: URIs vs. half-baked URIs [glib PATCH]

From: Alex Larsson <alexl redhat com>
To: Darin Adler <darin bentspoon com>
Cc: <gtk-devel-list gnome org>, <gnome-hackers gnome org>
Subject: Re: URIs vs. half-baked URIs [glib PATCH]
Date: Fri, 3 Aug 2001 20:57:27 -0400 (EDT)
On Thu, 2 Aug 2001, Darin Adler wrote:

> I know this kind of code is already in GNOME, but eventually we have to 
> distinguish real URIs from half-baked URIs: paths with "file:" or 
> "file://<hostname>" prefixes but without URI escaping. The code above 
> strips the prefix (ignoring the hostname) and then uses the rest as a path.
>   This won't work for files with "%" characters in their name, it won't 
> work properly if the URIs have "%" escape sequences in them, and it will 
> do the wrong thing for URIs with host names (other than the current host) 
> in them.

Ok. How about this glib patch then? I've tested it a bit, but this kind of 
stuff is easy to get wrong. I need some reviewing action.

/ Alex

Index: glib/gconvert.c
===================================================================
RCS file: /cvs/gnome/glib/glib/gconvert.c,v
retrieving revision 1.16
diff -u -p -r1.16 gconvert.c
--- glib/gconvert.c	2001/06/23 13:55:07	1.16
+++ glib/gconvert.c	2001/08/04 00:50:36
@@ -519,11 +519,24 @@ static gchar *
 strdup_len (const gchar *string,
 	    gssize       len,
 	    gsize       *bytes_written,
-	    gsize       *bytes_read)
+	    gsize       *bytes_read,
+	    GError      **error)
 	 
 {
   gsize real_len;
 
+  if (!g_utf8_validate (string, -1, NULL))
+    {
+      if (bytes_read)
+	*bytes_read = 0;
+      if (bytes_written)
+	*bytes_written = 0;
+
+      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+		   _("Invalid byte sequence in conversion input"));
+      return NULL;
+    }
+  
   if (len < 0)
     real_len = strlen (string);
   else
@@ -674,7 +687,7 @@ g_locale_to_utf8 (const gchar  *opsysstr
   const char *charset;
 
   if (g_get_charset (&charset))
-    return strdup_len (opsysstring, len, bytes_read, bytes_written);
+    return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
   else
     return g_convert (opsysstring, len, 
 		      "UTF-8", charset, bytes_read, bytes_written, error);
@@ -820,7 +833,7 @@ g_locale_from_utf8 (const gchar *utf8str
   const gchar *charset;
 
   if (g_get_charset (&charset))
-    return strdup_len (utf8string, len, bytes_read, bytes_written);
+    return strdup_len (utf8string, len, bytes_read, bytes_written, error);
   else
     return g_convert (utf8string, len,
 		      charset, "UTF-8", bytes_read, bytes_written, error);
@@ -863,12 +876,13 @@ g_filename_to_utf8 (const gchar *opsysst
 			   bytes_read, bytes_written,
 			   error);
 #else  /* !G_PLATFORM_WIN32 */
+      
   if (getenv ("G_BROKEN_FILENAMES"))
     return g_locale_to_utf8 (opsysstring, len,
 			     bytes_read, bytes_written,
 			     error);
   else
-    return strdup_len (opsysstring, len, bytes_read, bytes_written);
+    return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
 #endif /* !G_PLATFORM_WIN32 */
 }
 
@@ -911,6 +925,362 @@ g_filename_from_utf8 (const gchar *utf8s
 			       bytes_read, bytes_written,
 			       error);
   else
-    return strdup_len (utf8string, len, bytes_read, bytes_written);
+    return strdup_len (utf8string, len, bytes_read, bytes_written, error);
 #endif /* !G_PLATFORM_WIN32 */
 }
+
+/* Test of haystack has the needle prefix, comparing case
+ * insensitive. All strings UTF-8. */
+static gboolean
+has_case_prefix_utf8 (const gchar *haystack, const gchar *needle)
+{
+  const gchar *h, *n;
+  gunichar hc, nc;
+  
+  /* Eat one character at a time. */
+  h = haystack == NULL ? "" : haystack;
+  n = needle == NULL ? "" : needle;
+  do
+    {
+      if (*n == '\0') 
+	return TRUE;
+      if (*h == '\0') 
+	return FALSE;
+    
+      hc = g_utf8_get_char (h);
+      h = g_utf8_next_char (h);
+      
+      nc = g_utf8_get_char (n);
+      n = g_utf8_next_char (n);
+      
+      hc = g_unichar_tolower (hc);
+      nc = g_unichar_tolower (nc);
+    }
+  while (hc == nc);
+  
+  return FALSE;
+}
+
+typedef enum {
+  UNSAFE_ALL        = 0x1,  /* Escape all unsafe characters   */
+  UNSAFE_ALLOW_PLUS = 0x2,  /* Allows '+'  */
+  UNSAFE_PATH       = 0x4,  /* Allows '/' and '?' and '&' and '='  */
+  UNSAFE_DOS_PATH   = 0x8,  /* Allows '/' and '?' and '&' and '=' and ':' */
+  UNSAFE_HOST       = 0x10, /* Allows '/' and ':' and '@' */
+  UNSAFE_SLASHES    = 0x20  /* Allows all characters except for '/' and '%' */
+} UnsafeCharacterSet;
+
+static const guchar acceptable[96] = {
+ /* X0   X1   X2   X3   X4   X5   X6   X7   X8   X9   XA   XB   XC   XD   XE   XF */
+  0x00,0x3F,0x20,0x20,0x20,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x22,0x20,0x3F,0x3F,0x1C, /* 2X  !"#$%&'()*+,-./   */
+  0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x2C, /* 3X 0123456789:;<=>?   */
+  0x30,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, /* 4X @ABCDEFGHIJKLMNO   */
+  0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F, /* 5X PQRSTUVWXYZ[\]^_   */
+  0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, /* 6X `abcdefghijklmno   */
+  0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20  /* 7X pqrstuvwxyz{|}~DEL */
+};
+
+#define HEX_ESCAPE '%'
+
+static const gchar hex[16] = "0123456789ABCDEF";
+
+static gchar *
+g_escape_uri_string (const gchar *string, 
+		     UnsafeCharacterSet mask)
+{
+#define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask))
+
+  const gchar *p;
+  gchar *q;
+  gchar *result;
+  guchar c;
+  gint unacceptable;
+  UnsafeCharacterSet use_mask;
+  
+  g_return_val_if_fail (mask == UNSAFE_ALL
+			|| mask == UNSAFE_ALLOW_PLUS
+			|| mask == UNSAFE_PATH
+			|| mask == UNSAFE_DOS_PATH
+			|| mask == UNSAFE_HOST
+			|| mask == UNSAFE_SLASHES, NULL);
+  
+  if (string == NULL)
+    return NULL;
+  
+  unacceptable = 0;
+  use_mask = mask;
+  for (p = string; *p != '\0'; p = g_utf8_next_char (p))
+    {
+      c = *p;
+      if (!ACCEPTABLE (c)) 
+	unacceptable++;
+      
+      if ((use_mask == UNSAFE_HOST) && 
+	  (unacceptable || (c == '/')))
+	{
+	  /* when escaping a host, if we hit something that needs to be escaped, or we finally
+	   * hit a path separator, revert to path mode (the host segment of the url is over).
+	   */
+	  use_mask = UNSAFE_PATH;
+	}
+    }
+  
+  result = g_malloc (p - string + unacceptable * 2 + 1);
+  
+  use_mask = mask;
+  for (q = result, p = string; *p != '\0'; p = g_utf8_next_char (p))
+    {
+      c = *p;
+      
+      if (!ACCEPTABLE (c))
+	{
+	  *q++ = HEX_ESCAPE; /* means hex coming */
+	  *q++ = hex[c >> 4];
+	  *q++ = hex[c & 15];
+	}
+      else
+	{
+	  g_utf8_strncpy (q, p, 1);
+	  q = g_utf8_next_char (q);
+	}
+      if ((use_mask == UNSAFE_HOST) &&
+	  (!ACCEPTABLE (c) || (c == '/'))) 
+	use_mask = UNSAFE_PATH;
+    }
+  
+  *q = '\0';
+  
+  return result;
+}
+
+
+static int
+hex_to_int (gchar c)
+{
+  return  c >= '0' && c <= '9' ? c - '0'
+    : c >= 'A' && c <= 'F' ? c - 'A' + 10
+    : c >= 'a' && c <= 'f' ? c - 'a' + 10
+    : -1;
+}
+
+static int
+unescape_character (const char *scanner)
+{
+  int first_digit;
+  int second_digit;
+
+  first_digit = hex_to_int (*scanner++);
+  
+  if (first_digit < 0) 
+    return -1;
+  
+  second_digit = hex_to_int (*scanner++);
+  if (second_digit < 0) 
+    return -1;
+  
+  return (first_digit << 4) | second_digit;
+}
+
+static gchar *
+g_unescape_uri_string (const gchar *escaped,
+		       const gchar *illegal_characters)
+{
+  const gchar *in;
+  gchar *out, *result;
+  int character;
+  
+  if (escaped == NULL)
+    return NULL;
+  
+  result = g_malloc (strlen (escaped) + 1);
+  
+  out = result;
+  for (in = escaped; *in != '\0'; in = g_utf8_next_char (in))
+    {
+      character = *in;
+      if (character == HEX_ESCAPE)
+	{
+	  character = unescape_character (in + 1);
+      
+	  /* Check for an illegal character. We consider '\0' illegal here. */
+	  if (character <= 0
+	      || (illegal_characters != NULL
+		  && strchr (illegal_characters, (char)character) != NULL))
+	    {
+	      g_free (result);
+	      return NULL;
+	    }
+	  in += 2;
+
+	  out += g_unichar_to_utf8 ((gunichar)character, out);
+	}
+      else
+	{
+	  g_utf8_strncpy (out, in, 1);
+	  out = g_utf8_next_char (out);
+	}
+    }
+  
+  *out = '\0';
+  
+  g_assert (out - result <= strlen (escaped));
+  
+  return result;
+}
+
+/**
+ * g_filename_from_uri:
+ * @uri: a UTF-8 encoded uri
+ * @hostname: If the URI specifies a hostname it will be placed here,
+              or %NULL to ignore the hostname.
+ * @error: location to store the error occuring, or %NULL to ignore
+ *         errors. Any of the errors in #GConvertError may occur.
+ * 
+ * Converts a UTF-8 encoded uri to a local filename in the encoding
+ * used for filenames. Or NULL if the uri doesn't specify a local
+ * filename.
+ * 
+ * Return value: The converted string, or %NULL on an error.
+ **/
+gchar *
+g_filename_from_uri (const char *uri,
+		     char      **hostname,
+		     GError    **error)
+{
+  const char *path_part;
+  const char *host_part;
+  char *result;
+  char *filename;
+  GError *e;
+
+  if (hostname)
+    *hostname = NULL;
+
+  if (!g_utf8_validate (uri, -1, NULL))
+    {
+      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+		   _("The URI is not valid UTF-8"),
+		   uri);
+      return NULL;
+    }
+  
+  if (!has_case_prefix_utf8 (uri, "file:/"))
+    {
+      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_LOCAL_FILE,
+		   _("The URI `%s' does not specify a local file"),
+		   uri);
+      return NULL;
+    }
+  
+  path_part = uri + strlen ("file:");
+  
+  if (strchr (path_part, '#') != NULL)
+    {
+      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_INVALID_URI,
+		   _("The local file URI `%s' may not include a `#'"),
+		   uri);
+      return NULL;
+    }
+	
+  if (has_case_prefix_utf8 (path_part, "///")) 
+    path_part += 2;
+  else if (has_case_prefix_utf8 (path_part, "//"))
+    {
+      path_part += 2;
+      host_part = path_part;
+
+      path_part = strchr (path_part, '/');
+
+      if (path_part == NULL)
+	{
+	  g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_INVALID_URI,
+		       _("The URI `%s' is invalid"),
+		       uri);
+	  return NULL;
+	}
+
+      if (hostname)
+	{
+	  *hostname = g_malloc (path_part - host_part + 1);
+	  memcpy (*hostname, host_part, path_part - host_part);
+	  (*hostname)[path_part - host_part] = 0;
+	}
+    }
+  
+  filename = g_unescape_uri_string (path_part, "/");
+
+  if (filename == NULL)
+    {
+      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_INVALID_URI,
+		   _("The URI `%s' is contains invalidly escaped characters"),
+		   uri);
+      return NULL;
+    }
+
+  result = g_filename_from_utf8 (filename, -1, NULL, NULL, error);
+  g_free (filename);
+  
+  return result;
+}
+
+/**
+ * g_filename_to_uri:
+ * @filename: an absolute filename specified in the encoding
+ *            used for filenames.
+ * @hostname: A utf-8 encoded hostname, or %NULL for none.
+ * @error: location to store the error occuring, or %NULL to ignore
+ *         errors. Any of the errors in #GConvertError may occur.
+ * 
+ * Converts a UTF-8 encoded uri to a local filename in the encoding
+ * used for filenames. Or NULL if the uri doesn't specify a local
+ * filename.
+ * 
+ * Return value: The converted string, or %NULL on an error.
+ **/
+gchar *
+g_filename_to_uri   (const char *filename,
+		     char       *hostname,
+		     GError    **error)
+{
+  char *escaped_uri, *uri;
+  char *utf8_filename;
+  GError *tmp_error;
+
+  g_return_val_if_fail (filename != NULL, NULL);
+
+  if (!g_path_is_absolute (filename))
+    {
+      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH,
+		   _("The pathname '%s' is not an absolute path"),
+		   filename);
+      return NULL;
+    }
+
+  tmp_error = NULL;
+  utf8_filename = g_filename_to_utf8 (filename, -1, NULL, NULL, &tmp_error);
+  if (tmp_error)
+    {
+      g_propagate_error (error, tmp_error);
+      return NULL;
+    }
+  
+  if (hostname)
+    {
+      if (!g_utf8_validate (hostname, -1, NULL))
+	{
+	  g_free (utf8_filename);
+	  g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+		       _("Invalid byte sequence in hostname"));
+	  return NULL;
+	}
+      uri = g_strconcat ("file://", hostname, utf8_filename, NULL);
+    }
+  else 
+    uri = g_strconcat ("file://", utf8_filename, NULL);
+  
+  escaped_uri = g_escape_uri_string (uri, (hostname)?UNSAFE_HOST:UNSAFE_PATH);
+  g_free (uri);
+  
+  return escaped_uri;
+}
+
Index: glib/gconvert.h
===================================================================
RCS file: /cvs/gnome/glib/glib/gconvert.h,v
retrieving revision 1.7
diff -u -p -r1.7 gconvert.h
--- glib/gconvert.h	2001/06/26 16:01:14	1.7
+++ glib/gconvert.h	2001/08/04 00:50:36
@@ -37,7 +37,10 @@ typedef enum 
   G_CONVERT_ERROR_NO_CONVERSION,
   G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   G_CONVERT_ERROR_FAILED,
-  G_CONVERT_ERROR_PARTIAL_INPUT
+  G_CONVERT_ERROR_PARTIAL_INPUT,
+  G_CONVERT_ERROR_NOT_LOCAL_FILE,
+  G_CONVERT_ERROR_INVALID_URI,
+  G_CONVERT_ERROR_NOT_ABSOLUTE_PATH
 } GConvertError;
 
 #define G_CONVERT_ERROR g_convert_error_quark()
@@ -100,6 +103,15 @@ gchar* g_filename_from_utf8 (const gchar
 			     gsize        *bytes_read,     
 			     gsize        *bytes_written,  
 			     GError      **error);
+
+gchar *g_filename_from_uri (const char *uri,
+			    char      **hostname,
+			    GError    **error);
+  
+gchar *g_filename_to_uri   (const char *filename,
+			    char       *hostname,
+			    GError    **error);
+
 
 G_END_DECLS
 



_______________________________________________
gnome-hackers mailing list
gnome-hackers gnome org
http://mail.gnome.org/mailman/listinfo/gnome-hackers
Follow-Ups:
- Re: URIs vs. half-baked URIs [glib PATCH]
  - From: Owen Taylor
- Re: URIs vs. half-baked URIs [glib PATCH]
  - From: Darin Adler
References:
- URIs vs. half-baked URIs (was Filesel drag and drop)
  - From: Darin Adler
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]