Re: [PATCH] Fix heavily broken relative URI resolution
- From: Christian Neumair <chris gnome-de org>
- To: gnome-vfs-list gnome org
- Subject: Re: [PATCH] Fix heavily broken relative URI resolution
- Date: Sun, 19 Mar 2006 17:06:43 +0100
Am Sonntag, den 19.03.2006, 13:09 +0100 schrieb Christian Neumair:
> The attached patch implements the relative URI resolution as suggested
> by RFC 2396 and RFC 3986. The old algorithm was quite hosed, for
> instance when the relative reference had too many ".." components, which
> overwrote the authority component of an URI.
> "ftp://server/foo", "../../../" would be resolved to some garbage like
> "ftp:///".
>
> After applying this patch, test-uri passes all URI relative resolution
> tests in RFC 3986.
>
> I'm not sure why the old code was laid out that way, but somebody
> probably didn't separate relative URI resolution and symblic link
> resolution, which are two completely different things. The latter is for
> instance implemented in my sftp method patch, and could be used by other
> methods as well. One notable difference is that trailing slashes will be
> preserved in the RFC's algorithm. I'm not sure how desireable this is
> for directories, for instance /home/foo where foo is a symlink to "..",
> this would be resolved to /home/ with the RFC algorithm, and to /home
> with the symlink algorithm, I'm not sure who should handle the
> canonicalization, i.e. ensure that get_file_info for /home/ will work
> if /home is a symlink.
>
> There is quite a similarity between _gnome_vfs_canonicalize_pathname and
> remove_dot_segments, however I couldn't figure out the details because I
> don't find _gnome_vfs_canonicalize_pathname particularly traceable.
>
> I don't know WHY many modules and applications use
> gnome_vfs_uri_resolve_relative, they usually shouldn't have to mess
> around with this and instead use a symlink resolution helper - maybe
> because the old docs claim it has something to do with relative path
> resolution, which is only right to a point where the relative URI only
> forms a path with segments that exclusively contain unreserved
> characters.
I knew that I forgot something...this one is slightly more readable and
doesn't lack two g_free calls :).
--
Christian Neumair <chris gnome-de org>
Index: libgnomevfs/gnome-vfs-uri.c
===================================================================
RCS file: /cvs/gnome/gnome-vfs/libgnomevfs/gnome-vfs-uri.c,v
retrieving revision 1.133
diff -u -p -r1.133 gnome-vfs-uri.c
--- libgnomevfs/gnome-vfs-uri.c 27 Feb 2006 10:25:39 -0000 1.133
+++ libgnomevfs/gnome-vfs-uri.c 19 Mar 2006 16:04:58 -0000
@@ -362,6 +362,7 @@ set_uri_element (GnomeVFSURI *uri,
guint len)
{
char *escaped_text;
+ char *tmp, *query;
if (text == NULL || len == 0) {
uri->text = g_strdup ("/");
@@ -404,7 +405,20 @@ set_uri_element (GnomeVFSURI *uri,
}
gnome_vfs_remove_optional_escapes (uri->text);
- _gnome_vfs_canonicalize_pathname (uri->text);
+
+ /* make sure that we don't canonicalize the query of a URI */
+ query = strchr (uri->text ? uri->text : "", '?');
+ if (query != NULL) {
+ tmp = g_strndup (uri->text, query - uri->text);
+ query = g_strdup (query);
+ g_free (uri->text);
+
+ _gnome_vfs_canonicalize_pathname (tmp);
+ uri->text = g_strconcat (tmp, query, NULL);
+ g_free (tmp);
+ g_free (query);
+ } else
+ _gnome_vfs_canonicalize_pathname (uri->text);
}
static const gchar *
@@ -672,236 +686,347 @@ is_uri_relative (const char *uri)
return !(':' == *current);
}
-
-/*
- * Remove "./" segments
- * Compact "../" segments inside the URI
- * Remove "." at the end of the URL
- * Leave any ".."'s at the beginning of the URI
-
-*/
-/*
- * FIXME this is not the simplest or most time-efficent way
- * to do this. Probably a far more clear way of doing this processing
- * is to split the path into segments, rather than doing the processing
- * in place.
- */
static void
-remove_internal_relative_components (char *uri_current)
+decompose_uri (const char *uri,
+ char **scheme,
+ char **authority,
+ char **path,
+ char **query,
+ char **fragment)
+{
+ const char *p, *q, *f;
+ char *hier_part;
+
+ *scheme = *authority = *path = *query = *fragment = NULL;
+
+ /* In accordance with RFC 2396, and RFC 3986
+ *
+ * scheme ":" hier-part [ "?" query ]
+ * or
+ *
+ * relative-part [ "?" query ] [ "#" fragment ]
+ *
+ * hier-part or relative part have identical
+ * constraints. They may be one of
+ * + an authority followed by
+ * + an absolute path
+ * + an empty path
+ * + an absolute path
+ * + a relative path
+ * + an empty path
+ * */
+
+ p = strchr (uri, ':');
+ if (p != NULL) { /* absolute URI */
+ *scheme = g_strndup (uri, p - uri);
+ /* skip ':' */
+ p++;
+ } else
+ p = uri;
+
+ /* remaining:
+ * hier-part [ "?" query ]
+ * or
+ * relative-part [ "?" query ] [ "#" fragment ]
+ * */
+ q = strchr (p, '?');
+
+ f = NULL;
+ if (*scheme == NULL) /* only relative references may have fragments! */
+ f = strchr (p, '#');
+
+ if (q != NULL) { /* query */
+ hier_part = g_strndup (p, q - p);
+
+ if (f != NULL && f > q) { /* query, followed by fragment */
+ /* skip '?' */
+ q++;
+ *query = g_strndup (q, f - q);
+
+ /* skip '#' */
+ f++;
+ *fragment = g_strdup (f);
+ } else { /* query, no fragment */
+ /* skip '?' */
+ q++;
+ *query = g_strdup (q);
+ }
+ } else if (f != NULL) { /* no query, but fragment */
+ hier_part = g_strndup (p, f - p);
+
+ /* skip '#' */
+ f++;
+ *fragment = g_strdup (f);
+ } else { /* no query, no fragment */
+ hier_part = g_strdup (p);
+ }
+
+ /* analyze hier-part */
+ p = hier_part;
+ if (p[0] == '/' && p[1] == '/') { /* authority path-absempty */
+ /* skip "//" */
+ p += 2;
+
+ q = strchr (p, '/');
+ if (q != NULL) { /* authority, absolute path */
+ *authority = g_strndup (p, q - p);
+ } else { /* empty path */
+ *authority = g_strdup (p);
+ }
+
+ p = q; /* save path */
+ }
+
+ /* path absolute, empty or rootless.
+ * URIs must have a non-NULL path! */
+ *path = g_strdup (p != NULL ? p : "");
+ g_free (hier_part);
+
+#ifdef DEBUG_URI_HANDLING
+ g_message ("decomposed %s into %s scheme, %s authority, %s path, %s query, %s fragment",
+ uri, *scheme, *authority, *path, *query, *fragment);
+#endif /* DEBUG_URI_HANDLING */
+}
+
+/* RFC 2396, section 5.3 */
+static char *
+compose_uri (char *scheme,
+ char *authority,
+ char *path,
+ char *query,
+ char *fragment)
{
- char *segment_prev, *segment_cur;
- gsize len_prev, len_cur;
+ GString *uri = g_string_new ("");
- len_prev = len_cur = 0;
- segment_prev = NULL;
+ g_assert (path != NULL);
- segment_cur = uri_current;
+ if (scheme != NULL) {
+ g_string_append (uri, scheme);
+ g_string_append_c (uri, ':');
+ }
- while (*segment_cur) {
- len_cur = strcspn (segment_cur, "/");
+ if (authority != NULL) {
+ g_string_append (uri, "//");
+ g_string_append (uri, authority);
+ }
- if (len_cur == 1 && segment_cur[0] == '.') {
- /* Remove "." 's */
- if (segment_cur[1] == '\0') {
- segment_cur[0] = '\0';
- break;
- } else {
- memmove (segment_cur, segment_cur + 2, strlen (segment_cur + 2) + 1);
- continue;
- }
- } else if (len_cur == 2 && segment_cur[0] == '.' && segment_cur[1] == '.' ) {
- /* Remove ".."'s (and the component to the left of it) that aren't at the
- * beginning or to the right of other ..'s
- */
- if (segment_prev) {
- if (! (len_prev == 2
- && segment_prev[0] == '.'
- && segment_prev[1] == '.')) {
- if (segment_cur[2] == '\0') {
- segment_prev[0] = '\0';
- break;
- } else {
- memmove (segment_prev, segment_cur + 3, strlen (segment_cur + 3) + 1);
-
- segment_cur = segment_prev;
- len_cur = len_prev;
-
- /* now we find the previous segment_prev */
- if (segment_prev == uri_current) {
- segment_prev = NULL;
- } else if (segment_prev - uri_current >= 2) {
- segment_prev -= 2;
- for ( ; segment_prev > uri_current && segment_prev[0] != '/'
- ; segment_prev-- );
- if (segment_prev[0] == '/') {
- segment_prev++;
- }
- }
- continue;
- }
- }
- }
- }
+ g_string_append (uri, path);
- /*Forward to next segment */
+ if (query != NULL) {
+ g_string_append_c (uri, '?');
+ g_string_append (uri, query);
+ }
- if (segment_cur [len_cur] == '\0') {
- break;
- }
-
- segment_prev = segment_cur;
- len_prev = len_cur;
- segment_cur += len_cur + 1;
+ if (fragment != NULL) {
+ g_string_append_c (uri, '#');
+ g_string_append (uri, fragment);
}
-
+
+#ifdef DEBUG_URI_HANDLING
+ g_message ("composed %s scheme, %s authority, %s path, %s query, %s fragment into %s",
+ scheme, authority, path, query, fragment, uri->str);
+#endif /* DEBUG_URI_HANDLING */
+
+ return g_string_free (uri, FALSE);
}
-/* If I had known this relative uri code would have ended up this long, I would
- * have done it a different way
- */
+/* RFC 2396, 5.2.3 */
static char *
-make_full_uri_from_relative (const char *base_uri, const char *uri)
+merge_paths (const char *b_authority,
+ const char *b_path,
+ const char *r_path)
+{
+ char *p;
+ char *ret;
+
+ /* paths are always defined for URIs */
+ g_assert (b_path != NULL);
+ g_assert (r_path != NULL);
+
+ if (b_authority != NULL && !strcmp (b_path, "")) {
+ ret = g_strconcat ("/", r_path, NULL);
+ } else {
+ p = strrchr (b_path, '/');
+ if (p != NULL) { /* strip everything after last "/" */
+ char *b_path_stripped;
+ /* include '/' */
+ p++;
+ b_path_stripped = g_strndup (b_path, p - b_path);
+ ret = g_strconcat (b_path_stripped, r_path, NULL);
+ g_free (b_path_stripped);
+ } else
+ ret = g_strdup (r_path);
+ }
+
+ return ret;
+}
+
+static char *
+remove_dot_segments (const char *input)
{
- char *result = NULL;
+ GString *output;
+ char **strs, **sp, **sq;
+ gboolean ends_in_slash;
- char *mutable_base_uri;
- char *mutable_uri;
-
- char *uri_current;
- gsize base_uri_length;
- char *separator;
-
- /* We may need one extra character
- * to append a "/" to uri's that have no "/"
- * (such as help:)
- */
+ g_assert (input != NULL);
- mutable_base_uri = g_malloc(strlen(base_uri)+2);
- strcpy (mutable_base_uri, base_uri);
-
- uri_current = mutable_uri = g_strdup (uri);
+ ends_in_slash = * (input + strlen (input) - 1) == '/';
- /* Chew off Fragment and Query from the base_url */
+ strs = g_strsplit (input, "/", 0);
+ for (sp = strs; *sp != NULL; sp++) {
+ if (!strcmp (*sp, ".") ||
+ !strcmp (*sp, "")) {
+ g_free (*sp);
+ *sp = NULL;
+ } else if (!strcmp (*sp, "..")) {
+ g_free (*sp);
+ *sp = NULL;
- separator = strrchr (mutable_base_uri, '#');
+ for (sq = sp; *sq == NULL && sq > strs; sq--)
+ ;
- if (separator) {
- *separator = '\0';
+ g_free (*sq);
+ *sq = NULL;
+ }
}
- separator = strrchr (mutable_base_uri, '?');
+ if (sp > strs && *(sp-1) == NULL) /* was the last segment a dot segment?
+ * also reached when the path had
+ * multiple trailing '/' characters)
+ * */
+ ends_in_slash = TRUE;
- if (separator) {
- *separator = '\0';
+ output = g_string_new ("");
+
+ for (sq = strs; sq != sp; sq++) {
+ if (*sq != NULL) {
+ g_string_append_c (output, '/');
+ g_string_append (output, *sq);
+ g_free (*sq);
+ }
}
- if ('/' == uri_current[0] && '/' == uri_current [1]) {
- /* Relative URI's beginning with the authority
- * component inherit only the scheme from their parents
- */
+ g_free (strs);
- separator = strchr (mutable_base_uri, ':');
+ if (ends_in_slash)
+ g_string_append_c (output, '/');
- if (separator) {
- separator[1] = '\0';
- }
- } else if ('/' == uri_current[0]) {
- /* Relative URI's beginning with '/' absolute-path based
- * at the root of the base uri
- */
+ return g_string_free (output, FALSE);
+}
- separator = strchr (mutable_base_uri, ':');
+/* RFC 2396, 5.2.2 */
+static char *
+make_full_uri_from_relative (const char *base_uri, const char *uri)
+{
+ char *result;
- /* g_assert (separator), really */
- if (separator) {
- /* If we start with //, skip past the authority section */
- if ('/' == separator[1] && '/' == separator[2]) {
- separator = strchr (separator + 3, '/');
- if (separator) {
- separator[0] = '\0';
- }
- } else {
- /* If there's no //, just assume the scheme is the root */
- separator[1] = '\0';
- }
- }
- } else if ('#' != uri_current[0]) {
- /* Handle the ".." convention for relative uri's */
+ char *b_scheme;
+ char *b_authority;
+ char *b_path;
+ char *b_query;
+ char *b_fragment;
+
+ char *r_scheme;
+ char *r_authority;
+ char *r_path;
+ char *r_query;
+ char *r_fragment;
+
+ char *scheme;
+ char *authority;
+ char *path;
+ char *query;
+ char *fragment;
+
+ result = NULL;
+
+ decompose_uri (base_uri, &b_scheme, &b_authority, &b_path, &b_query, &b_fragment);
+ if (b_scheme == NULL) /* invalid absolute URI */
+ goto out;
+
+ g_assert (b_fragment == NULL);
+ g_assert (b_path != NULL);
+
+ decompose_uri (uri, &r_scheme, &r_authority, &r_path, &r_query, &r_fragment);
+
+ g_assert (r_path != NULL);
+
+ /* RFC 2396, section 5.2.2 */
+ if (r_scheme != NULL) {
+ scheme = g_strdup (r_scheme);
+ authority = g_strdup (r_authority);
+ path = remove_dot_segments (r_path);
+ query = g_strdup (r_query);
+ } else {
+ scheme = g_strdup (b_scheme);
- /* If there's a trailing '/' on base_url, treat base_url
- * as a directory path.
- * Otherwise, treat it as a file path, and chop off the filename
- */
-
- base_uri_length = strlen (mutable_base_uri);
- if ('/' == mutable_base_uri[base_uri_length-1]) {
- /* Trim off '/' for the operation below */
- mutable_base_uri[base_uri_length-1] = 0;
+ if (r_authority != NULL) {
+ authority = g_strdup (r_authority);
+ path = remove_dot_segments (r_path);
+ query = g_strdup (r_query);
} else {
- separator = strrchr (mutable_base_uri, '/');
- if (separator) {
- /* Make sure we don't eat a domain part */
- char *tmp = separator - 1;
- if ((separator != mutable_base_uri) && (*tmp != '/')) {
- *separator = '\0';
- } else {
- /* Maybe there is no domain part and this is a toplevel URI's child */
- char *tmp2 = strstr (mutable_base_uri, ":///");
- if (tmp2 != NULL && tmp2 + 3 == separator) {
- *(separator + 1) = '\0';
- }
- }
- }
- }
+ authority = g_strdup (b_authority);
- remove_internal_relative_components (uri_current);
+ if (!strcmp (r_path, "")) {
+ path = g_strdup (b_path);
- /* handle the "../"'s at the beginning of the relative URI */
- while (0 == strncmp ("../", uri_current, 3)) {
- uri_current += 3;
- separator = strrchr (mutable_base_uri, '/');
- if (separator) {
- *separator = '\0';
+ if (r_query != NULL)
+ query = g_strdup (r_query);
+ else
+ query = g_strdup (b_query);
} else {
- /* <shrug> */
- break;
- }
- }
+ if (r_path[0] == '/')
+ path = remove_dot_segments (r_path);
+ else {
+ char *tmp;
+ tmp = merge_paths (b_authority, b_path, r_path);
+ path = remove_dot_segments (tmp);
+ g_free (tmp);
+ }
- /* handle a ".." at the end */
- if (uri_current[0] == '.' && uri_current[1] == '.'
- && uri_current[2] == '\0') {
-
- uri_current += 2;
- separator = strrchr (mutable_base_uri, '/');
- if (separator) {
- *separator = '\0';
+ query = g_strdup (r_query);
}
}
-
- /* Re-append the '/' */
- mutable_base_uri [strlen(mutable_base_uri)+1] = '\0';
- mutable_base_uri [strlen(mutable_base_uri)] = '/';
}
- result = g_strconcat (mutable_base_uri, uri_current, NULL);
- g_free (mutable_base_uri);
- g_free (mutable_uri);
-
+ fragment = g_strdup (r_fragment);
+
+ /* RFC 2396, section 5.3 */
+ result = compose_uri (scheme, authority, path, query, fragment);
+
+ g_free (scheme);
+ g_free (authority);
+ g_free (path);
+ g_free (query);
+ g_free (fragment);
+
+ g_free (r_scheme);
+ g_free (r_authority);
+ g_free (r_path);
+ g_free (r_query);
+ g_free (r_fragment);
+
+out:
+ g_free (b_scheme);
+ g_free (b_authority);
+ g_free (b_path);
+ g_free (b_query);
+ g_free (b_fragment);
+
return result;
}
/**
* gnome_vfs_uri_resolve_relative:
* @base: base uri.
- * @relative_reference: a string representing a possibly relative uri reference.
+ * @relative_reference: a string representing an absolute URI or relative reference.
*
* Create a new uri from @relative_reference, relative to @base. The resolution
* algorithm follows RFC 2396. For details, see section 5.2 of
* http://www.ietf.org/rfc/rfc2396.txt .
*
- * In short, if the @base uri ends in '/', @relative_reference is appended to @base,
- * otherwise it replaces the part of @base after the last '/'.
+ * This function may not be used for resolving symbolic links, because escaped
+ * characters in @relative_reference have very special meanings.
*
* Return value: The new uri.
*/
@@ -913,20 +1038,13 @@ gnome_vfs_uri_resolve_relative (const Gn
char *text_new;
GnomeVFSURI *uri;
+ g_return_val_if_fail (base != NULL, NULL);
g_return_val_if_fail (relative_reference != NULL, NULL);
- if (base == NULL) {
- text_base = g_strdup ("");
- } else {
- text_base = gnome_vfs_uri_to_string (base, 0);
- }
+ text_base = gnome_vfs_uri_to_string (base, 0);
- if (is_uri_relative (relative_reference)) {
- text_new = make_full_uri_from_relative (text_base,
- relative_reference);
- } else {
- text_new = g_strdup (relative_reference);
- }
+ text_new = make_full_uri_from_relative (text_base,
+ relative_reference);
uri = gnome_vfs_uri_new (text_new);
@@ -2068,6 +2186,10 @@ gnome_vfs_uri_make_full_from_relative (c
const char *relative_uri)
{
char *result = NULL;
+ /* TODO 2.15
+ *
+ * rename and #define this to gnome_vfs_resolve_relative,
+ * and make gnome_vfs_uri_resolve_relative a wrapper */
/* See section 5.2 in RFC 2396 */
Index: test/test-uri.c
===================================================================
RCS file: /cvs/gnome/gnome-vfs/test/test-uri.c,v
retrieving revision 1.45
diff -u -p -r1.45 test-uri.c
--- test/test-uri.c 13 Apr 2005 19:58:56 -0000 1.45
+++ test/test-uri.c 19 Mar 2006 16:05:00 -0000
@@ -95,13 +95,52 @@ test_make_canonical_path (const char *in
static const char* test_uris[][2] =
{
- { "http://www.gnome.org/", "index.html" },
- { "http://www.gnome.org/", "/index.html"},
- { "http://www.gnome.org/", "/index.html"},
- { "http://www.gnome.org", "index.html"},
- { "http://www.gnome.org", "/index.html"},
- { "http://www.gnome.org", "./index.html"},
- {NULL, NULL}
+ /* RFC 2396, section 5.4.1 */
+/* { "g:h" , "g:h" }, skip this, no "g" method available */
+ { "g" , "http://a/b/c/g" },
+ { "./g" , "http://a/b/c/g" },
+ { "g/" , "http://a/b/c/g/" },
+ { "/g" , "http://a/g" },
+ { "//g" , "http://g" },
+ { "?y" , "http://a/b/c/d;p?y" },
+ { "g?y" , "http://a/b/c/g?y" },
+ { "#s" , "http://a/b/c/d;p?q#s" },
+ { "g#s" , "http://a/b/c/g#s" },
+ { "g?y#s" , "http://a/b/c/g?y#s" },
+ { ";x" , "http://a/b/c/;x" },
+ { "g;x" , "http://a/b/c/g;x" },
+ { "g;x?y#s", "http://a/b/c/g;x?y#s" },
+ { "" , "http://a/b/c/d;p?q" },
+ { "." , "http://a/b/c/" },
+ { "./" , "http://a/b/c/" },
+ { ".." , "http://a/b/" },
+ { "../" , "http://a/b/" },
+ { "../g" , "http://a/b/g" },
+ { "../.." , "http://a/" },
+ { "../../" , "http://a/" },
+ { "../../g", "http://a/g" },
+
+ /* RFC 2396, section 5.4.2 */
+ { "../../../g" , "http://a/g" },
+ { "../../../../g", "http://a/g" },
+ { "/./g" , "http://a/g" },
+ { "/../g" , "http://a/g" },
+ { "g." , "http://a/b/c/g." },
+ { ".g" , "http://a/b/c/.g" },
+ { "g.." , "http://a/b/c/g.." },
+ { "..g" , "http://a/b/c/..g" },
+
+ { "./../g" , "http://a/b/g" },
+ { "./g/." , "http://a/b/c/g/" },
+ { "g/./h" , "http://a/b/c/g/h" },
+ { "g/../h" , "http://a/b/c/h" },
+ { "g;x=1/./y" , "http://a/b/c/g;x=1/y" },
+ { "g;x=1/../y", "http://a/b/c/y" },
+
+ { "g?y/./x", "http://a/b/c/g?y/./x" },
+ { "g?y/../x", "http://a/b/c/g?y/../x" },
+ { "g#s/./x" , "http://a/b/c/g#s/./x" },
+ { "g#s/../x", "http://a/b/c/g#s/../x" }
};
@@ -731,11 +770,11 @@ main (int argc, char **argv)
VERIFY_STRING_RESULT_NULL (gnome_vfs_get_local_path_from_uri ("http://my/document.html"));
/* Testing gnome_vfs_uri_make_full_from_relative */
- /* (not an extensive testing, but a regression test */
i = 0;
while (test_uris[i][0] != NULL) {
- test_make_full_from_relative (test_uris[i][0], test_uris[i][1],
- "http://www.gnome.org/index.html");
+ test_make_full_from_relative ("http://a/b/c/d;p?q",
+ test_uris[i][0],
+ test_uris[i][1]);
i++;
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]