From 2d210f10ebc98d13dc35b172636efecb91f1b92a Mon Sep 17 00:00:00 2001 From: Simo Sorce Date: Tue, 11 Jul 2017 07:46:06 -0400 Subject: [PATCH] Add 3986 compatible URI Escape function RFC 3986 changed the set of (un)reserved characters, so we provide a function that uses that RFC to generate an escaped URI. The original function is preserved for backwards compatibility. Signed-off-by: Simo Sorce --- include/libxml/uri.h | 2 + python/setup.py | 2 +- uri.c | 239 ++++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 229 insertions(+), 14 deletions(-) diff --git a/include/libxml/uri.h b/include/libxml/uri.h index db48262..c9321a5 100644 --- a/include/libxml/uri.h +++ b/include/libxml/uri.h @@ -81,6 +81,8 @@ XMLPUBFUN int XMLCALL xmlNormalizeURIPath (char *path); XMLPUBFUN xmlChar * XMLCALL xmlURIEscape (const xmlChar *str); +XMLPUBFUN xmlChar * XMLCALL + xml3986URIEscape (const xmlChar *str); XMLPUBFUN void XMLCALL xmlFreeURI (xmlURIPtr uri); XMLPUBFUN xmlChar* XMLCALL diff --git a/python/setup.py b/python/setup.py index c44269a..1ac4eac 100755 --- a/python/setup.py +++ b/python/setup.py @@ -8,7 +8,7 @@ from distutils.core import setup, Extension # Below ROOT, we expect to find include, include/libxml2, lib and bin. # On *nix, it is not needed (but should not harm), # on Windows, it is set by configure.js. -ROOT = r'/usr' +ROOT = r'/usr/local' # Thread-enabled libxml2 with_threads = 1 diff --git a/uri.c b/uri.c index 3b627e8..16fec64 100644 --- a/uri.c +++ b/uri.c @@ -1656,18 +1656,9 @@ xmlURIUnescapeString(const char *str, int len, char *target) { return(ret); } -/** - * xmlURIEscapeStr: - * @str: string to escape - * @list: exception list string of chars not to escape - * - * This routine escapes a string to hex, ignoring reserved characters (a-z) - * and the characters in the exception list. - * - * Returns a new escaped string or NULL in case of error. - */ -xmlChar * -xmlURIEscapeStr(const xmlChar *str, const xmlChar *list) { +static xmlChar * +xmlURIEscapeStrExt(const xmlChar *str, const xmlChar *list, + int func(xmlChar, const xmlChar *)) { xmlChar *ret, ch; xmlChar *temp; const xmlChar *in; @@ -1701,7 +1692,7 @@ xmlURIEscapeStr(const xmlChar *str, const xmlChar *list) { ch = *in; - if ((ch != '@') && (!IS_UNRESERVED(ch)) && (!xmlStrchr(list, ch))) { + if (func(ch, list)) { unsigned char val; ret[out++] = '%'; val = ch >> 4; @@ -1724,6 +1715,29 @@ xmlURIEscapeStr(const xmlChar *str, const xmlChar *list) { return(ret); } +static int +xmlURIEscapeChar(xmlChar ch, const xmlChar *list) { + if ((ch != '@') && (!IS_UNRESERVED(ch)) && (!xmlStrchr(list, ch))) { + return 1; + } + return 0; +} + +/** + * xmlURIEscapeStr: + * @str: string to escape + * @list: exception list string of chars not to escape + * + * This routine escapes a string to hex, ignoring reserved characters (a-z) + * and the characters in the exception list. + * + * Returns a new escaped string or NULL in case of error. + */ +xmlChar * +xmlURIEscapeStr(const xmlChar *str, const xmlChar *list) { + return xmlURIEscapeStrExt(str, list, xmlURIEscapeChar); +} + /** * xmlURIEscape: * @str: the string of the URI to escape @@ -1857,6 +1871,205 @@ xmlURIEscape(const xmlChar * str) return (ret); } + +static int +xml3986SchemeEscapeChar(xmlChar ch, const xmlChar *unused ATTRIBUTE_UNUSED) { + if ((!ISA_ALPHA(&ch)) && (!ISA_DIGIT(&ch)) && + (!xmlStrchr(BAD_CAST "+-.", ch))) { + return 1; + } + return 0; +} + +static int +xml3986AuthorityEscapeChar(xmlChar ch, const xmlChar *unused ATTRIBUTE_UNUSED) { + if ((!ISA_UNRESERVED(&ch)) && (!ISA_SUB_DELIM(&ch)) && + (!xmlStrchr(BAD_CAST ":@[]", ch))) { + return 1; + } + return 0; +} + +static int +xml3986UserEscapeChar(xmlChar ch, const xmlChar *unused ATTRIBUTE_UNUSED) { + if ((!ISA_UNRESERVED(&ch)) && (!ISA_SUB_DELIM(&ch)) && (ch != ':')) { + return 1; + } + return 0; +} + +static int +xml3986HostEscapeChar(xmlChar ch, const xmlChar *unused ATTRIBUTE_UNUSED) { + if ((!ISA_UNRESERVED(&ch)) && (!ISA_SUB_DELIM(&ch)) && + (!xmlStrchr(BAD_CAST "[]:.", ch))) { + return 1; + } + return 0; +} + +static int +xml3986PathEscapeChar(xmlChar ch, const xmlChar *unused ATTRIBUTE_UNUSED) { + if ((!ISA_PCHAR(&ch)) && (ch != '/')) { + return 1; + } + return 0; +} + +static int +xml3986OpaqueEscapeChar(xmlChar ch, const xmlChar *unused ATTRIBUTE_UNUSED) { + if (!ISA_UNRESERVED(&ch)) { + return 1; + } + return 0; +} + +static int +xml3986QueryEscapeChar(xmlChar ch, const xmlChar *unused ATTRIBUTE_UNUSED) { + if ((!ISA_PCHAR(&ch)) && (!xmlStrchr(BAD_CAST "/?", ch))) { + return 1; + } + return 0; +} + +#define xml3986FragmentEscapeChar xml3986QueryEscapeChar + +/** + * xmlURIEscape: + * @str: the string of the URI to escape + * + * Escaping routine, does not do validity checks ! + * It will try to escape the chars needing this, but this is heuristic + * based it's impossible to be sure. + * + * Returns an copy of the string, but escaped + * + * Uses RFC 3986 rules to escape URI + */ +xmlChar * +xml3986URIEscape(const xmlChar * str) +{ + xmlChar *ret, *segment = NULL; + xmlURIPtr uri; + int ret2; + +#define NULLCHK(p) if(!p) { \ + xmlURIErrMemory("escaping URI value\n"); \ + xmlFreeURI(uri); \ + return NULL; } \ + + if (str == NULL) + return (NULL); + + uri = xmlCreateURI(); + if (uri != NULL) { + /* + * Allow escaping errors in the unescaped form + */ + uri->cleanup = 1; + ret2 = xmlParseURIReference(uri, (const char *)str); + if (ret2) { + xmlFreeURI(uri); + return (NULL); + } + } + + if (!uri) + return NULL; + + ret = NULL; + + if (uri->scheme) { + segment = xmlURIEscapeStrExt(BAD_CAST uri->scheme, + NULL, xml3986SchemeEscapeChar); + NULLCHK(segment) + ret = xmlStrcat(ret, segment); + ret = xmlStrcat(ret, BAD_CAST ":"); + xmlFree(segment); + } + + if (uri->authority) { + segment = xmlURIEscapeStrExt(BAD_CAST uri->scheme, + NULL, xml3986AuthorityEscapeChar); + NULLCHK(segment) + ret = xmlStrcat(ret, BAD_CAST "//"); + ret = xmlStrcat(ret, segment); + xmlFree(segment); + } + else { + + if (uri->user) { + segment = xmlURIEscapeStrExt(BAD_CAST uri->scheme, + NULL, xml3986UserEscapeChar); + NULLCHK(segment) + ret = xmlStrcat(ret,BAD_CAST "//"); + ret = xmlStrcat(ret, segment); + ret = xmlStrcat(ret, BAD_CAST "@"); + xmlFree(segment); + } + + if (uri->server) { + segment = xmlURIEscapeStrExt(BAD_CAST uri->scheme, + NULL, xml3986HostEscapeChar); + NULLCHK(segment) + if (uri->user == NULL) + ret = xmlStrcat(ret, BAD_CAST "//"); + ret = xmlStrcat(ret, segment); + xmlFree(segment); + } + + if (uri->port) { + xmlChar port[10]; + + snprintf((char *) port, 10, "%d", uri->port); + ret = xmlStrcat(ret, BAD_CAST ":"); + ret = xmlStrcat(ret, port); + } + } + + if (uri->path) { + segment = xmlURIEscapeStrExt(BAD_CAST uri->scheme, + NULL, xml3986PathEscapeChar); + NULLCHK(segment) + ret = xmlStrcat(ret, segment); + xmlFree(segment); + } + + if (uri->query_raw) { + ret = xmlStrcat(ret, BAD_CAST "?"); + ret = xmlStrcat(ret, BAD_CAST uri->query_raw); + } + else if (uri->query) { + segment = xmlURIEscapeStrExt(BAD_CAST uri->scheme, + NULL, xml3986QueryEscapeChar); + NULLCHK(segment) + ret = xmlStrcat(ret, BAD_CAST "?"); + ret = xmlStrcat(ret, segment); + xmlFree(segment); + } + + if (uri->opaque) { + segment = xmlURIEscapeStrExt(BAD_CAST uri->scheme, + NULL, xml3986OpaqueEscapeChar); + NULLCHK(segment) + ret = xmlStrcat(ret, segment); + xmlFree(segment); + } + + if (uri->fragment) { + segment = xmlURIEscapeStrExt(BAD_CAST uri->scheme, + NULL, xml3986FragmentEscapeChar); + NULLCHK(segment) + ret = xmlStrcat(ret, BAD_CAST "#"); + ret = xmlStrcat(ret, segment); + xmlFree(segment); + } + + xmlFreeURI(uri); +#undef NULLCHK + + return (ret); +} + /************************************************************************ * * * Public functions * -- 2.9.4