[xslt] URIs and Their Hacks



Okay, this seems to work, at least halfway.

The talk is about the recent discussion regarding duplicate parsing and not
finding catalogs on Windows.

I have completely removed the function xmlNormalizeWindowsPath. I really
doubt that anyone on this world has ever called it, so there won't be a
problem with binary compatibility. The only call outside libxml is in
xsltproc, but that one I shall patch accordingly. If removing this function
is a problem, that's easy, I'll put back one that calls the new facility
internally.

The new facility is called xmlURIFromPath and is implemented in uri.c. It
gives the functionality of the retired xmlNormalizeWindowsPath and does a
better job using the provided URI functions. This should cover the bug
105994.

The main point about the URIs and Windows paths is the URI manipulation in
uri.c. The xmlIO does not need URIs and can work with native paths as well.
The place where conflicts occur is when, for example, xmlBuildURI tries to
build an absolute URI from a relative URI and an absolute Windows path. That
desn't work :-)

If you now run xmllint --debug, you'll see that all URIs are heavilly
escaped. For example:

  C:\home\igor>xmllint --debug ..\..\test.xml
  DOCUMENT
  version=1.0
  URL=..%5C..%5Ctest.xml
  standalone=true
    ELEMENT=doc

or perhaps

  C:\home\igor>xmllint --debug c:\test.xml
  DOCUMENT
  version=1.0
  URL=file:///c%3A/test.xml
  standalone=true
    ELEMENT=doc

and that seems to be perfectly okay. At the xmlIO level, all paths are equal
to the user's input, be it the command-line, or some entry in the XML file.
Every file is being found. In xmlDoc.URL, however, all paths are URIs, with
proper escaping, and the URI functions in uri.c seem to handle those best.

That's the Windows side. The Unix side didn't suffer any changes. Regression
tests on Linux pass, save for few points regarding unimplemented blocks in
relaxng :-)

What remains is to see what happens on the serialisation side, do those URIs
get unescaped or not, and if not, how do other parsers handle them. I'll
check this tomorrow.

The patch is lenghty but simple and is attached to this message. Please risk
a glance. I would hear a comment or two before I commit this.

Ciao,
Igor
Index: DOCBparser.c
===================================================================
RCS file: /cvs/gnome/gnome-xml/DOCBparser.c,v
retrieving revision 1.31
diff -c -r1.31 DOCBparser.c
*** DOCBparser.c	11 Dec 2002 14:23:46 -0000	1.31
--- DOCBparser.c	17 Feb 2003 23:02:54 -0000
***************
*** 6031,6037 ****
      memset(inputStream, 0, sizeof(docbParserInput));
  
      inputStream->filename = (char *)
! 	xmlNormalizeWindowsPath((const xmlChar *)filename);
      inputStream->line = 1;
      inputStream->col = 1;
      inputStream->buf = buf;
--- 6031,6037 ----
      memset(inputStream, 0, sizeof(docbParserInput));
  
      inputStream->filename = (char *)
! 	xmlURIFromPath((const xmlChar *)filename);
      inputStream->line = 1;
      inputStream->col = 1;
      inputStream->buf = buf;
Index: HTMLparser.c
===================================================================
RCS file: /cvs/gnome/gnome-xml/HTMLparser.c,v
retrieving revision 1.135
diff -c -r1.135 HTMLparser.c
*** HTMLparser.c	10 Feb 2003 14:28:42 -0000	1.135
--- HTMLparser.c	17 Feb 2003 23:02:56 -0000
***************
*** 42,47 ****
--- 42,48 ----
  #include <libxml/valid.h>
  #include <libxml/xmlIO.h>
  #include <libxml/globals.h>
+ #include <libxml/uri.h>
  
  #define HTML_MAX_NAMELEN 1000
  #define HTML_PARSER_BIG_BUFFER_SIZE 1000
***************
*** 5346,5352 ****
      memset(inputStream, 0, sizeof(htmlParserInput));
  
      inputStream->filename = (char *)
! 	xmlNormalizeWindowsPath((xmlChar *)filename);
      inputStream->line = 1;
      inputStream->col = 1;
      inputStream->buf = buf;
--- 5347,5353 ----
      memset(inputStream, 0, sizeof(htmlParserInput));
  
      inputStream->filename = (char *)
! 	xmlURIFromPath((xmlChar *)filename);
      inputStream->line = 1;
      inputStream->col = 1;
      inputStream->buf = buf;
Index: SAX.c
===================================================================
RCS file: /cvs/gnome/gnome-xml/SAX.c,v
retrieving revision 1.106
diff -c -r1.106 SAX.c
*** SAX.c	5 Feb 2003 10:45:26 -0000	1.106
--- SAX.c	17 Feb 2003 23:02:57 -0000
***************
*** 762,768 ****
      }
      if ((ctxt->myDoc != NULL) && (ctxt->myDoc->URL == NULL) &&
  	(ctxt->input != NULL) && (ctxt->input->filename != NULL)) {
!         ctxt->myDoc->URL = xmlStrdup((const xmlChar *) ctxt->input->filename);
      }
  }
  
--- 762,770 ----
      }
      if ((ctxt->myDoc != NULL) && (ctxt->myDoc->URL == NULL) &&
  	(ctxt->input != NULL) && (ctxt->input->filename != NULL)) {
! 	ctxt->myDoc->URL = xmlURIFromPath((const xmlChar *) ctxt->input->filename);
! 	if (ctxt->myDoc->URL == NULL)
! 	    ctxt->myDoc->URL = xmlStrdup((const xmlChar *) ctxt->input->filename);
      }
  }
  
Index: parser.c
===================================================================
RCS file: /cvs/gnome/gnome-xml/parser.c,v
retrieving revision 1.244
diff -c -r1.244 parser.c
*** parser.c	4 Feb 2003 15:07:21 -0000	1.244
--- parser.c	17 Feb 2003 23:03:01 -0000
***************
*** 9154,9160 ****
  	inputStream->filename = NULL;
      else
  	inputStream->filename = (char *)
! 	    xmlNormalizeWindowsPath((const xmlChar *) filename);
      inputStream->buf = buf;
      inputStream->base = inputStream->buf->buffer->content;
      inputStream->cur = inputStream->buf->buffer->content;
--- 9154,9160 ----
  	inputStream->filename = NULL;
      else
  	inputStream->filename = (char *)
! 	    xmlURIFromPath((const xmlChar *) filename);
      inputStream->buf = buf;
      inputStream->base = inputStream->buf->buffer->content;
      inputStream->cur = inputStream->buf->buffer->content;
***************
*** 10330,10336 ****
      xmlParserCtxtPtr ctxt;
      xmlParserInputPtr inputStream;
      char *directory = NULL;
-     xmlChar *normalized;
  
      ctxt = xmlNewParserCtxt();
      if (ctxt == NULL) {
--- 10330,10335 ----
***************
*** 10340,10364 ****
  	return(NULL);
      }
  
!     normalized = xmlNormalizeWindowsPath((const xmlChar *) filename);
!     if (normalized == NULL) {
! 	xmlFreeParserCtxt(ctxt);
! 	return(NULL);
!     }
!     inputStream = xmlLoadExternalEntity((char *) normalized, NULL, ctxt);
      if (inputStream == NULL) {
  	xmlFreeParserCtxt(ctxt);
- 	xmlFree(normalized);
  	return(NULL);
      }
  
      inputPush(ctxt, inputStream);
      if ((ctxt->directory == NULL) && (directory == NULL))
!         directory = xmlParserGetDirectory((char *) normalized);
      if ((ctxt->directory == NULL) && (directory != NULL))
          ctxt->directory = directory;
- 
-     xmlFree(normalized);
  
      return(ctxt);
  }
--- 10339,10355 ----
  	return(NULL);
      }
  
!     inputStream = xmlLoadExternalEntity(filename, NULL, ctxt);
      if (inputStream == NULL) {
  	xmlFreeParserCtxt(ctxt);
  	return(NULL);
      }
  
      inputPush(ctxt, inputStream);
      if ((ctxt->directory == NULL) && (directory == NULL))
!         directory = xmlParserGetDirectory(filename);
      if ((ctxt->directory == NULL) && (directory != NULL))
          ctxt->directory = directory;
  
      return(ctxt);
  }
Index: uri.c
===================================================================
RCS file: /cvs/gnome/gnome-xml/uri.c,v
retrieving revision 1.52
diff -c -r1.52 uri.c
*** uri.c	14 Feb 2003 16:54:09 -0000	1.52
--- uri.c	17 Feb 2003 23:03:02 -0000
***************
*** 1966,1969 ****
--- 1966,2022 ----
      return(val);
  }
  
+ /**
+  * xmlURIFromPath:
+  * @path:  the resource locator in a filesystem notation
+  *
+  * Constructs an URI from the specified path. Mainly used
+  * for construction of xmlDoc.URL. 
+  *
+  * Returns a new URI, or a duplicate of the path parameter if the URI cannot
+  * be constructed. The caller is responsible for freeing the memory occupied
+  * by the returned string. If there is insufficient memory available, or the 
+  * argument is NULL, the function returns NULL.
+  */
+ #define IS_WINDOWS_PATH(p) 					\
+ 	((p != NULL) &&						\
+ 	 (((p[0] >= 'a') && (p[0] <= 'z')) ||			\
+ 	  ((p[0] >= 'A') && (p[0] <= 'Z'))) &&			\
+ 	 (p[1] == ':') && ((p[2] == '/') || (p[2] == '\\')))
+ xmlChar*
+ xmlURIFromPath(const xmlChar *path)
+ {
+     int len, i = 0;
+     xmlChar *ret;
+     xmlURIPtr uri;
+ 
+     if (path == NULL)
+ 	return(NULL);
+     if ((uri = xmlParseURI(path)) != NULL) {
+ 	xmlFreeURI(uri);
+ 	return xmlStrdup(path);
+     }
+ 
+     uri = xmlCreateURI();
+ 
+     len = xmlStrlen(path);
+     if ((len > 2) && IS_WINDOWS_PATH(path)) {
+ 	uri->scheme = xmlStrdup(BAD_CAST "file");
+ 	uri->path = xmlMalloc(len + 1);
+ 	uri->path[0] = '/';
+ 	while (i < len) {
+ 	    if (path[i] == '\\')
+ 		uri->path[i+1] = '/';
+ 	    else
+ 		uri->path[i+1] = path[i];
+ 	    i++;
+ 	}
+     } else {
+ 	uri->path = xmlStrdup(path);
+     }
+     
+     ret = xmlSaveUri(uri);
+     xmlFreeURI(uri);
+     return(ret);
+ }
  
Index: xmlIO.c
===================================================================
RCS file: /cvs/gnome/gnome-xml/xmlIO.c,v
retrieving revision 1.100
diff -c -r1.100 xmlIO.c
*** xmlIO.c	10 Feb 2003 15:43:53 -0000	1.100
--- xmlIO.c	17 Feb 2003 23:03:03 -0000
***************
*** 121,192 ****
  static int xmlOutputCallbackNr = 0;
  static int xmlOutputCallbackInitialized = 0;
  
- /************************************************************************
-  *									*
-  *		Handling of Windows file paths				*
-  *									*
-  ************************************************************************/
- 
- #define IS_WINDOWS_PATH(p) 					\
- 	((p != NULL) &&						\
- 	 (((p[0] >= 'a') && (p[0] <= 'z')) ||			\
- 	  ((p[0] >= 'A') && (p[0] <= 'Z'))) &&			\
- 	 (p[1] == ':') && ((p[2] == '/') || (p[2] == '\\')))
- 
- 
- /**
-  * xmlNormalizeWindowsPath:
-  * @path:  a windows path like "C:/foo/bar"
-  *
-  * Normalize a Windows path to make an URL from it
-  *
-  * Returns a new URI which must be freed by the caller or NULL
-  *   in case of error
-  */
- xmlChar *
- xmlNormalizeWindowsPath(const xmlChar *path)
- {
-     int len, i = 0, j;
-     xmlChar *ret;
- 
-     if (path == NULL)
- 	return(NULL);
- 
-     len = xmlStrlen(path);
-     if (!IS_WINDOWS_PATH(path)) {
- 	ret = xmlStrdup(path);
- 	if (ret == NULL)
- 	    return(NULL);
- 	j = 0;
-     } else {
- 	ret = xmlMalloc(len + 10);
- 	if (ret == NULL)
- 	    return(NULL);
- 	ret[0] = 'f';
- 	ret[1] = 'i';
- 	ret[2] = 'l';
- 	ret[3] = 'e';
- 	ret[4] = ':';
- 	ret[5] = '/';
- 	ret[6] = '/';
- 	ret[7] = '/';
- 	j = 8;
-     }
- 
-     while (i < len) {
- 	/* TODO: UTF8 conversion + URI escaping ??? */
- 	if (path[i] == '\\')
- 	    ret[j] = '/';
- 	else
- 	    ret[j] = path[i];
- 	i++;
- 	j++;
-     }
-     ret[j] = 0;
- 
-     return(ret);
- }
- 
  /**
   * xmlCleanupInputCallbacks:
   *
--- 121,126 ----
***************
*** 1768,1781 ****
      xmlParserInputBufferPtr ret;
      int i = 0;
      void *context = NULL;
-     char *normalized;
  
      if (xmlInputCallbackInitialized == 0)
  	xmlRegisterDefaultInputCallbacks();
  
      if (URI == NULL) return(NULL);
-     normalized = (char *) xmlNormalizeWindowsPath((const xmlChar *)URI);
-     if (normalized == NULL) return(NULL);
  
  #ifdef LIBXML_CATALOG_ENABLED
  #endif
--- 1702,1712 ----
***************
*** 1788,1800 ****
  	for (i = xmlInputCallbackNr - 1;i >= 0;i--) {
  	    if ((xmlInputCallbackTable[i].matchcallback != NULL) &&
  		(xmlInputCallbackTable[i].matchcallback(URI) != 0)) {
! 		context = xmlInputCallbackTable[i].opencallback(normalized);
  		if (context != NULL)
  		    break;
  	    }
  	}
      }
-     xmlFree(normalized);
      if (context == NULL) {
  	return(NULL);
      }
--- 1719,1730 ----
  	for (i = xmlInputCallbackNr - 1;i >= 0;i--) {
  	    if ((xmlInputCallbackTable[i].matchcallback != NULL) &&
  		(xmlInputCallbackTable[i].matchcallback(URI) != 0)) {
! 		context = xmlInputCallbackTable[i].opencallback(URI);
  		if (context != NULL)
  		    break;
  	    }
  	}
      }
      if (context == NULL) {
  	return(NULL);
      }
***************
*** 1834,1840 ****
      int i = 0;
      void *context = NULL;
      char *unescaped;
-     char *normalized;
  
      int is_http_uri = 0;	/*   Can't change if HTTP disabled  */
  
--- 1764,1769 ----
***************
*** 1842,1854 ****
  	xmlRegisterDefaultOutputCallbacks();
  
      if (URI == NULL) return(NULL);
-     normalized = (char *) xmlNormalizeWindowsPath((const xmlChar *)URI);
-     if (normalized == NULL) return(NULL);
  
  #ifdef LIBXML_HTTP_ENABLED
      /*  Need to prevent HTTP URI's from falling into zlib short circuit  */
  
!     is_http_uri = xmlIOHTTPMatch( normalized );
  #endif
  
  
--- 1771,1781 ----
  	xmlRegisterDefaultOutputCallbacks();
  
      if (URI == NULL) return(NULL);
  
  #ifdef LIBXML_HTTP_ENABLED
      /*  Need to prevent HTTP URI's from falling into zlib short circuit  */
  
!     is_http_uri = xmlIOHTTPMatch( URI );
  #endif
  
  
***************
*** 1857,1863 ****
       * Go in reverse to give precedence to user defined handlers.
       * try with an unescaped version of the URI
       */
!     unescaped = xmlURIUnescapeString(normalized, 0, NULL);
      if (unescaped != NULL) {
  #ifdef HAVE_ZLIB_H
  	if ((compression > 0) && (compression <= 9) && (is_http_uri == 0)) {
--- 1784,1790 ----
       * Go in reverse to give precedence to user defined handlers.
       * try with an unescaped version of the URI
       */
!     unescaped = xmlURIUnescapeString(URI, 0, NULL);
      if (unescaped != NULL) {
  #ifdef HAVE_ZLIB_H
  	if ((compression > 0) && (compression <= 9) && (is_http_uri == 0)) {
***************
*** 1870,1876 ****
  		    ret->closecallback = xmlGzfileClose;
  		}
  		xmlFree(unescaped);
- 		xmlFree(normalized);
  		return(ret);
  	    }
  	}
--- 1797,1802 ----
***************
*** 1899,1905 ****
      if (context == NULL) {
  #ifdef HAVE_ZLIB_H
  	if ((compression > 0) && (compression <= 9) && (is_http_uri == 0)) {
! 	    context = xmlGzfileOpenW(normalized, compression);
  	    if (context != NULL) {
  		ret = xmlAllocOutputBuffer(encoder);
  		if (ret != NULL) {
--- 1825,1831 ----
      if (context == NULL) {
  #ifdef HAVE_ZLIB_H
  	if ((compression > 0) && (compression <= 9) && (is_http_uri == 0)) {
! 	    context = xmlGzfileOpenW(URI, compression);
  	    if (context != NULL) {
  		ret = xmlAllocOutputBuffer(encoder);
  		if (ret != NULL) {
***************
*** 1907,1920 ****
  		    ret->writecallback = xmlGzfileWrite;
  		    ret->closecallback = xmlGzfileClose;
  		}
- 		xmlFree(normalized);
  		return(ret);
  	    }
  	}
  #endif
  	for (i = xmlOutputCallbackNr - 1;i >= 0;i--) {
  	    if ((xmlOutputCallbackTable[i].matchcallback != NULL) &&
! 		(xmlOutputCallbackTable[i].matchcallback(normalized) != 0)) {
  #if defined(LIBXML_HTTP_ENABLED) && defined(HAVE_ZLIB_H)
  		/*  Need to pass compression parameter into HTTP open calls  */
  		if (xmlOutputCallbackTable[i].matchcallback == xmlIOHTTPMatch)
--- 1833,1845 ----
  		    ret->writecallback = xmlGzfileWrite;
  		    ret->closecallback = xmlGzfileClose;
  		}
  		return(ret);
  	    }
  	}
  #endif
  	for (i = xmlOutputCallbackNr - 1;i >= 0;i--) {
  	    if ((xmlOutputCallbackTable[i].matchcallback != NULL) &&
! 		(xmlOutputCallbackTable[i].matchcallback(URI) != 0)) {
  #if defined(LIBXML_HTTP_ENABLED) && defined(HAVE_ZLIB_H)
  		/*  Need to pass compression parameter into HTTP open calls  */
  		if (xmlOutputCallbackTable[i].matchcallback == xmlIOHTTPMatch)
***************
*** 1927,1933 ****
  	    }
  	}
      }
-     xmlFree(normalized);
  
      if (context == NULL) {
  	return(NULL);
--- 1852,1857 ----
Index: include/libxml/uri.h
===================================================================
RCS file: /cvs/gnome/gnome-xml/include/libxml/uri.h,v
retrieving revision 1.14
diff -c -r1.14 uri.h
*** include/libxml/uri.h	12 Mar 2002 18:46:39 -0000	1.14
--- include/libxml/uri.h	17 Feb 2003 23:03:06 -0000
***************
*** 60,65 ****
--- 60,66 ----
  int		xmlNormalizeURIPath	(char *path);
  xmlChar *	xmlURIEscape		(const xmlChar *str);
  void		xmlFreeURI		(xmlURIPtr uri);
+ xmlChar*	xmlURIFromPath		(const xmlChar *path);
  
  #ifdef __cplusplus
  }
Index: include/libxml/xmlIO.h
===================================================================
RCS file: /cvs/gnome/gnome-xml/include/libxml/xmlIO.h,v
retrieving revision 1.40
diff -c -r1.40 xmlIO.h
*** include/libxml/xmlIO.h	8 Nov 2002 17:16:02 -0000	1.40
--- include/libxml/xmlIO.h	17 Feb 2003 23:03:06 -0000
***************
*** 238,246 ****
  xmlParserInputPtr xmlNoNetExternalEntityLoader(const char *URL,
  					 const char *ID,
  					 xmlParserCtxtPtr ctxt);
- 
- xmlChar *xmlNormalizeWindowsPath	(const xmlChar *path);
- 
  int	xmlCheckFilename		(const char *path);
  /**
   * Default 'file://' protocol callbacks 
--- 238,243 ----
Index: win32/libxml2.def.src
===================================================================
RCS file: /cvs/gnome/gnome-xml/win32/libxml2.def.src,v
retrieving revision 1.21
diff -c -r1.21 libxml2.def.src
*** win32/libxml2.def.src	8 Feb 2003 17:54:08 -0000	1.21
--- win32/libxml2.def.src	17 Feb 2003 23:03:08 -0000
***************
*** 714,719 ****
--- 714,720 ----
  	xmlRelaxNGParse
  	xmlRelaxNGFree
  	xmlRelaxNGDump
+ 	xmlRelaxNGDumpTree
  	
  	/* Interfaces for validating */
  	xmlRelaxNGSetValidErrors
***************
*** 957,962 ****
--- 958,964 ----
  	xmlNormalizeURIPath
  	xmlURIEscape
  	xmlFreeURI
+ 	xmlURIFromPath
  
  
  /* valid.h
***************
*** 1136,1142 ****
  	#endif /* LIBXML_HTTP_ENABLED */
  	xmlNoNetExternalEntityLoader
  
- 	xmlNormalizeWindowsPath
  	xmlCheckFilename
  
  	/**
--- 1138,1143 ----


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]