[libxml2] 566012 autodetected encoding and encoding conflict

From: Daniel Veillard <veillard src gnome org>
To: svn-commits-list gnome org
Cc:
Subject: [libxml2] 566012 autodetected encoding and encoding conflict
Date: Wed, 26 Aug 2009 09:41:04 +0000 (UTC)
commit 7e385bd4e28a0cc12b6b26ed178c620e3c3ab8d8
Author: Daniel Veillard <veillard redhat com>
Date:   Wed Aug 26 11:38:49 2009 +0200

    566012 autodetected encoding and encoding conflict
    
    * encoding.c parser.c parserInternals.c: when we autodetect an encoding
      but it's actually not completely compatible with the one declared
      great care must be taken to not convert more than just the first line.
      Led to some refactoring, more private functions and a bit of cleanup.

 encoding.c        |   53 +++++++++++++++++++++------
 parser.c          |    3 +-
 parserInternals.c |  104 +++++++++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 133 insertions(+), 27 deletions(-)
---
diff --git a/encoding.c b/encoding.c
index 5b73fe6..ef54156 100644
--- a/encoding.c
+++ b/encoding.c
@@ -1737,24 +1737,28 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
  *		The real API used by libxml for on-the-fly conversion	*
  *									*
  ************************************************************************/
+int
+xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
+                       xmlBufferPtr in, int len);
 
 /**
- * xmlCharEncFirstLine:
+ * xmlCharEncFirstLineInt:
  * @handler:	char enconding transformation data structure
  * @out:  an xmlBuffer for the output.
  * @in:  an xmlBuffer for the input
- *     
+ * @len:  number of bytes to convert for the first line, or -1
+ *
  * Front-end for the encoding handler input function, but handle only
  * the very first line, i.e. limit itself to 45 chars.
- *     
- * Returns the number of byte written if success, or 
+ *
+ * Returns the number of byte written if success, or
  *     -1 general error
  *     -2 if the transcoding fails (for *in is not valid utf8 string or
  *        the result of transformation can't fit into the encoding we want), or
  */
 int
-xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
-                 xmlBufferPtr in) {
+xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
+                       xmlBufferPtr in, int len) {
     int ret = -2;
     int written;
     int toconv;
@@ -1771,9 +1775,16 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
      * 45 chars should be sufficient to reach the end of the encoding
      * declaration without going too far inside the document content.
      * on UTF-16 this means 90bytes, on UCS4 this means 180
+     * The actual value depending on guessed encoding is passed as @len
+     * if provided
      */
-    if (toconv > 180)
-	toconv  = 180;
+    if (len >= 0) {
+        if (toconv > len)
+            toconv = len;
+    } else {
+        if (toconv > 180)
+            toconv = 180;
+    }
     if (toconv * 2 >= written) {
         xmlBufferGrow(out, toconv);
 	written = out->size - out->use - 1;
@@ -1828,14 +1839,34 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
 }
 
 /**
+ * xmlCharEncFirstLine:
+ * @handler:	char enconding transformation data structure
+ * @out:  an xmlBuffer for the output.
+ * @in:  an xmlBuffer for the input
+ *
+ * Front-end for the encoding handler input function, but handle only
+ * the very first line, i.e. limit itself to 45 chars.
+ *
+ * Returns the number of byte written if success, or
+ *     -1 general error
+ *     -2 if the transcoding fails (for *in is not valid utf8 string or
+ *        the result of transformation can't fit into the encoding we want), or
+ */
+int
+xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
+                 xmlBufferPtr in) {
+    return(xmlCharEncFirstLineInt(handler, out, in, -1));
+}
+
+/**
  * xmlCharEncInFunc:
  * @handler:	char encoding transformation data structure
  * @out:  an xmlBuffer for the output.
  * @in:  an xmlBuffer for the input
- *     
+ *
  * Generic front-end for the encoding handler input function
- *     
- * Returns the number of byte written if success, or 
+ *
+ * Returns the number of byte written if success, or
  *     -1 general error
  *     -2 if the transcoding fails (for *in is not valid utf8 string or
  *        the result of transformation can't fit into the encoding we want), or
diff --git a/parser.c b/parser.c
index 0a10b0f..0d856b7 100644
--- a/parser.c
+++ b/parser.c
@@ -10109,8 +10109,9 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) {
 
     /*
      * Check for the XMLDecl in the Prolog.
+     * do not GROW here to avoid the detected encoder to decode more
+     * than just the first line
      */
-    GROW;
     if ((CMP5(CUR_PTR, '<', '?', 'x', 'm', 'l')) && (IS_BLANK_CH(NXT(5)))) {
 
 	/*
diff --git a/parserInternals.c b/parserInternals.c
index a42e5ed..ff20435 100644
--- a/parserInternals.c
+++ b/parserInternals.c
@@ -945,6 +945,17 @@ xmlCopyChar(int len ATTRIBUTE_UNUSED, xmlChar *out, int val) {
  *									*
  ************************************************************************/
 
+/* defined in encoding.c, not public */
+int
+xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
+                       xmlBufferPtr in, int len);
+
+static int
+xmlSwitchToEncodingInt(xmlParserCtxtPtr ctxt,
+                       xmlCharEncodingHandlerPtr handler, int len);
+static int
+xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
+                          xmlCharEncodingHandlerPtr handler, int len);
 /**
  * xmlSwitchEncoding:
  * @ctxt:  the parser context
@@ -959,6 +970,7 @@ int
 xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
 {
     xmlCharEncodingHandlerPtr handler;
+    int len = -1;
 
     if (ctxt == NULL) return(-1);
     switch (enc) {
@@ -1002,9 +1014,33 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
             (ctxt->input->cur[2] == 0xBF)) {
             ctxt->input->cur += 3;
         }
-	break ;
-	default:
-	    break;
+        len = 90;
+	break;
+    case XML_CHAR_ENCODING_UCS2:
+        len = 90;
+	break;
+    case XML_CHAR_ENCODING_UCS4BE:
+    case XML_CHAR_ENCODING_UCS4LE:
+    case XML_CHAR_ENCODING_UCS4_2143:
+    case XML_CHAR_ENCODING_UCS4_3412:
+        len = 180;
+	break;
+    case XML_CHAR_ENCODING_EBCDIC:
+    case XML_CHAR_ENCODING_8859_1:
+    case XML_CHAR_ENCODING_8859_2:
+    case XML_CHAR_ENCODING_8859_3:
+    case XML_CHAR_ENCODING_8859_4:
+    case XML_CHAR_ENCODING_8859_5:
+    case XML_CHAR_ENCODING_8859_6:
+    case XML_CHAR_ENCODING_8859_7:
+    case XML_CHAR_ENCODING_8859_8:
+    case XML_CHAR_ENCODING_8859_9:
+    case XML_CHAR_ENCODING_ASCII:
+    case XML_CHAR_ENCODING_2022_JP:
+    case XML_CHAR_ENCODING_SHIFT_JIS:
+    case XML_CHAR_ENCODING_EUC_JP:
+        len = 45;
+	break;
     }
     handler = xmlGetCharEncodingHandler(enc);
     if (handler == NULL) {
@@ -1095,7 +1131,7 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
     if (handler == NULL)
 	return(-1);
     ctxt->charset = XML_CHAR_ENCODING_UTF8;
-    return(xmlSwitchToEncoding(ctxt, handler));
+    return(xmlSwitchToEncodingInt(ctxt, handler, len));
 }
 
 /**
@@ -1103,15 +1139,16 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
  * @ctxt:  the parser context
  * @input:  the input stream
  * @handler:  the encoding handler
+ * @len:  the number of bytes to convert for the first line or -1
  *
  * change the input functions when discovering the character encoding
  * of a given entity.
  *
  * Returns 0 in case of success, -1 otherwise
  */
-int
-xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
-                       xmlCharEncodingHandlerPtr handler)
+static int
+xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
+                          xmlCharEncodingHandlerPtr handler, int len)
 {
     int nbchars;
 
@@ -1208,9 +1245,10 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
                  * parsed with the autodetected encoding
                  * into the parser reading buffer.
                  */
-                nbchars = xmlCharEncFirstLine(input->buf->encoder,
-                                              input->buf->buffer,
-                                              input->buf->raw);
+                nbchars = xmlCharEncFirstLineInt(input->buf->encoder,
+                                                 input->buf->buffer,
+                                                 input->buf->raw,
+                                                 len);
             }
             if (nbchars < 0) {
                 xmlErrInternal(ctxt,
@@ -1236,8 +1274,9 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
 }
 
 /**
- * xmlSwitchToEncoding:
+ * xmlSwitchInputEncoding:
  * @ctxt:  the parser context
+ * @input:  the input stream
  * @handler:  the encoding handler
  *
  * change the input functions when discovering the character encoding
@@ -1246,13 +1285,32 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
  * Returns 0 in case of success, -1 otherwise
  */
 int
-xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler) 
-{
+xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
+                          xmlCharEncodingHandlerPtr handler) {
+    return(xmlSwitchInputEncodingInt(ctxt, input, handler, -1));
+}
+
+/**
+ * xmlSwitchToEncodingInt:
+ * @ctxt:  the parser context
+ * @handler:  the encoding handler
+ * @len: the lenght to convert or -1
+ *
+ * change the input functions when discovering the character encoding
+ * of a given entity, and convert only @len bytes of the output, this
+ * is needed on auto detect to allows any declared encoding later to
+ * convert the actual content after the xmlDecl
+ *
+ * Returns 0 in case of success, -1 otherwise
+ */
+static int
+xmlSwitchToEncodingInt(xmlParserCtxtPtr ctxt,
+                       xmlCharEncodingHandlerPtr handler, int len) {
     int ret = 0;
 
     if (handler != NULL) {
         if (ctxt->input != NULL) {
-	    ret = xmlSwitchInputEncoding(ctxt, ctxt->input, handler);
+	    ret = xmlSwitchInputEncodingInt(ctxt, ctxt->input, handler, len);
 	} else {
 	    xmlErrInternal(ctxt, "xmlSwitchToEncoding : no input\n",
 	                   NULL);
@@ -1262,11 +1320,27 @@ xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler)
 	 * The parsing is now done in UTF8 natively
 	 */
 	ctxt->charset = XML_CHAR_ENCODING_UTF8;
-    } else 
+    } else
 	return(-1);
     return(ret);
 }
 
+/**
+ * xmlSwitchToEncoding:
+ * @ctxt:  the parser context
+ * @handler:  the encoding handler
+ *
+ * change the input functions when discovering the character encoding
+ * of a given entity.
+ *
+ * Returns 0 in case of success, -1 otherwise
+ */
+int
+xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler) 
+{
+    return (xmlSwitchToEncodingInt(ctxt, handler, -1));
+}
+
 /************************************************************************
  *									*
  *	Commodity functions to handle entities processing		*
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]