[libxml2] Adding new encoding function to deal with the new structures



commit 18d0db25037978a6a274f55a2c058ad82331965a
Author: Daniel Veillard <veillard redhat com>
Date:   Fri Jul 13 19:51:15 2012 +0800

    Adding new encoding function to deal with the new structures
    
    * encoding.c: adds xmlCharEncFirstLineInput, xmlCharEncInput and
      xmlCharEncOutput
    * enc.h: the functions are not made public but added to this new header

 enc.h      |   29 ++++
 encoding.c |  483 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 508 insertions(+), 4 deletions(-)
---
diff --git a/enc.h b/enc.h
new file mode 100644
index 0000000..8b4fcd0
--- /dev/null
+++ b/enc.h
@@ -0,0 +1,29 @@
+/*
+ * enc.h: Internal Interfaces for encoding in libxml2
+ *
+ * See Copyright for the status of this software.
+ *
+ * daniel veillard com
+ */
+
+#ifndef __XML_ENC_H__
+#define __XML_ENC_H__
+
+#include <libxml/tree.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
+                           xmlBufferPtr in, int len);
+int xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len);
+int xmlCharEncInput(xmlParserInputBufferPtr input);
+int xmlCharEncOutput(xmlOutputBufferPtr output, int init);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* __XML_ENC_H__ */
+
+
diff --git a/encoding.c b/encoding.c
index d486dd6..e7f563f 100644
--- a/encoding.c
+++ b/encoding.c
@@ -24,6 +24,7 @@
 #include "libxml.h"
 
 #include <string.h>
+#include <limits.h>
 
 #ifdef HAVE_CTYPE_H
 #include <ctype.h>
@@ -44,6 +45,9 @@
 #include <libxml/globals.h>
 #include <libxml/xmlerror.h>
 
+#include "buf.h"
+#include "enc.h"
+
 static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
 static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
 
@@ -1897,9 +1901,6 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
  *		The real API used by libxml for on-the-fly conversion	*
  *									*
  ************************************************************************/
-int
-xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
-                       xmlBufferPtr in, int len);
 
 /**
  * xmlCharEncFirstLineInt:
@@ -1946,7 +1947,7 @@ xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
             toconv = 180;
     }
     if (toconv * 2 >= written) {
-        xmlBufferGrow(out, toconv);
+        xmlBufferGrow(out, toconv * 2);
 	written = out->size - out->use - 1;
     }
 
@@ -2029,6 +2030,251 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
 }
 
 /**
+ * xmlCharEncInput:
+ * @input: a parser input buffer
+ * @len:  number of bytes to convert for the first line, or -1
+ *
+ * Front-end for the encoding handler input function, but handle only
+ * the very first line. Point is that this is based on autodetection
+ * of the encoding and once that first line is converted we may find
+ * out that a different decoder is needed to process the input.
+ *
+ * Returns the number of byte written if success, or
+ *     -1 general error
+ *     -2 if the transcoding fails (for *in is not valid utf8 string or
+ *        the result of transformation can't fit into the encoding we want), or
+ */
+int
+xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len)
+{
+    int ret = -2;
+    size_t written;
+    size_t toconv;
+    int c_in;
+    int c_out;
+    xmlBufPtr in;
+    xmlBufPtr out;
+
+    if ((input == NULL) || (input->encoder == NULL) ||
+        (input->buffer == NULL) || (input->raw == NULL))
+        return (-1);
+    out = input->buffer;
+    in = input->raw;
+
+    toconv = xmlBufUse(in);
+    if (toconv == 0)
+        return (0);
+    written = xmlBufAvail(out) - 1; /* count '\0' */
+    /*
+     * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
+     * 45 chars should be sufficient to reach the end of the encoding
+     * declaration without going too far inside the document content.
+     * on UTF-16 this means 90bytes, on UCS4 this means 180
+     * The actual value depending on guessed encoding is passed as @len
+     * if provided
+     */
+    if (len >= 0) {
+        if (toconv > (unsigned int) len)
+            toconv = len;
+    } else {
+        if (toconv > 180)
+            toconv = 180;
+    }
+    if (toconv * 2 >= written) {
+        xmlBufGrow(out, toconv * 2);
+        written = xmlBufAvail(out) - 1;
+    }
+    if (written > 360)
+        written = 360;
+
+    c_in = toconv;
+    c_out = written;
+    if (input->encoder->input != NULL) {
+        ret = input->encoder->input(xmlBufEnd(out), &c_out,
+                                    xmlBufContent(in), &c_in);
+        xmlBufShrink(in, c_in);
+        xmlBufAddLen(out, c_out);
+    }
+#ifdef LIBXML_ICONV_ENABLED
+    else if (input->encoder->iconv_in != NULL) {
+        ret = xmlIconvWrapper(input->encoder->iconv_in, xmlBufEnd(out),
+                              &c_out, xmlBufContent(in), &c_in);
+        xmlBufShrink(in, c_in);
+        xmlBufAddLen(out, c_out);
+        if (ret == -1)
+            ret = -3;
+    }
+#endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+    else if (input->encoder->uconv_in != NULL) {
+        ret = xmlUconvWrapper(input->encoder->uconv_in, 1, xmlBufEnd(out),
+                              &c_out, xmlBufContent(in), &c_in);
+        xmlBufShrink(in, c_in);
+        xmlBufAddLen(out, c_out);
+        if (ret == -1)
+            ret = -3;
+    }
+#endif /* LIBXML_ICU_ENABLED */
+    switch (ret) {
+        case 0:
+#ifdef DEBUG_ENCODING
+            xmlGenericError(xmlGenericErrorContext,
+                            "converted %d bytes to %d bytes of input\n",
+                            c_in, c_out);
+#endif
+            break;
+        case -1:
+#ifdef DEBUG_ENCODING
+            xmlGenericError(xmlGenericErrorContext,
+                         "converted %d bytes to %d bytes of input, %d left\n",
+                            c_in, c_out, (int)xmlBufUse(in));
+#endif
+            break;
+        case -3:
+#ifdef DEBUG_ENCODING
+            xmlGenericError(xmlGenericErrorContext,
+                        "converted %d bytes to %d bytes of input, %d left\n",
+                            c_in, c_out, (int)xmlBufUse(in));
+#endif
+            break;
+        case -2: {
+            char buf[50];
+            const xmlChar *content = xmlBufContent(in);
+
+	    snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X",
+		     content[0], content[1],
+		     content[2], content[3]);
+	    buf[49] = 0;
+	    xmlEncodingErr(XML_I18N_CONV_FAILED,
+		    "input conversion failed due to input error, bytes %s\n",
+		           buf);
+        }
+    }
+    /*
+     * Ignore when input buffer is not on a boundary
+     */
+    if (ret == -3) ret = 0;
+    if (ret == -1) ret = 0;
+    return(ret);
+}
+
+/**
+ * xmlCharEncInput:
+ * @input: a parser input buffer
+ *
+ * Generic front-end for the encoding handler on parser input
+ *
+ * Returns the number of byte written if success, or
+ *     -1 general error
+ *     -2 if the transcoding fails (for *in is not valid utf8 string or
+ *        the result of transformation can't fit into the encoding we want), or
+ */
+int
+xmlCharEncInput(xmlParserInputBufferPtr input)
+{
+    int ret = -2;
+    size_t written;
+    size_t toconv;
+    int c_in;
+    int c_out;
+    xmlBufPtr in;
+    xmlBufPtr out;
+
+    if ((input == NULL) || (input->encoder == NULL) ||
+        (input->buffer == NULL) || (input->raw == NULL))
+        return (-1);
+    out = input->buffer;
+    in = input->raw;
+
+    toconv = xmlBufUse(in);
+    if (toconv == 0)
+        return (0);
+    if (toconv > 64 * 1024)
+        toconv = 64 * 1024;
+    written = xmlBufAvail(out);
+    if (written > 0)
+        written--; /* count '\0' */
+    if (toconv * 2 >= written) {
+        xmlBufGrow(out, toconv * 2);
+        written = xmlBufAvail(out);
+        if (written > 0)
+            written--; /* count '\0' */
+    }
+    if (written > 128 * 1024)
+        written = 128 * 1024;
+
+    c_in = toconv;
+    c_out = written;
+    if (input->encoder->input != NULL) {
+        ret = input->encoder->input(xmlBufEnd(out), &c_out,
+                                    xmlBufContent(in), &c_in);
+        xmlBufShrink(in, c_in);
+        xmlBufAddLen(out, c_out);
+    }
+#ifdef LIBXML_ICONV_ENABLED
+    else if (input->encoder->iconv_in != NULL) {
+        ret = xmlIconvWrapper(input->encoder->iconv_in, xmlBufEnd(out),
+                              &c_out, xmlBufContent(in), &c_in);
+        xmlBufShrink(in, c_in);
+        xmlBufAddLen(out, c_out);
+        if (ret == -1)
+            ret = -3;
+    }
+#endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+    else if (input->encoder->uconv_in != NULL) {
+        ret = xmlUconvWrapper(input->encoder->uconv_in, 1, xmlBufEnd(out),
+                              &c_out, xmlBufContent(in), &c_in);
+        xmlBufShrink(in, c_in);
+        xmlBufAddLen(out, c_out);
+        if (ret == -1)
+            ret = -3;
+    }
+#endif /* LIBXML_ICU_ENABLED */
+    switch (ret) {
+        case 0:
+#ifdef DEBUG_ENCODING
+            xmlGenericError(xmlGenericErrorContext,
+                            "converted %d bytes to %d bytes of input\n",
+                            c_in, c_out);
+#endif
+            break;
+        case -1:
+#ifdef DEBUG_ENCODING
+            xmlGenericError(xmlGenericErrorContext,
+                         "converted %d bytes to %d bytes of input, %d left\n",
+                            c_in, c_out, (int)xmlBufUse(in));
+#endif
+            break;
+        case -3:
+#ifdef DEBUG_ENCODING
+            xmlGenericError(xmlGenericErrorContext,
+                        "converted %d bytes to %d bytes of input, %d left\n",
+                            c_in, c_out, (int)xmlBufUse(in));
+#endif
+            break;
+        case -2: {
+            char buf[50];
+            const xmlChar *content = xmlBufContent(in);
+
+	    snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X",
+		     content[0], content[1],
+		     content[2], content[3]);
+	    buf[49] = 0;
+	    xmlEncodingErr(XML_I18N_CONV_FAILED,
+		    "input conversion failed due to input error, bytes %s\n",
+		           buf);
+        }
+    }
+    /*
+     * Ignore when input buffer is not on a boundary
+     */
+    if (ret == -3)
+        ret = 0;
+    return (c_out? c_out : ret);
+}
+
+/**
  * xmlCharEncInFunc:
  * @handler:	char encoding transformation data structure
  * @out:  an xmlBuffer for the output.
@@ -2136,6 +2382,235 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
 }
 
 /**
+ * xmlCharEncOutput:
+ * @input: a parser input buffer
+ * @init: is this an initialization call without data
+ *
+ * Generic front-end for the encoding handler on parser output
+ * a first call with @init == 1 has to be made first to initiate the
+ * output in case of non-stateless encoding needing to initiate their
+ * state or the output (like the BOM in UTF16).
+ * In case of UTF8 sequence conversion errors for the given encoder,
+ * the content will be automatically remapped to a CharRef sequence.
+ *
+ * Returns the number of byte written if success, or
+ *     -1 general error
+ *     -2 if the transcoding fails (for *in is not valid utf8 string or
+ *        the result of transformation can't fit into the encoding we want), or
+ */
+int
+xmlCharEncOutput(xmlOutputBufferPtr output, int init)
+{
+    int ret = -2;
+    size_t written;
+    size_t writtentot = 0;
+    size_t toconv;
+    int c_in;
+    int c_out;
+    xmlBufPtr in;
+    xmlBufPtr out;
+    int charref_len = 0;
+
+    if ((output == NULL) || (output->encoder == NULL) ||
+        (output->buffer == NULL) || (output->conv == NULL))
+        return (-1);
+    out = output->conv;
+    in = output->buffer;
+
+retry:
+
+    written = xmlBufAvail(out);
+    if (written > 0)
+        written--; /* count '\0' */
+
+    /*
+     * First specific handling of the initialization call
+     */
+    if (init) {
+        c_in = 0;
+        c_out = written;
+        if (output->encoder->output != NULL) {
+            ret = output->encoder->output(xmlBufEnd(out), &c_out,
+                                          NULL, &c_in);
+            if (ret > 0) /* Gennady: check return value */
+                xmlBufAddLen(out, c_out);
+        }
+#ifdef LIBXML_ICONV_ENABLED
+        else if (output->encoder->iconv_out != NULL) {
+            ret = xmlIconvWrapper(output->encoder->iconv_out, xmlBufEnd(out),
+                                  &c_out, NULL, &c_in);
+            xmlBufAddLen(out, c_out);
+        }
+#endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+        else if (output->encoder->uconv_out != NULL) {
+            ret = xmlUconvWrapper(output->encoder->uconv_out, 0, xmlBufEnd(out),
+                                  &c_out, NULL, &c_in);
+            xmlBufAddLen(out, c_out);
+        }
+#endif /* LIBXML_ICU_ENABLED */
+#ifdef DEBUG_ENCODING
+	xmlGenericError(xmlGenericErrorContext,
+		"initialized encoder\n");
+#endif
+        return(0);
+    }
+
+    /*
+     * Conversion itself.
+     */
+    toconv = xmlBufUse(in);
+    if (toconv == 0)
+        return (0);
+    if (toconv > 64 * 1024)
+        toconv = 64 * 1024;
+    if (toconv * 4 >= written) {
+        xmlBufGrow(out, toconv * 4);
+        written = xmlBufAvail(out) - 1;
+    }
+    if (written > 256 * 1024)
+        written = 256 * 1024;
+
+    c_in = toconv;
+    c_out = written;
+    if (output->encoder->output != NULL) {
+        ret = output->encoder->output(xmlBufEnd(out), &c_out,
+                                      xmlBufContent(in), &c_in);
+        if (c_out > 0) {
+            xmlBufShrink(in, c_in);
+            xmlBufAddLen(out, c_out);
+            writtentot += c_out;
+        }
+    }
+#ifdef LIBXML_ICONV_ENABLED
+    else if (output->encoder->iconv_out != NULL) {
+        ret = xmlIconvWrapper(output->encoder->iconv_out, xmlBufEnd(out),
+                              &c_out, xmlBufContent(in), &c_in);
+        xmlBufShrink(in, c_in);
+        xmlBufAddLen(out, c_out);
+        writtentot += c_out;
+        if (ret == -1) {
+            if (c_out > 0) {
+                /*
+                 * Can be a limitation of iconv
+                 */
+                charref_len = 0;
+                goto retry;
+            }
+            ret = -3;
+        }
+    }
+#endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+    else if (output->encoder->uconv_out != NULL) {
+        ret = xmlUconvWrapper(output->encoder->uconv_out, 0, xmlBufEnd(out),
+                              &c_out, xmlBufContent(in), &c_in);
+        xmlBufShrink(in, c_in);
+        xmlBufAddLen(out, c_out);
+        writtentot += c_out;
+        if (ret == -1) {
+            if (c_out > 0) {
+                /*
+                 * Can be a limitation of uconv
+                 */
+                charref_len = 0;
+                goto retry;
+            }
+            ret = -3;
+        }
+    }
+#endif /* LIBXML_ICU_ENABLED */
+    else {
+        xmlEncodingErr(XML_I18N_NO_OUTPUT,
+                       "xmlCharEncOutFunc: no output function !\n", NULL);
+        return(-1);
+    }
+
+    if (ret >= 0) output += ret;
+
+    /*
+     * Attempt to handle error cases
+     */
+    switch (ret) {
+        case 0:
+#ifdef DEBUG_ENCODING
+	    xmlGenericError(xmlGenericErrorContext,
+		    "converted %d bytes to %d bytes of output\n",
+	            c_in, c_out);
+#endif
+	    break;
+        case -1:
+#ifdef DEBUG_ENCODING
+	    xmlGenericError(xmlGenericErrorContext,
+		    "output conversion failed by lack of space\n");
+#endif
+	    break;
+        case -3:
+#ifdef DEBUG_ENCODING
+	    xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
+	            c_in, c_out, (int) xmlBufUse(in));
+#endif
+	    break;
+        case -2: {
+	    int len = (int) xmlBufUse(in);
+            xmlChar *content = xmlBufContent(in);
+	    int cur;
+
+	    cur = xmlGetUTF8Char(content, &len);
+	    if ((charref_len != 0) && (c_out < charref_len)) {
+		/*
+		 * We attempted to insert a character reference and failed.
+		 * Undo what was written and skip the remaining charref.
+		 */
+                xmlBufErase(out, c_out);
+		writtentot -= c_out;
+		xmlBufShrink(in, charref_len - c_out);
+		charref_len = 0;
+
+		ret = -1;
+                break;
+	    } else if (cur > 0) {
+		xmlChar charref[20];
+
+#ifdef DEBUG_ENCODING
+		xmlGenericError(xmlGenericErrorContext,
+			"handling output conversion error\n");
+		xmlGenericError(xmlGenericErrorContext,
+			"Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
+			content[0], content[1],
+			content[2], content[3]);
+#endif
+		/*
+		 * Removes the UTF8 sequence, and replace it by a charref
+		 * and continue the transcoding phase, hoping the error
+		 * did not mangle the encoder state.
+		 */
+		charref_len = snprintf((char *) &charref[0], sizeof(charref),
+				 "&#%d;", cur);
+		xmlBufShrink(in, len);
+		xmlBufAddHead(in, charref, -1);
+
+		goto retry;
+	    } else {
+		char buf[50];
+
+		snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X",
+			 content[0], content[1],
+			 content[2], content[3]);
+		buf[49] = 0;
+		xmlEncodingErr(XML_I18N_CONV_FAILED,
+		    "output conversion failed due to conv error, bytes %s\n",
+			       buf);
+		if (xmlBufGetAllocationScheme(in) != XML_BUFFER_ALLOC_IMMUTABLE)
+		    content[0] = ' ';
+	    }
+	    break;
+	}
+    }
+    return(ret);
+}
+
+/**
  * xmlCharEncOutFunc:
  * @handler:	char enconding transformation data structure
  * @out:  an xmlBuffer for the output.



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]