[libxml2] Clean up encoding switching code



commit aab584dc3127edf383c3d1c6acda2b83353a7150
Author: Nick Wellnhofer <wellnhofer aevum de>
Date:   Sun Mar 6 23:23:43 2022 +0100

    Clean up encoding switching code
    
    - Remove xmlSwitchToEncodingInt which was basically just a wrapper
      around xmlSwitchInputEncodingInt.
    - Simplify xmlSwitchEncoding.
    - Improve error handling in xmlSwitchInputEncodingInt.
    - Deprecate xmlSwitchInputEncoding.

 include/libxml/parserInternals.h |   1 +
 parserInternals.c                | 150 ++++++---------------------------------
 2 files changed, 24 insertions(+), 127 deletions(-)
---
diff --git a/include/libxml/parserInternals.h b/include/libxml/parserInternals.h
index 0615f084..656ee462 100644
--- a/include/libxml/parserInternals.h
+++ b/include/libxml/parserInternals.h
@@ -339,6 +339,7 @@ XMLPUBFUN int XMLCALL
 XMLPUBFUN int XMLCALL
                        xmlSwitchToEncoding     (xmlParserCtxtPtr ctxt,
                                         xmlCharEncodingHandlerPtr handler);
+XML_DEPRECATED
 XMLPUBFUN int XMLCALL
                        xmlSwitchInputEncoding  (xmlParserCtxtPtr ctxt,
                                                 xmlParserInputPtr input,
diff --git a/parserInternals.c b/parserInternals.c
index a68a32fa..31d94680 100644
--- a/parserInternals.c
+++ b/parserInternals.c
@@ -866,9 +866,6 @@ xmlCopyChar(int len ATTRIBUTE_UNUSED, xmlChar *out, int val) {
  ************************************************************************/
 
 static int
-xmlSwitchToEncodingInt(xmlParserCtxtPtr ctxt,
-                       xmlCharEncodingHandlerPtr handler, int len);
-static int
 xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
                           xmlCharEncodingHandlerPtr handler, int len);
 /**
@@ -968,55 +965,7 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
                /* default encoding, no conversion should be needed */
                ctxt->charset = XML_CHAR_ENCODING_UTF8;
                return(0);
-           case XML_CHAR_ENCODING_UTF16LE:
-               break;
-           case XML_CHAR_ENCODING_UTF16BE:
-               break;
-           case XML_CHAR_ENCODING_UCS4LE:
-               __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
-                              "encoding not supported %s\n",
-                              BAD_CAST "USC4 little endian", NULL);
-               break;
-           case XML_CHAR_ENCODING_UCS4BE:
-               __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
-                              "encoding not supported %s\n",
-                              BAD_CAST "USC4 big endian", NULL);
-               break;
-           case XML_CHAR_ENCODING_EBCDIC:
-               __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
-                              "encoding not supported %s\n",
-                              BAD_CAST "EBCDIC", NULL);
-               break;
-           case XML_CHAR_ENCODING_UCS4_2143:
-               __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
-                              "encoding not supported %s\n",
-                              BAD_CAST "UCS4 2143", NULL);
-               break;
-           case XML_CHAR_ENCODING_UCS4_3412:
-               __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
-                              "encoding not supported %s\n",
-                              BAD_CAST "UCS4 3412", NULL);
-               break;
-           case XML_CHAR_ENCODING_UCS2:
-               __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
-                              "encoding not supported %s\n",
-                              BAD_CAST "UCS2", NULL);
-               break;
            case XML_CHAR_ENCODING_8859_1:
-           case XML_CHAR_ENCODING_8859_2:
-           case XML_CHAR_ENCODING_8859_3:
-           case XML_CHAR_ENCODING_8859_4:
-           case XML_CHAR_ENCODING_8859_5:
-           case XML_CHAR_ENCODING_8859_6:
-           case XML_CHAR_ENCODING_8859_7:
-           case XML_CHAR_ENCODING_8859_8:
-           case XML_CHAR_ENCODING_8859_9:
-               /*
-                * We used to keep the internal content in the
-                * document encoding however this turns being unmaintainable
-                * So xmlGetCharEncodingHandler() will return non-null
-                * values for this now.
-                */
                if ((ctxt->inputNr == 1) &&
                    (ctxt->encoding == NULL) &&
                    (ctxt->input != NULL) &&
@@ -1025,36 +974,20 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
                }
                ctxt->charset = enc;
                return(0);
-           case XML_CHAR_ENCODING_2022_JP:
-               __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
-                              "encoding not supported %s\n",
-                              BAD_CAST "ISO-2022-JP", NULL);
-               break;
-           case XML_CHAR_ENCODING_SHIFT_JIS:
-               __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
-                              "encoding not supported %s\n",
-                              BAD_CAST "Shift_JIS", NULL);
-               break;
-           case XML_CHAR_ENCODING_EUC_JP:
-               __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
-                              "encoding not supported %s\n",
-                              BAD_CAST "EUC-JP", NULL);
-               break;
            default:
-               break;
-       }
-    }
-    /*
-     * TODO: We could recover from errors in external entities if we
-     * didn't stop the parser. But most callers of this function don't
-     * check the return value.
-     */
-    if (handler == NULL) {
-        xmlStopParser(ctxt);
-       return(-1);
+               __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
+                        "encoding not supported: %s\n",
+                       BAD_CAST xmlGetCharEncodingName(enc), NULL);
+                /*
+                 * TODO: We could recover from errors in external entities
+                 * if we didn't stop the parser. But most callers of this
+                 * function don't check the return value.
+                 */
+                xmlStopParser(ctxt);
+                return(-1);
+        }
     }
-    ctxt->charset = XML_CHAR_ENCODING_UTF8;
-    ret = xmlSwitchToEncodingInt(ctxt, handler, len);
+    ret = xmlSwitchInputEncodingInt(ctxt, ctxt->input, handler, len);
     if ((ret < 0) || (ctxt->errNo == XML_I18N_CONV_FAILED)) {
         /*
         * on encoding conversion errors, stop the parser
@@ -1066,7 +999,7 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
 }
 
 /**
- * xmlSwitchInputEncoding:
+ * xmlSwitchInputEncodingInt:
  * @ctxt:  the parser context
  * @input:  the input stream
  * @handler:  the encoding handler
@@ -1088,6 +1021,8 @@ xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
     if (input == NULL)
         return (-1);
     if (input->buf != NULL) {
+       ctxt->charset = XML_CHAR_ENCODING_UTF8;
+
         if (input->buf->encoder != NULL) {
             /*
              * Check in case the auto encoding detection triggered
@@ -1191,12 +1126,9 @@ xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
            input->buf->rawconsumed += use - xmlBufUse(input->buf->raw);
         }
         return (0);
-    } else if (input->length == 0) {
-       /*
-        * When parsing a static memory array one must know the
-        * size to be able to convert the buffer.
-        */
-       xmlErrInternal(ctxt, "switching encoding : no input\n", NULL);
+    } else {
+       xmlErrInternal(ctxt,
+                "static memory buffer doesn't support encoding\n", NULL);
         /*
          * Callers assume that the input buffer takes ownership of the
          * encoding handler. xmlCharEncCloseFunc frees unregistered
@@ -1205,11 +1137,6 @@ xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
         xmlCharEncCloseFunc(handler);
        return (-1);
     }
-    /*
-     * We should actually raise an error here, see issue #34.
-     */
-    xmlCharEncCloseFunc(handler);
-    return (0);
 }
 
 /**
@@ -1218,6 +1145,8 @@ xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
  * @input:  the input stream
  * @handler:  the encoding handler
  *
+ * DEPRECATED: Use xmlSwitchToEncoding
+ *
  * change the input functions when discovering the character encoding
  * of a given entity.
  *
@@ -1229,41 +1158,6 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
     return(xmlSwitchInputEncodingInt(ctxt, input, handler, -1));
 }
 
-/**
- * xmlSwitchToEncodingInt:
- * @ctxt:  the parser context
- * @handler:  the encoding handler
- * @len: the length to convert or -1
- *
- * change the input functions when discovering the character encoding
- * of a given entity, and convert only @len bytes of the output, this
- * is needed on auto detect to allows any declared encoding later to
- * convert the actual content after the xmlDecl
- *
- * Returns 0 in case of success, -1 otherwise
- */
-static int
-xmlSwitchToEncodingInt(xmlParserCtxtPtr ctxt,
-                       xmlCharEncodingHandlerPtr handler, int len) {
-    int ret = 0;
-
-    if (handler != NULL) {
-        if (ctxt->input != NULL) {
-           ret = xmlSwitchInputEncodingInt(ctxt, ctxt->input, handler, len);
-       } else {
-           xmlErrInternal(ctxt, "xmlSwitchToEncoding : no input\n",
-                          NULL);
-           return(-1);
-       }
-       /*
-        * The parsing is now done in UTF8 natively
-        */
-       ctxt->charset = XML_CHAR_ENCODING_UTF8;
-    } else
-       return(-1);
-    return(ret);
-}
-
 /**
  * xmlSwitchToEncoding:
  * @ctxt:  the parser context
@@ -1277,7 +1171,9 @@ xmlSwitchToEncodingInt(xmlParserCtxtPtr ctxt,
 int
 xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler)
 {
-    return (xmlSwitchToEncodingInt(ctxt, handler, -1));
+    if (ctxt == NULL)
+        return(-1);
+    return(xmlSwitchInputEncodingInt(ctxt, ctxt->input, handler, -1));
 }
 
 /************************************************************************


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]