commit 0ad5bfe98c8b76d52db905a847d48187d75d15d1 Author: Olli Pottonen Date: Tue Jul 7 20:29:50 2015 +1000 Detect fatal encoding errors. diff --git a/HTMLparser.c b/HTMLparser.c index d329d3b..8717d0b 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -3507,7 +3507,6 @@ htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) { return; if (encoding != NULL) { - xmlCharEncoding enc; xmlCharEncodingHandlerPtr handler; while ((*encoding == ' ') || (*encoding == '\t')) encoding++; @@ -3516,37 +3515,16 @@ htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) { xmlFree((xmlChar *) ctxt->input->encoding); ctxt->input->encoding = xmlStrdup(encoding); - enc = xmlParseCharEncoding((const char *) encoding); - /* - * registered set of known encodings - */ - if (enc != XML_CHAR_ENCODING_ERROR) { - if (((enc == XML_CHAR_ENCODING_UTF16LE) || - (enc == XML_CHAR_ENCODING_UTF16BE) || - (enc == XML_CHAR_ENCODING_UCS4LE) || - (enc == XML_CHAR_ENCODING_UCS4BE)) && - (ctxt->input->buf != NULL) && - (ctxt->input->buf->encoder == NULL)) { - htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, - "htmlCheckEncoding: wrong encoding meta\n", - NULL, NULL); - } else { - xmlSwitchEncoding(ctxt, enc); - } - ctxt->charset = XML_CHAR_ENCODING_UTF8; + handler = xmlFindCharEncodingHandler((const char *) encoding); + + if (handler == NULL || !xmlEncHandlerAsciiCompatible(handler)) { + htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, + "htmlCheckEncoding: wrong encoding meta %s\n", + encoding, NULL); } else { - /* - * fallback for unknown encodings - */ - handler = xmlFindCharEncodingHandler((const char *) encoding); - if (handler != NULL) { - xmlSwitchToEncoding(ctxt, handler); - ctxt->charset = XML_CHAR_ENCODING_UTF8; - } else { - htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, - "htmlCheckEncoding: unknown encoding %s\n", - encoding, NULL); - } + xmlSwitchToEncoding(ctxt, handler); + xmlFree((xmlChar *) ctxt->encoding); + ctxt->encoding = xmlStrdup(encoding); } if ((ctxt->input->buf != NULL) && diff --git a/encoding.c b/encoding.c index 3f89fd2..3f19d71 100644 --- a/encoding.c +++ b/encoding.c @@ -949,7 +949,8 @@ xmlDetectCharEncoding(const unsigned char* in, int len) return(XML_CHAR_ENCODING_EBCDIC); if ((in[0] == 0x3C) && (in[1] == 0x3F) && (in[2] == 0x78) && (in[3] == 0x6D)) - return(XML_CHAR_ENCODING_UTF8); + /* Something ascii compatible, do not know what. */ + return(XML_CHAR_ENCODING_NONE); /* * Although not part of the recommendation, we also * attempt an "auto-recognition" of UTF-16LE and @@ -1275,6 +1276,178 @@ xmlGetCharEncodingName(xmlCharEncoding enc) { return(NULL); } +/** + * @CompatibleEncodings: + * @fname : name of encoded as determined by xmlDetectCharEncoding() + * and xmlSwitchEncoding() + * @sname: name of encoding indicated in the XML declaration + * + * Helper function for xmlParseEncodingDecl() for determining + * whether the document really is in the declared encoding. + */ +xmlEncodingCompatibility +xmlCompatibleEncodings(const xmlChar *fname, const xmlChar *sname) +{ + xmlChar upper[20]; + int i; + for (i = 0;i < 19;i++) { + upper[i] = toupper(sname[i]); + if (upper[i] == 0) break; + } + upper[i] = 0; + + if (fname == NULL) { + /* fname == NULL indicates some yet unknown ascii-compatible encoding */ + if (!xmlStrcmp(upper, BAD_CAST "UTF-8") || + !xmlStrcmp(upper, BAD_CAST "UTF8") || + !xmlStrcmp(upper, BAD_CAST "CSUTF8")) + /* The default, which is used when fname == NULL, + * is UTF-8, same as sname. */ + return(XML_ENC_COMP_OK); + else + /* Don't know, let xmlParseEncodingDecl() figure it out. */ + return(XML_ENC_COMP_UNKNOWN); + } + + /* + * If xmlDetectCharEncoding() said it is UTF-8, that is not a + * preliminary guess, but a certain conclusion based on presence + * of a BOM. Then only valid declaration is UTF-8. + */ + if (!xmlStrcmp(fname, BAD_CAST "UTF-8")) { + if (!xmlStrcmp(upper, BAD_CAST "UTF-8") || + !xmlStrcmp(upper, BAD_CAST "UTF8") || + !xmlStrcmp(upper, BAD_CAST "CSUTF8")) + return(XML_ENC_COMP_OK); + else + return(XML_ENC_COMP_ERR); + } + + else if (!xmlStrcmp(fname, BAD_CAST "UTF-16BE")) { + if (!xmlStrcmp(upper, BAD_CAST "UTF-16BE") || + !xmlStrcmp(upper, BAD_CAST "UTF16BE") || + !xmlStrcmp(upper, BAD_CAST "CSUTF16BE")) + return(XML_ENC_COMP_OK); + else if (!xmlStrcmp(upper, BAD_CAST "UTF-16") || + !xmlStrcmp(upper, BAD_CAST "UTF16") || + !xmlStrcmp(upper, BAD_CAST "CSUTF16")) + /* UTF-16BE is equivalent to the variant of UTF-16 with no BOM. + * In XML missing BOM is an error, but not fatal. */ + return XML_ENC_COMP_BOM_MISSING; + else if (!xmlStrcmp(upper, BAD_CAST "UCS-2BE") || + !xmlStrcmp(upper, BAD_CAST "UCS-2") || + !xmlStrcmp(upper, BAD_CAST "ISO-10646-UCS-2") || + !xmlStrcmp(upper, BAD_CAST "ISO-10646-UCS-2BE") || + !xmlStrcmp(upper, BAD_CAST "UCS2BE") || + !xmlStrcmp(upper, BAD_CAST "UCS2") || + !xmlStrcmp(upper, BAD_CAST "CSUNICODE")) + /* Ok, compatible, UTF-16 and UCS-2 are almost the same. + * Not exactly the same however, this equires special handling. */ + return(XML_ENC_COMP_UCS2); + else + return(XML_ENC_COMP_ERR); + } + + else if (!xmlStrcmp(fname, BAD_CAST "UTF-16LE")) { + if (!xmlStrcmp(upper, BAD_CAST "UTF-16LE") || + !xmlStrcmp(upper, BAD_CAST "UTF16LE") || + !xmlStrcmp(upper, BAD_CAST "CSUTF16LE")) + return(XML_ENC_COMP_OK); + else if (!xmlStrcmp(upper, BAD_CAST "UCS-2LE") || + !xmlStrcmp(upper, BAD_CAST "UCS-2") || + !xmlStrcmp(upper, BAD_CAST "ISO-10646-UCS-2") || + !xmlStrcmp(upper, BAD_CAST "ISO-10646-UCS-2LE") || + !xmlStrcmp(upper, BAD_CAST "UCS2LE") || + !xmlStrcmp(upper, BAD_CAST "UCS2") || + !xmlStrcmp(upper, BAD_CAST "CSUNICODE")) + /* Ok, compatible, but requires special handling. */ + return(XML_ENC_COMP_UCS2); + else + return(XML_ENC_COMP_ERR); + } + + /* + * If xmlDetectCharEncoding() said it is UTF-16, there must be a + * BOM. Then UTF-16LE, UTF-16BE which have no BOM are not compatible. + */ + else if (!xmlStrcmp(fname, BAD_CAST "UTF-16")) { + if (!xmlStrcmp(upper, BAD_CAST "UTF-16") || + !xmlStrcmp(upper, BAD_CAST "UTF16") || + !xmlStrcmp(upper, BAD_CAST "CSUTF16")) + return(XML_ENC_COMP_OK); + else if (!xmlStrcmp(upper, BAD_CAST "UCS-2") || + !xmlStrcmp(upper, BAD_CAST "UCS2") || + !xmlStrcmp(upper, BAD_CAST "ISO-10646-UCS-2")) + return(XML_ENC_COMP_UCS2); + else + return(XML_ENC_COMP_ERR); + } + + /* UTF-32 a.k.a. UCS-4 is handled almost the same as UTF-16. */ + else if (!xmlStrcmp(fname, BAD_CAST "ISO-10646-UCS-4BE")) { + if (!xmlStrcmp(upper, BAD_CAST "UTF-32BE") || + !xmlStrcmp(upper, BAD_CAST "UTF32BE") || + !xmlStrcmp(upper, BAD_CAST "UTF-32") || + !xmlStrcmp(upper, BAD_CAST "UTF32BE") || + !xmlStrcmp(upper, BAD_CAST "UTF32") || + !xmlStrcmp(upper, BAD_CAST "UCS-4BE") || + !xmlStrcmp(upper, BAD_CAST "UCS-4") || + !xmlStrcmp(upper, BAD_CAST "UCS4BE") || + !xmlStrcmp(upper, BAD_CAST "UCS4") || + !xmlStrcmp(upper, BAD_CAST "ISO-10646-UCS-4") || + !xmlStrcmp(upper, BAD_CAST "ISO-10646-UCS-4BE") || + !xmlStrcmp(upper, BAD_CAST "CSUTF32BE") || + !xmlStrcmp(upper, BAD_CAST "CSUTF32")) + return(XML_ENC_COMP_OK); + else + return(XML_ENC_COMP_ERR); + } + + else if (!xmlStrcmp(fname, BAD_CAST "ISO-10646-UCS-4LE")) { + if (!xmlStrcmp(upper, BAD_CAST "UTF-32LE") || + !xmlStrcmp(upper, BAD_CAST "UTF32LE") || + !xmlStrcmp(upper, BAD_CAST "UCS-4LE") || + !xmlStrcmp(upper, BAD_CAST "UCS4LE") || + !xmlStrcmp(upper, BAD_CAST "ISO-10646-UCS-4LE") || + !xmlStrcmp(upper, BAD_CAST "CSUTF32LE")) + return(XML_ENC_COMP_OK); + else + return(XML_ENC_COMP_ERR); + } + + /* + * If xmlDetectCharEncoding() said it is UTF-32, there must be a + * BOM. Then only valid declaration is UTF-32. + */ + else if (!xmlStrcmp(fname, BAD_CAST "ISO-10646-UCS-4")) { + if (!xmlStrcmp(upper, BAD_CAST "UTF-32") || + !xmlStrcmp(upper, BAD_CAST "UTF32") || + !xmlStrcmp(upper, BAD_CAST "UCS-4") || + !xmlStrcmp(upper, BAD_CAST "UCS4") || + !xmlStrcmp(upper, BAD_CAST "ISO-10646-UCS-4") || + !xmlStrcmp(upper, BAD_CAST "CSUTF32")) + return(XML_ENC_COMP_OK); + else + return(XML_ENC_COMP_ERR); + } + + /* + * TODO: this is incomplete, there are several EBCDIC variants. + */ + else if (!xmlStrcmp(fname, BAD_CAST "EBCDIC")) { + if (!xmlStrcmp(upper, BAD_CAST "EBCDIC")) + return(XML_ENC_COMP_OK); + else + return(XML_ENC_COMP_ERR); + } + + else { + xmlEncodingErr(XML_ERR_INTERNAL_ERROR, + "Unexpected encoding %s\n", (const char*) fname); + return(XML_ENC_COMP_ERR); + } +} + /************************************************************************ * * * Char encoding handlers * @@ -2931,6 +3104,39 @@ xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) { } /** + * xmlEncHandlerAsciiCompatible: + * @handler: an XML chanacter encoding handler + * + * This function finds out whether the handler is for an ASCII + * compatible encoding (e.g. UTF-8, ISO-8859-X, ASCII) or non-compatible + * (e.g. UTF-16, UTF-32, EBCDIC). + */ +int +xmlEncHandlerAsciiCompatible(xmlCharEncodingHandler *handler) { + unsigned char test_out[11], test_in[] = "input != NULL) { + if (handler->input(test_out, &outlen, test_in, &inlen) < 0) + return(0); + } +#ifdef LIBXML_ICONV_ENABLED + else if (handler->iconv_in != NULL) { + if (xmlIconvWrapper(handler->iconv_in, test_out, + &outlen, test_in, &inlen) < 0) + return(0); + } +#endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + else if (handler->uconv_in != NULL) { + if (xmlUconvWrapper(handler->uconv_in, 1, test_out, + &outlen, test_in, &inlen) < 0) + return(0); + } +#endif /* LIBXML_ICU_ENABLED */ + return(outlen == 5 && inlen == 5 && !xmlStrncmp(test_in, test_out, 5)); +} + +/** * xmlByteConsumed: * @ctxt: an XML parser context * diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h index cd25b2f..9121a00 100644 --- a/include/libxml/encoding.h +++ b/include/libxml/encoding.h @@ -156,6 +156,15 @@ struct _xmlCharEncodingHandler { #endif /* LIBXML_ICU_ENABLED */ }; +/* Return values of xmlCompatibleEncodings */ +typedef enum { + XML_ENC_COMP_UNKNOWN= 0, + XML_ENC_COMP_OK= 1, /* Ok, compatible */ + XML_ENC_COMP_ERR= 2, /* Fatal error, incompatible */ + XML_ENC_COMP_BOM_MISSING= 3, /* BOM missing, non-fatal error */ + XML_ENC_COMP_UCS2= 4, /* Ok, switch UTF-16 -> UCS2 */ +} xmlEncodingCompatibility; + #ifdef __cplusplus } #endif @@ -199,6 +208,10 @@ XMLPUBFUN xmlCharEncoding XMLCALL XMLPUBFUN const char * XMLCALL xmlGetCharEncodingName (xmlCharEncoding enc); +XMLPUBFUN xmlEncodingCompatibility + xmlCompatibleEncodings (const xmlChar *fname, + const xmlChar *sname); + /* * Interfaces directly used by the parsers. */ @@ -222,6 +235,9 @@ XMLPUBFUN int XMLCALL XMLPUBFUN int XMLCALL xmlCharEncCloseFunc (xmlCharEncodingHandler *handler); +XMLPUBFUN int XMLCALL + xmlEncHandlerAsciiCompatible (xmlCharEncodingHandler *handler); + /* * Export a few useful functions */ diff --git a/parser.c b/parser.c index 93f8d35..b6da7c4 100644 --- a/parser.c +++ b/parser.c @@ -10426,68 +10426,133 @@ xmlParseEncodingDecl(xmlParserCtxtPtr ctxt) { xmlFatalErr(ctxt, XML_ERR_STRING_NOT_STARTED, NULL); } - /* - * Non standard parsing, allowing the user to ignore encoding - */ - if (ctxt->options & XML_PARSE_IGNORE_ENC) { - xmlFree((xmlChar *) encoding); - return(NULL); - } + if (encoding == NULL) + return(NULL); /* - * UTF-16 encoding stwich has already taken place at this stage, - * more over the little-endian/big-endian selection is already done - */ - if ((encoding != NULL) && - ((!xmlStrcasecmp(encoding, BAD_CAST "UTF-16")) || - (!xmlStrcasecmp(encoding, BAD_CAST "UTF16")))) { - /* - * If no encoding was passed to the parser, that we are - * using UTF-16 and no decoder is present i.e. the - * document is apparently UTF-8 compatible, then raise an - * encoding mismatch fatal error - */ - if ((ctxt->encoding == NULL) && - (ctxt->input->buf != NULL) && - (ctxt->input->buf->encoder == NULL)) { - xmlFatalErrMsg(ctxt, XML_ERR_INVALID_ENCODING, - "Document labelled UTF-16 but has UTF-8 content\n"); - } - if (ctxt->encoding != NULL) - xmlFree((xmlChar *) ctxt->encoding); - ctxt->encoding = encoding; - } - /* - * UTF-8 encoding is handled natively + * XML REC Section 4.3.3: "In the absence of information + * provided by an external transport protocol (e.g. HTTP or + * MIME), it is a fatal error for an entity including an + * encoding declaration to be presented to the XML processor + * in an encoding other than that named in the declaration" + * + * The presence of information by an external protocol is + * indicated by XML_PARSE_IGNORE_ENC in ctxt->options. + * + * In absence of information by external protocol, we may have + * sniffed that the content is UTF-16 or UTF-32, or UTF-8. + * This initial guess can not be completely incorrect, for we have + * succesfully parsed the document so far, but it may be perfectly + * precise, e.g. it is not always possible to distinguish between + * UTF-8 and ASCII. + * + * xmlCompatibleEncodings() tells us whether the initial + * guess and declared encoding are compatible. */ - else if ((encoding != NULL) && - ((!xmlStrcasecmp(encoding, BAD_CAST "UTF-8")) || - (!xmlStrcasecmp(encoding, BAD_CAST "UTF8")))) { - if (ctxt->encoding != NULL) - xmlFree((xmlChar *) ctxt->encoding); - ctxt->encoding = encoding; + if (ctxt->options & XML_PARSE_IGNORE_ENC) { + xmlFree((xmlChar *) encoding); + return(NULL); } - else if (encoding != NULL) { - xmlCharEncodingHandlerPtr handler; - if (ctxt->input->encoding != NULL) - xmlFree((xmlChar *) ctxt->input->encoding); - ctxt->input->encoding = encoding; + xmlCharEncodingHandlerPtr handler = NULL; + xmlEncodingCompatibility compatible = \ + xmlCompatibleEncodings(ctxt->encoding, encoding); - handler = xmlFindCharEncodingHandler((const char *) encoding); - if (handler != NULL) { - xmlSwitchToEncoding(ctxt, handler); - } else { - xmlFatalErrMsgStr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, - "Unsupported encoding %s\n", encoding); + if (compatible == XML_ENC_COMP_ERR) { + xmlFatalErrMsgStr(ctxt, XML_ERR_INVALID_ENCODING, + "Incorrect encoding declaration %s\n", encoding); + xmlFree((xmlChar *) encoding); + return(NULL); + } else if (compatible == XML_ENC_COMP_BOM_MISSING) { + xmlWarningMsg(ctxt, XML_ERR_INVALID_ENCODING, + "Missing Byte Order Mark\n", NULL, NULL); + xmlFree((xmlChar *) encoding); + return(NULL); + } else if (compatible == XML_ENC_COMP_OK) { + /* Correct decoder already in use, no need to + * xmlSwitchToEncoding() + */ + xmlFree((xmlChar *) ctxt->encoding); + ctxt->encoding = encoding; + return(NULL); + } else if (compatible == XML_ENC_COMP_UCS2) { + /* Switch from UTF-16 to UCS-2. Keep current UTF-16 byte + * order (big endian/little endian). + */ + if(ctxt->input->buf->encoder == NULL) { + xmlFatalErr(ctxt, XML_ERR_INTERNAL_ERROR, + "no encoder"); + xmlFree((xmlChar *) encoding); return(NULL); } + + if (!strcmp(ctxt->input->buf->encoder->name, "UTF-16LE")) { + handler = xmlFindCharEncodingHandler("UCS-2LE"); + } else if (!strcmp(ctxt->input->buf->encoder->name, "UTF-16BE")) { + handler = xmlFindCharEncodingHandler("UCS-2BE"); + } else { + xmlFatalErr(ctxt, XML_ERR_INTERNAL_ERROR, + "unexpected encoder"); + } + } else { /* compatible == XML_ENC_COMP_UNKNOWN. Ascii-like. */ + handler = xmlFindCharEncodingHandler((const char *) encoding); + } + + if (handler == NULL) { + xmlFatalErrMsgStr(ctxt, XML_ERR_INVALID_ENCODING, + "Unsupported encoding %s\n", encoding); + xmlFree((xmlChar *) encoding); + return(NULL); } + if (compatible == XML_ENC_COMP_UNKNOWN && + !xmlEncHandlerAsciiCompatible(handler)) { + xmlFatalErrMsgStr(ctxt, XML_ERR_INVALID_ENCODING, + "Document starts with ASCII but declares " + "incompatible encoding %s\n", encoding); + } + + xmlSwitchToEncoding(ctxt, handler); + xmlFree((void*) ctxt->encoding); + ctxt->encoding = encoding; } return(encoding); } /** + * checkNoEncodingDecl + * @ctxt: an XML parser context + * + * Necessary checks if thereis no encoding declaration in XML + * declaration or text declaration. XML REC section 4.3.3: "Unless an + * encoding is determined by a higher-level protocol, it is also a + * fatal error if an XML entity contains no encoding declaration and + * its content is not legal UTF-8 or UTF-16." + * + * Note that UTF-16 only refers to UTF-16 (with BOM), + * UTF-16LE or UTF-16BE (without BOM) won't do: + * + * "In the absence of information provided by an external + * transport protocol (e.g. HTTP or MIME), it is a fatal error + * ... for an entity which begins with neither a Byte Order Mark + * nor an encoding declaration to use an encoding other than + * UTF-8." + * + * For UTF-8, ctxt->encoding is either NULL or "UTF-8". + */ +static int checkNoEncodingDecl(xmlParserCtxtPtr ctxt) { + if ((ctxt->options & XML_PARSE_IGNORE_ENC) == 0 && + ctxt->encoding != NULL && + xmlStrcmp(ctxt->encoding, BAD_CAST "UTF-8") && + xmlStrcmp(ctxt->encoding, BAD_CAST "UTF-16")) { + xmlFatalErrMsgStr(ctxt, XML_ERR_INVALID_ENCODING, + "Encoding declaration missing (not UTF-8 nor UTF-16 with BOM)", + NULL); + return(-1); + } + return(0); +} + +/** * xmlParseSDDecl: * @ctxt: an XML parser context * @@ -10639,6 +10704,7 @@ xmlParseXMLDecl(xmlParserCtxtPtr ctxt) { */ if (!IS_BLANK_CH(RAW)) { if ((RAW == '?') && (NXT(1) == '>')) { + checkNoEncodingDecl(ctxt); SKIP(2); return; } @@ -10658,6 +10724,9 @@ xmlParseXMLDecl(xmlParserCtxtPtr ctxt) { * We may have the standalone status. */ int hasEncodingDecl = (ctxt->input->cur != preEncodingCur); + if (!hasEncodingDecl && checkNoEncodingDecl(ctxt) < 0) { + return; + } if (hasEncodingDecl && (!IS_BLANK_CH(RAW))) { if ((RAW == '?') && (NXT(1) == '>')) { SKIP(2); @@ -10798,8 +10867,11 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) { ctxt->standalone = ctxt->input->standalone; SKIP_BLANKS; } else { + if (checkNoEncodingDecl(ctxt) < 0) + return(-1); ctxt->version = xmlCharStrdup(XML_DEFAULT_VERSION); } + if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX)) ctxt->sax->startDocument(ctxt->userData); if (ctxt->instate == XML_PARSER_EOF) @@ -10977,6 +11049,8 @@ xmlParseExtParsedEnt(xmlParserCtxtPtr ctxt) { } SKIP_BLANKS; } else { + if (checkNoEncodingDecl(ctxt) < 0) + return(-1); ctxt->version = xmlCharStrdup(XML_DEFAULT_VERSION); } if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX)) @@ -11392,6 +11466,7 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { ctxt->sax->endDocument(ctxt->userData); goto done; } + int hasXmlDecl = 0; if ((cur == '<') && (next == '?')) { /* PI or XML decl */ if (avail < 5) return(ret); @@ -11410,6 +11485,7 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { xmlGenericError(xmlGenericErrorContext, "PP: Parsing XML Decl\n"); #endif + hasXmlDecl = 1; xmlParseXMLDecl(ctxt); if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { /* @@ -11460,6 +11536,9 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { "PP: entering MISC\n"); #endif } + if (!hasXmlDecl && checkNoEncodingDecl(ctxt) < 0) + return(0); + break; case XML_PARSER_START_TAG: { const xmlChar *name; @@ -13090,6 +13169,8 @@ xmlParseCtxtExternalEntity(xmlParserCtxtPtr ctx, const xmlChar *URL, xmlFatalErrMsg(ctxt, XML_ERR_VERSION_MISMATCH, "Version mismatch between document and entity\n"); } + } else { + checkNoEncodingDecl(ctxt); } /* @@ -13310,6 +13391,8 @@ xmlParseExternalEntityPrivate(xmlDocPtr doc, xmlParserCtxtPtr oldctxt, */ if ((CMP5(CUR_PTR, '<', '?', 'x', 'm', 'l')) && (IS_BLANK_CH(NXT(5)))) { xmlParseTextDecl(ctxt); + } else { + checkNoEncodingDecl(ctxt); } ctxt->instate = XML_PARSER_CONTENT; @@ -15104,6 +15187,7 @@ xmlCtxtResetPush(xmlParserCtxtPtr ctxt, const char *chunk, } if (encoding != NULL) { + ctxt->options |= XML_PARSE_IGNORE_ENC; xmlCharEncodingHandlerPtr hdlr; if (ctxt->encoding != NULL) @@ -15304,6 +15388,7 @@ xmlDoRead(xmlParserCtxtPtr ctxt, const char *URL, const char *encoding, xmlCtxtUseOptionsInternal(ctxt, options, encoding); if (encoding != NULL) { xmlCharEncodingHandlerPtr hdlr; + ctxt->options |= XML_PARSE_IGNORE_ENC; hdlr = xmlFindCharEncodingHandler(encoding); if (hdlr != NULL) diff --git a/parserInternals.c b/parserInternals.c index e01a252..957df04 100644 --- a/parserInternals.c +++ b/parserInternals.c @@ -940,6 +940,7 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) if (ctxt == NULL) return(-1); int length = 0; + const char *encodingName = NULL; switch (enc) { case XML_CHAR_ENCODING_ERROR: __xmlErrEncoding(ctxt, XML_ERR_UNKNOWN_ENCODING, @@ -952,6 +953,8 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) case XML_CHAR_ENCODING_UTF8: /* default encoding, no conversion should be needed */ ctxt->charset = XML_CHAR_ENCODING_UTF8; + xmlFree((xmlChar *) ctxt->encoding); + ctxt->encoding = xmlStrdup(BAD_CAST "UTF-8"); /* * Errata on XML-1.0 June 20 2001 @@ -987,10 +990,12 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) if ((ctxt->input->cur != NULL) && (length >= 2) && (ctxt->input->cur[0] == 0xFF) && (ctxt->input->cur[1] == 0xFE)) { ctxt->input->cur += 2; + encodingName = "UTF-16"; } else if ((ctxt->input->cur != NULL) && (length >= 2) && (ctxt->input->cur[0] == 0xFE) && (ctxt->input->cur[1] == 0xFF)) { ctxt->input->cur += 2; + encodingName = "UTF-16"; } len = 90; break; @@ -1003,6 +1008,7 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) (ctxt->input->cur[0] == 0x00) && (ctxt->input->cur[1] == 0x00) && (ctxt->input->cur[2] == 0xFE) && (ctxt->input->cur[3] == 0xFF)) { ctxt->input->cur += 4; + encodingName = "ISO-10646-UCS-4"; } len = 180; break; @@ -1012,6 +1018,7 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) (ctxt->input->cur[0] == 0xFF) && (ctxt->input->cur[1] == 0xFE) && (ctxt->input->cur[2] == 0x00) && (ctxt->input->cur[3] == 0x00)) { ctxt->input->cur += 4; + encodingName = "ISO-10646-UCS-4"; } len = 180; break; @@ -1037,6 +1044,8 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) break; } handler = xmlGetCharEncodingHandler(enc); + if (encodingName == NULL) + encodingName = xmlGetCharEncodingName(enc); if (handler == NULL) { /* * Default handlers. @@ -1125,7 +1134,12 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) if (handler == NULL) return(-1); ctxt->charset = XML_CHAR_ENCODING_UTF8; - return(xmlSwitchToEncodingInt(ctxt, handler, len)); + int res = xmlSwitchToEncodingInt(ctxt, handler, len); + if (res == 0 && encodingName != NULL) { + xmlFree((xmlChar *)ctxt->encoding); + ctxt->encoding = xmlStrdup(BAD_CAST encodingName); + } + return(res); } /**