commit 7733865c3e942f6ec171b43d7202ed763e689a09 Author: Olli Pottonen Date: Tue Jul 7 23:26:59 2015 +1000 Properly distinguish between UTF-16, UTF-16LE, UTF-16BE. UTF-16, UTF-16LE and UTF-16BE are three different schemes. UTF-16LE and UTF-16BE use little endiand and big endian byte order, respectively, and do not have a byte order mark (BOM). UTF-16 is either little endian or big endian, as indicated by BOM, which must be present. Fix mixup over UTF-16 and UTF-16LE. Same goes for UTF-32 as well. See http://www.unicode.org/versions/Unicode5.0.0/ch03.pdf#page=43. diff --git a/encoding.c b/encoding.c index 574e1ae..6807306 100644 --- a/encoding.c +++ b/encoding.c @@ -50,6 +50,7 @@ static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL; static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL; +static xmlCharEncodingHandlerPtr xmlUTF16Handler = NULL; typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias; typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr; @@ -911,7 +912,7 @@ UTF8ToUTF16BE(unsigned char* outb, int *outlen, /** * xmlDetectCharEncoding: * @in: a pointer to the first bytes of the XML entity, must be at least - * 2 bytes long (at least 4 if encoding is UTF4 variant). + * 2 bytes long (at least 4 if encoding is UTF32 variant). * @len: pointer to the length of the buffer * * Guess the encoding of the entity using the first bytes of the entity content @@ -964,12 +965,15 @@ xmlDetectCharEncoding(const unsigned char* in, int len) (in[2] == 0xBF)) return(XML_CHAR_ENCODING_UTF8); } - /* For UTF-16 we can recognize by the BOM */ + /* For UTF-16 we can recognize by the BOM. For backwards + * compatibility, return the wrong value; if there is BOM, the + * encoding is UTF-16, not UTF-16LE nor UTF-16BE. + */ if (len >= 2) { - if ((in[0] == 0xFE) && (in[1] == 0xFF)) - return(XML_CHAR_ENCODING_UTF16BE); - if ((in[0] == 0xFF) && (in[1] == 0xFE)) - return(XML_CHAR_ENCODING_UTF16LE); + if ((in[0] == 0xFE) && (in[1] == 0xFF)) + return(XML_CHAR_ENCODING_UTF16BE); + if ((in[0] == 0xFF) && (in[1] == 0xFE)) + return(XML_CHAR_ENCODING_UTF16LE); } return(XML_CHAR_ENCODING_NONE); } @@ -1164,26 +1168,15 @@ xmlParseCharEncoding(const char* name) if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8); if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8); - /* - * NOTE: if we were able to parse this, the endianness of UTF16 is - * already found and in use - */ - if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE); - if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE); + if (!strcmp(upper, "UTF-16LE")) return(XML_CHAR_ENCODING_UTF16LE); + if (!strcmp(upper, "UTF16LE")) return(XML_CHAR_ENCODING_UTF16LE); + if (!strcmp(upper, "UTF-16BE")) return(XML_CHAR_ENCODING_UTF16BE); + if (!strcmp(upper, "UTF16BE")) return(XML_CHAR_ENCODING_UTF16BE); if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2); if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2); if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2); - /* - * NOTE: if we were able to parse this, the endianness of UCS4 is - * already found and in use - */ - if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE); - if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE); - if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE); - - if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1); if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1); if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1); @@ -1231,19 +1224,19 @@ xmlGetCharEncodingName(xmlCharEncoding enc) { case XML_CHAR_ENCODING_UTF8: return("UTF-8"); case XML_CHAR_ENCODING_UTF16LE: - return("UTF-16"); + return("UTF-16LE"); case XML_CHAR_ENCODING_UTF16BE: - return("UTF-16"); + return("UTF-16BE"); case XML_CHAR_ENCODING_EBCDIC: return("EBCDIC"); case XML_CHAR_ENCODING_UCS4LE: - return("ISO-10646-UCS-4"); + return("ISO-10646-UCS-4LE"); case XML_CHAR_ENCODING_UCS4BE: - return("ISO-10646-UCS-4"); + return("ISO-10646-UCS-4BE"); case XML_CHAR_ENCODING_UCS4_2143: - return("ISO-10646-UCS-4"); + return("ISO-10646-UCS-4-2143"); case XML_CHAR_ENCODING_UCS4_3412: - return("ISO-10646-UCS-4"); + return("ISO-10646-UCS-4-3412"); case XML_CHAR_ENCODING_UCS2: return("ISO-10646-UCS-2"); case XML_CHAR_ENCODING_8859_1: @@ -1411,7 +1404,10 @@ xmlInitCharEncodingHandlers(void) { xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE); xmlUTF16BEHandler = xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE); - xmlNewCharEncodingHandler("UTF-16", UTF16LEToUTF8, UTF8ToUTF16); + // There is no decoder for UTF-16; either UTF16BEToUTF8 or + // UTF16LEToUTF8, is used, depending on Byte Order Mark. + xmlUTF16Handler = + xmlNewCharEncodingHandler("UTF-16", NULL, UTF8ToUTF16); xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1); xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii); xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii); @@ -1423,7 +1419,8 @@ xmlInitCharEncodingHandlers(void) { xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, NULL); xmlUTF16BEHandler = xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, NULL); - xmlNewCharEncodingHandler("UTF-16", UTF16LEToUTF8, NULL); + xmlUTF16Handler = + xmlNewCharEncodingHandler("UTF-16", NULL, UTF8ToUTF16); xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, NULL); xmlNewCharEncodingHandler("ASCII", asciiToUTF8, NULL); xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, NULL); @@ -1434,6 +1431,10 @@ xmlInitCharEncodingHandlers(void) { #endif #endif + xmlAddEncodingAlias("UTF-16", "UTF16"); + xmlAddEncodingAlias("UTF-32", "UTF32"); + xmlAddEncodingAlias("UCS-4", "ISO-10646-UCS-4"); + xmlAddEncodingAlias("UCS4", "ISO-10646-UCS-4"); } /** @@ -1520,20 +1521,25 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) { handler = xmlFindCharEncodingHandler("IBM-037"); if (handler != NULL) return(handler); break; - case XML_CHAR_ENCODING_UCS4BE: - handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4"); + + case XML_CHAR_ENCODING_UCS4LE: + handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4LE"); if (handler != NULL) return(handler); - handler = xmlFindCharEncodingHandler("UCS-4"); + handler = xmlFindCharEncodingHandler("UCS-4LE"); if (handler != NULL) return(handler); - handler = xmlFindCharEncodingHandler("UCS4"); + handler = xmlFindCharEncodingHandler("UCS4LE"); + if (handler != NULL) return(handler); + handler = xmlFindCharEncodingHandler("UTF-32LE"); if (handler != NULL) return(handler); break; - case XML_CHAR_ENCODING_UCS4LE: - handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4"); + case XML_CHAR_ENCODING_UCS4BE: + handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4BE"); + if (handler != NULL) return(handler); + handler = xmlFindCharEncodingHandler("UCS-4BE"); if (handler != NULL) return(handler); - handler = xmlFindCharEncodingHandler("UCS-4"); + handler = xmlFindCharEncodingHandler("UCS4BE"); if (handler != NULL) return(handler); - handler = xmlFindCharEncodingHandler("UCS4"); + handler = xmlFindCharEncodingHandler("UTF-32BE"); if (handler != NULL) return(handler); break; case XML_CHAR_ENCODING_UCS4_2143: diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h index 7967cc6..cd25b2f 100644 --- a/include/libxml/encoding.h +++ b/include/libxml/encoding.h @@ -39,7 +39,7 @@ extern "C" { * * Predefined values for some standard encodings. * Libxml does not do beforehand translation on UTF8 and ISOLatinX. - * It also supports ASCII, ISO-8859-1, and UTF16 (LE and BE) by default. + * It also supports ASCII, ISO-8859-1, and all variants of UTF16 by default. * * Anything else would have to be translated to UTF8 before being * given to the parser itself. The BOM for UTF16 and the encoding @@ -52,8 +52,12 @@ extern "C" { * to be sure to enable iconv and to provide iconv libs for the encoding * support needed. * - * Note that the generic "UTF-16" is not a predefined value. Instead, only - * the specific UTF-16LE and UTF-16BE are present. + * Note that UTF-16, UTF-16LE and UTF-16BE are three different things. + * UTF-16 must have byte order marker, UTF-16LE and UTF-16BE must not. + * + * Similarly UTF-32, UTF-32LE and UTF-32BE are three different things. + * However UTF-32 is also known as UCS-4 (and, in addition to little endian + * and big endian, there are two unusual byte orders.) */ typedef enum { XML_CHAR_ENCODING_ERROR= -1, /* No char encoding detected */ diff --git a/parser.c b/parser.c index d8c3ee3..d7457fc 100644 --- a/parser.c +++ b/parser.c @@ -15039,7 +15039,7 @@ xmlCtxtResetPush(xmlParserCtxtPtr ctxt, const char *chunk, if ((encoding == NULL) && (chunk != NULL) && (size >= 4)) enc = xmlDetectCharEncoding((const xmlChar *) chunk, size); - buf = xmlAllocParserInputBuffer(enc); + buf = xmlAllocParserInputBuffer(XML_CHAR_ENCODING_NONE); if (buf == NULL) return(1); diff --git a/parserInternals.c b/parserInternals.c index df204fd..e01a252 100644 --- a/parserInternals.c +++ b/parserInternals.c @@ -939,6 +939,7 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) int len = -1; if (ctxt == NULL) return(-1); + int length = 0; switch (enc) { case XML_CHAR_ENCODING_ERROR: __xmlErrEncoding(ctxt, XML_ERR_UNKNOWN_ENCODING, @@ -981,12 +982,39 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) ctxt->input->cur += 3; } len = 90; + + length = ctxt->input->end - ctxt->input->cur; + if ((ctxt->input->cur != NULL) && (length >= 2) && + (ctxt->input->cur[0] == 0xFF) && (ctxt->input->cur[1] == 0xFE)) { + ctxt->input->cur += 2; + } + else if ((ctxt->input->cur != NULL) && (length >= 2) && + (ctxt->input->cur[0] == 0xFE) && (ctxt->input->cur[1] == 0xFF)) { + ctxt->input->cur += 2; + } + len = 90; break; case XML_CHAR_ENCODING_UCS2: len = 90; break; case XML_CHAR_ENCODING_UCS4BE: + length = ctxt->input->end - ctxt->input->cur; + if ((ctxt->input->cur != NULL) && (length >= 4) && + (ctxt->input->cur[0] == 0x00) && (ctxt->input->cur[1] == 0x00) && + (ctxt->input->cur[2] == 0xFE) && (ctxt->input->cur[3] == 0xFF)) { + ctxt->input->cur += 4; + } + len = 180; + break; case XML_CHAR_ENCODING_UCS4LE: + length = ctxt->input->end - ctxt->input->cur; + if ((ctxt->input->cur != NULL) && (length >= 4) && + (ctxt->input->cur[0] == 0xFF) && (ctxt->input->cur[1] == 0xFE) && + (ctxt->input->cur[2] == 0x00) && (ctxt->input->cur[3] == 0x00)) { + ctxt->input->cur += 4; + } + len = 180; + break; case XML_CHAR_ENCODING_UCS4_2143: case XML_CHAR_ENCODING_UCS4_3412: len = 180; @@ -1025,12 +1053,12 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) case XML_CHAR_ENCODING_UCS4LE: __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, "encoding not supported %s\n", - BAD_CAST "USC4 little endian", NULL); + BAD_CAST "UCS4 little endian", NULL); break; case XML_CHAR_ENCODING_UCS4BE: __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, "encoding not supported %s\n", - BAD_CAST "USC4 big endian", NULL); + BAD_CAST "UCS4 big endian", NULL); break; case XML_CHAR_ENCODING_EBCDIC: __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, @@ -1132,16 +1160,6 @@ xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, return (0); /* - * "UTF-16" can be used for both LE and BE - if ((!xmlStrncmp(BAD_CAST input->buf->encoder->name, - BAD_CAST "UTF-16", 6)) && - (!xmlStrncmp(BAD_CAST handler->name, - BAD_CAST "UTF-16", 6))) { - return(0); - } - */ - - /* * Note: this is a bit dangerous, but that's what it * takes to use nearly compatible signature for different * encodings. @@ -1160,21 +1178,6 @@ xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, unsigned int use; /* - * Specific handling of the Byte Order Mark for - * UTF-16 - */ - if ((handler->name != NULL) && - (!strcmp(handler->name, "UTF-16LE") || - !strcmp(handler->name, "UTF-16")) && - (input->cur[0] == 0xFF) && (input->cur[1] == 0xFE)) { - input->cur += 2; - } - if ((handler->name != NULL) && - (!strcmp(handler->name, "UTF-16BE")) && - (input->cur[0] == 0xFE) && (input->cur[1] == 0xFF)) { - input->cur += 2; - } - /* * Errata on XML-1.0 June 20 2001 * Specific handling of the Byte Order Mark for * UTF-8 @@ -1266,6 +1269,68 @@ static int xmlSwitchToEncodingInt(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler, int len) { int ret = 0; + const char *newEncoding = NULL; + + if (handler != NULL && handler->name != NULL && + !strcmp(handler->name, "UTF-16")) { + /* + * "UTF-16" means "either little endian or big endian as indicated + * by byte order mark". So let's check the mark. + */ + const xmlChar *in = ctxt->input->cur; + int length = ctxt->input->end - ctxt->input->cur; + if (length == 0) { + /* No input. This should not happen. */ + __xmlRaiseError(NULL, NULL, NULL, + ctxt, NULL, XML_FROM_PARSER, + XML_ERR_INVALID_ENCODING, XML_ERR_WARNING, + NULL, 0, NULL, NULL, + NULL, 0, 0, "Empty UTF-16 string (no byte mark)", + NULL, NULL); + return (0); + } + if (length == 1) { + __xmlErrEncoding(ctxt, XML_IO_ENCODER, + "Decoding error for UTF-16: only one byte", NULL, NULL); + return (-1); + } + if ((in[0] == 0xFF) && (in[1] == 0xFE)) { + newEncoding = "UTF-16LE"; + ctxt->input->cur += 2; + } else if ((in[0] == 0xFE) && (in[1] == 0xFF)) { + newEncoding = "UTF-16BE"; + ctxt->input->cur += 2; + } else { + /* Error. XML REC says UTF-16 must have BOM. Not fatal however. */ + __xmlRaiseError(NULL, NULL, NULL, + ctxt, NULL, XML_FROM_PARSER, + XML_ERR_INVALID_ENCODING, XML_ERR_WARNING, + NULL, 0, NULL, NULL, + NULL, 0, 0, "No byte order mark for UTF-16", + NULL, NULL); + if ((in[0] == 0x3C) && (in[1] == 0x00)) { + newEncoding = "UTF-16LE"; + } else if ((in[0] == 0x00) && (in[1] == 0x3C)) { + newEncoding = "UTF-16BE"; + } else { + /* Unicode standard says BE should be the default, but + * for backwards compatibility we use LE. XML standard + * allows anything: after an error (missing BOM) + * results are undefined, and we can recover as best + * we can. */ + newEncoding = "UTF-16LE"; + } + } + } + + if (newEncoding != NULL) { + handler = xmlFindCharEncodingHandler(newEncoding); + if (handler == NULL) { + __xmlErrEncoding(ctxt, XML_IO_ENCODER, + "Encoding %s not supported", BAD_CAST newEncoding, NULL); + return (-1); + } + } if (handler != NULL) { if (ctxt->input != NULL) {