commit 7733865c3e942f6ec171b43d7202ed763e689a09
Author: Olli Pottonen <olli pottonen iki fi>
Date:   Tue Jul 7 23:26:59 2015 +1000

    Properly distinguish between UTF-16, UTF-16LE, UTF-16BE.
    
    UTF-16, UTF-16LE and UTF-16BE are three different schemes. UTF-16LE
    and UTF-16BE use little endiand and big endian byte order,
    respectively, and do not have a byte order mark (BOM). UTF-16 is
    either little endian or big endian, as indicated by BOM, which must be
    present. Fix mixup over UTF-16 and UTF-16LE.
    
    Same goes for UTF-32 as well.
    
    See http://www.unicode.org/versions/Unicode5.0.0/ch03.pdf#page=43.

diff --git a/encoding.c b/encoding.c
index 574e1ae..6807306 100644
--- a/encoding.c
+++ b/encoding.c
@@ -50,6 +50,7 @@
 
 static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
 static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
+static xmlCharEncodingHandlerPtr xmlUTF16Handler = NULL;
 
 typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
 typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
@@ -911,7 +912,7 @@ UTF8ToUTF16BE(unsigned char* outb, int *outlen,
 /**
  * xmlDetectCharEncoding:
  * @in:  a pointer to the first bytes of the XML entity, must be at least
- *       2 bytes long (at least 4 if encoding is UTF4 variant).
+ *       2 bytes long (at least 4 if encoding is UTF32 variant).
  * @len:  pointer to the length of the buffer
  *
  * Guess the encoding of the entity using the first bytes of the entity content
@@ -964,12 +965,15 @@ xmlDetectCharEncoding(const unsigned char* in, int len)
 	    (in[2] == 0xBF))
 	    return(XML_CHAR_ENCODING_UTF8);
     }
-    /* For UTF-16 we can recognize by the BOM */
+    /* For UTF-16 we can recognize by the BOM. For backwards
+     * compatibility, return the wrong value; if there is BOM, the
+     * encoding is UTF-16, not UTF-16LE nor UTF-16BE.
+     */
     if (len >= 2) {
-	if ((in[0] == 0xFE) && (in[1] == 0xFF))
-	    return(XML_CHAR_ENCODING_UTF16BE);
-	if ((in[0] == 0xFF) && (in[1] == 0xFE))
-	    return(XML_CHAR_ENCODING_UTF16LE);
+       if ((in[0] == 0xFE) && (in[1] == 0xFF))
+          return(XML_CHAR_ENCODING_UTF16BE);
+       if ((in[0] == 0xFF) && (in[1] == 0xFE))
+          return(XML_CHAR_ENCODING_UTF16LE);
     }
     return(XML_CHAR_ENCODING_NONE);
 }
@@ -1164,26 +1168,15 @@ xmlParseCharEncoding(const char* name)
     if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
     if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
 
-    /*
-     * NOTE: if we were able to parse this, the endianness of UTF16 is
-     *       already found and in use
-     */
-    if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
-    if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
+    if (!strcmp(upper, "UTF-16LE")) return(XML_CHAR_ENCODING_UTF16LE);
+    if (!strcmp(upper, "UTF16LE")) return(XML_CHAR_ENCODING_UTF16LE);
+    if (!strcmp(upper, "UTF-16BE")) return(XML_CHAR_ENCODING_UTF16BE);
+    if (!strcmp(upper, "UTF16BE")) return(XML_CHAR_ENCODING_UTF16BE);
 
     if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
     if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
     if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
 
-    /*
-     * NOTE: if we were able to parse this, the endianness of UCS4 is
-     *       already found and in use
-     */
-    if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
-    if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
-    if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
-
-
     if (!strcmp(upper,  "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
     if (!strcmp(upper,  "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
     if (!strcmp(upper,  "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
@@ -1231,19 +1224,19 @@ xmlGetCharEncodingName(xmlCharEncoding enc) {
         case XML_CHAR_ENCODING_UTF8:
 	    return("UTF-8");
         case XML_CHAR_ENCODING_UTF16LE:
-	    return("UTF-16");
+	    return("UTF-16LE");
         case XML_CHAR_ENCODING_UTF16BE:
-	    return("UTF-16");
+	    return("UTF-16BE");
         case XML_CHAR_ENCODING_EBCDIC:
             return("EBCDIC");
         case XML_CHAR_ENCODING_UCS4LE:
-            return("ISO-10646-UCS-4");
+            return("ISO-10646-UCS-4LE");
         case XML_CHAR_ENCODING_UCS4BE:
-            return("ISO-10646-UCS-4");
+            return("ISO-10646-UCS-4BE");
         case XML_CHAR_ENCODING_UCS4_2143:
-            return("ISO-10646-UCS-4");
+            return("ISO-10646-UCS-4-2143");
         case XML_CHAR_ENCODING_UCS4_3412:
-            return("ISO-10646-UCS-4");
+            return("ISO-10646-UCS-4-3412");
         case XML_CHAR_ENCODING_UCS2:
             return("ISO-10646-UCS-2");
         case XML_CHAR_ENCODING_8859_1:
@@ -1411,7 +1404,10 @@ xmlInitCharEncodingHandlers(void) {
           xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
     xmlUTF16BEHandler =
           xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
-    xmlNewCharEncodingHandler("UTF-16", UTF16LEToUTF8, UTF8ToUTF16);
+    // There is no decoder for UTF-16; either UTF16BEToUTF8 or
+    // UTF16LEToUTF8, is used, depending on Byte Order Mark.
+    xmlUTF16Handler =
+          xmlNewCharEncodingHandler("UTF-16", NULL, UTF8ToUTF16);
     xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
     xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
     xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii);
@@ -1423,7 +1419,8 @@ xmlInitCharEncodingHandlers(void) {
           xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, NULL);
     xmlUTF16BEHandler =
           xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, NULL);
-    xmlNewCharEncodingHandler("UTF-16", UTF16LEToUTF8, NULL);
+    xmlUTF16Handler =
+          xmlNewCharEncodingHandler("UTF-16", NULL, UTF8ToUTF16);
     xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, NULL);
     xmlNewCharEncodingHandler("ASCII", asciiToUTF8, NULL);
     xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, NULL);
@@ -1434,6 +1431,10 @@ xmlInitCharEncodingHandlers(void) {
 #endif
 #endif
 
+    xmlAddEncodingAlias("UTF-16", "UTF16");
+    xmlAddEncodingAlias("UTF-32", "UTF32");
+    xmlAddEncodingAlias("UCS-4", "ISO-10646-UCS-4");
+    xmlAddEncodingAlias("UCS4", "ISO-10646-UCS-4");
 }
 
 /**
@@ -1520,20 +1521,25 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) {
             handler = xmlFindCharEncodingHandler("IBM-037");
             if (handler != NULL) return(handler);
 	    break;
-        case XML_CHAR_ENCODING_UCS4BE:
-            handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
+
+        case XML_CHAR_ENCODING_UCS4LE:
+            handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4LE");
             if (handler != NULL) return(handler);
-            handler = xmlFindCharEncodingHandler("UCS-4");
+            handler = xmlFindCharEncodingHandler("UCS-4LE");
             if (handler != NULL) return(handler);
-            handler = xmlFindCharEncodingHandler("UCS4");
+            handler = xmlFindCharEncodingHandler("UCS4LE");
+            if (handler != NULL) return(handler);
+            handler = xmlFindCharEncodingHandler("UTF-32LE");
             if (handler != NULL) return(handler);
 	    break;
-        case XML_CHAR_ENCODING_UCS4LE:
-            handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
+        case XML_CHAR_ENCODING_UCS4BE:
+            handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4BE");
+            if (handler != NULL) return(handler);
+            handler = xmlFindCharEncodingHandler("UCS-4BE");
             if (handler != NULL) return(handler);
-            handler = xmlFindCharEncodingHandler("UCS-4");
+            handler = xmlFindCharEncodingHandler("UCS4BE");
             if (handler != NULL) return(handler);
-            handler = xmlFindCharEncodingHandler("UCS4");
+            handler = xmlFindCharEncodingHandler("UTF-32BE");
             if (handler != NULL) return(handler);
 	    break;
         case XML_CHAR_ENCODING_UCS4_2143:
diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h
index 7967cc6..cd25b2f 100644
--- a/include/libxml/encoding.h
+++ b/include/libxml/encoding.h
@@ -39,7 +39,7 @@ extern "C" {
  *
  * Predefined values for some standard encodings.
  * Libxml does not do beforehand translation on UTF8 and ISOLatinX.
- * It also supports ASCII, ISO-8859-1, and UTF16 (LE and BE) by default.
+ * It also supports ASCII, ISO-8859-1, and all variants of UTF16 by default.
  *
  * Anything else would have to be translated to UTF8 before being
  * given to the parser itself. The BOM for UTF16 and the encoding
@@ -52,8 +52,12 @@ extern "C" {
  * to be sure to enable iconv and to provide iconv libs for the encoding
  * support needed.
  *
- * Note that the generic "UTF-16" is not a predefined value.  Instead, only
- * the specific UTF-16LE and UTF-16BE are present.
+ * Note that UTF-16, UTF-16LE and UTF-16BE are three different things.
+ * UTF-16 must have byte order marker, UTF-16LE and UTF-16BE must not.
+ *
+ * Similarly UTF-32, UTF-32LE and UTF-32BE are three different things.
+ * However UTF-32 is also known as UCS-4 (and, in addition to little endian
+ * and big endian, there are two unusual byte orders.)
  */
 typedef enum {
     XML_CHAR_ENCODING_ERROR=   -1, /* No char encoding detected */
diff --git a/parser.c b/parser.c
index d8c3ee3..d7457fc 100644
--- a/parser.c
+++ b/parser.c
@@ -15039,7 +15039,7 @@ xmlCtxtResetPush(xmlParserCtxtPtr ctxt, const char *chunk,
     if ((encoding == NULL) && (chunk != NULL) && (size >= 4))
         enc = xmlDetectCharEncoding((const xmlChar *) chunk, size);
 
-    buf = xmlAllocParserInputBuffer(enc);
+    buf = xmlAllocParserInputBuffer(XML_CHAR_ENCODING_NONE);
     if (buf == NULL)
         return(1);
 
diff --git a/parserInternals.c b/parserInternals.c
index df204fd..e01a252 100644
--- a/parserInternals.c
+++ b/parserInternals.c
@@ -939,6 +939,7 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
     int len = -1;
 
     if (ctxt == NULL) return(-1);
+    int length = 0;
     switch (enc) {
 	case XML_CHAR_ENCODING_ERROR:
 	    __xmlErrEncoding(ctxt, XML_ERR_UNKNOWN_ENCODING,
@@ -981,12 +982,39 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
             ctxt->input->cur += 3;
         }
         len = 90;
+
+        length = ctxt->input->end - ctxt->input->cur;
+        if ((ctxt->input->cur != NULL) && (length >= 2) &&
+            (ctxt->input->cur[0] == 0xFF) && (ctxt->input->cur[1] == 0xFE)) {
+            ctxt->input->cur += 2;
+        }
+        else if ((ctxt->input->cur != NULL) && (length >= 2) &&
+            (ctxt->input->cur[0] == 0xFE) && (ctxt->input->cur[1] == 0xFF)) {
+            ctxt->input->cur += 2;
+        }
+	len = 90;
 	break;
     case XML_CHAR_ENCODING_UCS2:
         len = 90;
 	break;
     case XML_CHAR_ENCODING_UCS4BE:
+        length = ctxt->input->end - ctxt->input->cur;
+	if ((ctxt->input->cur != NULL) && (length >= 4) &&
+	    (ctxt->input->cur[0] == 0x00) && (ctxt->input->cur[1] == 0x00) &&
+	    (ctxt->input->cur[2] == 0xFE) && (ctxt->input->cur[3] == 0xFF)) {
+	    ctxt->input->cur += 4;
+	}
+	len = 180;
+	break;
     case XML_CHAR_ENCODING_UCS4LE:
+        length = ctxt->input->end - ctxt->input->cur;
+	if ((ctxt->input->cur != NULL) && (length >= 4) &&
+	    (ctxt->input->cur[0] == 0xFF) && (ctxt->input->cur[1] == 0xFE) &&
+	    (ctxt->input->cur[2] == 0x00) && (ctxt->input->cur[3] == 0x00)) {
+	    ctxt->input->cur += 4;
+	}
+	len = 180;
+	break;
     case XML_CHAR_ENCODING_UCS4_2143:
     case XML_CHAR_ENCODING_UCS4_3412:
         len = 180;
@@ -1025,12 +1053,12 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
 	    case XML_CHAR_ENCODING_UCS4LE:
 		__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
 			       "encoding not supported %s\n",
-			       BAD_CAST "USC4 little endian", NULL);
+			       BAD_CAST "UCS4 little endian", NULL);
 		break;
 	    case XML_CHAR_ENCODING_UCS4BE:
 		__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
 			       "encoding not supported %s\n",
-			       BAD_CAST "USC4 big endian", NULL);
+			       BAD_CAST "UCS4 big endian", NULL);
 		break;
 	    case XML_CHAR_ENCODING_EBCDIC:
 		__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
@@ -1132,16 +1160,6 @@ xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
                 return (0);
 
             /*
-             * "UTF-16" can be used for both LE and BE
-             if ((!xmlStrncmp(BAD_CAST input->buf->encoder->name,
-             BAD_CAST "UTF-16", 6)) &&
-             (!xmlStrncmp(BAD_CAST handler->name,
-             BAD_CAST "UTF-16", 6))) {
-             return(0);
-             }
-             */
-
-            /*
              * Note: this is a bit dangerous, but that's what it
              * takes to use nearly compatible signature for different
              * encodings.
@@ -1160,21 +1178,6 @@ xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
 	    unsigned int use;
 
             /*
-             * Specific handling of the Byte Order Mark for
-             * UTF-16
-             */
-            if ((handler->name != NULL) &&
-                (!strcmp(handler->name, "UTF-16LE") ||
-                 !strcmp(handler->name, "UTF-16")) &&
-                (input->cur[0] == 0xFF) && (input->cur[1] == 0xFE)) {
-                input->cur += 2;
-            }
-            if ((handler->name != NULL) &&
-                (!strcmp(handler->name, "UTF-16BE")) &&
-                (input->cur[0] == 0xFE) && (input->cur[1] == 0xFF)) {
-                input->cur += 2;
-            }
-            /*
              * Errata on XML-1.0 June 20 2001
              * Specific handling of the Byte Order Mark for
              * UTF-8
@@ -1266,6 +1269,68 @@ static int
 xmlSwitchToEncodingInt(xmlParserCtxtPtr ctxt,
                        xmlCharEncodingHandlerPtr handler, int len) {
     int ret = 0;
+    const char *newEncoding = NULL;
+
+    if (handler != NULL && handler->name != NULL &&
+	!strcmp(handler->name, "UTF-16")) {
+        /*
+	 * "UTF-16" means "either little endian or big endian as indicated
+	 * by byte order mark". So let's check the mark.
+	 */
+	const xmlChar *in = ctxt->input->cur;
+	int length = ctxt->input->end - ctxt->input->cur;
+	if (length == 0) {
+	  /* No input. This should not happen. */
+	  __xmlRaiseError(NULL, NULL, NULL,
+			  ctxt, NULL, XML_FROM_PARSER,
+			  XML_ERR_INVALID_ENCODING, XML_ERR_WARNING,
+			  NULL, 0, NULL, NULL,
+			  NULL, 0, 0, "Empty UTF-16 string (no byte mark)",
+			  NULL, NULL);
+	    return (0);
+	}
+        if (length == 1) {
+	    __xmlErrEncoding(ctxt, XML_IO_ENCODER,
+		"Decoding error for UTF-16: only one byte", NULL, NULL);
+	    return (-1);
+	}
+        if ((in[0] == 0xFF) && (in[1] == 0xFE)) {
+	    newEncoding = "UTF-16LE";
+	    ctxt->input->cur += 2;
+        } else if ((in[0] == 0xFE) && (in[1] == 0xFF)) {
+	    newEncoding = "UTF-16BE";
+	    ctxt->input->cur += 2;
+	} else {
+	    /* Error. XML REC says UTF-16 must have BOM. Not fatal however. */
+	    __xmlRaiseError(NULL, NULL, NULL,
+			    ctxt, NULL, XML_FROM_PARSER,
+			    XML_ERR_INVALID_ENCODING, XML_ERR_WARNING,
+			    NULL, 0, NULL, NULL,
+			    NULL, 0, 0, "No byte order mark for UTF-16",
+			    NULL, NULL);
+	    if ((in[0] == 0x3C) && (in[1] == 0x00)) {
+	        newEncoding = "UTF-16LE";
+	    } else if ((in[0] == 0x00) && (in[1] == 0x3C)) {
+	        newEncoding = "UTF-16BE";
+	    } else {
+	        /* Unicode standard says BE should be the default, but
+		 * for backwards compatibility we use LE. XML standard
+		 * allows anything: after an error (missing BOM)
+		 * results are undefined, and we can recover as best
+		 * we can. */
+	        newEncoding = "UTF-16LE";
+	    }
+	}
+    }
+
+    if (newEncoding != NULL) {
+	handler = xmlFindCharEncodingHandler(newEncoding);
+	if (handler == NULL) {
+	    __xmlErrEncoding(ctxt, XML_IO_ENCODER,
+	        "Encoding %s not supported", BAD_CAST newEncoding, NULL);
+	    return (-1);
+	}
+    }
 
     if (handler != NULL) {
         if (ctxt->input != NULL) {