[libxml2] Validate UTF8 in xmlEncodeEntities



commit bf22713507fe1fc3a2c4b525cf0a88c2dc87a3a2
Author: Joel Hockey <joel hockey gmail com>
Date:   Sun Aug 16 17:19:35 2020 -0700

    Validate UTF8 in xmlEncodeEntities
    
    Code is currently assuming UTF-8 without validating. Truncated UTF-8
    input can cause out-of-bounds array access.
    
    Adds further checks to partial fix in 50f06b3e.
    
    Fixes #178

 entities.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)
---
diff --git a/entities.c b/entities.c
index 37b99a56..1a8f86f0 100644
--- a/entities.c
+++ b/entities.c
@@ -704,11 +704,25 @@ xmlEncodeEntitiesInternal(xmlDocPtr doc, const xmlChar *input, int attr) {
            } else {
                /*
                 * We assume we have UTF-8 input.
+                * It must match either:
+                *   110xxxxx 10xxxxxx
+                *   1110xxxx 10xxxxxx 10xxxxxx
+                *   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+                * That is:
+                *   cur[0] is 11xxxxxx
+                *   cur[1] is 10xxxxxx
+                *   cur[2] is 10xxxxxx if cur[0] is 111xxxxx
+                *   cur[3] is 10xxxxxx if cur[0] is 1111xxxx
+                *   cur[0] is not 11111xxx
                 */
                char buf[11], *ptr;
                int val = 0, l = 1;
 
-               if (*cur < 0xC0) {
+               if (((cur[0] & 0xC0) != 0xC0) ||
+                   ((cur[1] & 0xC0) != 0x80) ||
+                   (((cur[0] & 0xE0) == 0xE0) && ((cur[2] & 0xC0) != 0x80)) ||
+                   (((cur[0] & 0xF0) == 0xF0) && ((cur[3] & 0xC0) != 0x80)) ||
+                   (((cur[0] & 0xF8) == 0xF8))) {
                    xmlEntitiesErr(XML_CHECK_NOT_UTF8,
                            "xmlEncodeEntities: input not UTF-8");
                    if (doc != NULL)


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]