[libxml2] Rework control flow in htmlCurrentChar



commit dfd4e330489c383c0ae58d5fb1393558d6567bc6
Author: Nick Wellnhofer <wellnhofer aevum de>
Date:   Wed Jul 15 14:22:08 2020 +0200

    Rework control flow in htmlCurrentChar
    
    Don't call xmlCurrentChar after switching encodings. Rearrange code
    blocks and fall through to normal UTF-8 handling.

 HTMLparser.c | 197 +++++++++++++++++++++++++++++------------------------------
 1 file changed, 98 insertions(+), 99 deletions(-)
---
diff --git a/HTMLparser.c b/HTMLparser.c
index d31e2ec9..ec88eed0 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -414,6 +414,10 @@ htmlFindEncoding(xmlParserCtxtPtr ctxt) {
 
 static int
 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
+    const unsigned char *cur;
+    unsigned char c;
+    unsigned int val;
+
     if (ctxt->instate == XML_PARSER_EOF)
        return(0);
 
@@ -421,107 +425,23 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
        *len = 0;
        return(ctxt->token);
     }
-    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
-       /*
-        * We are supposed to handle UTF8, check it's valid
-        * From rfc2044: encoding of the Unicode values on UTF-8:
-        *
-        * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
-        * 0000 0000-0000 007F   0xxxxxxx
-        * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
-        * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
-        *
-        * Check for the 0x110000 limit too
-        */
-       const unsigned char *cur = ctxt->input->cur;
-       unsigned char c;
-       unsigned int val;
-
-       c = *cur;
-       if (c & 0x80) {
-           if ((c & 0x40) == 0)
-               goto encoding_error;
-           if (cur[1] == 0) {
-               xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
-                cur = ctxt->input->cur;
-            }
-           if ((cur[1] & 0xc0) != 0x80)
-               goto encoding_error;
-           if ((c & 0xe0) == 0xe0) {
-
-               if (cur[2] == 0) {
-                   xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
-                    cur = ctxt->input->cur;
-                }
-               if ((cur[2] & 0xc0) != 0x80)
-                   goto encoding_error;
-               if ((c & 0xf0) == 0xf0) {
-                   if (cur[3] == 0) {
-                       xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
-                        cur = ctxt->input->cur;
-                    }
-                   if (((c & 0xf8) != 0xf0) ||
-                       ((cur[3] & 0xc0) != 0x80))
-                       goto encoding_error;
-                   /* 4-byte code */
-                   *len = 4;
-                   val = (cur[0] & 0x7) << 18;
-                   val |= (cur[1] & 0x3f) << 12;
-                   val |= (cur[2] & 0x3f) << 6;
-                   val |= cur[3] & 0x3f;
-                   if (val < 0x10000)
-                       goto encoding_error;
-               } else {
-                 /* 3-byte code */
-                   *len = 3;
-                   val = (cur[0] & 0xf) << 12;
-                   val |= (cur[1] & 0x3f) << 6;
-                   val |= cur[2] & 0x3f;
-                   if (val < 0x800)
-                       goto encoding_error;
-               }
-           } else {
-             /* 2-byte code */
-               *len = 2;
-               val = (cur[0] & 0x1f) << 6;
-               val |= cur[1] & 0x3f;
-               if (val < 0x80)
-                   goto encoding_error;
-           }
-           if (!IS_CHAR(val)) {
-               htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
-                               "Char 0x%X out of allowed range\n", val);
-           }
-           return(val);
-       } else {
-            if ((*ctxt->input->cur == 0) &&
-                (ctxt->input->cur < ctxt->input->end)) {
-                    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
-                               "Char 0x%X out of allowed range\n", 0);
-                *len = 1;
-                return(' ');
-            }
-           /* 1-byte code */
-           *len = 1;
-           return((int) *ctxt->input->cur);
-       }
-    }
-    /*
-     * Assume it's a fixed length encoding (1) with
-     * a compatible encoding for the ASCII set, since
-     * XML constructs only use < 128 chars
-     */
-    *len = 1;
-    if ((int) *ctxt->input->cur < 0x80)
-       return((int) *ctxt->input->cur);
-
-    /*
-     * Humm this is bad, do an automatic flow conversion
-     */
-    {
+    if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
         xmlChar * guess;
         xmlCharEncodingHandlerPtr handler;
 
+        /*
+         * Assume it's a fixed length encoding (1) with
+         * a compatible encoding for the ASCII set, since
+         * HTML constructs only use < 128 chars
+         */
+        if ((int) *ctxt->input->cur < 0x80) {
+            *len = 1;
+            return((int) *ctxt->input->cur);
+        }
+
+        /*
+         * Humm this is bad, do an automatic flow conversion
+         */
         guess = htmlFindEncoding(ctxt);
         if (guess == NULL) {
             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
@@ -540,7 +460,86 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
         ctxt->charset = XML_CHAR_ENCODING_UTF8;
     }
 
-    return(xmlCurrentChar(ctxt, len));
+    /*
+     * We are supposed to handle UTF8, check it's valid
+     * From rfc2044: encoding of the Unicode values on UTF-8:
+     *
+     * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
+     * 0000 0000-0000 007F   0xxxxxxx
+     * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
+     * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
+     *
+     * Check for the 0x110000 limit too
+     */
+    cur = ctxt->input->cur;
+    c = *cur;
+    if (c & 0x80) {
+        if ((c & 0x40) == 0)
+            goto encoding_error;
+        if (cur[1] == 0) {
+            xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+            cur = ctxt->input->cur;
+        }
+        if ((cur[1] & 0xc0) != 0x80)
+            goto encoding_error;
+        if ((c & 0xe0) == 0xe0) {
+
+            if (cur[2] == 0) {
+                xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+                cur = ctxt->input->cur;
+            }
+            if ((cur[2] & 0xc0) != 0x80)
+                goto encoding_error;
+            if ((c & 0xf0) == 0xf0) {
+                if (cur[3] == 0) {
+                    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+                    cur = ctxt->input->cur;
+                }
+                if (((c & 0xf8) != 0xf0) ||
+                    ((cur[3] & 0xc0) != 0x80))
+                    goto encoding_error;
+                /* 4-byte code */
+                *len = 4;
+                val = (cur[0] & 0x7) << 18;
+                val |= (cur[1] & 0x3f) << 12;
+                val |= (cur[2] & 0x3f) << 6;
+                val |= cur[3] & 0x3f;
+                if (val < 0x10000)
+                    goto encoding_error;
+            } else {
+              /* 3-byte code */
+                *len = 3;
+                val = (cur[0] & 0xf) << 12;
+                val |= (cur[1] & 0x3f) << 6;
+                val |= cur[2] & 0x3f;
+                if (val < 0x800)
+                    goto encoding_error;
+            }
+        } else {
+          /* 2-byte code */
+            *len = 2;
+            val = (cur[0] & 0x1f) << 6;
+            val |= cur[1] & 0x3f;
+            if (val < 0x80)
+                goto encoding_error;
+        }
+        if (!IS_CHAR(val)) {
+            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+                            "Char 0x%X out of allowed range\n", val);
+        }
+        return(val);
+    } else {
+        if ((*ctxt->input->cur == 0) &&
+            (ctxt->input->cur < ctxt->input->end)) {
+            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+                            "Char 0x%X out of allowed range\n", 0);
+            *len = 1;
+            return(' ');
+        }
+        /* 1-byte code */
+        *len = 1;
+        return((int) *ctxt->input->cur);
+    }
 
 encoding_error:
     /*


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]