[libxml2] Rework control flow in htmlCurrentChar
- From: Nick Wellnhofer <nwellnhof src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [libxml2] Rework control flow in htmlCurrentChar
- Date: Wed, 15 Jul 2020 14:41:59 +0000 (UTC)
commit dfd4e330489c383c0ae58d5fb1393558d6567bc6
Author: Nick Wellnhofer <wellnhofer aevum de>
Date: Wed Jul 15 14:22:08 2020 +0200
Rework control flow in htmlCurrentChar
Don't call xmlCurrentChar after switching encodings. Rearrange code
blocks and fall through to normal UTF-8 handling.
HTMLparser.c | 197 +++++++++++++++++++++++++++++------------------------------
1 file changed, 98 insertions(+), 99 deletions(-)
---
diff --git a/HTMLparser.c b/HTMLparser.c
index d31e2ec9..ec88eed0 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -414,6 +414,10 @@ htmlFindEncoding(xmlParserCtxtPtr ctxt) {
static int
htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
+ const unsigned char *cur;
+ unsigned char c;
+ unsigned int val;
+
if (ctxt->instate == XML_PARSER_EOF)
return(0);
@@ -421,107 +425,23 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
*len = 0;
return(ctxt->token);
}
- if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
- /*
- * We are supposed to handle UTF8, check it's valid
- * From rfc2044: encoding of the Unicode values on UTF-8:
- *
- * UCS-4 range (hex.) UTF-8 octet sequence (binary)
- * 0000 0000-0000 007F 0xxxxxxx
- * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
- * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
- *
- * Check for the 0x110000 limit too
- */
- const unsigned char *cur = ctxt->input->cur;
- unsigned char c;
- unsigned int val;
-
- c = *cur;
- if (c & 0x80) {
- if ((c & 0x40) == 0)
- goto encoding_error;
- if (cur[1] == 0) {
- xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
- cur = ctxt->input->cur;
- }
- if ((cur[1] & 0xc0) != 0x80)
- goto encoding_error;
- if ((c & 0xe0) == 0xe0) {
-
- if (cur[2] == 0) {
- xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
- cur = ctxt->input->cur;
- }
- if ((cur[2] & 0xc0) != 0x80)
- goto encoding_error;
- if ((c & 0xf0) == 0xf0) {
- if (cur[3] == 0) {
- xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
- cur = ctxt->input->cur;
- }
- if (((c & 0xf8) != 0xf0) ||
- ((cur[3] & 0xc0) != 0x80))
- goto encoding_error;
- /* 4-byte code */
- *len = 4;
- val = (cur[0] & 0x7) << 18;
- val |= (cur[1] & 0x3f) << 12;
- val |= (cur[2] & 0x3f) << 6;
- val |= cur[3] & 0x3f;
- if (val < 0x10000)
- goto encoding_error;
- } else {
- /* 3-byte code */
- *len = 3;
- val = (cur[0] & 0xf) << 12;
- val |= (cur[1] & 0x3f) << 6;
- val |= cur[2] & 0x3f;
- if (val < 0x800)
- goto encoding_error;
- }
- } else {
- /* 2-byte code */
- *len = 2;
- val = (cur[0] & 0x1f) << 6;
- val |= cur[1] & 0x3f;
- if (val < 0x80)
- goto encoding_error;
- }
- if (!IS_CHAR(val)) {
- htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
- "Char 0x%X out of allowed range\n", val);
- }
- return(val);
- } else {
- if ((*ctxt->input->cur == 0) &&
- (ctxt->input->cur < ctxt->input->end)) {
- htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
- "Char 0x%X out of allowed range\n", 0);
- *len = 1;
- return(' ');
- }
- /* 1-byte code */
- *len = 1;
- return((int) *ctxt->input->cur);
- }
- }
- /*
- * Assume it's a fixed length encoding (1) with
- * a compatible encoding for the ASCII set, since
- * XML constructs only use < 128 chars
- */
- *len = 1;
- if ((int) *ctxt->input->cur < 0x80)
- return((int) *ctxt->input->cur);
-
- /*
- * Humm this is bad, do an automatic flow conversion
- */
- {
+ if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
xmlChar * guess;
xmlCharEncodingHandlerPtr handler;
+ /*
+ * Assume it's a fixed length encoding (1) with
+ * a compatible encoding for the ASCII set, since
+ * HTML constructs only use < 128 chars
+ */
+ if ((int) *ctxt->input->cur < 0x80) {
+ *len = 1;
+ return((int) *ctxt->input->cur);
+ }
+
+ /*
+ * Humm this is bad, do an automatic flow conversion
+ */
guess = htmlFindEncoding(ctxt);
if (guess == NULL) {
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
@@ -540,7 +460,86 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
ctxt->charset = XML_CHAR_ENCODING_UTF8;
}
- return(xmlCurrentChar(ctxt, len));
+ /*
+ * We are supposed to handle UTF8, check it's valid
+ * From rfc2044: encoding of the Unicode values on UTF-8:
+ *
+ * UCS-4 range (hex.) UTF-8 octet sequence (binary)
+ * 0000 0000-0000 007F 0xxxxxxx
+ * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
+ * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
+ *
+ * Check for the 0x110000 limit too
+ */
+ cur = ctxt->input->cur;
+ c = *cur;
+ if (c & 0x80) {
+ if ((c & 0x40) == 0)
+ goto encoding_error;
+ if (cur[1] == 0) {
+ xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+ cur = ctxt->input->cur;
+ }
+ if ((cur[1] & 0xc0) != 0x80)
+ goto encoding_error;
+ if ((c & 0xe0) == 0xe0) {
+
+ if (cur[2] == 0) {
+ xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+ cur = ctxt->input->cur;
+ }
+ if ((cur[2] & 0xc0) != 0x80)
+ goto encoding_error;
+ if ((c & 0xf0) == 0xf0) {
+ if (cur[3] == 0) {
+ xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+ cur = ctxt->input->cur;
+ }
+ if (((c & 0xf8) != 0xf0) ||
+ ((cur[3] & 0xc0) != 0x80))
+ goto encoding_error;
+ /* 4-byte code */
+ *len = 4;
+ val = (cur[0] & 0x7) << 18;
+ val |= (cur[1] & 0x3f) << 12;
+ val |= (cur[2] & 0x3f) << 6;
+ val |= cur[3] & 0x3f;
+ if (val < 0x10000)
+ goto encoding_error;
+ } else {
+ /* 3-byte code */
+ *len = 3;
+ val = (cur[0] & 0xf) << 12;
+ val |= (cur[1] & 0x3f) << 6;
+ val |= cur[2] & 0x3f;
+ if (val < 0x800)
+ goto encoding_error;
+ }
+ } else {
+ /* 2-byte code */
+ *len = 2;
+ val = (cur[0] & 0x1f) << 6;
+ val |= cur[1] & 0x3f;
+ if (val < 0x80)
+ goto encoding_error;
+ }
+ if (!IS_CHAR(val)) {
+ htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+ "Char 0x%X out of allowed range\n", val);
+ }
+ return(val);
+ } else {
+ if ((*ctxt->input->cur == 0) &&
+ (ctxt->input->cur < ctxt->input->end)) {
+ htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+ "Char 0x%X out of allowed range\n", 0);
+ *len = 1;
+ return(' ');
+ }
+ /* 1-byte code */
+ *len = 1;
+ return((int) *ctxt->input->cur);
+ }
encoding_error:
/*
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]