[libxml2] Recover unescaped less-than character in HTML recovery parsing
- From: Daniel Veillard <veillard src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [libxml2] Recover unescaped less-than character in HTML recovery parsing
- Date: Tue, 30 Jun 2015 03:42:39 +0000 (UTC)
commit 140c251e8e5653572edcca91b9d675f871735cb4
Author: Daniel Veillard <veillard redhat com>
Date: Tue Jun 30 11:36:28 2015 +0800
Recover unescaped less-than character in HTML recovery parsing
As pointed by Christian Schoenebeck <schoenebeck crudebyte com>
on the list and based on some of his early patches, this preserve
content when unescaped opening angle brackets are not escaped in
textual content like:
<p> a < b </p>
<p> a <0 </p>
<p> a <=0 </p>
while still reporting the error.
HTMLparser.c | 33 ++++++++++++++++++++++++++++++---
1 files changed, 30 insertions(+), 3 deletions(-)
---
diff --git a/HTMLparser.c b/HTMLparser.c
index d329d3b..19c10c3 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -2948,8 +2948,9 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
/**
- * htmlParseCharData:
+ * htmlParseCharDataInternal:
* @ctxt: an HTML parser context
+ * @readahead: optional read ahead character in ascii range
*
* parse a CharData section.
* if we are within a CDATA section ']]>' marks an end of section.
@@ -2958,12 +2959,15 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
*/
static void
-htmlParseCharData(htmlParserCtxtPtr ctxt) {
- xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
+htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
+ xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
int nbchar = 0;
int cur, l;
int chunk = 0;
+ if (readahead)
+ buf[nbchar++] = readahead;
+
SHRINK;
cur = CUR_CHAR(l);
while (((cur != '<') || (ctxt->token == '<')) &&
@@ -3043,6 +3047,21 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
}
/**
+ * htmlParseCharData:
+ * @ctxt: an HTML parser context
+ *
+ * parse a CharData section.
+ * if we are within a CDATA section ']]>' marks an end of section.
+ *
+ * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
+ */
+
+static void
+htmlParseCharData(htmlParserCtxtPtr ctxt) {
+ htmlParseCharDataInternal(ctxt, 0);
+}
+
+/**
* htmlParseExternalID:
* @ctxt: an HTML parser context
* @publicID: a xmlChar** receiving PubidLiteral
@@ -3690,6 +3709,14 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
"htmlParseStartTag: invalid element name\n",
NULL, NULL);
+ /* if recover preserve text on classic misconstructs */
+ if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
+ (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
+ htmlParseCharDataInternal(ctxt, '<');
+ return(-1);
+ }
+
+
/* Dump the bogus tag like browsers do */
while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
(ctxt->instate != XML_PARSER_EOF))
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]