diff -r -u libxml2-2.9.1+dfsg1.orig/HTMLparser.c libxml2-2.9.1+dfsg1/HTMLparser.c
--- libxml2-2.9.1+dfsg1.orig/HTMLparser.c 2015-04-14 13:05:01.000000000 +0200
+++ libxml2-2.9.1+dfsg1/HTMLparser.c 2015-04-26 03:18:31.088399169 +0200
@@ -59,6 +59,7 @@
xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
xmlChar end, xmlChar end2, xmlChar end3);
static void htmlParseComment(htmlParserCtxtPtr ctxt);
+static int htmlParseElementRecursive(htmlParserCtxtPtr ctxt);
/************************************************************************
* *
@@ -2946,10 +2947,11 @@
}
}
-
/**
- * htmlParseCharData:
+ * htmlParseCharDataInternal:
* @ctxt: an HTML parser context
+ * @prep: optional character to be prepended to text, 0 if no character
+ * shall be prepended
*
* parse a CharData section.
* if we are within a CDATA section ']]>' marks an end of section.
@@ -2958,12 +2960,15 @@
*/
static void
-htmlParseCharData(htmlParserCtxtPtr ctxt) {
- xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
+htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, char prep) {
+ xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
int nbchar = 0;
int cur, l;
int chunk = 0;
+ if (prep)
+ buf[nbchar++] = prep;
+
SHRINK;
cur = CUR_CHAR(l);
while (((cur != '<') || (ctxt->token == '<')) &&
@@ -3043,6 +3048,21 @@
}
/**
+ * htmlParseCharData:
+ * @ctxt: an HTML parser context
+ *
+ * parse a CharData section.
+ * if we are within a CDATA section ']]>' marks an end of section.
+ *
+ * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
+ */
+
+static void
+htmlParseCharData(htmlParserCtxtPtr ctxt) {
+ htmlParseCharDataInternal(ctxt, 0);
+}
+
+/**
* htmlParseExternalID:
* @ctxt: an HTML parser context
* @publicID: a xmlChar** receiving PubidLiteral
@@ -4051,7 +4071,7 @@
* htmlParseContent:
* @ctxt: an HTML parser context
*
- * Parse a content: comment, sub-element, reference or text.
+ * Parse a content recursively: comment, sub-element, reference or text.
* Kept for compatibility with old code
*/
@@ -4060,6 +4080,7 @@
xmlChar *currentNode;
int depth;
const xmlChar *name;
+ int res;
currentNode = xmlStrdup(ctxt->name);
depth = ctxt->nameNr;
@@ -4157,10 +4178,19 @@
}
/*
- * Third case : a sub-element.
+ * Third case : a sub-element (recursively).
*/
else if (CUR == '<') {
- htmlParseElement(ctxt);
+ res = htmlParseElementRecursive(ctxt);
+
+ /*
+ * If it was an invalid element tag and parser position was
+ * rewinded to tag start, then consume next input as text.
+ */
+ if (res == -2) {
+ NEXT;
+ htmlParseCharDataInternal(ctxt, '<');
+ }
}
/*
@@ -4201,19 +4231,24 @@
}
/**
- * htmlParseElement:
+ * htmlParseElementRecursive:
* @ctxt: an HTML parser context
*
* parse an HTML element, this is highly recursive
* this is kept for compatibility with previous code versions
+ * regarding htmlParseElement()
*
* [39] element ::= EmptyElemTag | STag content ETag
*
* [41] Attribute ::= Name Eq AttValue
+ *
+ * Returns 0: success,
+ * -1: error - consumed tag input (that is dumped invalid tag),
+ * -2: error - rewinded input to tag start - handle next char as text
*/
-void
-htmlParseElement(htmlParserCtxtPtr ctxt) {
+static int
+htmlParseElementRecursive(htmlParserCtxtPtr ctxt) {
const xmlChar *name;
xmlChar *currentNode = NULL;
const htmlElemDesc * info;
@@ -4221,15 +4256,16 @@
int failed;
int depth;
const xmlChar *oldptr;
+ xmlParserInputPtr startPos = NULL;
if ((ctxt == NULL) || (ctxt->input == NULL)) {
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
- "htmlParseElement: context error\n", NULL, NULL);
- return;
+ "htmlParseElementRecursive: context error\n", NULL, NULL);
+ return -1;
}
if (ctxt->instate == XML_PARSER_EOF)
- return;
+ return -1;
/* Capture start position */
if (ctxt->record_info) {
@@ -4237,13 +4273,18 @@
(CUR_PTR - ctxt->input->base);
node_info.begin_line = ctxt->input->line;
}
+ startPos = xmlNewInputStream(ctxt);
+ if (startPos != NULL)
+ *startPos = *ctxt->input;
+ else
+ return -1;
failed = htmlParseStartTag(ctxt);
name = ctxt->name;
if ((failed == -1) || (name == NULL)) {
if (CUR == '>')
NEXT;
- return;
+ goto errorDoRewind;
}
/*
@@ -4263,7 +4304,7 @@
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, name);
htmlnamePop(ctxt);
- return;
+ goto errorDoConsume;
}
if (CUR == '>') {
@@ -4290,7 +4331,7 @@
node_info.node = ctxt->node;
xmlParserAddNodeInfo(ctxt, &node_info);
}
- return;
+ goto errorDoConsume;
}
/*
@@ -4300,7 +4341,7 @@
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, name);
htmlnamePop(ctxt);
- return;
+ goto errorDoConsume;
}
/*
@@ -4331,6 +4372,52 @@
if (currentNode != NULL)
xmlFree(currentNode);
+
+ /* Success */
+ failed = 0;
+ goto done;
+
+ /* Error occurred, leave input consumed. */
+errorDoConsume:
+ failed = -1;
+ goto done;
+
+ /*
+ * Error occurred, rewind parser to tag start position.
+ * (but only if HTML_PARSE_RECOVER is set, otherwise leave input consumed)
+ */
+errorDoRewind:
+ if (ctxt->recovery) {
+ failed = -2;
+ if (startPos)
+ *ctxt->input = *startPos;
+ } else {
+ failed = -1;
+ }
+ goto done;
+
+done:
+ if (startPos != NULL)
+ xmlFree(startPos);
+
+ return failed;
+}
+
+/**
+ * htmlParseElement:
+ * @ctxt: an HTML parser context
+ *
+ * parse an HTML element, this is highly recursive
+ * this is kept for compatibility with previous code versions
+ *
+ * [39] element ::= EmptyElemTag | STag content ETag
+ *
+ * [41] Attribute ::= Name Eq AttValue
+ */
+
+void
+htmlParseElement(htmlParserCtxtPtr ctxt) {
+ htmlParseElementRecursive(ctxt);
}
static void
@@ -4352,7 +4439,7 @@
}
/**
- * htmlParseElementInternal:
+ * htmlParseElementNonRecursive:
* @ctxt: an HTML parser context
*
* parse an HTML element, new version, non recursive
@@ -4360,23 +4447,28 @@
* [39] element ::= EmptyElemTag | STag content ETag
*
* [41] Attribute ::= Name Eq AttValue
+ *
+ * Returns 0: success,
+ * -1: error - consumed tag input (that is dumped invalid tag),
+ * -2: error - rewinded input to tag start - handle next char as text
*/
-static void
-htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
+static int
+htmlParseElementNonRecursive(htmlParserCtxtPtr ctxt) {
const xmlChar *name;
const htmlElemDesc * info;
htmlParserNodeInfo node_info = { 0, };
int failed;
+ xmlParserInputPtr startPos = NULL;
if ((ctxt == NULL) || (ctxt->input == NULL)) {
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
- "htmlParseElementInternal: context error\n", NULL, NULL);
- return;
+ "htmlParseElementNonRecursive: context error\n", NULL, NULL);
+ return -1;
}
if (ctxt->instate == XML_PARSER_EOF)
- return;
+ return -1;
/* Capture start position */
if (ctxt->record_info) {
@@ -4384,13 +4476,18 @@
(CUR_PTR - ctxt->input->base);
node_info.begin_line = ctxt->input->line;
}
+ startPos = xmlNewInputStream(ctxt);
+ if (startPos != NULL)
+ *startPos = *ctxt->input;
+ else
+ return -1;
failed = htmlParseStartTag(ctxt);
name = ctxt->name;
if ((failed == -1) || (name == NULL)) {
if (CUR == '>')
NEXT;
- return;
+ goto errorDoRewind;
}
/*
@@ -4410,7 +4507,7 @@
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, name);
htmlnamePop(ctxt);
- return;
+ goto errorDoConsume;
}
if (CUR == '>') {
@@ -4430,7 +4527,7 @@
if (ctxt->record_info)
htmlNodeInfoPush(ctxt, &node_info);
htmlParserFinishElementParsing(ctxt);
- return;
+ goto errorDoConsume;
}
/*
@@ -4440,26 +4537,56 @@
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, name);
htmlnamePop(ctxt);
- return;
+ goto errorDoConsume;
}
if (ctxt->record_info)
htmlNodeInfoPush(ctxt, &node_info);
+
+ /* Success */
+ failed = 0;
+ goto done;
+
+ /* Error occurred, leave input consumed. */
+errorDoConsume:
+ failed = -1;
+ goto done;
+
+ /*
+ * Error occurred, rewind parser to tag start position.
+ * (but only if HTML_PARSE_RECOVER is set, otherwise leave input consumed)
+ */
+errorDoRewind:
+ if (ctxt->recovery) {
+ failed = -2;
+ if (startPos)
+ *ctxt->input = *startPos;
+ } else {
+ failed = -1;
+ }
+ goto done;
+
+done:
+ if (startPos != NULL)
+ xmlFree(startPos);
+
+ return failed;
}
/**
- * htmlParseContentInternal:
+ * htmlParseContentNonRecursive:
* @ctxt: an HTML parser context
*
* Parse a content: comment, sub-element, reference or text.
- * New version for non recursive htmlParseElementInternal
+ * New version for non recursive htmlParseElementNonRecursive
*/
static void
-htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
+htmlParseContentNonRecursive(htmlParserCtxtPtr ctxt) {
xmlChar *currentNode;
int depth;
const xmlChar *name;
+ int res;
currentNode = xmlStrdup(ctxt->name);
depth = ctxt->nameNr;
@@ -4570,11 +4697,20 @@
* Third case : a sub-element.
*/
else if (CUR == '<') {
- htmlParseElementInternal(ctxt);
+ res = htmlParseElementNonRecursive(ctxt);
if (currentNode != NULL) xmlFree(currentNode);
currentNode = xmlStrdup(ctxt->name);
depth = ctxt->nameNr;
+
+ /*
+ * If it was an invalid element tag and parser position was
+ * rewinded to tag start, then consume next input as text.
+ */
+ if (res == -2) {
+ NEXT;
+ htmlParseCharDataInternal(ctxt, '<');
+ }
}
/*
@@ -4618,14 +4754,14 @@
* htmlParseContent:
* @ctxt: an HTML parser context
*
- * Parse a content: comment, sub-element, reference or text.
+ * Parse a content non-recursive: comment, sub-element, reference or text.
* This is the entry point when called from parser.c
*/
void
__htmlParseContent(void *ctxt) {
if (ctxt != NULL)
- htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
+ htmlParseContentNonRecursive((htmlParserCtxtPtr) ctxt);
}
/**
@@ -4732,7 +4868,7 @@
/*
* Time to start parsing the tree itself
*/
- htmlParseContentInternal(ctxt);
+ htmlParseContentNonRecursive(ctxt);
/*
* autoclose