diff -r -u libxml2-2.9.1+dfsg1.orig/HTMLparser.c libxml2-2.9.1+dfsg1/HTMLparser.c --- libxml2-2.9.1+dfsg1.orig/HTMLparser.c 2015-04-14 13:05:01.000000000 +0200 +++ libxml2-2.9.1+dfsg1/HTMLparser.c 2015-04-26 03:18:31.088399169 +0200 @@ -59,6 +59,7 @@ xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len, xmlChar end, xmlChar end2, xmlChar end3); static void htmlParseComment(htmlParserCtxtPtr ctxt); +static int htmlParseElementRecursive(htmlParserCtxtPtr ctxt); /************************************************************************ * * @@ -2946,10 +2947,11 @@ } } - /** - * htmlParseCharData: + * htmlParseCharDataInternal: * @ctxt: an HTML parser context + * @prep: optional character to be prepended to text, 0 if no character + * shall be prepended * * parse a CharData section. * if we are within a CDATA section ']]>' marks an end of section. @@ -2958,12 +2960,15 @@ */ static void -htmlParseCharData(htmlParserCtxtPtr ctxt) { - xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; +htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, char prep) { + xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6]; int nbchar = 0; int cur, l; int chunk = 0; + if (prep) + buf[nbchar++] = prep; + SHRINK; cur = CUR_CHAR(l); while (((cur != '<') || (ctxt->token == '<')) && @@ -3043,6 +3048,21 @@ } /** + * htmlParseCharData: + * @ctxt: an HTML parser context + * + * parse a CharData section. + * if we are within a CDATA section ']]>' marks an end of section. + * + * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) + */ + +static void +htmlParseCharData(htmlParserCtxtPtr ctxt) { + htmlParseCharDataInternal(ctxt, 0); +} + +/** * htmlParseExternalID: * @ctxt: an HTML parser context * @publicID: a xmlChar** receiving PubidLiteral @@ -4051,7 +4071,7 @@ * htmlParseContent: * @ctxt: an HTML parser context * - * Parse a content: comment, sub-element, reference or text. + * Parse a content recursively: comment, sub-element, reference or text. * Kept for compatibility with old code */ @@ -4060,6 +4080,7 @@ xmlChar *currentNode; int depth; const xmlChar *name; + int res; currentNode = xmlStrdup(ctxt->name); depth = ctxt->nameNr; @@ -4157,10 +4178,19 @@ } /* - * Third case : a sub-element. + * Third case : a sub-element (recursively). */ else if (CUR == '<') { - htmlParseElement(ctxt); + res = htmlParseElementRecursive(ctxt); + + /* + * If it was an invalid element tag and parser position was + * rewinded to tag start, then consume next input as text. + */ + if (res == -2) { + NEXT; + htmlParseCharDataInternal(ctxt, '<'); + } } /* @@ -4201,19 +4231,24 @@ } /** - * htmlParseElement: + * htmlParseElementRecursive: * @ctxt: an HTML parser context * * parse an HTML element, this is highly recursive * this is kept for compatibility with previous code versions + * regarding htmlParseElement() * * [39] element ::= EmptyElemTag | STag content ETag * * [41] Attribute ::= Name Eq AttValue + * + * Returns 0: success, + * -1: error - consumed tag input (that is dumped invalid tag), + * -2: error - rewinded input to tag start - handle next char as text */ -void -htmlParseElement(htmlParserCtxtPtr ctxt) { +static int +htmlParseElementRecursive(htmlParserCtxtPtr ctxt) { const xmlChar *name; xmlChar *currentNode = NULL; const htmlElemDesc * info; @@ -4221,15 +4256,16 @@ int failed; int depth; const xmlChar *oldptr; + xmlParserInputPtr startPos = NULL; if ((ctxt == NULL) || (ctxt->input == NULL)) { htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, - "htmlParseElement: context error\n", NULL, NULL); - return; + "htmlParseElementRecursive: context error\n", NULL, NULL); + return -1; } if (ctxt->instate == XML_PARSER_EOF) - return; + return -1; /* Capture start position */ if (ctxt->record_info) { @@ -4237,13 +4273,18 @@ (CUR_PTR - ctxt->input->base); node_info.begin_line = ctxt->input->line; } + startPos = xmlNewInputStream(ctxt); + if (startPos != NULL) + *startPos = *ctxt->input; + else + return -1; failed = htmlParseStartTag(ctxt); name = ctxt->name; if ((failed == -1) || (name == NULL)) { if (CUR == '>') NEXT; - return; + goto errorDoRewind; } /* @@ -4263,7 +4304,7 @@ if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) ctxt->sax->endElement(ctxt->userData, name); htmlnamePop(ctxt); - return; + goto errorDoConsume; } if (CUR == '>') { @@ -4290,7 +4331,7 @@ node_info.node = ctxt->node; xmlParserAddNodeInfo(ctxt, &node_info); } - return; + goto errorDoConsume; } /* @@ -4300,7 +4341,7 @@ if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) ctxt->sax->endElement(ctxt->userData, name); htmlnamePop(ctxt); - return; + goto errorDoConsume; } /* @@ -4331,6 +4372,52 @@ if (currentNode != NULL) xmlFree(currentNode); + + /* Success */ + failed = 0; + goto done; + + /* Error occurred, leave input consumed. */ +errorDoConsume: + failed = -1; + goto done; + + /* + * Error occurred, rewind parser to tag start position. + * (but only if HTML_PARSE_RECOVER is set, otherwise leave input consumed) + */ +errorDoRewind: + if (ctxt->recovery) { + failed = -2; + if (startPos) + *ctxt->input = *startPos; + } else { + failed = -1; + } + goto done; + +done: + if (startPos != NULL) + xmlFree(startPos); + + return failed; +} + +/** + * htmlParseElement: + * @ctxt: an HTML parser context + * + * parse an HTML element, this is highly recursive + * this is kept for compatibility with previous code versions + * + * [39] element ::= EmptyElemTag | STag content ETag + * + * [41] Attribute ::= Name Eq AttValue + */ + +void +htmlParseElement(htmlParserCtxtPtr ctxt) { + htmlParseElementRecursive(ctxt); } static void @@ -4352,7 +4439,7 @@ } /** - * htmlParseElementInternal: + * htmlParseElementNonRecursive: * @ctxt: an HTML parser context * * parse an HTML element, new version, non recursive @@ -4360,23 +4447,28 @@ * [39] element ::= EmptyElemTag | STag content ETag * * [41] Attribute ::= Name Eq AttValue + * + * Returns 0: success, + * -1: error - consumed tag input (that is dumped invalid tag), + * -2: error - rewinded input to tag start - handle next char as text */ -static void -htmlParseElementInternal(htmlParserCtxtPtr ctxt) { +static int +htmlParseElementNonRecursive(htmlParserCtxtPtr ctxt) { const xmlChar *name; const htmlElemDesc * info; htmlParserNodeInfo node_info = { 0, }; int failed; + xmlParserInputPtr startPos = NULL; if ((ctxt == NULL) || (ctxt->input == NULL)) { htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, - "htmlParseElementInternal: context error\n", NULL, NULL); - return; + "htmlParseElementNonRecursive: context error\n", NULL, NULL); + return -1; } if (ctxt->instate == XML_PARSER_EOF) - return; + return -1; /* Capture start position */ if (ctxt->record_info) { @@ -4384,13 +4476,18 @@ (CUR_PTR - ctxt->input->base); node_info.begin_line = ctxt->input->line; } + startPos = xmlNewInputStream(ctxt); + if (startPos != NULL) + *startPos = *ctxt->input; + else + return -1; failed = htmlParseStartTag(ctxt); name = ctxt->name; if ((failed == -1) || (name == NULL)) { if (CUR == '>') NEXT; - return; + goto errorDoRewind; } /* @@ -4410,7 +4507,7 @@ if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) ctxt->sax->endElement(ctxt->userData, name); htmlnamePop(ctxt); - return; + goto errorDoConsume; } if (CUR == '>') { @@ -4430,7 +4527,7 @@ if (ctxt->record_info) htmlNodeInfoPush(ctxt, &node_info); htmlParserFinishElementParsing(ctxt); - return; + goto errorDoConsume; } /* @@ -4440,26 +4537,56 @@ if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) ctxt->sax->endElement(ctxt->userData, name); htmlnamePop(ctxt); - return; + goto errorDoConsume; } if (ctxt->record_info) htmlNodeInfoPush(ctxt, &node_info); + + /* Success */ + failed = 0; + goto done; + + /* Error occurred, leave input consumed. */ +errorDoConsume: + failed = -1; + goto done; + + /* + * Error occurred, rewind parser to tag start position. + * (but only if HTML_PARSE_RECOVER is set, otherwise leave input consumed) + */ +errorDoRewind: + if (ctxt->recovery) { + failed = -2; + if (startPos) + *ctxt->input = *startPos; + } else { + failed = -1; + } + goto done; + +done: + if (startPos != NULL) + xmlFree(startPos); + + return failed; } /** - * htmlParseContentInternal: + * htmlParseContentNonRecursive: * @ctxt: an HTML parser context * * Parse a content: comment, sub-element, reference or text. - * New version for non recursive htmlParseElementInternal + * New version for non recursive htmlParseElementNonRecursive */ static void -htmlParseContentInternal(htmlParserCtxtPtr ctxt) { +htmlParseContentNonRecursive(htmlParserCtxtPtr ctxt) { xmlChar *currentNode; int depth; const xmlChar *name; + int res; currentNode = xmlStrdup(ctxt->name); depth = ctxt->nameNr; @@ -4570,11 +4697,20 @@ * Third case : a sub-element. */ else if (CUR == '<') { - htmlParseElementInternal(ctxt); + res = htmlParseElementNonRecursive(ctxt); if (currentNode != NULL) xmlFree(currentNode); currentNode = xmlStrdup(ctxt->name); depth = ctxt->nameNr; + + /* + * If it was an invalid element tag and parser position was + * rewinded to tag start, then consume next input as text. + */ + if (res == -2) { + NEXT; + htmlParseCharDataInternal(ctxt, '<'); + } } /* @@ -4618,14 +4754,14 @@ * htmlParseContent: * @ctxt: an HTML parser context * - * Parse a content: comment, sub-element, reference or text. + * Parse a content non-recursive: comment, sub-element, reference or text. * This is the entry point when called from parser.c */ void __htmlParseContent(void *ctxt) { if (ctxt != NULL) - htmlParseContentInternal((htmlParserCtxtPtr) ctxt); + htmlParseContentNonRecursive((htmlParserCtxtPtr) ctxt); } /** @@ -4732,7 +4868,7 @@ /* * Time to start parsing the tree itself */ - htmlParseContentInternal(ctxt); + htmlParseContentNonRecursive(ctxt); /* * autoclose