commit 7a7620af4b806d9a67ac883a0f84479e3a9fda6b Author: Olli Pottonen Date: Sun Jun 28 11:38:26 2015 +1000 Improve encoding detection. Call xmlDetectCharEncoding() even if there is less than 4 bytes of input; 2 or 3 bytes may be enough. Avoid unnecessary copying of data to local array. diff --git a/HTMLparser.c b/HTMLparser.c index 8717d0b..9c4ec04 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -4619,7 +4619,6 @@ __htmlParseContent(void *ctxt) { int htmlParseDocument(htmlParserCtxtPtr ctxt) { - xmlChar start[4]; xmlCharEncoding enc; xmlDtdPtr dtd; @@ -4641,18 +4640,12 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) { if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator); - if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) && - ((ctxt->input->end - ctxt->input->cur) >= 4)) { + if (ctxt->encoding == NULL) { /* - * Get the 4 first bytes and decode the charset - * if enc != XML_CHAR_ENCODING_NONE * plug some encoding conversion routines. */ - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(&start[0], 4); + int length = ctxt->input->end - ctxt->input->cur; + enc = xmlDetectCharEncoding(ctxt->input->cur, length); if (enc != XML_CHAR_ENCODING_NONE) { xmlSwitchEncoding(ctxt, enc); } diff --git a/parser.c b/parser.c index 0e73d47..af637ca 100644 --- a/parser.c +++ b/parser.c @@ -2623,9 +2623,7 @@ xmlParserHandlePEReference(xmlParserCtxtPtr ctxt) { } else { if ((entity->etype == XML_INTERNAL_PARAMETER_ENTITY) || (entity->etype == XML_EXTERNAL_PARAMETER_ENTITY)) { - xmlChar start[4]; xmlCharEncoding enc; - /* * Note: external parameter entities will not be loaded, it * is not required for a non-validating parser, unless the @@ -2664,16 +2662,11 @@ xmlParserHandlePEReference(xmlParserCtxtPtr ctxt) { GROW if (ctxt->instate == XML_PARSER_EOF) return; - if ((ctxt->input->end - ctxt->input->cur)>=4) { - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(start, 4); - if (enc != XML_CHAR_ENCODING_NONE) { - xmlSwitchEncoding(ctxt, enc); - } - } + int len = ctxt->input->end - ctxt->input->cur; + enc = xmlDetectCharEncoding(ctxt->input->cur, len); + if (enc != XML_CHAR_ENCODING_NONE) { + xmlSwitchEncoding(ctxt, enc); + } if ((entity->etype == XML_EXTERNAL_PARAMETER_ENTITY) && (CMP5(CUR_PTR, '<', '?', 'x', 'm', 'l' )) && @@ -7080,18 +7073,12 @@ xmlParseExternalSubset(xmlParserCtxtPtr ctxt, const xmlChar *ExternalID, xmlDetectSAX2(ctxt); GROW; - if ((ctxt->encoding == NULL) && - (ctxt->input->end - ctxt->input->cur >= 4)) { - xmlChar start[4]; - xmlCharEncoding enc; - - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(start, 4); - if (enc != XML_CHAR_ENCODING_NONE) + if (ctxt->encoding == NULL) { + int length = ctxt->input->end - ctxt->input->cur; + xmlCharEncoding enc = xmlDetectCharEncoding(ctxt->input->cur, length); + if (enc != XML_CHAR_ENCODING_NONE) { xmlSwitchEncoding(ctxt, enc); + } } if (CMP5(CUR_PTR, '<', '?', 'x', 'm', 'l')) { @@ -10803,7 +10790,6 @@ xmlParseMisc(xmlParserCtxtPtr ctxt) { int xmlParseDocument(xmlParserCtxtPtr ctxt) { - xmlChar start[4]; xmlCharEncoding enc; xmlInitParser(); @@ -10826,18 +10812,9 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) { if (ctxt->instate == XML_PARSER_EOF) return(-1); - if ((ctxt->encoding == NULL) && - ((ctxt->input->end - ctxt->input->cur) >= 4)) { - /* - * Get the 4 first bytes and decode the charset - * if enc != XML_CHAR_ENCODING_NONE - * plug some encoding conversion routines. - */ - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(&start[0], 4); + if (ctxt->encoding == NULL) { + int length = ctxt->input->end - ctxt->input->cur; + enc = xmlDetectCharEncoding(ctxt->input->cur, length); if (enc != XML_CHAR_ENCODING_NONE) { xmlSwitchEncoding(ctxt, enc); } @@ -10997,9 +10974,7 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) { int xmlParseExtParsedEnt(xmlParserCtxtPtr ctxt) { - xmlChar start[4]; xmlCharEncoding enc; - if ((ctxt == NULL) || (ctxt->input == NULL)) return(-1); @@ -11016,19 +10991,12 @@ xmlParseExtParsedEnt(xmlParserCtxtPtr ctxt) { ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator); /* - * Get the 4 first bytes and decode the charset - * if enc != XML_CHAR_ENCODING_NONE * plug some encoding conversion routines. */ - if ((ctxt->input->end - ctxt->input->cur) >= 4) { - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(start, 4); - if (enc != XML_CHAR_ENCODING_NONE) { - xmlSwitchEncoding(ctxt, enc); - } + int length = ctxt->input->end - ctxt->input->cur; + enc = xmlDetectCharEncoding(ctxt->input->cur, length); + if (enc != XML_CHAR_ENCODING_NONE) { + xmlSwitchEncoding(ctxt, enc); } @@ -11428,28 +11396,19 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { goto done; case XML_PARSER_START: if (ctxt->charset == XML_CHAR_ENCODING_NONE) { - xmlChar start[4]; xmlCharEncoding enc; - /* * Very first chars read from the document flow. */ if (avail < 4) goto done; - /* - * Get the 4 first bytes and decode the charset - * if enc != XML_CHAR_ENCODING_NONE - * plug some encoding conversion routines, - * else xmlSwitchEncoding will set to (default) - * UTF8. - */ - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(start, 4); - xmlSwitchEncoding(ctxt, enc); + int length = ctxt->input->end - ctxt->input->cur; + enc = xmlDetectCharEncoding(ctxt->input->cur, length); + if (enc != XML_CHAR_ENCODING_NONE) { + xmlSwitchEncoding(ctxt, enc); + } + break; } @@ -12583,13 +12542,10 @@ xmlCreatePushParserCtxt(xmlSAXHandlerPtr sax, void *user_data, xmlParserInputBufferPtr buf; xmlCharEncoding enc = XML_CHAR_ENCODING_NONE; - /* - * plug some encoding conversion routines - */ - if ((chunk != NULL) && (size >= 4)) - enc = xmlDetectCharEncoding((const xmlChar *) chunk, size); + if (chunk != NULL) + enc = xmlDetectCharEncoding((const xmlChar *) chunk, size); - buf = xmlAllocParserInputBuffer(enc); + buf = xmlAllocParserInputBuffer(XML_CHAR_ENCODING_NONE); if (buf == NULL) return(NULL); ctxt = xmlNewParserCtxt(); @@ -12791,7 +12747,6 @@ xmlIOParseDTD(xmlSAXHandlerPtr sax, xmlParserInputBufferPtr input, xmlDtdPtr ret = NULL; xmlParserCtxtPtr ctxt; xmlParserInputPtr pinput = NULL; - xmlChar start[4]; if (input == NULL) return(NULL); @@ -12838,6 +12793,12 @@ xmlIOParseDTD(xmlSAXHandlerPtr sax, xmlParserInputBufferPtr input, } if (enc != XML_CHAR_ENCODING_NONE) { xmlSwitchEncoding(ctxt, enc); + } else { + int length = ctxt->input->end - ctxt->input->cur; + enc = xmlDetectCharEncoding(ctxt->input->cur, length); + if (enc != XML_CHAR_ENCODING_NONE) { + xmlSwitchEncoding(ctxt, enc); + } } pinput->filename = NULL; @@ -12860,23 +12821,6 @@ xmlIOParseDTD(xmlSAXHandlerPtr sax, xmlParserInputBufferPtr input, ctxt->myDoc->extSubset = xmlNewDtd(ctxt->myDoc, BAD_CAST "none", BAD_CAST "none", BAD_CAST "none"); - if ((enc == XML_CHAR_ENCODING_NONE) && - ((ctxt->input->end - ctxt->input->cur) >= 4)) { - /* - * Get the 4 first bytes and decode the charset - * if enc != XML_CHAR_ENCODING_NONE - * plug some encoding conversion routines. - */ - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(start, 4); - if (enc != XML_CHAR_ENCODING_NONE) { - xmlSwitchEncoding(ctxt, enc); - } - } - xmlParseExternalSubset(ctxt, BAD_CAST "none", BAD_CAST "none"); if (ctxt->myDoc != NULL) { @@ -12979,9 +12923,10 @@ xmlSAXParseDTD(xmlSAXHandlerPtr sax, const xmlChar *ExternalID, xmlFree(systemIdCanonic); return(NULL); } - if ((ctxt->input->end - ctxt->input->cur) >= 4) { - enc = xmlDetectCharEncoding(ctxt->input->cur, 4); - xmlSwitchEncoding(ctxt, enc); + int length = ctxt->input->end - ctxt->input->cur; + enc = xmlDetectCharEncoding(ctxt->input->cur, length); + if (enc != XML_CHAR_ENCODING_NONE) { + xmlSwitchEncoding(ctxt, enc); } if (input->filename == NULL) @@ -13084,7 +13029,6 @@ xmlParseCtxtExternalEntity(xmlParserCtxtPtr ctx, const xmlChar *URL, xmlNodePtr newRoot; xmlSAXHandlerPtr oldsax = NULL; int ret = 0; - xmlChar start[4]; xmlCharEncoding enc; if (ctx == NULL) return(-1); @@ -13150,15 +13094,11 @@ xmlParseCtxtExternalEntity(xmlParserCtxtPtr ctx, const xmlChar *URL, * plug some encoding conversion routines. */ GROW - if ((ctxt->input->end - ctxt->input->cur) >= 4) { - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(start, 4); - if (enc != XML_CHAR_ENCODING_NONE) { - xmlSwitchEncoding(ctxt, enc); - } + + int length = ctxt->input->end - ctxt->input->cur; + enc = xmlDetectCharEncoding(ctxt->input->cur, length); + if (enc != XML_CHAR_ENCODING_NONE) { + xmlSwitchEncoding(ctxt, enc); } /* @@ -13293,7 +13233,6 @@ xmlParseExternalEntityPrivate(xmlDocPtr doc, xmlParserCtxtPtr oldctxt, xmlNodePtr newRoot; xmlSAXHandlerPtr oldsax = NULL; xmlParserErrors ret = XML_ERR_OK; - xmlChar start[4]; xmlCharEncoding enc; if (((depth > 40) && @@ -13375,20 +13314,13 @@ xmlParseExternalEntityPrivate(xmlDocPtr doc, xmlParserCtxtPtr oldctxt, newRoot->doc = doc; /* - * Get the 4 first bytes and decode the charset - * if enc != XML_CHAR_ENCODING_NONE * plug some encoding conversion routines. */ GROW; - if ((ctxt->input->end - ctxt->input->cur) >= 4) { - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(start, 4); - if (enc != XML_CHAR_ENCODING_NONE) { - xmlSwitchEncoding(ctxt, enc); - } + int length = ctxt->input->end - ctxt->input->cur; + enc = xmlDetectCharEncoding(ctxt->input->cur, length); + if (enc != XML_CHAR_ENCODING_NONE) { + xmlSwitchEncoding(ctxt, enc); } /* @@ -15132,8 +15064,9 @@ xmlCtxtResetPush(xmlParserCtxtPtr ctxt, const char *chunk, if (ctxt == NULL) return(1); - if ((encoding == NULL) && (chunk != NULL) && (size >= 4)) - enc = xmlDetectCharEncoding((const xmlChar *) chunk, size); + if ((encoding == NULL) && (chunk != NULL)) { + enc = xmlDetectCharEncoding(BAD_CAST chunk, size); + } buf = xmlAllocParserInputBuffer(XML_CHAR_ENCODING_NONE); if (buf == NULL)