[libxml2] Detect change of encoding when parsing HTML names



commit beca86e8c86984b967a6efa05a9653470253edda
Author: Hugh Davenport <hugh davenport net nz>
Date:   Wed May 4 11:23:49 2016 +0800

    Detect change of encoding when parsing HTML names
    
    From https://bugzilla.gnome.org/show_bug.cgi?id=758518
    
    Happens when a file has a name getting parsed, but no valid encoding
    set, so libxml has to guess what the encoding is. This patch detects
    when the buffer location changes, and if it does, restarts the parsing
    of the name.
    
    This slightly change a couple of regression tests output

 HTMLparser.c                |    8 ++++++++
 result/HTML/758605.html     |    2 +-
 result/HTML/758605.html.err |    2 +-
 result/HTML/758605.html.sax |    3 ++-
 4 files changed, 12 insertions(+), 3 deletions(-)
---
diff --git a/HTMLparser.c b/HTMLparser.c
index 1c112cc..c6fcbc9 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -2492,6 +2492,7 @@ htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
     int len = 0, l;
     int c;
     int count = 0;
+    const xmlChar *base = ctxt->input->base;
 
     /*
      * Handler for more complex cases
@@ -2517,6 +2518,13 @@ htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
        len += l;
        NEXTL(l);
        c = CUR_CHAR(l);
+       if (ctxt->input->base != base) {
+           /*
+            * We changed encoding from an unknown encoding
+            * Input buffer changed location, so we better start again
+            */
+           return(htmlParseNameComplex(ctxt));
+       }
     }
 
     if (ctxt->input->base > ctxt->input->cur - len)
diff --git a/result/HTML/758605.html b/result/HTML/758605.html
index a085cce..60b01d3 100644
--- a/result/HTML/758605.html
+++ b/result/HTML/758605.html
@@ -1,3 +1,3 @@
 <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd";>
-<html><body><p>&amp;
+<html><body><p>&amp;&ecirc;
 </p></body></html>
diff --git a/result/HTML/758605.html.err b/result/HTML/758605.html.err
index 2b82be6..2086f96 100644
--- a/result/HTML/758605.html.err
+++ b/result/HTML/758605.html.err
@@ -1,3 +1,3 @@
-./test/HTML/758605.html:1: HTML parser error : htmlParseEntityRef: no name
+./test/HTML/758605.html:1: HTML parser error : htmlParseEntityRef: expecting ';'
 ê
   ^
diff --git a/result/HTML/758605.html.sax b/result/HTML/758605.html.sax
index 1f5cd32..c6e0986 100644
--- a/result/HTML/758605.html.sax
+++ b/result/HTML/758605.html.sax
@@ -1,10 +1,11 @@
 SAX.setDocumentLocator()
 SAX.startDocument()
-SAX.error: htmlParseEntityRef: no name
+SAX.error: htmlParseEntityRef: expecting ';'
 SAX.startElement(html)
 SAX.startElement(body)
 SAX.startElement(p)
 SAX.characters(&amp;, 1)
+SAX.characters(&ecirc;, 2)
 SAX.ignorableWhitespace(
 , 1)
 SAX.endElement(p)


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]