[libxml2] Add HTML parser support for HTML5 meta charset encoding declaration



commit 868d92da8915fc5dc5e329d93cc7882370a28475
Author: Denis Pauk <pauk denis gmail com>
Date:   Thu May 10 15:34:57 2012 +0800

    Add HTML parser support for HTML5 meta charset encoding declaration
    
    For https://bugzilla.gnome.org/show_bug.cgi?id=655218
    
    http://www.w3.org/TR/2011/WD-html5-20110525/semantics.html#the-meta-element
    
    """
    The charset attribute specifies the character encoding used by the document.
    This is a character encoding declaration. If the attribute is present in an XML
    document, its value must be an ASCII case-insensitive match for the string
    "UTF-8" (and the document is therefore forced to use UTF-8 as its
    encoding).
    """
    
    However, while <meta http-equiv="Content-Type" content="text/html;
    charset=utf8"> works, <meta charset="utf8"> does not.
    
    While libxml2 HTML parser is not tuned for HTML5, this is a simple
    addition
    
    Also added a testcase

 .gitignore                     |    3 --
 HTMLparser.c                   |   53 +++++++++++++++++++++++++++++----------
 result/HTML/html5_enc.html     |    7 +++++
 result/HTML/html5_enc.html.sax |   30 ++++++++++++++++++++++
 test/HTML/html5_enc.html       |    8 ++++++
 5 files changed, 84 insertions(+), 17 deletions(-)
---
diff --git a/.gitignore b/.gitignore
index e360fff..55bd73b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,9 +1,6 @@
 *.o
 *.lo
-*.xml
 *.log
-*.rng
-*.html
 *.patch
 .deps
 .libs
diff --git a/HTMLparser.c b/HTMLparser.c
index 5580b18..2eb3fb4 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -727,7 +727,7 @@ static const char* const map_contents[] = { BLOCK, "area", NULL } ;
 static const char* const name_attr[] = { "name", NULL } ;
 static const char* const action_attr[] = { "action", NULL } ;
 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
-static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
+static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
 static const char* const content_attr[] = { "content", NULL } ;
 static const char* const type_attr[] = { "type", NULL } ;
 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
@@ -3435,20 +3435,19 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
 }
 
 /**
- * htmlCheckEncoding:
+ * htmlCheckEncodingDirect:
  * @ctxt:  an HTML parser context
  * @attvalue: the attribute value
  *
- * Checks an http-equiv attribute from a Meta tag to detect
+ * Checks an attribute value to detect
  * the encoding
  * If a new encoding is detected the parser is switched to decode
  * it and pass UTF8
  */
 static void
-htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
-    const xmlChar *encoding;
+htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
 
-    if ((ctxt == NULL) || (attvalue == NULL) ||
+    if ((ctxt == NULL) || (encoding == NULL) ||
         (ctxt->options & HTML_PARSE_IGNORE_ENC))
 	return;
 
@@ -3456,14 +3455,6 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
     if (ctxt->input->encoding != NULL)
         return;
 
-    encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
-    if (encoding != NULL) {
-	encoding += 8;
-    } else {
-	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
-	if (encoding != NULL)
-	    encoding += 9;
-    }
     if (encoding != NULL) {
 	xmlCharEncoding enc;
 	xmlCharEncodingHandlerPtr handler;
@@ -3536,6 +3527,38 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
 }
 
 /**
+ * htmlCheckEncoding:
+ * @ctxt:  an HTML parser context
+ * @attvalue: the attribute value
+ *
+ * Checks an http-equiv attribute from a Meta tag to detect
+ * the encoding
+ * If a new encoding is detected the parser is switched to decode
+ * it and pass UTF8
+ */
+static void
+htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
+    const xmlChar *encoding;
+
+    if (!attvalue)
+	return;
+
+    encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
+    if (encoding != NULL) {
+	encoding += 7;
+    }
+    /*
+     * skip blank
+     */
+    if (encoding && IS_BLANK_CH(*encoding))
+	encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
+    if (encoding && *encoding == '=') {
+	encoding ++;
+	htmlCheckEncodingDirect(ctxt, encoding);
+    }
+}
+
+/**
  * htmlCheckMeta:
  * @ctxt:  an HTML parser context
  * @atts:  the attributes values
@@ -3559,6 +3582,8 @@ htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
 	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
 	 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
 	    http = 1;
+	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
+	    htmlCheckEncodingDirect(ctxt, value);
 	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
 	    content = value;
 	att = atts[i++];
diff --git a/result/HTML/html5_enc.html b/result/HTML/html5_enc.html
new file mode 100644
index 0000000..596d54d
--- /dev/null
+++ b/result/HTML/html5_enc.html
@@ -0,0 +1,7 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd";>
+<html>
+<head><meta charset="iso-8859-1"></head>
+<body>
+  <p>tr&egrave;s</p>
+</body>
+</html>
diff --git a/result/HTML/html5_enc.html.err b/result/HTML/html5_enc.html.err
new file mode 100644
index 0000000..e69de29
diff --git a/result/HTML/html5_enc.html.sax b/result/HTML/html5_enc.html.sax
new file mode 100644
index 0000000..292be57
--- /dev/null
+++ b/result/HTML/html5_enc.html.sax
@@ -0,0 +1,30 @@
+SAX.setDocumentLocator()
+SAX.startDocument()
+SAX.startElement(html)
+SAX.ignorableWhitespace(
+, 1)
+SAX.startElement(head)
+SAX.ignorableWhitespace(
+, 1)
+SAX.startElement(meta, charset='iso-8859-1')
+SAX.endElement(meta)
+SAX.ignorableWhitespace(
+, 1)
+SAX.endElement(head)
+SAX.ignorableWhitespace(
+, 1)
+SAX.startElement(body)
+SAX.characters(
+  , 3)
+SAX.startElement(p)
+SAX.characters(tr&egrave;s, 5)
+SAX.endElement(p)
+SAX.characters(
+, 1)
+SAX.endElement(body)
+SAX.ignorableWhitespace(
+, 1)
+SAX.endElement(html)
+SAX.ignorableWhitespace(
+, 1)
+SAX.endDocument()
diff --git a/test/HTML/html5_enc.html b/test/HTML/html5_enc.html
new file mode 100644
index 0000000..3ebf491
--- /dev/null
+++ b/test/HTML/html5_enc.html
@@ -0,0 +1,8 @@
+<html>
+<head>
+<meta charset="iso-8859-1"/>
+</head>
+<body>
+  <p>trè/p>
+</body>
+</html>



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]