[libxml2] 579317 Try to find the HTML encoding information
- From: Daniel Veillard <veillard src gnome org>
- To: svn-commits-list gnome org
- Cc:
- Subject: [libxml2] 579317 Try to find the HTML encoding information
- Date: Wed, 12 Aug 2009 21:01:01 +0000 (UTC)
commit 533ec0e073488039972fafa57e360e4c47d68dd4
Author: Daniel Veillard <veillard redhat com>
Date: Wed Aug 12 20:13:38 2009 +0200
579317 Try to find the HTML encoding information
* HTMLparser.c: if we hit an encoding error before parsing a potential
<meta> with the info look in the input buffer to see if we can find
it instead of forcing a blind switch to ISO-8859-1
HTMLparser.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 74 insertions(+), 2 deletions(-)
---
diff --git a/HTMLparser.c b/HTMLparser.c
index da17efe..3a03a3e 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -288,6 +288,58 @@ htmlnamePop(htmlParserCtxtPtr ctxt)
else i += xmlCopyChar(l,&b[i],v)
/**
+ * htmlFindEncoding:
+ * @the HTML parser context
+ *
+ * Ty to find and encoding in the current data available in the input
+ * buffer this is needed to try to switch to the proper encoding when
+ * one face a character error.
+ * That's an heuristic, since it's operating outside of parsing it could
+ * try to use a meta which had been commented out, that's the reason it
+ * should only be used in case of error, not as a default.
+ *
+ * Returns an encoding string or NULL if not found, the string need to
+ * be freed
+ */
+static xmlChar *
+htmlFindEncoding(xmlParserCtxtPtr ctxt) {
+ const xmlChar *start, *cur, *end;
+
+ if ((ctxt == NULL) || (ctxt->input == NULL) ||
+ (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
+ (ctxt->input->buf->encoder != NULL))
+ return(NULL);
+ if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
+ return(NULL);
+
+ start = ctxt->input->cur;
+ end = ctxt->input->end;
+ /* we also expect the input buffer to be zero terminated */
+ if (*end != 0)
+ return(NULL);
+
+ cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
+ if (cur == NULL)
+ return(NULL);
+ cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
+ if (cur == NULL)
+ return(NULL);
+ cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
+ if (cur == NULL)
+ return(NULL);
+ cur += 8;
+ start = cur;
+ while (((*cur >= 'A') && (*cur <= 'Z')) ||
+ ((*cur >= 'a') && (*cur <= 'z')) ||
+ ((*cur >= '0') && (*cur <= '9')) ||
+ (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
+ cur++;
+ if (cur == start)
+ return(NULL);
+ return(xmlStrndup(start, cur - start));
+}
+
+/**
* htmlCurrentChar:
* @ctxt: the HTML parser context
* @len: pointer to the length of the char read
@@ -386,8 +438,28 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
/*
* Humm this is bad, do an automatic flow conversion
*/
- xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
- ctxt->charset = XML_CHAR_ENCODING_UTF8;
+ {
+ xmlChar * guess;
+ xmlCharEncodingHandlerPtr handler;
+
+ guess = htmlFindEncoding(ctxt);
+ if (guess == NULL) {
+ xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
+ } else {
+ if (ctxt->input->encoding != NULL)
+ xmlFree((xmlChar *) ctxt->input->encoding);
+ ctxt->input->encoding = guess;
+ handler = xmlFindCharEncodingHandler((const char *) guess);
+ if (handler != NULL) {
+ xmlSwitchToEncoding(ctxt, handler);
+ } else {
+ htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
+ "Unsupported encoding %s", guess, NULL);
+ }
+ }
+ ctxt->charset = XML_CHAR_ENCODING_UTF8;
+ }
+
return(xmlCurrentChar(ctxt, len));
encoding_error:
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]