Hi Daniel, All, (A) Find attached patches for parser.c and tree.c, which address the problem that iconv-less libxml2 tries to handle ISO-8859-2... ISO-8859-9 in an outdated mode (of storing in-memory strings using the document encoding). After applying the patch, iconv-less libxml2 will only handle UTF-8, UTF-16 and ISO-8859-1. Use the diff -cb output to easier see what's done and the diff -c output for actual patching. (B) The 8859x-test.tar.gz archive holds sample xml files demostrating the character repertoire of ISO-8859-1.. ISO-8859-16, both in native encoding and UTF-8. You can check the transcoding with tests like: xmllint --encode UTF-8 /tests/8859-15.xml | cmp - /tests/utf8-15.xml xmllint --encode ISO-8859-15 /tests/utf8-15.xml | cmp - /tests/8859-15.xml A slightly different test is: xmllint --debug /tests/8859-15.xml >8859-15.debug xmllint --debug /tests/utf8-15.xml >utf8-15.debug diff 8859-15.debug utf8-15.debug (C) The 8859x-test.tar.gz archive holds reasonably efficent ISO-8859-* transcoders which can be registered with xmlNewCharEncodingHandler. As provided, the code is to be linked with the app, not into the libxml2 DLL, but this is a matter of switching some lines. A patch for xmllint.c is included, which can be used to link the transcoders into xmllint and test them there. The additional xmllint parameter --iso8859x will switch them on. I'm fine with having them on the app side and I suspect not many users are running without iconv, so I don't know whether to put them into libxml2. Regards, Peter Jacobi
*** ../../base/libxml2-2.5.8/tree.c Wed Jun 04 17:39:09 2003
--- tree.c Sat Jul 19 13:07:23 2003
***************
*** 961,966 ****
--- 961,971 ----
cur->standalone = -1;
cur->compression = -1; /* not initialized */
cur->doc = cur;
+ /*
+ * The in memory encoding is always UTF8
+ * This field will never change and would
+ * be obsolete if not for binary compatibility.
+ */
cur->charset = XML_CHAR_ENCODING_UTF8;
if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
***************
*** 7824,7830 ****
int format) {
int dummy = 0;
- xmlCharEncoding doc_charset;
xmlOutputBufferPtr out_buff = NULL;
xmlCharEncodingHandlerPtr conv_hdlr = NULL;
--- 7829,7834 ----
***************
*** 7857,7872 ****
if (txt_encoding == NULL)
txt_encoding = (const char *) out_doc->encoding;
if (txt_encoding != NULL) {
- doc_charset = xmlParseCharEncoding(txt_encoding);
-
- if (out_doc->charset != XML_CHAR_ENCODING_UTF8) {
- xmlGenericError(xmlGenericErrorContext,
- "xmlDocDumpFormatMemoryEnc: Source document not in UTF8\n");
- return;
-
- } else if (doc_charset != XML_CHAR_ENCODING_UTF8) {
- conv_hdlr = xmlGetCharEncodingHandler(doc_charset);
- if (conv_hdlr == NULL)
conv_hdlr = xmlFindCharEncodingHandler(txt_encoding);
if ( conv_hdlr == NULL ) {
xmlGenericError(xmlGenericErrorContext,
--- 7861,7866 ----
***************
*** 7878,7884 ****
return;
}
}
- }
if ((out_buff = xmlAllocOutputBuffer(conv_hdlr)) == NULL ) {
xmlGenericError(xmlGenericErrorContext,
--- 7872,7877 ----
***************
*** 8042,8066 ****
encoding = (const char *) cur->encoding;
if (encoding != NULL) {
- xmlCharEncoding enc;
-
- enc = xmlParseCharEncoding(encoding);
-
- if (cur->charset != XML_CHAR_ENCODING_UTF8) {
- xmlGenericError(xmlGenericErrorContext,
- "xmlDocDump: document not in UTF8\n");
- return(-1);
- }
- if (enc != XML_CHAR_ENCODING_UTF8) {
- handler = xmlGetCharEncodingHandler(enc);
- if (handler == NULL)
handler = xmlFindCharEncodingHandler(encoding);
if (handler == NULL) {
xmlFree((char *) cur->encoding);
cur->encoding = NULL;
}
}
- }
buf = xmlOutputBufferCreateFile(f, handler);
if (buf == NULL) return(-1);
xmlDocContentDumpOutput(buf, cur, NULL, format);
--- 8035,8046 ----
***************
*** 8144,8150 ****
const char * encoding, int format ) {
xmlOutputBufferPtr buf;
xmlCharEncodingHandlerPtr handler = NULL;
- xmlCharEncoding enc;
int ret;
if (cur == NULL)
--- 8124,8129 ----
***************
*** 8155,8171 ****
if (encoding != NULL) {
- enc = xmlParseCharEncoding(encoding);
- if (cur->charset != XML_CHAR_ENCODING_UTF8) {
- xmlGenericError(xmlGenericErrorContext,
- "xmlSaveFormatFileEnc: document not in UTF8\n");
- return(-1);
- }
- if (enc != XML_CHAR_ENCODING_UTF8) {
handler = xmlFindCharEncodingHandler(encoding);
if (handler == NULL)
return(-1);
- }
}
#ifdef HAVE_ZLIB_H
--- 8134,8142 ----
*** ../../base/libxml2-2.5.8/tree.c Wed Jun 04 17:39:09 2003
--- tree.c Sat Jul 19 13:07:23 2003
***************
*** 961,966 ****
--- 961,971 ----
cur->standalone = -1;
cur->compression = -1; /* not initialized */
cur->doc = cur;
+ /*
+ * The in memory encoding is always UTF8
+ * This field will never change and would
+ * be obsolete if not for binary compatibility.
+ */
cur->charset = XML_CHAR_ENCODING_UTF8;
if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
***************
*** 7824,7830 ****
int format) {
int dummy = 0;
- xmlCharEncoding doc_charset;
xmlOutputBufferPtr out_buff = NULL;
xmlCharEncodingHandlerPtr conv_hdlr = NULL;
--- 7829,7834 ----
***************
*** 7857,7882 ****
if (txt_encoding == NULL)
txt_encoding = (const char *) out_doc->encoding;
if (txt_encoding != NULL) {
! doc_charset = xmlParseCharEncoding(txt_encoding);
!
! if (out_doc->charset != XML_CHAR_ENCODING_UTF8) {
xmlGenericError(xmlGenericErrorContext,
! "xmlDocDumpFormatMemoryEnc: Source document not in UTF8\n");
return;
-
- } else if (doc_charset != XML_CHAR_ENCODING_UTF8) {
- conv_hdlr = xmlGetCharEncodingHandler(doc_charset);
- if (conv_hdlr == NULL)
- conv_hdlr = xmlFindCharEncodingHandler(txt_encoding);
- if ( conv_hdlr == NULL ) {
- xmlGenericError(xmlGenericErrorContext,
- "%s: %s %s '%s'\n",
- "xmlDocDumpFormatMemoryEnc",
- "Failed to identify encoding handler for",
- "character set",
- txt_encoding);
- return;
- }
}
}
--- 7861,7875 ----
if (txt_encoding == NULL)
txt_encoding = (const char *) out_doc->encoding;
if (txt_encoding != NULL) {
! conv_hdlr = xmlFindCharEncodingHandler(txt_encoding);
! if ( conv_hdlr == NULL ) {
xmlGenericError(xmlGenericErrorContext,
! "%s: %s %s '%s'\n",
! "xmlDocDumpFormatMemoryEnc",
! "Failed to identify encoding handler for",
! "character set",
! txt_encoding);
return;
}
}
***************
*** 8042,8064 ****
encoding = (const char *) cur->encoding;
if (encoding != NULL) {
! xmlCharEncoding enc;
!
! enc = xmlParseCharEncoding(encoding);
!
! if (cur->charset != XML_CHAR_ENCODING_UTF8) {
! xmlGenericError(xmlGenericErrorContext,
! "xmlDocDump: document not in UTF8\n");
! return(-1);
! }
! if (enc != XML_CHAR_ENCODING_UTF8) {
! handler = xmlGetCharEncodingHandler(enc);
! if (handler == NULL)
! handler = xmlFindCharEncodingHandler(encoding);
! if (handler == NULL) {
! xmlFree((char *) cur->encoding);
! cur->encoding = NULL;
! }
}
}
buf = xmlOutputBufferCreateFile(f, handler);
--- 8035,8044 ----
encoding = (const char *) cur->encoding;
if (encoding != NULL) {
! handler = xmlFindCharEncodingHandler(encoding);
! if (handler == NULL) {
! xmlFree((char *) cur->encoding);
! cur->encoding = NULL;
}
}
buf = xmlOutputBufferCreateFile(f, handler);
***************
*** 8144,8150 ****
const char * encoding, int format ) {
xmlOutputBufferPtr buf;
xmlCharEncodingHandlerPtr handler = NULL;
- xmlCharEncoding enc;
int ret;
if (cur == NULL)
--- 8124,8129 ----
***************
*** 8155,8171 ****
if (encoding != NULL) {
! enc = xmlParseCharEncoding(encoding);
! if (cur->charset != XML_CHAR_ENCODING_UTF8) {
! xmlGenericError(xmlGenericErrorContext,
! "xmlSaveFormatFileEnc: document not in UTF8\n");
return(-1);
- }
- if (enc != XML_CHAR_ENCODING_UTF8) {
- handler = xmlFindCharEncodingHandler(encoding);
- if (handler == NULL)
- return(-1);
- }
}
#ifdef HAVE_ZLIB_H
--- 8134,8142 ----
if (encoding != NULL) {
! handler = xmlFindCharEncodingHandler(encoding);
! if (handler == NULL)
return(-1);
}
#ifdef HAVE_ZLIB_H
*** ../../base/libxml2-2.5.8/parser.c Tue Jul 01 14:26:07 2003
--- parser.c Sat Jul 19 12:37:27 2003
***************
*** 7568,7605 ****
if (ctxt->recovery == 0) ctxt->disableSAX = 1;
}
if (encoding != NULL) {
- xmlCharEncoding enc;
xmlCharEncodingHandlerPtr handler;
if (ctxt->input->encoding != NULL)
xmlFree((xmlChar *) ctxt->input->encoding);
ctxt->input->encoding = encoding;
! enc = xmlParseCharEncoding((const char *) encoding);
! /*
! * registered set of known encodings
! */
! if (enc != XML_CHAR_ENCODING_ERROR) {
! xmlSwitchEncoding(ctxt, enc);
! if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
! ctxt->input->encoding = NULL;
! xmlFree(encoding);
! return(NULL);
! }
} else {
! /*
! * fallback for unknown encodings
! */
! handler = xmlFindCharEncodingHandler((const char *) encoding);
! if (handler != NULL) {
! xmlSwitchToEncoding(ctxt, handler);
! } else {
! ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
! if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
! ctxt->sax->error(ctxt->userData,
! "Unsupported encoding %s\n", encoding);
! return(NULL);
! }
}
}
}
--- 7568,7588 ----
if (ctxt->recovery == 0) ctxt->disableSAX = 1;
}
if (encoding != NULL) {
xmlCharEncodingHandlerPtr handler;
if (ctxt->input->encoding != NULL)
xmlFree((xmlChar *) ctxt->input->encoding);
ctxt->input->encoding = encoding;
! handler = xmlFindCharEncodingHandler((const char *) encoding);
! if (handler != NULL) {
! xmlSwitchToEncoding(ctxt, handler);
} else {
! ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
! if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
! ctxt->sax->error(ctxt->userData,
! "Unsupported encoding %s\n", encoding);
! return(NULL);
}
}
}
*** ../../base/libxml2-2.5.8/parser.c Tue Jul 01 14:26:07 2003
--- parser.c Sat Jul 19 12:37:27 2003
***************
*** 7568,7595 ****
if (ctxt->recovery == 0) ctxt->disableSAX = 1;
}
if (encoding != NULL) {
- xmlCharEncoding enc;
xmlCharEncodingHandlerPtr handler;
if (ctxt->input->encoding != NULL)
xmlFree((xmlChar *) ctxt->input->encoding);
ctxt->input->encoding = encoding;
- enc = xmlParseCharEncoding((const char *) encoding);
- /*
- * registered set of known encodings
- */
- if (enc != XML_CHAR_ENCODING_ERROR) {
- xmlSwitchEncoding(ctxt, enc);
- if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
- ctxt->input->encoding = NULL;
- xmlFree(encoding);
- return(NULL);
- }
- } else {
- /*
- * fallback for unknown encodings
- */
handler = xmlFindCharEncodingHandler((const char *) encoding);
if (handler != NULL) {
xmlSwitchToEncoding(ctxt, handler);
--- 7568,7579 ----
***************
*** 7599,7605 ****
ctxt->sax->error(ctxt->userData,
"Unsupported encoding %s\n", encoding);
return(NULL);
- }
}
}
}
--- 7583,7588 ----Attachment:
8859x-tests.tar.gz
Description: Binary data
Attachment:
8859x-lint.tar.gz
Description: Binary data