Hi Daniel, All, (A) Find attached patches for parser.c and tree.c, which address the problem that iconv-less libxml2 tries to handle ISO-8859-2... ISO-8859-9 in an outdated mode (of storing in-memory strings using the document encoding). After applying the patch, iconv-less libxml2 will only handle UTF-8, UTF-16 and ISO-8859-1. Use the diff -cb output to easier see what's done and the diff -c output for actual patching. (B) The 8859x-test.tar.gz archive holds sample xml files demostrating the character repertoire of ISO-8859-1.. ISO-8859-16, both in native encoding and UTF-8. You can check the transcoding with tests like: xmllint --encode UTF-8 /tests/8859-15.xml | cmp - /tests/utf8-15.xml xmllint --encode ISO-8859-15 /tests/utf8-15.xml | cmp - /tests/8859-15.xml A slightly different test is: xmllint --debug /tests/8859-15.xml >8859-15.debug xmllint --debug /tests/utf8-15.xml >utf8-15.debug diff 8859-15.debug utf8-15.debug (C) The 8859x-test.tar.gz archive holds reasonably efficent ISO-8859-* transcoders which can be registered with xmlNewCharEncodingHandler. As provided, the code is to be linked with the app, not into the libxml2 DLL, but this is a matter of switching some lines. A patch for xmllint.c is included, which can be used to link the transcoders into xmllint and test them there. The additional xmllint parameter --iso8859x will switch them on. I'm fine with having them on the app side and I suspect not many users are running without iconv, so I don't know whether to put them into libxml2. Regards, Peter Jacobi
*** ../../base/libxml2-2.5.8/tree.c Wed Jun 04 17:39:09 2003 --- tree.c Sat Jul 19 13:07:23 2003 *************** *** 961,966 **** --- 961,971 ---- cur->standalone = -1; cur->compression = -1; /* not initialized */ cur->doc = cur; + /* + * The in memory encoding is always UTF8 + * This field will never change and would + * be obsolete if not for binary compatibility. + */ cur->charset = XML_CHAR_ENCODING_UTF8; if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue)) *************** *** 7824,7830 **** int format) { int dummy = 0; - xmlCharEncoding doc_charset; xmlOutputBufferPtr out_buff = NULL; xmlCharEncodingHandlerPtr conv_hdlr = NULL; --- 7829,7834 ---- *************** *** 7857,7872 **** if (txt_encoding == NULL) txt_encoding = (const char *) out_doc->encoding; if (txt_encoding != NULL) { - doc_charset = xmlParseCharEncoding(txt_encoding); - - if (out_doc->charset != XML_CHAR_ENCODING_UTF8) { - xmlGenericError(xmlGenericErrorContext, - "xmlDocDumpFormatMemoryEnc: Source document not in UTF8\n"); - return; - - } else if (doc_charset != XML_CHAR_ENCODING_UTF8) { - conv_hdlr = xmlGetCharEncodingHandler(doc_charset); - if (conv_hdlr == NULL) conv_hdlr = xmlFindCharEncodingHandler(txt_encoding); if ( conv_hdlr == NULL ) { xmlGenericError(xmlGenericErrorContext, --- 7861,7866 ---- *************** *** 7878,7884 **** return; } } - } if ((out_buff = xmlAllocOutputBuffer(conv_hdlr)) == NULL ) { xmlGenericError(xmlGenericErrorContext, --- 7872,7877 ---- *************** *** 8042,8066 **** encoding = (const char *) cur->encoding; if (encoding != NULL) { - xmlCharEncoding enc; - - enc = xmlParseCharEncoding(encoding); - - if (cur->charset != XML_CHAR_ENCODING_UTF8) { - xmlGenericError(xmlGenericErrorContext, - "xmlDocDump: document not in UTF8\n"); - return(-1); - } - if (enc != XML_CHAR_ENCODING_UTF8) { - handler = xmlGetCharEncodingHandler(enc); - if (handler == NULL) handler = xmlFindCharEncodingHandler(encoding); if (handler == NULL) { xmlFree((char *) cur->encoding); cur->encoding = NULL; } } - } buf = xmlOutputBufferCreateFile(f, handler); if (buf == NULL) return(-1); xmlDocContentDumpOutput(buf, cur, NULL, format); --- 8035,8046 ---- *************** *** 8144,8150 **** const char * encoding, int format ) { xmlOutputBufferPtr buf; xmlCharEncodingHandlerPtr handler = NULL; - xmlCharEncoding enc; int ret; if (cur == NULL) --- 8124,8129 ---- *************** *** 8155,8171 **** if (encoding != NULL) { - enc = xmlParseCharEncoding(encoding); - if (cur->charset != XML_CHAR_ENCODING_UTF8) { - xmlGenericError(xmlGenericErrorContext, - "xmlSaveFormatFileEnc: document not in UTF8\n"); - return(-1); - } - if (enc != XML_CHAR_ENCODING_UTF8) { handler = xmlFindCharEncodingHandler(encoding); if (handler == NULL) return(-1); - } } #ifdef HAVE_ZLIB_H --- 8134,8142 ----
*** ../../base/libxml2-2.5.8/tree.c Wed Jun 04 17:39:09 2003 --- tree.c Sat Jul 19 13:07:23 2003 *************** *** 961,966 **** --- 961,971 ---- cur->standalone = -1; cur->compression = -1; /* not initialized */ cur->doc = cur; + /* + * The in memory encoding is always UTF8 + * This field will never change and would + * be obsolete if not for binary compatibility. + */ cur->charset = XML_CHAR_ENCODING_UTF8; if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue)) *************** *** 7824,7830 **** int format) { int dummy = 0; - xmlCharEncoding doc_charset; xmlOutputBufferPtr out_buff = NULL; xmlCharEncodingHandlerPtr conv_hdlr = NULL; --- 7829,7834 ---- *************** *** 7857,7882 **** if (txt_encoding == NULL) txt_encoding = (const char *) out_doc->encoding; if (txt_encoding != NULL) { ! doc_charset = xmlParseCharEncoding(txt_encoding); ! ! if (out_doc->charset != XML_CHAR_ENCODING_UTF8) { xmlGenericError(xmlGenericErrorContext, ! "xmlDocDumpFormatMemoryEnc: Source document not in UTF8\n"); return; - - } else if (doc_charset != XML_CHAR_ENCODING_UTF8) { - conv_hdlr = xmlGetCharEncodingHandler(doc_charset); - if (conv_hdlr == NULL) - conv_hdlr = xmlFindCharEncodingHandler(txt_encoding); - if ( conv_hdlr == NULL ) { - xmlGenericError(xmlGenericErrorContext, - "%s: %s %s '%s'\n", - "xmlDocDumpFormatMemoryEnc", - "Failed to identify encoding handler for", - "character set", - txt_encoding); - return; - } } } --- 7861,7875 ---- if (txt_encoding == NULL) txt_encoding = (const char *) out_doc->encoding; if (txt_encoding != NULL) { ! conv_hdlr = xmlFindCharEncodingHandler(txt_encoding); ! if ( conv_hdlr == NULL ) { xmlGenericError(xmlGenericErrorContext, ! "%s: %s %s '%s'\n", ! "xmlDocDumpFormatMemoryEnc", ! "Failed to identify encoding handler for", ! "character set", ! txt_encoding); return; } } *************** *** 8042,8064 **** encoding = (const char *) cur->encoding; if (encoding != NULL) { ! xmlCharEncoding enc; ! ! enc = xmlParseCharEncoding(encoding); ! ! if (cur->charset != XML_CHAR_ENCODING_UTF8) { ! xmlGenericError(xmlGenericErrorContext, ! "xmlDocDump: document not in UTF8\n"); ! return(-1); ! } ! if (enc != XML_CHAR_ENCODING_UTF8) { ! handler = xmlGetCharEncodingHandler(enc); ! if (handler == NULL) ! handler = xmlFindCharEncodingHandler(encoding); ! if (handler == NULL) { ! xmlFree((char *) cur->encoding); ! cur->encoding = NULL; ! } } } buf = xmlOutputBufferCreateFile(f, handler); --- 8035,8044 ---- encoding = (const char *) cur->encoding; if (encoding != NULL) { ! handler = xmlFindCharEncodingHandler(encoding); ! if (handler == NULL) { ! xmlFree((char *) cur->encoding); ! cur->encoding = NULL; } } buf = xmlOutputBufferCreateFile(f, handler); *************** *** 8144,8150 **** const char * encoding, int format ) { xmlOutputBufferPtr buf; xmlCharEncodingHandlerPtr handler = NULL; - xmlCharEncoding enc; int ret; if (cur == NULL) --- 8124,8129 ---- *************** *** 8155,8171 **** if (encoding != NULL) { ! enc = xmlParseCharEncoding(encoding); ! if (cur->charset != XML_CHAR_ENCODING_UTF8) { ! xmlGenericError(xmlGenericErrorContext, ! "xmlSaveFormatFileEnc: document not in UTF8\n"); return(-1); - } - if (enc != XML_CHAR_ENCODING_UTF8) { - handler = xmlFindCharEncodingHandler(encoding); - if (handler == NULL) - return(-1); - } } #ifdef HAVE_ZLIB_H --- 8134,8142 ---- if (encoding != NULL) { ! handler = xmlFindCharEncodingHandler(encoding); ! if (handler == NULL) return(-1); } #ifdef HAVE_ZLIB_H
*** ../../base/libxml2-2.5.8/parser.c Tue Jul 01 14:26:07 2003 --- parser.c Sat Jul 19 12:37:27 2003 *************** *** 7568,7605 **** if (ctxt->recovery == 0) ctxt->disableSAX = 1; } if (encoding != NULL) { - xmlCharEncoding enc; xmlCharEncodingHandlerPtr handler; if (ctxt->input->encoding != NULL) xmlFree((xmlChar *) ctxt->input->encoding); ctxt->input->encoding = encoding; ! enc = xmlParseCharEncoding((const char *) encoding); ! /* ! * registered set of known encodings ! */ ! if (enc != XML_CHAR_ENCODING_ERROR) { ! xmlSwitchEncoding(ctxt, enc); ! if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { ! ctxt->input->encoding = NULL; ! xmlFree(encoding); ! return(NULL); ! } } else { ! /* ! * fallback for unknown encodings ! */ ! handler = xmlFindCharEncodingHandler((const char *) encoding); ! if (handler != NULL) { ! xmlSwitchToEncoding(ctxt, handler); ! } else { ! ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; ! if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ! ctxt->sax->error(ctxt->userData, ! "Unsupported encoding %s\n", encoding); ! return(NULL); ! } } } } --- 7568,7588 ---- if (ctxt->recovery == 0) ctxt->disableSAX = 1; } if (encoding != NULL) { xmlCharEncodingHandlerPtr handler; if (ctxt->input->encoding != NULL) xmlFree((xmlChar *) ctxt->input->encoding); ctxt->input->encoding = encoding; ! handler = xmlFindCharEncodingHandler((const char *) encoding); ! if (handler != NULL) { ! xmlSwitchToEncoding(ctxt, handler); } else { ! ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; ! if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ! ctxt->sax->error(ctxt->userData, ! "Unsupported encoding %s\n", encoding); ! return(NULL); } } }
*** ../../base/libxml2-2.5.8/parser.c Tue Jul 01 14:26:07 2003 --- parser.c Sat Jul 19 12:37:27 2003 *************** *** 7568,7595 **** if (ctxt->recovery == 0) ctxt->disableSAX = 1; } if (encoding != NULL) { - xmlCharEncoding enc; xmlCharEncodingHandlerPtr handler; if (ctxt->input->encoding != NULL) xmlFree((xmlChar *) ctxt->input->encoding); ctxt->input->encoding = encoding; - enc = xmlParseCharEncoding((const char *) encoding); - /* - * registered set of known encodings - */ - if (enc != XML_CHAR_ENCODING_ERROR) { - xmlSwitchEncoding(ctxt, enc); - if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { - ctxt->input->encoding = NULL; - xmlFree(encoding); - return(NULL); - } - } else { - /* - * fallback for unknown encodings - */ handler = xmlFindCharEncodingHandler((const char *) encoding); if (handler != NULL) { xmlSwitchToEncoding(ctxt, handler); --- 7568,7579 ---- *************** *** 7599,7605 **** ctxt->sax->error(ctxt->userData, "Unsupported encoding %s\n", encoding); return(NULL); - } } } } --- 7583,7588 ----
Attachment:
8859x-tests.tar.gz
Description: Binary data
Attachment:
8859x-lint.tar.gz
Description: Binary data