[xml] [PATCH] Character encodng cleanup



Hi Daniel, All,

(A) Find attached patches for parser.c and tree.c, which address
the problem that iconv-less libxml2 tries to handle ISO-8859-2...
ISO-8859-9 in an outdated mode (of storing in-memory strings
using the document encoding).

After applying the patch, iconv-less libxml2 will only handle
UTF-8, UTF-16 and ISO-8859-1.

Use the diff -cb output to easier see what's done and the
diff -c output for actual patching.

(B) The 8859x-test.tar.gz archive holds sample xml files
demostrating the character repertoire of ISO-8859-1..
ISO-8859-16, both in native encoding and UTF-8.

You can check the transcoding with tests like:

xmllint --encode UTF-8 /tests/8859-15.xml | cmp - /tests/utf8-15.xml
xmllint --encode ISO-8859-15 /tests/utf8-15.xml | cmp - /tests/8859-15.xml

A slightly different test is:

xmllint --debug /tests/8859-15.xml >8859-15.debug
xmllint --debug /tests/utf8-15.xml >utf8-15.debug
diff 8859-15.debug utf8-15.debug

(C) The 8859x-test.tar.gz archive holds reasonably efficent ISO-8859-*
transcoders which can be registered with xmlNewCharEncodingHandler.
As provided, the code is to be linked with the app, not into the libxml2
DLL, but this is a matter of switching some lines. A patch for xmllint.c is 
included, which can be used to link the transcoders into xmllint and test 
them there. The additional xmllint parameter --iso8859x
will switch them on.

I'm fine with having them on the app side and I suspect not many
users are running without iconv, so I don't know whether to put them
into libxml2.

Regards,
Peter Jacobi




*** ../../base/libxml2-2.5.8/tree.c     Wed Jun 04 17:39:09 2003
--- tree.c      Sat Jul 19 13:07:23 2003
***************
*** 961,966 ****
--- 961,971 ----
      cur->standalone = -1;
      cur->compression = -1; /* not initialized */
      cur->doc = cur;
+     /*
+      * The in memory encoding is always UTF8
+      * This field will never change and would
+      * be obsolete if not for binary compatibility.
+      */
      cur->charset = XML_CHAR_ENCODING_UTF8;
  
      if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
***************
*** 7824,7830 ****
                int format) {
      int                         dummy = 0;
  
-     xmlCharEncoding             doc_charset;
      xmlOutputBufferPtr          out_buff = NULL;
      xmlCharEncodingHandlerPtr   conv_hdlr = NULL;
  
--- 7829,7834 ----
***************
*** 7857,7872 ****
      if (txt_encoding == NULL)
        txt_encoding = (const char *) out_doc->encoding;
      if (txt_encoding != NULL) {
-         doc_charset = xmlParseCharEncoding(txt_encoding);
- 
-         if (out_doc->charset != XML_CHAR_ENCODING_UTF8) {
-             xmlGenericError(xmlGenericErrorContext,
-                     "xmlDocDumpFormatMemoryEnc: Source document not in UTF8\n");
-             return;
- 
-         } else if (doc_charset != XML_CHAR_ENCODING_UTF8) {
-           conv_hdlr = xmlGetCharEncodingHandler(doc_charset);
-           if (conv_hdlr == NULL)
                conv_hdlr = xmlFindCharEncodingHandler(txt_encoding);
              if ( conv_hdlr == NULL ) {
                  xmlGenericError(xmlGenericErrorContext,
--- 7861,7866 ----
***************
*** 7878,7884 ****
                  return;
              }
          }
-     }
  
      if ((out_buff = xmlAllocOutputBuffer(conv_hdlr)) == NULL ) {
          xmlGenericError(xmlGenericErrorContext,
--- 7872,7877 ----
***************
*** 8042,8066 ****
      encoding = (const char *) cur->encoding;
  
      if (encoding != NULL) {
-       xmlCharEncoding enc;
- 
-       enc = xmlParseCharEncoding(encoding);
- 
-       if (cur->charset != XML_CHAR_ENCODING_UTF8) {
-           xmlGenericError(xmlGenericErrorContext,
-                   "xmlDocDump: document not in UTF8\n");
-           return(-1);
-       }
-       if (enc != XML_CHAR_ENCODING_UTF8) {
-           handler = xmlGetCharEncodingHandler(enc);
-           if (handler == NULL)
                handler = xmlFindCharEncodingHandler(encoding);
            if (handler == NULL) {
                xmlFree((char *) cur->encoding);
                cur->encoding = NULL;
            }
        }
-     }
      buf = xmlOutputBufferCreateFile(f, handler);
      if (buf == NULL) return(-1);
      xmlDocContentDumpOutput(buf, cur, NULL, format);
--- 8035,8046 ----
***************
*** 8144,8150 ****
                        const char * encoding, int format ) {
      xmlOutputBufferPtr buf;
      xmlCharEncodingHandlerPtr handler = NULL;
-     xmlCharEncoding enc;
      int ret;
  
      if (cur == NULL)
--- 8124,8129 ----
***************
*** 8155,8171 ****
  
      if (encoding != NULL) {
  
-       enc = xmlParseCharEncoding(encoding);
-       if (cur->charset != XML_CHAR_ENCODING_UTF8) {
-           xmlGenericError(xmlGenericErrorContext,
-                   "xmlSaveFormatFileEnc: document not in UTF8\n");
-           return(-1);
-       }
-       if (enc != XML_CHAR_ENCODING_UTF8) {
            handler = xmlFindCharEncodingHandler(encoding);
            if (handler == NULL)
                return(-1);
-       }
      }
  
  #ifdef HAVE_ZLIB_H
--- 8134,8142 ----
*** ../../base/libxml2-2.5.8/tree.c     Wed Jun 04 17:39:09 2003
--- tree.c      Sat Jul 19 13:07:23 2003
***************
*** 961,966 ****
--- 961,971 ----
      cur->standalone = -1;
      cur->compression = -1; /* not initialized */
      cur->doc = cur;
+     /*
+      * The in memory encoding is always UTF8
+      * This field will never change and would
+      * be obsolete if not for binary compatibility.
+      */
      cur->charset = XML_CHAR_ENCODING_UTF8;
  
      if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
***************
*** 7824,7830 ****
                int format) {
      int                         dummy = 0;
  
-     xmlCharEncoding             doc_charset;
      xmlOutputBufferPtr          out_buff = NULL;
      xmlCharEncodingHandlerPtr   conv_hdlr = NULL;
  
--- 7829,7834 ----
***************
*** 7857,7882 ****
      if (txt_encoding == NULL)
        txt_encoding = (const char *) out_doc->encoding;
      if (txt_encoding != NULL) {
!         doc_charset = xmlParseCharEncoding(txt_encoding);
! 
!         if (out_doc->charset != XML_CHAR_ENCODING_UTF8) {
              xmlGenericError(xmlGenericErrorContext,
!                     "xmlDocDumpFormatMemoryEnc: Source document not in UTF8\n");
              return;
- 
-         } else if (doc_charset != XML_CHAR_ENCODING_UTF8) {
-           conv_hdlr = xmlGetCharEncodingHandler(doc_charset);
-           if (conv_hdlr == NULL)
-               conv_hdlr = xmlFindCharEncodingHandler(txt_encoding);
-             if ( conv_hdlr == NULL ) {
-                 xmlGenericError(xmlGenericErrorContext,
-                                 "%s:  %s %s '%s'\n",
-                                 "xmlDocDumpFormatMemoryEnc",
-                                 "Failed to identify encoding handler for",
-                                 "character set",
-                                 txt_encoding);
-                 return;
-             }
          }
      }
  
--- 7861,7875 ----
      if (txt_encoding == NULL)
        txt_encoding = (const char *) out_doc->encoding;
      if (txt_encoding != NULL) {
!       conv_hdlr = xmlFindCharEncodingHandler(txt_encoding);
!         if ( conv_hdlr == NULL ) {
              xmlGenericError(xmlGenericErrorContext,
!                 "%s:  %s %s '%s'\n",
!                 "xmlDocDumpFormatMemoryEnc",
!                 "Failed to identify encoding handler for",
!                 "character set",
!                 txt_encoding);
              return;
          }
      }
  
***************
*** 8042,8064 ****
      encoding = (const char *) cur->encoding;
  
      if (encoding != NULL) {
!       xmlCharEncoding enc;
! 
!       enc = xmlParseCharEncoding(encoding);
! 
!       if (cur->charset != XML_CHAR_ENCODING_UTF8) {
!           xmlGenericError(xmlGenericErrorContext,
!                   "xmlDocDump: document not in UTF8\n");
!           return(-1);
!       }
!       if (enc != XML_CHAR_ENCODING_UTF8) {
!           handler = xmlGetCharEncodingHandler(enc);
!           if (handler == NULL)
!               handler = xmlFindCharEncodingHandler(encoding);
!           if (handler == NULL) {
!               xmlFree((char *) cur->encoding);
!               cur->encoding = NULL;
!           }
        }
      }
      buf = xmlOutputBufferCreateFile(f, handler);
--- 8035,8044 ----
      encoding = (const char *) cur->encoding;
  
      if (encoding != NULL) {
!       handler = xmlFindCharEncodingHandler(encoding);
!       if (handler == NULL) {
!           xmlFree((char *) cur->encoding);
!           cur->encoding = NULL;
        }
      }
      buf = xmlOutputBufferCreateFile(f, handler);
***************
*** 8144,8150 ****
                        const char * encoding, int format ) {
      xmlOutputBufferPtr buf;
      xmlCharEncodingHandlerPtr handler = NULL;
-     xmlCharEncoding enc;
      int ret;
  
      if (cur == NULL)
--- 8124,8129 ----
***************
*** 8155,8171 ****
  
      if (encoding != NULL) {
  
!       enc = xmlParseCharEncoding(encoding);
!       if (cur->charset != XML_CHAR_ENCODING_UTF8) {
!           xmlGenericError(xmlGenericErrorContext,
!                   "xmlSaveFormatFileEnc: document not in UTF8\n");
            return(-1);
-       }
-       if (enc != XML_CHAR_ENCODING_UTF8) {
-           handler = xmlFindCharEncodingHandler(encoding);
-           if (handler == NULL)
-               return(-1);
-       }
      }
  
  #ifdef HAVE_ZLIB_H
--- 8134,8142 ----
  
      if (encoding != NULL) {
  
!       handler = xmlFindCharEncodingHandler(encoding);
!       if (handler == NULL)
            return(-1);
      }
  
  #ifdef HAVE_ZLIB_H
*** ../../base/libxml2-2.5.8/parser.c   Tue Jul 01 14:26:07 2003
--- parser.c    Sat Jul 19 12:37:27 2003
***************
*** 7568,7605 ****
            if (ctxt->recovery == 0) ctxt->disableSAX = 1;
        }
        if (encoding != NULL) {
-           xmlCharEncoding enc;
            xmlCharEncodingHandlerPtr handler;
  
            if (ctxt->input->encoding != NULL)
                xmlFree((xmlChar *) ctxt->input->encoding);
            ctxt->input->encoding = encoding;
  
!           enc = xmlParseCharEncoding((const char *) encoding);
!           /*
!            * registered set of known encodings
!            */
!           if (enc != XML_CHAR_ENCODING_ERROR) {
!               xmlSwitchEncoding(ctxt, enc);
!               if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
!                   ctxt->input->encoding = NULL;
!                   xmlFree(encoding);
!                   return(NULL);
!               }
            } else {
!               /*
!                * fallback for unknown encodings
!                */
!                 handler = xmlFindCharEncodingHandler((const char *) encoding);
!               if (handler != NULL) {
!                   xmlSwitchToEncoding(ctxt, handler);
!               } else {
!                   ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
!                   if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
!                       ctxt->sax->error(ctxt->userData,
!                            "Unsupported encoding %s\n", encoding);
!                   return(NULL);
!               }
            }
        }
      }
--- 7568,7588 ----
            if (ctxt->recovery == 0) ctxt->disableSAX = 1;
        }
        if (encoding != NULL) {
            xmlCharEncodingHandlerPtr handler;
  
            if (ctxt->input->encoding != NULL)
                xmlFree((xmlChar *) ctxt->input->encoding);
            ctxt->input->encoding = encoding;
  
!             handler = xmlFindCharEncodingHandler((const char *) encoding);
!           if (handler != NULL) {
!               xmlSwitchToEncoding(ctxt, handler);
            } else {
!               ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
!               if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
!                   ctxt->sax->error(ctxt->userData,
!                       "Unsupported encoding %s\n", encoding);
!               return(NULL);
            }
        }
      }
*** ../../base/libxml2-2.5.8/parser.c   Tue Jul 01 14:26:07 2003
--- parser.c    Sat Jul 19 12:37:27 2003
***************
*** 7568,7595 ****
            if (ctxt->recovery == 0) ctxt->disableSAX = 1;
        }
        if (encoding != NULL) {
-           xmlCharEncoding enc;
            xmlCharEncodingHandlerPtr handler;
  
            if (ctxt->input->encoding != NULL)
                xmlFree((xmlChar *) ctxt->input->encoding);
            ctxt->input->encoding = encoding;
  
-           enc = xmlParseCharEncoding((const char *) encoding);
-           /*
-            * registered set of known encodings
-            */
-           if (enc != XML_CHAR_ENCODING_ERROR) {
-               xmlSwitchEncoding(ctxt, enc);
-               if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
-                   ctxt->input->encoding = NULL;
-                   xmlFree(encoding);
-                   return(NULL);
-               }
-           } else {
-               /*
-                * fallback for unknown encodings
-                */
                  handler = xmlFindCharEncodingHandler((const char *) encoding);
                if (handler != NULL) {
                    xmlSwitchToEncoding(ctxt, handler);
--- 7568,7579 ----
***************
*** 7599,7605 ****
                        ctxt->sax->error(ctxt->userData,
                             "Unsupported encoding %s\n", encoding);
                    return(NULL);
-               }
            }
        }
      }
--- 7583,7588 ----

Attachment: 8859x-tests.tar.gz
Description: Binary data

Attachment: 8859x-lint.tar.gz
Description: Binary data



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]