[xml] xmlSaveSetAttrEscape() ignored?



I'm using a custom 'xmlCharEncodingOutputFunc' named 'xmlEscapeMinimalEntities', which is intended to only escape the minimum required characters, thus leaving the majority of UTF characters unescaped and human-readable.

I pass it to xmlSaveSetEscape(), and thus able to save files with node contents that are human-readable. I also call xmlSaveSetAttrEscape() at the same time, however I've recently realized that attribute values are still being escaped. Apparently the encoding output function isn't being used. Sample code is provided below.

1. Is there an easier/better way to do this? :(
It's a real disappointment that libxml output of math, scientific symbols, foreign language, etc. becomes unreadable because it's a mess of escaped numeric values. XML has a character set definition for a reason, I want to use it! Also, I only know how to do this via xmlSaveToBuffer(), which provides the xmlSaveCtxtPtr to then make the *SetEscape() calls on. And worse, versions earlier than 2.6.23 don't even have xmlSaveToBuffer implemented, that means this solution isn't even portable (Mac OS X 10.5 *still* ships with 2.6.16 :()

2. Why isn't xmlSaveSetAttrEscape() working?
BTW, I realize I'll need make a different xmlCharEncodingOutputFunc to use with attributes, in order to escape quote characters... I wonder, do I no longer need to escape < and > within attribute values? (you can't nest tags there, so are they still special?)

Thanks,
  -Ethan

----- Sample output: -----
<?xml version="1.0"?>
<testing>
  <contentTest>degree  theta Πless &lt; quote "</contentTest>
  <attrTest prop="degree &#xB0; theta &#x3B8; less &lt; quote &quot;"/>
</testing>

----- Sample code: -----
#include <iostream>
#include <string>
#include <libxml/xmlmemory.h>
#include <libxml/parser.h>
#include <libxml/parserInternals.h>
#include <libxml/xmlsave.h>
#include <libxml/xmlversion.h>
#include <libxml/tree.h>
#include <errno.h>

//! if true, the saved document will use automatic indenting and formatting
bool autoFormat=true;

int xmlEscapeMinimalEntities(unsigned char* out, int *outlen, const xmlChar* in, int *inlen);
unsigned int saveFile(const char* filename);
void saveXML(xmlNode* node);

int main() {
   xmlInitParser();
   saveFile("testout.xml");
   xmlCleanupParser();
}

void saveXML(xmlNode* node) {
   xmlNodeSetName(node,(const xmlChar*)"testing");
xmlNewChild(node,NULL,(const xmlChar*)"contentTest",(const xmlChar*)"degree  theta Πless < quote \""); xmlNode * attrTest = xmlNewChild(node,NULL,(const xmlChar*)"attrTest",NULL); xmlSetProp(attrTest,(const xmlChar*)"prop",(const xmlChar*)"degree  theta Πless < quote \"");
}

unsigned int saveFile(const char* filename) {
   xmlDoc* xmldocument=NULL;
   xmldocument=xmlNewDoc((const xmlChar*)"1.0");
   xmlNode* cur=xmlNewNode(NULL,(const xmlChar*)"");
   saveXML(cur);
   xmlDocSetRootElement(xmldocument,cur);
#if LIBXML_VERSION < 20623
   // versions prior to 2.6.23 don't have saveToBuffer implemented!
// could use xmlSaveToFilename and fake the return size, but I'd rather be correct and
   // give up on un-escaping fancy unicode characters
   int size=xmlSaveFormatFile (filename, xmldocument, autoFormat);
   if(size==-1)
cerr << "Error: XMLLoadSave::saveFile: xmlSaveFormatFile(\"" << filename << "\",...) returned -1" << endl;
   return size==-1?0:size;
#else
   FILE* f = fopen(filename,"w");
   if(f==NULL) {
std::cerr << "*** WARNING XMLLoadSave::saveFile: could not open file for saving \"" << filename << "\"" << std::endl;
      return 0;
   }
// xmlSaveDoc doesn't properly return written size, so use buffers instead of xmlSaveToFilename:
   xmlBufferPtr xmlbuf = xmlBufferCreate();
xmlSaveCtxtPtr ctxt = xmlSaveToBuffer(xmlbuf, NULL, (autoFormat ? XML_SAVE_FORMAT : 0)); //xmlSaveCtxtPtr ctxt = xmlSaveToFilename(filename, NULL, (autoFormat ? XML_SAVE_FORMAT : 0));
   xmlSaveSetEscape(ctxt,xmlEscapeMinimalEntities);
   xmlSaveSetAttrEscape(ctxt,xmlEscapeMinimalEntities);
   size_t size = xmlSaveDoc(ctxt,xmldocument);
   xmlSaveClose(ctxt);
   ctxt=NULL;
   if(size==(size_t)-1) {
std::cerr << "Error: XMLLoadSave::saveFile: xmlSaveDoc(\"" << filename << "\",...) returned -1" << std::endl;
      fclose(f);
      return 0;
   }
   size=xmlBufferLength(xmlbuf);
size_t wrote=fwrite(xmlBufferContent(xmlbuf), 1,xmlBufferLength(xmlbuf),f);
   if(wrote!=size)
std::cerr << "*** WARNING XMLLoadSave::saveFile: short write (wrote " << wrote << ", expected " << size << ")" << std::endl;
   int err=fclose(f);
   if(err!=0) {
std::cerr << "*** WARNING XMLLoadSave::saveFile: error '" << strerror(errno) << "' while closing " << filename << std::endl;
      return 0;
   }
   xmlBufferFree(xmlbuf);
   xmlbuf=NULL;
   return size;
#endif
}

int xmlEscapeMinimalEntities(unsigned char* out, int *outlen, const xmlChar* in, int *inlen) {
   unsigned char* outstart = out;
   const unsigned char* base = in;
   unsigned char* outend = out + *outlen;
   const unsigned char* inend;
   int val;

   inend = in + (*inlen);

   while ((in < inend) && (out < outend)) {
      if (*in == '<') {
         if (outend - out < 4) break;
         *out++ = '&';
         *out++ = 'l';
         *out++ = 't';
         *out++ = ';';
         in++;
         continue;
      } else if (*in == '>') {
         if (outend - out < 4) break;
         *out++ = '&';
         *out++ = 'g';
         *out++ = 't';
         *out++ = ';';
         in++;
         continue;
      } else if (*in == '&') {
         if (outend - out < 5) break;
         *out++ = '&';
         *out++ = 'a';
         *out++ = 'm';
         *out++ = 'p';
         *out++ = ';';
         in++;
         continue;
      } else if (((*in >= 0x20) && (*in < 0x80)) ||
               (*in == '\n') || (*in == '\t')) {
         /*
          * default case, just copy !
          */
         *out++ = *in++;
         continue;
      } else if (*in >= 0x80) {
         /*
          * We assume we have UTF-8 input.
          */
         if (outend - out < 10) break;

         if (*in < 0xC0) {
std::cerr << "XMLLoadSave::xmlEscapeMinimalEntities encountered non-UTF8 data: " << *in << std::endl;
            in++;
            goto error;
         } else if (*in < 0xE0) {
            if (inend - in < 2) break;
            val = (in[0]) & 0x1F;
            val <<= 6;
            val |= (in[1]) & 0x3F;
            *out++ = *in++;
            *out++ = *in++;
         } else if (*in < 0xF0) {
            if (inend - in < 3) break;
            val = (in[0]) & 0x0F;
            val <<= 6;
            val |= (in[1]) & 0x3F;
            val <<= 6;
            val |= (in[2]) & 0x3F;
            *out++ = *in++;
            *out++ = *in++;
            *out++ = *in++;
         } else if (*in < 0xF8) {
            if (inend - in < 4) break;
            val = (in[0]) & 0x07;
            val <<= 6;
            val |= (in[1]) & 0x3F;
            val <<= 6;
            val |= (in[2]) & 0x3F;
            val <<= 6;
            val |= (in[3]) & 0x3F;
            *out++ = *in++;
            *out++ = *in++;
            *out++ = *in++;
            *out++ = *in++;
         } else {
std::cerr << "XMLLoadSave::xmlEscapeMinimalEntities encountered invalid UTF8 data " << *in << std::endl;
            in++;
            goto error;
         }
         if (!IS_CHAR(val)) {
std::cerr << "XMLLoadSave::xmlEscapeMinimalEntities encountered unknown UTF8 data " << *in << std::endl;
            goto error;
         }

      } else if (IS_BYTE_CHAR(*in)) {
         if (outend - out < 6) break;
         *out++ = *in++;
      } else {
xmlGenericError(xmlGenericErrorContext,"xmlEscapeEntities : char out of range\n");
         in++;
         goto error;
      }
   }
   *outlen = out - outstart;
   *inlen = in - base;
   return(0);
error:
   *outlen = out - outstart;
   *inlen = in - base;
   return(-1);
}




[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]