[xml] xmlSaveSetAttrEscape() ignored?
- From: Ethan Tira-Thompson <ejt andrew cmu edu>
- To: xml gnome org
- Subject: [xml] xmlSaveSetAttrEscape() ignored?
- Date: Fri, 30 May 2008 18:00:06 -0400
I'm using a custom 'xmlCharEncodingOutputFunc' named
'xmlEscapeMinimalEntities', which is intended to only escape the
minimum required characters, thus leaving the majority of UTF
characters unescaped and human-readable.
I pass it to xmlSaveSetEscape(), and thus able to save files with node
contents that are human-readable. I also call xmlSaveSetAttrEscape()
at the same time, however I've recently realized that attribute values
are still being escaped. Apparently the encoding output function
isn't being used. Sample code is provided below.
1. Is there an easier/better way to do this? :(
It's a real disappointment that libxml output of math, scientific
symbols, foreign language, etc. becomes unreadable because it's a mess
of escaped numeric values. XML has a character set definition for a
reason, I want to use it!
Also, I only know how to do this via xmlSaveToBuffer(), which provides
the xmlSaveCtxtPtr to then make the *SetEscape() calls on. And worse,
versions earlier than 2.6.23 don't even have xmlSaveToBuffer
implemented, that means this solution isn't even portable (Mac OS X
10.5 *still* ships with 2.6.16 :()
2. Why isn't xmlSaveSetAttrEscape() working?
BTW, I realize I'll need make a different xmlCharEncodingOutputFunc to
use with attributes, in order to escape quote characters... I wonder,
do I no longer need to escape < and > within attribute values? (you
can't nest tags there, so are they still special?)
Thanks,
-Ethan
----- Sample output: -----
<?xml version="1.0"?>
<testing>
<contentTest>degree  theta Πless < quote "</contentTest>
<attrTest prop="degree ° theta θ less < quote ""/>
</testing>
----- Sample code: -----
#include <iostream>
#include <string>
#include <libxml/xmlmemory.h>
#include <libxml/parser.h>
#include <libxml/parserInternals.h>
#include <libxml/xmlsave.h>
#include <libxml/xmlversion.h>
#include <libxml/tree.h>
#include <errno.h>
//! if true, the saved document will use automatic indenting and
formatting
bool autoFormat=true;
int xmlEscapeMinimalEntities(unsigned char* out, int *outlen, const
xmlChar* in, int *inlen);
unsigned int saveFile(const char* filename);
void saveXML(xmlNode* node);
int main() {
xmlInitParser();
saveFile("testout.xml");
xmlCleanupParser();
}
void saveXML(xmlNode* node) {
xmlNodeSetName(node,(const xmlChar*)"testing");
xmlNewChild(node,NULL,(const xmlChar*)"contentTest",(const
xmlChar*)"degree  theta Πless < quote \"");
xmlNode * attrTest = xmlNewChild(node,NULL,(const
xmlChar*)"attrTest",NULL);
xmlSetProp(attrTest,(const xmlChar*)"prop",(const xmlChar*)"degree
 theta Πless < quote \"");
}
unsigned int saveFile(const char* filename) {
xmlDoc* xmldocument=NULL;
xmldocument=xmlNewDoc((const xmlChar*)"1.0");
xmlNode* cur=xmlNewNode(NULL,(const xmlChar*)"");
saveXML(cur);
xmlDocSetRootElement(xmldocument,cur);
#if LIBXML_VERSION < 20623
// versions prior to 2.6.23 don't have saveToBuffer implemented!
// could use xmlSaveToFilename and fake the return size, but I'd
rather be correct and
// give up on un-escaping fancy unicode characters
int size=xmlSaveFormatFile (filename, xmldocument, autoFormat);
if(size==-1)
cerr << "Error: XMLLoadSave::saveFile: xmlSaveFormatFile(\"" <<
filename << "\",...) returned -1" << endl;
return size==-1?0:size;
#else
FILE* f = fopen(filename,"w");
if(f==NULL) {
std::cerr << "*** WARNING XMLLoadSave::saveFile: could not open
file for saving \"" << filename << "\"" << std::endl;
return 0;
}
// xmlSaveDoc doesn't properly return written size, so use buffers
instead of xmlSaveToFilename:
xmlBufferPtr xmlbuf = xmlBufferCreate();
xmlSaveCtxtPtr ctxt = xmlSaveToBuffer(xmlbuf, NULL, (autoFormat ?
XML_SAVE_FORMAT : 0));
//xmlSaveCtxtPtr ctxt = xmlSaveToFilename(filename, NULL,
(autoFormat ? XML_SAVE_FORMAT : 0));
xmlSaveSetEscape(ctxt,xmlEscapeMinimalEntities);
xmlSaveSetAttrEscape(ctxt,xmlEscapeMinimalEntities);
size_t size = xmlSaveDoc(ctxt,xmldocument);
xmlSaveClose(ctxt);
ctxt=NULL;
if(size==(size_t)-1) {
std::cerr << "Error: XMLLoadSave::saveFile: xmlSaveDoc(\"" <<
filename << "\",...) returned -1" << std::endl;
fclose(f);
return 0;
}
size=xmlBufferLength(xmlbuf);
size_t wrote=fwrite(xmlBufferContent(xmlbuf),
1,xmlBufferLength(xmlbuf),f);
if(wrote!=size)
std::cerr << "*** WARNING XMLLoadSave::saveFile: short write
(wrote " << wrote << ", expected " << size << ")" << std::endl;
int err=fclose(f);
if(err!=0) {
std::cerr << "*** WARNING XMLLoadSave::saveFile: error '" <<
strerror(errno) << "' while closing " << filename << std::endl;
return 0;
}
xmlBufferFree(xmlbuf);
xmlbuf=NULL;
return size;
#endif
}
int xmlEscapeMinimalEntities(unsigned char* out, int *outlen, const
xmlChar* in, int *inlen) {
unsigned char* outstart = out;
const unsigned char* base = in;
unsigned char* outend = out + *outlen;
const unsigned char* inend;
int val;
inend = in + (*inlen);
while ((in < inend) && (out < outend)) {
if (*in == '<') {
if (outend - out < 4) break;
*out++ = '&';
*out++ = 'l';
*out++ = 't';
*out++ = ';';
in++;
continue;
} else if (*in == '>') {
if (outend - out < 4) break;
*out++ = '&';
*out++ = 'g';
*out++ = 't';
*out++ = ';';
in++;
continue;
} else if (*in == '&') {
if (outend - out < 5) break;
*out++ = '&';
*out++ = 'a';
*out++ = 'm';
*out++ = 'p';
*out++ = ';';
in++;
continue;
} else if (((*in >= 0x20) && (*in < 0x80)) ||
(*in == '\n') || (*in == '\t')) {
/*
* default case, just copy !
*/
*out++ = *in++;
continue;
} else if (*in >= 0x80) {
/*
* We assume we have UTF-8 input.
*/
if (outend - out < 10) break;
if (*in < 0xC0) {
std::cerr << "XMLLoadSave::xmlEscapeMinimalEntities
encountered non-UTF8 data: " << *in << std::endl;
in++;
goto error;
} else if (*in < 0xE0) {
if (inend - in < 2) break;
val = (in[0]) & 0x1F;
val <<= 6;
val |= (in[1]) & 0x3F;
*out++ = *in++;
*out++ = *in++;
} else if (*in < 0xF0) {
if (inend - in < 3) break;
val = (in[0]) & 0x0F;
val <<= 6;
val |= (in[1]) & 0x3F;
val <<= 6;
val |= (in[2]) & 0x3F;
*out++ = *in++;
*out++ = *in++;
*out++ = *in++;
} else if (*in < 0xF8) {
if (inend - in < 4) break;
val = (in[0]) & 0x07;
val <<= 6;
val |= (in[1]) & 0x3F;
val <<= 6;
val |= (in[2]) & 0x3F;
val <<= 6;
val |= (in[3]) & 0x3F;
*out++ = *in++;
*out++ = *in++;
*out++ = *in++;
*out++ = *in++;
} else {
std::cerr << "XMLLoadSave::xmlEscapeMinimalEntities
encountered invalid UTF8 data " << *in << std::endl;
in++;
goto error;
}
if (!IS_CHAR(val)) {
std::cerr << "XMLLoadSave::xmlEscapeMinimalEntities
encountered unknown UTF8 data " << *in << std::endl;
goto error;
}
} else if (IS_BYTE_CHAR(*in)) {
if (outend - out < 6) break;
*out++ = *in++;
} else {
xmlGenericError(xmlGenericErrorContext,"xmlEscapeEntities :
char out of range\n");
in++;
goto error;
}
}
*outlen = out - outstart;
*inlen = in - base;
return(0);
error:
*outlen = out - outstart;
*inlen = in - base;
return(-1);
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]