Hi Daniel, A couple of weeks ago I sent the email below, but I haven't seen any feedback from you or anyone else. I guess you lost sight of it while you were away, or have I done something to upset you :-)? Regards, Mark
From: "Mark.Itzcovitz" <mark itzcovitz vistacomp com> Date: 2003/08/11 Mon AM 11:22:34 GMT To: mark itzcovitz ntlworld com Subject: FW: Another encoding problem when not using iconv -----Original Message----- From: Mark Itzcovitz [mailto:mark itzcovitz ntlworld com] Sent: 30 July 2003 15:01 To: xml gnome org Subject: Another encoding problem when not using iconv There is a problem in the UTF8ToUTF16xx conversion routines in encoding.c. The last parameter is a pointer to the length of the input, which xmlCharEncOutFunc expects to be modified to be the number of characters in the input that have been processed. Other routines such as UTF8Toisolat1 do this but the UTF8ToUTF16xx routines seem to set it to 0. This means that xmlCharEncOutFunc doesn't remove the processed input from the input buffer. The problem can be verified by running the attached xml document (converted to utf-16 from slashdot.xml in the test suite) through a version of xmllint built without iconv - it never ends! A patch is attached containing changes for UTF8ToUTF16LE and UTF8ToUTF16BE. The information in this message is intended solely for the addressee and should be considered confidential. This message has been scanned for viruses using the most current and reliable technology available. VISTA excludes all liability related to any viruses that might exist in any attachment or which may have been acquired in transit.
----------------------------------------- Email provided by http://www.ntlhome.com/
Attachment:
slashdot16.xml
Description: Binary data
# diff -c encoding_orig.c encoding.c *** encoding_orig.c Wed Jul 30 14:45:20 2003 --- encoding.c Wed Jul 30 14:47:14 2003 *************** *** 824,829 **** --- 824,830 ---- { unsigned short* out = (unsigned short*) outb; const unsigned char* processed = in; + const unsigned char *const instart = in; unsigned short* outstart= out; unsigned short* outend; const unsigned char* inend= in+*inlen; *************** *** 858,864 **** else if (d < 0xC0) { /* trailing byte in leading position */ *outlen = (out - outstart) * 2; ! *inlen = processed - in; return(-2); } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } --- 859,865 ---- else if (d < 0xC0) { /* trailing byte in leading position */ *outlen = (out - outstart) * 2; ! *inlen = processed - instart; return(-2); } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } *************** *** 866,872 **** else { /* no chance for this in UTF-16 */ *outlen = (out - outstart) * 2; ! *inlen = processed - in; return(-2); } --- 867,873 ---- else { /* no chance for this in UTF-16 */ *outlen = (out - outstart) * 2; ! *inlen = processed - instart; return(-2); } *************** *** 920,926 **** processed = in; } *outlen = (out - outstart) * 2; ! *inlen = processed - in; return(0); } --- 921,927 ---- processed = in; } *outlen = (out - outstart) * 2; ! *inlen = processed - instart; return(0); } *************** *** 1035,1040 **** --- 1036,1042 ---- { unsigned short* out = (unsigned short*) outb; const unsigned char* processed = in; + const unsigned char *const instart = in; unsigned short* outstart= out; unsigned short* outend; const unsigned char* inend= in+*inlen; *************** *** 1069,1075 **** else if (d < 0xC0) { /* trailing byte in leading position */ *outlen = out - outstart; ! *inlen = processed - in; return(-2); } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } --- 1071,1077 ---- else if (d < 0xC0) { /* trailing byte in leading position */ *outlen = out - outstart; ! *inlen = processed - instart; return(-2); } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } *************** *** 1077,1083 **** else { /* no chance for this in UTF-16 */ *outlen = out - outstart; ! *inlen = processed - in; return(-2); } --- 1079,1085 ---- else { /* no chance for this in UTF-16 */ *outlen = out - outstart; ! *inlen = processed - instart; return(-2); } *************** *** 1128,1134 **** processed = in; } *outlen = (out - outstart) * 2; ! *inlen = processed - in; return(0); } --- 1130,1136 ---- processed = in; } *outlen = (out - outstart) * 2; ! *inlen = processed - instart; return(0); } #