diff --git a/encoding.c b/encoding.c index cd019c51..617e1ed7 100644 --- a/encoding.c +++ b/encoding.c @@ -110,6 +110,9 @@ openIcuConverter(const char* name, int toUnicode) if (conv == NULL) return NULL; + conv->pivot_source = conv->pivot_buf; + conv->pivot_target = conv->pivot_buf; + conv->uconv = ucnv_open(name, &status); if (U_FAILURE(status)) goto error; @@ -1850,6 +1853,7 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen, * @outlen: the length of @out * @in: a pointer to an array of ISO Latin 1 chars * @inlen: the length of @in + * @flush: if true, indicates end of input * * Returns 0 if success, or * -1 by lack of space, or @@ -1863,7 +1867,7 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen, */ static int xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen, - const unsigned char *in, int *inlen) { + const unsigned char *in, int *inlen, int flush) { const char *ucv_in = (const char *) in; char *ucv_out = (char *) out; UErrorCode err = U_ZERO_ERROR; @@ -1873,33 +1877,30 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen, return(-1); } - /* - * TODO(jungshik) - * 1. is ucnv_convert(To|From)Algorithmic better? - * 2. had we better use an explicit pivot buffer? - * 3. error returned comes from 'fromUnicode' only even - * when toUnicode is true ! - */ if (toUnicode) { /* encoding => UTF-16 => UTF-8 */ ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen, - &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL, - 0, TRUE, &err); + &ucv_in, ucv_in + *inlen, cd->pivot_buf, + &cd->pivot_source, &cd->pivot_target, + cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, flush, &err); } else { /* UTF-8 => UTF-16 => encoding */ ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen, - &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL, - 0, TRUE, &err); + &ucv_in, ucv_in + *inlen, cd->pivot_buf, + &cd->pivot_source, &cd->pivot_target, + cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, flush, &err); } *inlen = ucv_in - (const char*) in; *outlen = ucv_out - (char *) out; + /* reset pivot buffer if this is the last call for input (flush==TRUE) */ + if (flush) + cd->pivot_source = cd->pivot_target = cd->pivot_buf; if (U_SUCCESS(err)) return 0; if (err == U_BUFFER_OVERFLOW_ERROR) return -1; if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND) return -2; - /* if (err == U_TRUNCATED_CHAR_FOUND) */ return -3; } #endif /* LIBXML_ICU_ENABLED */ @@ -1912,7 +1913,7 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen, static int xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out, - int *outlen, const unsigned char *in, int *inlen) { + int *outlen, const unsigned char *in, int *inlen, int flush) { int ret; if (handler->input != NULL) { @@ -1925,7 +1926,8 @@ xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out, #endif /* LIBXML_ICONV_ENABLED */ #ifdef LIBXML_ICU_ENABLED else if (handler->uconv_in != NULL) { - ret = xmlUconvWrapper(handler->uconv_in, 1, out, outlen, in, inlen); + ret = xmlUconvWrapper(handler->uconv_in, 1, out, outlen, in, inlen, + flush); } #endif /* LIBXML_ICU_ENABLED */ else { @@ -1953,7 +1955,8 @@ xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out, #endif /* LIBXML_ICONV_ENABLED */ #ifdef LIBXML_ICU_ENABLED else if (handler->uconv_out != NULL) { - ret = xmlUconvWrapper(handler->uconv_out, 0, out, outlen, in, inlen); + ret = xmlUconvWrapper(handler->uconv_out, 0, out, outlen, in, inlen, + TRUE); } #endif /* LIBXML_ICU_ENABLED */ else { @@ -2015,7 +2018,7 @@ xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out, } ret = xmlEncInputChunk(handler, &out->content[out->use], &written, - in->content, &toconv); + in->content, &toconv, 0); xmlBufferShrink(in, toconv); out->use += written; out->content[out->use] = 0; @@ -2133,7 +2136,7 @@ xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len) c_in = toconv; c_out = written; ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out, - xmlBufContent(in), &c_in); + xmlBufContent(in), &c_in, 0); xmlBufShrink(in, c_in); xmlBufAddLen(out, c_out); if (ret == -1) @@ -2231,7 +2234,7 @@ xmlCharEncInput(xmlParserInputBufferPtr input, int flush) c_in = toconv; c_out = written; ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out, - xmlBufContent(in), &c_in); + xmlBufContent(in), &c_in, flush); xmlBufShrink(in, c_in); xmlBufAddLen(out, c_out); if (ret == -1) @@ -2285,6 +2288,7 @@ xmlCharEncInput(xmlParserInputBufferPtr input, int flush) * @handler: char encoding transformation data structure * @out: an xmlBuffer for the output. * @in: an xmlBuffer for the input + * @flush: if true, indicates end of input. * * Generic front-end for the encoding handler input function * @@ -2295,7 +2299,7 @@ xmlCharEncInput(xmlParserInputBufferPtr input, int flush) */ int xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out, - xmlBufferPtr in) + xmlBufferPtr in, int flush) { int ret; int written; @@ -2317,7 +2321,7 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out, written = out->size - out->use - 1; } ret = xmlEncInputChunk(handler, &out->content[out->use], &written, - in->content, &toconv); + in->content, &toconv, flush); xmlBufferShrink(in, toconv); out->use += written; out->content[out->use] = 0; diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h index 7967cc66..e3ad1950 100644 --- a/include/libxml/encoding.h +++ b/include/libxml/encoding.h @@ -129,9 +129,14 @@ typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen, * If iconv is supported, there are two extra fields. */ #ifdef LIBXML_ICU_ENABLED +/* Size of pivot buffer, same as icu/source/common/ucnv.cpp CHUNK_SIZE */ +#define ICU_PIVOT_BUF_SIZE 1024 struct _uconv_t { UConverter *uconv; /* for conversion between an encoding and UTF-16 */ UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */ + UChar pivot_buf[ICU_PIVOT_BUF_SIZE]; + UChar *pivot_source; + UChar *pivot_target; }; typedef struct _uconv_t uconv_t; #endif @@ -210,7 +215,8 @@ XMLPUBFUN int XMLCALL XMLPUBFUN int XMLCALL xmlCharEncInFunc (xmlCharEncodingHandler *handler, xmlBufferPtr out, - xmlBufferPtr in); + xmlBufferPtr in, + int flush); XMLPUBFUN int XMLCALL xmlCharEncFirstLine (xmlCharEncodingHandler *handler, xmlBufferPtr out, diff --git a/result/icu_parse_test.xml b/result/icu_parse_test.xml new file mode 100644 index 00000000..e7163232 --- /dev/null +++ b/result/icu_parse_test.xml @@ -0,0 +1,13 @@ + + +Text with UTF8 chars at position 214 (0xd6) +______ +_______________ +_______________ +_______________ +_______________ +_______________ +_______________ +_______________ +_______駪槗___ +_ diff --git a/result/icu_parse_test.xml.rde b/result/icu_parse_test.xml.rde new file mode 100644 index 00000000..6af31937 --- /dev/null +++ b/result/icu_parse_test.xml.rde @@ -0,0 +1,14 @@ +0 1 foo 0 0 +1 3 #text 0 1 +Text with UTF8 chars at position 214 (0xd6) +______ +_______________ +_______________ +_______________ +_______________ +_______________ +_______________ +_______________ +_______駪槗___ +_ +0 15 foo 0 0 diff --git a/result/icu_parse_test.xml.rdr b/result/icu_parse_test.xml.rdr new file mode 100644 index 00000000..6af31937 --- /dev/null +++ b/result/icu_parse_test.xml.rdr @@ -0,0 +1,14 @@ +0 1 foo 0 0 +1 3 #text 0 1 +Text with UTF8 chars at position 214 (0xd6) +______ +_______________ +_______________ +_______________ +_______________ +_______________ +_______________ +_______________ +_______駪槗___ +_ +0 15 foo 0 0 diff --git a/result/icu_parse_test.xml.sax b/result/icu_parse_test.xml.sax new file mode 100644 index 00000000..1ac3a072 --- /dev/null +++ b/result/icu_parse_test.xml.sax @@ -0,0 +1,9 @@ +SAX.setDocumentLocator() +SAX.startDocument() +SAX.startElement(foo) +SAX.characters( +Text with UTF8 chars at posit, 171) +SAX.characters(駪槗___ +_, 9) +SAX.endElement(foo) +SAX.endDocument() diff --git a/result/icu_parse_test.xml.sax2 b/result/icu_parse_test.xml.sax2 new file mode 100644 index 00000000..97294a1b --- /dev/null +++ b/result/icu_parse_test.xml.sax2 @@ -0,0 +1,9 @@ +SAX.setDocumentLocator() +SAX.startDocument() +SAX.startElementNs(foo, NULL, NULL, 0, 0, 0) +SAX.characters( +Text with UTF8 chars at posit, 171) +SAX.characters(駪槗___ +_, 9) +SAX.endElementNs(foo, NULL, NULL) +SAX.endDocument() diff --git a/result/noent/icu_parse_test.xml b/result/noent/icu_parse_test.xml new file mode 100644 index 00000000..e7163232 --- /dev/null +++ b/result/noent/icu_parse_test.xml @@ -0,0 +1,13 @@ + + +Text with UTF8 chars at position 214 (0xd6) +______ +_______________ +_______________ +_______________ +_______________ +_______________ +_______________ +_______________ +_______駪槗___ +_ diff --git a/result/noent/icu_parse_test.xml.sax2 b/result/noent/icu_parse_test.xml.sax2 new file mode 100644 index 00000000..97294a1b --- /dev/null +++ b/result/noent/icu_parse_test.xml.sax2 @@ -0,0 +1,9 @@ +SAX.setDocumentLocator() +SAX.startDocument() +SAX.startElementNs(foo, NULL, NULL, 0, 0, 0) +SAX.characters( +Text with UTF8 chars at posit, 171) +SAX.characters(駪槗___ +_, 9) +SAX.endElementNs(foo, NULL, NULL) +SAX.endDocument() diff --git a/test/icu_parse_test.xml b/test/icu_parse_test.xml new file mode 100644 index 00000000..e7163232 --- /dev/null +++ b/test/icu_parse_test.xml @@ -0,0 +1,13 @@ + + +Text with UTF8 chars at position 214 (0xd6) +______ +_______________ +_______________ +_______________ +_______________ +_______________ +_______________ +_______________ +_______駪槗___ +_ diff --git a/testapi.c b/testapi.c index 168ceb67..ef3f4733 100644 --- a/testapi.c +++ b/testapi.c @@ -8785,7 +8785,7 @@ test_xmlCharEncInFunc(void) { out = gen_xmlBufferPtr(n_out, 1); in = gen_xmlBufferPtr(n_in, 2); - ret_val = xmlCharEncInFunc(handler, out, in); + ret_val = xmlCharEncInFunc(handler, out, in, 1); desret_int(ret_val); call_tests++; des_xmlCharEncodingHandler_ptr(n_handler, handler, 0);