diff --git a/encoding.c b/encoding.c
index cd019c51..617e1ed7 100644
--- a/encoding.c
+++ b/encoding.c
@@ -110,6 +110,9 @@ openIcuConverter(const char* name, int toUnicode)
if (conv == NULL)
return NULL;
+ conv->pivot_source = conv->pivot_buf;
+ conv->pivot_target = conv->pivot_buf;
+
conv->uconv = ucnv_open(name, &status);
if (U_FAILURE(status))
goto error;
@@ -1850,6 +1853,7 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
* @outlen: the length of @out
* @in: a pointer to an array of ISO Latin 1 chars
* @inlen: the length of @in
+ * @flush: if true, indicates end of input
*
* Returns 0 if success, or
* -1 by lack of space, or
@@ -1863,7 +1867,7 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
*/
static int
xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
- const unsigned char *in, int *inlen) {
+ const unsigned char *in, int *inlen, int flush) {
const char *ucv_in = (const char *) in;
char *ucv_out = (char *) out;
UErrorCode err = U_ZERO_ERROR;
@@ -1873,33 +1877,30 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
return(-1);
}
- /*
- * TODO(jungshik)
- * 1. is ucnv_convert(To|From)Algorithmic better?
- * 2. had we better use an explicit pivot buffer?
- * 3. error returned comes from 'fromUnicode' only even
- * when toUnicode is true !
- */
if (toUnicode) {
/* encoding => UTF-16 => UTF-8 */
ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen,
- &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL,
- 0, TRUE, &err);
+ &ucv_in, ucv_in + *inlen, cd->pivot_buf,
+ &cd->pivot_source, &cd->pivot_target,
+ cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, flush, &err);
} else {
/* UTF-8 => UTF-16 => encoding */
ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen,
- &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL,
- 0, TRUE, &err);
+ &ucv_in, ucv_in + *inlen, cd->pivot_buf,
+ &cd->pivot_source, &cd->pivot_target,
+ cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, flush, &err);
}
*inlen = ucv_in - (const char*) in;
*outlen = ucv_out - (char *) out;
+ /* reset pivot buffer if this is the last call for input (flush==TRUE) */
+ if (flush)
+ cd->pivot_source = cd->pivot_target = cd->pivot_buf;
if (U_SUCCESS(err))
return 0;
if (err == U_BUFFER_OVERFLOW_ERROR)
return -1;
if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND)
return -2;
- /* if (err == U_TRUNCATED_CHAR_FOUND) */
return -3;
}
#endif /* LIBXML_ICU_ENABLED */
@@ -1912,7 +1913,7 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
static int
xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
- int *outlen, const unsigned char *in, int *inlen) {
+ int *outlen, const unsigned char *in, int *inlen, int flush) {
int ret;
if (handler->input != NULL) {
@@ -1925,7 +1926,8 @@ xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
#endif /* LIBXML_ICONV_ENABLED */
#ifdef LIBXML_ICU_ENABLED
else if (handler->uconv_in != NULL) {
- ret = xmlUconvWrapper(handler->uconv_in, 1, out, outlen, in, inlen);
+ ret = xmlUconvWrapper(handler->uconv_in, 1, out, outlen, in, inlen,
+ flush);
}
#endif /* LIBXML_ICU_ENABLED */
else {
@@ -1953,7 +1955,8 @@ xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
#endif /* LIBXML_ICONV_ENABLED */
#ifdef LIBXML_ICU_ENABLED
else if (handler->uconv_out != NULL) {
- ret = xmlUconvWrapper(handler->uconv_out, 0, out, outlen, in, inlen);
+ ret = xmlUconvWrapper(handler->uconv_out, 0, out, outlen, in, inlen,
+ TRUE);
}
#endif /* LIBXML_ICU_ENABLED */
else {
@@ -2015,7 +2018,7 @@ xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
}
ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
- in->content, &toconv);
+ in->content, &toconv, 0);
xmlBufferShrink(in, toconv);
out->use += written;
out->content[out->use] = 0;
@@ -2133,7 +2136,7 @@ xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len)
c_in = toconv;
c_out = written;
ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
- xmlBufContent(in), &c_in);
+ xmlBufContent(in), &c_in, 0);
xmlBufShrink(in, c_in);
xmlBufAddLen(out, c_out);
if (ret == -1)
@@ -2231,7 +2234,7 @@ xmlCharEncInput(xmlParserInputBufferPtr input, int flush)
c_in = toconv;
c_out = written;
ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
- xmlBufContent(in), &c_in);
+ xmlBufContent(in), &c_in, flush);
xmlBufShrink(in, c_in);
xmlBufAddLen(out, c_out);
if (ret == -1)
@@ -2285,6 +2288,7 @@ xmlCharEncInput(xmlParserInputBufferPtr input, int flush)
* @handler: char encoding transformation data structure
* @out: an xmlBuffer for the output.
* @in: an xmlBuffer for the input
+ * @flush: if true, indicates end of input.
*
* Generic front-end for the encoding handler input function
*
@@ -2295,7 +2299,7 @@ xmlCharEncInput(xmlParserInputBufferPtr input, int flush)
*/
int
xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
- xmlBufferPtr in)
+ xmlBufferPtr in, int flush)
{
int ret;
int written;
@@ -2317,7 +2321,7 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
written = out->size - out->use - 1;
}
ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
- in->content, &toconv);
+ in->content, &toconv, flush);
xmlBufferShrink(in, toconv);
out->use += written;
out->content[out->use] = 0;
diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h
index 7967cc66..e3ad1950 100644
--- a/include/libxml/encoding.h
+++ b/include/libxml/encoding.h
@@ -129,9 +129,14 @@ typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen,
* If iconv is supported, there are two extra fields.
*/
#ifdef LIBXML_ICU_ENABLED
+/* Size of pivot buffer, same as icu/source/common/ucnv.cpp CHUNK_SIZE */
+#define ICU_PIVOT_BUF_SIZE 1024
struct _uconv_t {
UConverter *uconv; /* for conversion between an encoding and UTF-16 */
UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
+ UChar pivot_buf[ICU_PIVOT_BUF_SIZE];
+ UChar *pivot_source;
+ UChar *pivot_target;
};
typedef struct _uconv_t uconv_t;
#endif
@@ -210,7 +215,8 @@ XMLPUBFUN int XMLCALL
XMLPUBFUN int XMLCALL
xmlCharEncInFunc (xmlCharEncodingHandler *handler,
xmlBufferPtr out,
- xmlBufferPtr in);
+ xmlBufferPtr in,
+ int flush);
XMLPUBFUN int XMLCALL
xmlCharEncFirstLine (xmlCharEncodingHandler *handler,
xmlBufferPtr out,
diff --git a/result/icu_parse_test.xml b/result/icu_parse_test.xml
new file mode 100644
index 00000000..e7163232
--- /dev/null
+++ b/result/icu_parse_test.xml
@@ -0,0 +1,13 @@
+
+
+Text with UTF8 chars at position 214 (0xd6)
+______
+_______________
+_______________
+_______________
+_______________
+_______________
+_______________
+_______________
+_______駪槗___
+_
diff --git a/result/icu_parse_test.xml.rde b/result/icu_parse_test.xml.rde
new file mode 100644
index 00000000..6af31937
--- /dev/null
+++ b/result/icu_parse_test.xml.rde
@@ -0,0 +1,14 @@
+0 1 foo 0 0
+1 3 #text 0 1
+Text with UTF8 chars at position 214 (0xd6)
+______
+_______________
+_______________
+_______________
+_______________
+_______________
+_______________
+_______________
+_______駪槗___
+_
+0 15 foo 0 0
diff --git a/result/icu_parse_test.xml.rdr b/result/icu_parse_test.xml.rdr
new file mode 100644
index 00000000..6af31937
--- /dev/null
+++ b/result/icu_parse_test.xml.rdr
@@ -0,0 +1,14 @@
+0 1 foo 0 0
+1 3 #text 0 1
+Text with UTF8 chars at position 214 (0xd6)
+______
+_______________
+_______________
+_______________
+_______________
+_______________
+_______________
+_______________
+_______駪槗___
+_
+0 15 foo 0 0
diff --git a/result/icu_parse_test.xml.sax b/result/icu_parse_test.xml.sax
new file mode 100644
index 00000000..1ac3a072
--- /dev/null
+++ b/result/icu_parse_test.xml.sax
@@ -0,0 +1,9 @@
+SAX.setDocumentLocator()
+SAX.startDocument()
+SAX.startElement(foo)
+SAX.characters(
+Text with UTF8 chars at posit, 171)
+SAX.characters(駪槗___
+_, 9)
+SAX.endElement(foo)
+SAX.endDocument()
diff --git a/result/icu_parse_test.xml.sax2 b/result/icu_parse_test.xml.sax2
new file mode 100644
index 00000000..97294a1b
--- /dev/null
+++ b/result/icu_parse_test.xml.sax2
@@ -0,0 +1,9 @@
+SAX.setDocumentLocator()
+SAX.startDocument()
+SAX.startElementNs(foo, NULL, NULL, 0, 0, 0)
+SAX.characters(
+Text with UTF8 chars at posit, 171)
+SAX.characters(駪槗___
+_, 9)
+SAX.endElementNs(foo, NULL, NULL)
+SAX.endDocument()
diff --git a/result/noent/icu_parse_test.xml b/result/noent/icu_parse_test.xml
new file mode 100644
index 00000000..e7163232
--- /dev/null
+++ b/result/noent/icu_parse_test.xml
@@ -0,0 +1,13 @@
+
+
+Text with UTF8 chars at position 214 (0xd6)
+______
+_______________
+_______________
+_______________
+_______________
+_______________
+_______________
+_______________
+_______駪槗___
+_
diff --git a/result/noent/icu_parse_test.xml.sax2 b/result/noent/icu_parse_test.xml.sax2
new file mode 100644
index 00000000..97294a1b
--- /dev/null
+++ b/result/noent/icu_parse_test.xml.sax2
@@ -0,0 +1,9 @@
+SAX.setDocumentLocator()
+SAX.startDocument()
+SAX.startElementNs(foo, NULL, NULL, 0, 0, 0)
+SAX.characters(
+Text with UTF8 chars at posit, 171)
+SAX.characters(駪槗___
+_, 9)
+SAX.endElementNs(foo, NULL, NULL)
+SAX.endDocument()
diff --git a/test/icu_parse_test.xml b/test/icu_parse_test.xml
new file mode 100644
index 00000000..e7163232
--- /dev/null
+++ b/test/icu_parse_test.xml
@@ -0,0 +1,13 @@
+
+
+Text with UTF8 chars at position 214 (0xd6)
+______
+_______________
+_______________
+_______________
+_______________
+_______________
+_______________
+_______________
+_______駪槗___
+_
diff --git a/testapi.c b/testapi.c
index 168ceb67..ef3f4733 100644
--- a/testapi.c
+++ b/testapi.c
@@ -8785,7 +8785,7 @@ test_xmlCharEncInFunc(void) {
out = gen_xmlBufferPtr(n_out, 1);
in = gen_xmlBufferPtr(n_in, 2);
- ret_val = xmlCharEncInFunc(handler, out, in);
+ ret_val = xmlCharEncInFunc(handler, out, in, 1);
desret_int(ret_val);
call_tests++;
des_xmlCharEncodingHandler_ptr(n_handler, handler, 0);