[libxml2] Print error messages for truncated UTF-8 sequences



commit 79c8a6b10593c709fd3ceea9deef35c1a3da70f0
Author: Nick Wellnhofer <wellnhofer aevum de>
Date:   Sat Jun 10 17:01:27 2017 +0200

    Print error messages for truncated UTF-8 sequences
    
    Before, truncated UTF-8 sequences at the end of a file were treated as
    EOF. Create an error message containing the offending bytes.
    
    xmlStringCurrentChar would also print characters from the input stream,
    not the string it's working on.

 parserInternals.c                    |   55 +++++++++++++++++----------------
 result/errors/partial_utf8_1.xml.err |    7 ++++
 result/errors/partial_utf8_1.xml.str |    4 ++
 result/errors/partial_utf8_2.xml.err |    7 ++++
 result/errors/partial_utf8_2.xml.str |    5 +++
 result/errors/partial_utf8_3.xml.err |    7 ++++
 result/errors/partial_utf8_3.xml.str |    5 +++
 test/errors/partial_utf8_1.xml       |    1 +
 test/errors/partial_utf8_2.xml       |    1 +
 test/errors/partial_utf8_3.xml       |    1 +
 10 files changed, 66 insertions(+), 27 deletions(-)
---
diff --git a/parserInternals.c b/parserInternals.c
index f09142b..0270f06 100644
--- a/parserInternals.c
+++ b/parserInternals.c
@@ -710,16 +710,6 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
     return((int) *ctxt->input->cur);
 encoding_error:
     /*
-     * An encoding problem may arise from a truncated input buffer
-     * splitting a character in the middle. In that case do not raise
-     * an error but return 0 to endicate an end of stream problem
-     */
-    if (ctxt->input->end - ctxt->input->cur < 4) {
-       *len = 0;
-       return(0);
-    }
-
-    /*
      * If we detect an UTF8 error that probably mean that the
      * input encoding didn't get properly advertised in the
      * declaration header. Report the error and switch the encoding
@@ -729,9 +719,21 @@ encoding_error:
     {
         char buffer[150];
 
-       snprintf(&buffer[0], 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
-                       ctxt->input->cur[0], ctxt->input->cur[1],
-                       ctxt->input->cur[2], ctxt->input->cur[3]);
+        if (ctxt->input->cur[1] == 0) {
+            snprintf(&buffer[0], 149, "Bytes: 0x%02X EOF\n",
+                     ctxt->input->cur[0]);
+        } else if (ctxt->input->cur[2] == 0) {
+            snprintf(&buffer[0], 149, "Bytes: 0x%02X 0x%02X EOF\n",
+                     ctxt->input->cur[0], ctxt->input->cur[1]);
+        } else if (ctxt->input->cur[3] == 0) {
+            snprintf(&buffer[0], 149, "Bytes: 0x%02X 0x%02X 0x%02X EOF\n",
+                     ctxt->input->cur[0], ctxt->input->cur[1],
+                     ctxt->input->cur[2]);
+        } else {
+           snprintf(&buffer[0], 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
+                    ctxt->input->cur[0], ctxt->input->cur[1],
+                    ctxt->input->cur[2], ctxt->input->cur[3]);
+        }
        __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
                     "Input is not proper UTF-8, indicate encoding !\n%s",
                     BAD_CAST buffer, NULL);
@@ -821,17 +823,6 @@ xmlStringCurrentChar(xmlParserCtxtPtr ctxt, const xmlChar * cur, int *len)
     *len = 1;
     return ((int) *cur);
 encoding_error:
-
-    /*
-     * An encoding problem may arise from a truncated input buffer
-     * splitting a character in the middle. In that case do not raise
-     * an error but return 0 to endicate an end of stream problem
-     */
-    if ((ctxt == NULL) || (ctxt->input == NULL) ||
-        (ctxt->input->end - ctxt->input->cur < 4)) {
-       *len = 0;
-       return(0);
-    }
     /*
      * If we detect an UTF8 error that probably mean that the
      * input encoding didn't get properly advertised in the
@@ -842,9 +833,19 @@ encoding_error:
     {
         char buffer[150];
 
-       snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
-                       ctxt->input->cur[0], ctxt->input->cur[1],
-                       ctxt->input->cur[2], ctxt->input->cur[3]);
+        if (cur[1] == 0) {
+            snprintf(&buffer[0], 149, "Bytes: 0x%02X EOF\n",
+                     cur[0]);
+        } else if (cur[2] == 0) {
+            snprintf(&buffer[0], 149, "Bytes: 0x%02X 0x%02X EOF\n",
+                     cur[0], cur[1]);
+        } else if (cur[3] == 0) {
+            snprintf(&buffer[0], 149, "Bytes: 0x%02X 0x%02X 0x%02X EOF\n",
+                     cur[0], cur[1], cur[2]);
+        } else {
+           snprintf(&buffer[0], 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
+                    cur[0], cur[1], cur[2], cur[3]);
+        }
        __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
                     "Input is not proper UTF-8, indicate encoding !\n%s",
                     BAD_CAST buffer, NULL);
diff --git a/result/errors/partial_utf8_1.xml b/result/errors/partial_utf8_1.xml
new file mode 100644
index 0000000..e69de29
diff --git a/result/errors/partial_utf8_1.xml.err b/result/errors/partial_utf8_1.xml.err
new file mode 100644
index 0000000..544594f
--- /dev/null
+++ b/result/errors/partial_utf8_1.xml.err
@@ -0,0 +1,7 @@
+./test/errors/partial_utf8_1.xml:1: parser error : Input is not proper UTF-8, indicate encoding !
+Bytes: 0xC2 EOF
+<a>�
+   ^
+./test/errors/partial_utf8_1.xml:1: parser error : Premature end of data in tag a line 1
+<a>�
+    ^
diff --git a/result/errors/partial_utf8_1.xml.str b/result/errors/partial_utf8_1.xml.str
new file mode 100644
index 0000000..8b0cb57
--- /dev/null
+++ b/result/errors/partial_utf8_1.xml.str
@@ -0,0 +1,4 @@
+./test/errors/partial_utf8_1.xml:1: parser error : Extra content at the end of the document
+<a>�
+   ^
+./test/errors/partial_utf8_1.xml : failed to parse
diff --git a/result/errors/partial_utf8_2.xml b/result/errors/partial_utf8_2.xml
new file mode 100644
index 0000000..e69de29
diff --git a/result/errors/partial_utf8_2.xml.err b/result/errors/partial_utf8_2.xml.err
new file mode 100644
index 0000000..b205821
--- /dev/null
+++ b/result/errors/partial_utf8_2.xml.err
@@ -0,0 +1,7 @@
+./test/errors/partial_utf8_2.xml:1: parser error : Input is not proper UTF-8, indicate encoding !
+Bytes: 0xE3 0xA0 EOF
+<a>��
+   ^
+./test/errors/partial_utf8_2.xml:1: parser error : Premature end of data in tag a line 1
+<a>��
+     ^
diff --git a/result/errors/partial_utf8_2.xml.str b/result/errors/partial_utf8_2.xml.str
new file mode 100644
index 0000000..a91e904
--- /dev/null
+++ b/result/errors/partial_utf8_2.xml.str
@@ -0,0 +1,5 @@
+./test/errors/partial_utf8_2.xml:1: parser error : Input is not proper UTF-8, indicate encoding !
+Bytes: 0xE3 0xA0 EOF
+<a>��
+   ^
+./test/errors/partial_utf8_2.xml : failed to parse
diff --git a/result/errors/partial_utf8_3.xml b/result/errors/partial_utf8_3.xml
new file mode 100644
index 0000000..e69de29
diff --git a/result/errors/partial_utf8_3.xml.err b/result/errors/partial_utf8_3.xml.err
new file mode 100644
index 0000000..111ac11
--- /dev/null
+++ b/result/errors/partial_utf8_3.xml.err
@@ -0,0 +1,7 @@
+./test/errors/partial_utf8_3.xml:1: parser error : Input is not proper UTF-8, indicate encoding !
+Bytes: 0xF2 0xA0 0xA0 EOF
+<a>���
+   ^
+./test/errors/partial_utf8_3.xml:1: parser error : Premature end of data in tag a line 1
+<a>���
+      ^
diff --git a/result/errors/partial_utf8_3.xml.str b/result/errors/partial_utf8_3.xml.str
new file mode 100644
index 0000000..2060852
--- /dev/null
+++ b/result/errors/partial_utf8_3.xml.str
@@ -0,0 +1,5 @@
+./test/errors/partial_utf8_3.xml:1: parser error : Input is not proper UTF-8, indicate encoding !
+Bytes: 0xF2 0xA0 0xA0 EOF
+<a>���
+   ^
+./test/errors/partial_utf8_3.xml : failed to parse
diff --git a/test/errors/partial_utf8_1.xml b/test/errors/partial_utf8_1.xml
new file mode 100644
index 0000000..f859620
--- /dev/null
+++ b/test/errors/partial_utf8_1.xml
@@ -0,0 +1 @@
+<a>�
\ No newline at end of file
diff --git a/test/errors/partial_utf8_2.xml b/test/errors/partial_utf8_2.xml
new file mode 100644
index 0000000..dbc2d81
--- /dev/null
+++ b/test/errors/partial_utf8_2.xml
@@ -0,0 +1 @@
+<a>��
\ No newline at end of file
diff --git a/test/errors/partial_utf8_3.xml b/test/errors/partial_utf8_3.xml
new file mode 100644
index 0000000..0abd9de
--- /dev/null
+++ b/test/errors/partial_utf8_3.xml
@@ -0,0 +1 @@
+<a>���
\ No newline at end of file


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]