[gnumeric] html: improve BOM handling.



commit 2105232e1935e771356a215fe11672bf3548ab2d
Author: Morten Welinder <terra gnome org>
Date:   Thu Oct 31 19:06:09 2019 -0400

    html: improve BOM handling.

 NEWS                     |  3 +++
 plugins/html/ChangeLog   |  4 ++++
 plugins/html/html_read.c | 22 +++++++++++++++++-----
 3 files changed, 24 insertions(+), 5 deletions(-)
---
diff --git a/NEWS b/NEWS
index dd1c0d867..abb9ce11d 100644
--- a/NEWS
+++ b/NEWS
@@ -17,6 +17,9 @@ Morten:
        * Fix SUMIF (etc) problem with blank criteria.  [#423]
        * Improve editing of percentages.  [#413]
 
+Thomas Kuehne:
+       * Improve html import.  [#392]
+
 --------------------------------------------------------------------------
 Gnumeric 1.12.45
 
diff --git a/plugins/html/ChangeLog b/plugins/html/ChangeLog
index fc37c6951..7cd07a17b 100644
--- a/plugins/html/ChangeLog
+++ b/plugins/html/ChangeLog
@@ -1,3 +1,7 @@
+2019-10-31  Morten Welinder  <terra gnome org>
+
+       * html_read.c (html_file_open): Improve BOM handling.  See #392.
+
 2019-05-20  Morten Welinder <terra gnome org>
 
        * Release 1.12.45
diff --git a/plugins/html/html_read.c b/plugins/html/html_read.c
index c4dd90039..dfd14c2fe 100644
--- a/plugins/html/html_read.c
+++ b/plugins/html/html_read.c
@@ -506,26 +506,38 @@ html_file_open (G_GNUC_UNUSED GOFileOpener const *fo, GOIOContext *io_context,
                buf = gsf_input_read (input, 4, NULL);
                if (buf != NULL) {
                        enc = xmlDetectCharEncoding(buf, 4);
-                       switch (enc) {  /* Skip byte order mark */
+                       switch (enc) {
+#if LIBXML_VERSION < 20702
+                       /* Skip byte order mark */
                        case XML_CHAR_ENCODING_UCS4BE:
                        case XML_CHAR_ENCODING_UCS4LE:
                        case XML_CHAR_ENCODING_UCS4_2143:
                        case XML_CHAR_ENCODING_UCS4_3412:
+                               if (buf[0] == 0xFE || buf[1] == 0xFE || buf[2] == 0xFE || buf[3] == 0xFE)
+                                       bomlen = 4;
+                               else
+                                       bomlen = 0;
+                               break;
                        case XML_CHAR_ENCODING_EBCDIC:
-                               bomlen = 4;
+                               if (buf[0] == 0xDD)
+                                       bomlen = 4;
+                               else
+                                       bomlen = 0;
                                break;
                        case XML_CHAR_ENCODING_UTF16BE:
                        case XML_CHAR_ENCODING_UTF16LE:
-                               bomlen = 2;
+                               if (buf[0] == 0xFE || buf[1] == 0xFE)
+                                       bomlen = 2;
+                               else
+                                       bomlen = 0;
                                break;
                        case XML_CHAR_ENCODING_UTF8:
                                if (buf[0] == 0xef)
                                        bomlen = 3;
-                               else if (buf[0] == 0x3c)
-                                       bomlen = 4;
                                else
                                        bomlen = 0;
                                break;
+#endif
                        case XML_CHAR_ENCODING_NONE:
                                bomlen = 0;
                                /* Try to detect unmarked UTF16LE


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]