[gnumeric] html: improve space handling.
- From: Morten Welinder <mortenw src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gnumeric] html: improve space handling.
- Date: Sun, 25 Sep 2022 22:58:15 +0000 (UTC)
commit 577646d0e3168825b47bd69a92e5a0c50f694455
Author: Morten Welinder <terra gnome org>
Date: Sun Sep 25 18:58:01 2022 -0400
html: improve space handling.
NEWS | 4 ++++
plugins/html/ChangeLog | 5 +++++
plugins/html/html_read.c | 53 ++++++++++++++++++++++++++++++++++++++----------
3 files changed, 51 insertions(+), 11 deletions(-)
---
diff --git a/NEWS b/NEWS
index 33419a9a3..3d211e184 100644
--- a/NEWS
+++ b/NEWS
@@ -1,5 +1,9 @@
Gnumeric 1.12.54
+John Denker:
+ * Improve whitespace handling in html import. [#671]
+
+
--------------------------------------------------------------------------
Gnumeric 1.12.53
diff --git a/plugins/html/ChangeLog b/plugins/html/ChangeLog
index 74cfbbb48..17282ed51 100644
--- a/plugins/html/ChangeLog
+++ b/plugins/html/ChangeLog
@@ -1,3 +1,8 @@
+2022-09-25 Morten Welinder <terra gnome org>
+
+ * html_read.c: Improve whitespace handling. Based on patch from
+ John Denker. [#671]
+
2022-09-17 Morten Welinder <terra gnome org>
* Release 1.12.53
diff --git a/plugins/html/html_read.c b/plugins/html/html_read.c
index 50daf18c7..d78a3eb8a 100644
--- a/plugins/html/html_read.c
+++ b/plugins/html/html_read.c
@@ -81,27 +81,58 @@ html_get_sheet (char const *name, Workbook *wb)
return sheet;
}
+
+/* deletes any initial whitespace */
+/* thereafter, including at the end, */
+/* collapses any run of whitespace to a single space. */
+/* (This may or may not be what you want, e.g. <pre>...</pre>) */
+/* It's up to the caller to deal with the possible final trailing space. */
static void
-html_append_text (GString *buf, const xmlChar *text)
+html_append_trim_text (GString *buf, const xmlChar *text)
{
const xmlChar *p;
+ const xmlChar *last_sp;
while (*text) {
- while (g_unichar_isspace (g_utf8_get_char (text)))
- text = g_utf8_next_char (text);
+ // collect a run of spaces, if any
+ for (last_sp = p = text;
+ *p && g_unichar_isspace (g_utf8_get_char (p));
+ p = g_utf8_next_char (p)) {
+ last_sp = p;
+ }
+ if (buf->len == 0 ||
+ g_unichar_isspace (g_utf8_get_char (g_utf8_prev_char (buf->str + buf->len)))) {
+ text = p; /* skip all the spaces */
+ } else {
+ text = last_sp; /* keep the last space */
+ }
if (*text) {
- for (p = text;
+ // collect a run of non-spaces, if any
+ for (/* keep p */;
*p && !g_unichar_isspace (g_utf8_get_char (p));
- p = g_utf8_next_char (p))
- ;
- if (buf->len > 0)
- g_string_append_c (buf, ' ');
+ p = g_utf8_next_char (p)) {
+ }
+ // here p points to either a space or EoS
+ if (*p) p = g_utf8_next_char (p);
+ // copy the non-spaces and one trailing space if any
g_string_append_len (buf, text, p - text);
- text = p;
}
+ text = p;
}
}
+/* remove one trailing space, if it exists */
+static void
+html_rtrim (GString *buf)
+{
+ if (buf->len == 0)
+ return;
+
+ gchar* last = g_utf8_prev_char (buf->str + buf->len);
+ if (g_unichar_isspace (g_utf8_get_char (last)))
+ g_string_truncate(buf, last - buf->str);
+}
+
static void
html_read_content (htmlNodePtr cur, GString *buf, GnmStyle *mstyle,
xmlBufferPtr a_buf, GSList **hrefs, gboolean first,
@@ -112,7 +143,7 @@ html_read_content (htmlNodePtr cur, GString *buf, GnmStyle *mstyle,
for (ptr = cur->children; ptr != NULL ; ptr = ptr->next) {
if (ptr->type == XML_TEXT_NODE) {
if (g_utf8_validate (ptr->content, -1, NULL))
- html_append_text (buf, ptr->content);
+ html_append_trim_text (buf, ptr->content);
else
g_string_append (buf, _("[Warning: Invalid text string has been removed.]"));
} else if (ptr->type == XML_ELEMENT_NODE) {
@@ -218,7 +249,7 @@ html_read_row (htmlNodePtr cur, htmlDocPtr doc, GnmHtmlTableCtxt *tc)
html_read_content (ptr, buf, mstyle, a_buf,
&hrefs, TRUE, doc, tc);
-
+ html_rtrim(buf);
if (g_slist_length (hrefs) >= 1 &&
buf->len > 0) {
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]