[gnumeric] Applix: fix encoding and character escape problems.
- From: Morten Welinder <mortenw src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gnumeric] Applix: fix encoding and character escape problems.
- Date: Sat, 3 Nov 2018 20:37:12 +0000 (UTC)
commit 1c67e3e8c11a4a6af2dc9f6f0ea9efa3ebc8350a
Author: Morten Welinder <terra gnome org>
Date: Sat Nov 3 16:13:14 2018 -0400
Applix: fix encoding and character escape problems.
Escaping and continuation lines didn't play well together.
Decoding was absolutely bogus even for the 8-bit days for which it was
written. Assume ISO-8859-1.
There is some kind of longer encoding. That we still don't handle.
http://www.vistasource.com/doc/applixware/wordstechref.pdf
NEWS | 1 +
plugins/applix/ChangeLog | 6 +++
plugins/applix/applix-read.c | 107 +++++++++++++++++++++++++++----------------
3 files changed, 74 insertions(+), 40 deletions(-)
---
diff --git a/NEWS b/NEWS
index 29d6b80cd..807776e83 100644
--- a/NEWS
+++ b/NEWS
@@ -8,6 +8,7 @@ Morten:
* New NT_RADICAL function.
* Fix conditional style crash.
* Fix applix locale problem. [#362]
+ * Fix applix encoding and escape problems. [#363]
--------------------------------------------------------------------------
Gnumeric 1.12.43
diff --git a/plugins/applix/ChangeLog b/plugins/applix/ChangeLog
index 0167c9319..c555e7233 100644
--- a/plugins/applix/ChangeLog
+++ b/plugins/applix/ChangeLog
@@ -1,3 +1,9 @@
+2018-11-03 Morten Welinder <terra gnome org>
+
+ * applix-read.c (applix_get_line): Properly handle continuation
+ lines with escaped characters. Handle non-ASCII characters on the
+ assumption that they are ISO-8859-1. Fixes #363.
+
2018-11-01 Morten Welinder <terra gnome org>
* applix-read.c (applix_conventions_new): Set up separators so we
diff --git a/plugins/applix/applix-read.c b/plugins/applix/applix-read.c
index 9254c8110..6303ccd49 100644
--- a/plugins/applix/applix-read.c
+++ b/plugins/applix/applix-read.c
@@ -78,6 +78,7 @@ typedef struct {
GSList *std_names, *real_names;
GnmConventions *convs;
+ GIConv converter;
} ApplixReadState;
/* #define NO_DEBUG_APPLIX */
@@ -296,57 +297,81 @@ static unsigned char *
applix_get_line (ApplixReadState *state)
{
unsigned char *ptr, *end, *buf;
- size_t len, skip = 0, offset = 0;
+ GString *line = g_string_new (NULL);
+ gboolean first = TRUE;
+ // Read line and continuation lines.
while (NULL != (ptr = gsf_input_textline_ascii_gets (state->input))) {
- len = strlen (ptr);
+ size_t len = strlen (ptr);
+ // Clip at the state line length
+ size_t uselen = MIN (len, state->line_len);
+
+ if (first) {
+ first = FALSE;
+ g_string_append_len (line, ptr, uselen);
+ } else if (uselen > 0) {
+ // Drop initial space from continuation line
+ g_string_append_len (line, ptr + 1, uselen - 1);
+ }
- /* Clip at the state line length */
- if (len > state->line_len)
- len = state->line_len;
+ if (len < state->line_len)
+ break;
+ }
- if ((offset + len) > state->buffer_size) {
- state->buffer_size += state->line_len;
- state->buffer = g_realloc (state->buffer, state->buffer_size + 1);
- }
+ if (line->len > state->buffer_size) {
+ state->buffer_size = line->len;
+ state->buffer = g_realloc (state->buffer, state->buffer_size + 1);
+ }
- end = ptr + len;
- ptr += skip;
- buf = state->buffer + offset;
- while (ptr < end) {
- if (*ptr == '^') {
- if (ptr [1] != '^') {
- if (ptr [1] == '\0' || ptr [2] == '\0') {
- applix_parse_error (state, _("Missing characters for
character encoding"));
- *(buf++) = *(ptr++);
- } else if (ptr [1] < 'a' || ptr [1] > 'p' ||
- ptr [2] < 'a' || ptr [2] > 'p') {
- applix_parse_error (state, _("Invalid characters for encoding
'%c%c'"),
- ptr[1], ptr[2]);
- *(buf++) = *(ptr++);
- } else {
- *(buf++) = ((ptr[1] - 'a') << 8) | (ptr[2] - 'a');
- ptr += 3;
- }
- } else /* an encoded carat */
- *(buf++) = '^', ptr += 2;
- } else
- *(buf++) = *(ptr++);
+ ptr = line->str;
+ end = ptr + line->len;
+ buf = state->buffer;
+
+ // g_printerr ("Pre [%s]\n", ptr);
+
+ while (ptr < end) {
+ if (*ptr != '^') {
+ *(buf++) = *(ptr++);
+ continue;
}
- offset = buf - state->buffer;
+ if (ptr[1] == '^') {
+ // An encoded carat
+ *(buf++) = '^', ptr += 2;
+ continue;
+ }
- if (len >= state->line_len)
- skip = 1; /* skip the leading space for next line */
- else
- break;
+ if (ptr[1] == '\0' || ptr[2] == '\0') {
+ applix_parse_error (state, _("Missing characters for character encoding"));
+ *(buf++) = *(ptr++);
+ } else if (ptr[1] < 'a' || ptr[1] > 'p' ||
+ ptr[2] < 'a' || ptr[2] > 'p') {
+ applix_parse_error (state, _("Invalid characters for encoding '%c%c'"),
+ ptr[1], ptr[2]);
+ *(buf++) = *(ptr++);
+ } else {
+ guchar uc = ((ptr[1] - 'a') << 4) | (ptr[2] - 'a');
+ gsize utf8_len;
+ char *utf8buf = g_convert_with_iconv (&uc, 1, state->converter, NULL,
+ &utf8_len, NULL);
+ memcpy (buf, utf8buf, utf8_len);
+ buf += utf8_len;
+ g_free (utf8buf);
+ ptr += 3;
+ }
}
- if (offset == 0 && ptr == NULL)
+ if (line->len == 0) {
+ g_string_free (line, TRUE);
return NULL;
+ }
+
+ if (buf)
+ *buf = 0;
+
+ g_string_free (line, TRUE);
- if (state->buffer != NULL)
- state->buffer [offset] = '\0';
+ //g_printerr ("Post: [%s]\n", state->buffer);
return state->buffer;
}
@@ -885,7 +910,7 @@ applix_read_attributes (ApplixReadState *state)
if (!a_strncmp (ptr, "Attr Table End"))
return FALSE;
- if (ptr [0] != '<')
+ if (ptr[0] != '<')
return applix_parse_error (state, "Invalid attribute");
/* TODO : The first style seems to be a different format */
@@ -1659,6 +1684,7 @@ applix_read (GOIOContext *io_context, WorkbookView *wb_view, GsfInput *src)
state.std_names = NULL;
state.real_names = NULL;
state.convs = applix_conventions_new ();
+ state.converter = g_iconv_open ("UTF-8", "ISO-8859-1");
/* Actually read the workbook */
res = applix_read_impl (&state);
@@ -1708,4 +1734,5 @@ applix_read (GOIOContext *io_context, WorkbookView *wb_view, GsfInput *src)
go_io_error_info_set (io_context, state.parse_error);
gnm_conventions_unref (state.convs);
+ gsf_iconv_close (state.converter);
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]