[gnumeric] Fix import of non-UTF8 csv files. [#658916]
- From: Andreas J. Guelzow <guelzow src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gnumeric] Fix import of non-UTF8 csv files. [#658916]
- Date: Tue, 13 Sep 2011 18:42:14 +0000 (UTC)
commit 14b59c0c202adadae49ec14adf0193269c66739d
Author: Andreas J Guelzow <aguelzow pyrshep ca>
Date: Tue Sep 13 12:40:45 2011 -0600
Fix import of non-UTF8 csv files. [#658916]
2011-09-13 Andreas J. Guelzow <aguelzow pyrshep ca>
* src/stf.c (stf_open_and_read): do not check for NUL here
(clear_stray_NULs): new
(stf_read_workbook_auto_csvtab): adjust call to go_guess_encoding and
call clear_stray_NULs
(csv_tsv_probe): adjust call to go_guess_encoding
2011-09-13 Andreas J. Guelzow <aguelzow pyrshep ca>
* dialog-stf-main-page.c (stf_dialog_main_page_init): adjust call to
go_guess_encoding
2011-09-13 Andreas J. Guelzow <aguelzow pyrshep ca>
* html_read.c (html_file_probe): adjust call to go_guess_encoding
ChangeLog | 8 +++
NEWS | 1 +
plugins/html/ChangeLog | 4 ++
plugins/html/html_read.c | 24 +++------
src/dialogs/ChangeLog | 5 ++
src/dialogs/dialog-stf-main-page.c | 2 +-
src/stf.c | 97 ++++++++++++++++++------------------
src/xml-sax-read.c | 20 ++++---
8 files changed, 87 insertions(+), 74 deletions(-)
---
diff --git a/ChangeLog b/ChangeLog
index 7b2ae00..50ff8ea 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+2011-09-13 Andreas J. Guelzow <aguelzow pyrshep ca>
+
+ * src/stf.c (stf_open_and_read): do not check for NUL here
+ (clear_stray_NULs): new
+ (stf_read_workbook_auto_csvtab): adjust call to go_guess_encoding and
+ call clear_stray_NULs
+ (csv_tsv_probe): adjust call to go_guess_encoding
+
2011-09-08 Morten Welinder <terra gnome org>
* configure.in: Switch to AM_MAINTAINER_MODE([enable]).
diff --git a/NEWS b/NEWS
index 8b132ed..a8fedc9 100644
--- a/NEWS
+++ b/NEWS
@@ -19,6 +19,7 @@ Andreas:
* Some more number formatting improvements to ODF import/export.
* Read and write tick spacing from/to ODF files.
* Fix graph editor crash. [#658223]
+ * Fix import of non-UTF8 csv files. [#658916]
Jean:
* Make things build against gtk+-3.0.
diff --git a/plugins/html/ChangeLog b/plugins/html/ChangeLog
index ac693bf..68858ed 100644
--- a/plugins/html/ChangeLog
+++ b/plugins/html/ChangeLog
@@ -1,3 +1,7 @@
+2011-09-13 Andreas J. Guelzow <aguelzow pyrshep ca>
+
+ * html_read.c (html_file_probe): adjust call to go_guess_encoding
+
2011-07-31 Morten Welinder <terra gnome org>
* Release 1.10.17
diff --git a/plugins/html/html_read.c b/plugins/html/html_read.c
index 0d45bf6..4743d68 100644
--- a/plugins/html/html_read.c
+++ b/plugins/html/html_read.c
@@ -482,7 +482,7 @@ html_search_for_tables (htmlNodePtr cur, htmlDocPtr doc,
}
void
-html_file_open (GOFileOpener const *fo, GOIOContext *io_context,
+html_file_open (G_GNUC_UNUSED GOFileOpener const *fo, GOIOContext *io_context,
WorkbookView *wb_view, GsfInput *input)
{
guint8 const *buf;
@@ -571,13 +571,14 @@ html_file_open (GOFileOpener const *fo, GOIOContext *io_context,
/* Quick and dirty html probe. */
gboolean
-html_file_probe (GOFileOpener const *fo, GsfInput *input, GOFileProbeLevel pl)
+html_file_probe (G_GNUC_UNUSED GOFileOpener const *fo, GsfInput *input,
+ G_GNUC_UNUSED GOFileProbeLevel pl)
{
gsf_off_t size = 200;
guint8 const* buf = gsf_input_read (input, size, NULL);
gchar *ulstr = NULL;
+ GString *ustr;
gboolean res = FALSE;
- int try;
/* Avoid seeking in large streams - try to read, fall back if
* stream is too short. (Actually, currently _size does not
@@ -589,20 +590,11 @@ html_file_probe (GOFileOpener const *fo, GsfInput *input, GOFileProbeLevel pl)
return res;
}
- /*
- * It is conceivable that encoding guessing could fail
- * if our truncated buffer had partial characters. We
- * really need go_guess_encoding_truncated, but for now
- * let's just try cutting a byte away at a time.
- */
- for (try = 0; try < MIN (size, 6); try++) {
- char *ustr;
- if (go_guess_encoding (buf, size - try, NULL, &ustr)) {
- ulstr = g_utf8_strdown (ustr, -1);
- g_free (ustr);
- break;
- }
+ if (go_guess_encoding (buf, size, NULL, &ustr, NULL)) {
+ ulstr = g_utf8_strdown (ustr->str, -1);
+ g_string_free (ustr, TRUE);
}
+
if (!ulstr)
return res;
diff --git a/src/dialogs/ChangeLog b/src/dialogs/ChangeLog
index 0a680d2..ef2283e 100644
--- a/src/dialogs/ChangeLog
+++ b/src/dialogs/ChangeLog
@@ -1,3 +1,8 @@
+2011-09-13 Andreas J. Guelzow <aguelzow pyrshep ca>
+
+ * dialog-stf-main-page.c (stf_dialog_main_page_init): adjust call to
+ go_guess_encoding
+
2011-09-12 Jean Brefort <jean brefort normalesup org>
* autofilter-top10.ui: fixed radio buttons.
diff --git a/src/dialogs/dialog-stf-main-page.c b/src/dialogs/dialog-stf-main-page.c
index eb6d4ee..44d1560 100644
--- a/src/dialogs/dialog-stf-main-page.c
+++ b/src/dialogs/dialog-stf-main-page.c
@@ -345,7 +345,7 @@ stf_dialog_main_page_init (GtkBuilder *gui, StfDialogData *pagedata)
encoding_guess = go_guess_encoding (pagedata->raw_data, pagedata->raw_data_len,
"ASCII",
- NULL);
+ NULL, NULL);
pagedata->main.main_separated = GTK_RADIO_BUTTON (go_gtk_builder_get_widget (gui, "main_separated"));
pagedata->main.main_fixed = GTK_RADIO_BUTTON (go_gtk_builder_get_widget (gui, "main_fixed"));
diff --git a/src/stf.c b/src/stf.c
index ba0e65f..6c23906 100644
--- a/src/stf.c
+++ b/src/stf.c
@@ -93,13 +93,11 @@ stf_warning (GOIOContext *context, char const *msg)
* returns : a buffer containing the file contents
**/
static char *
-stf_open_and_read (GOIOContext *context, GsfInput *input, size_t *readsize)
+stf_open_and_read (G_GNUC_UNUSED GOIOContext *context, GsfInput *input, size_t *readsize)
{
gpointer result;
gulong allocsize;
gsf_off_t size = gsf_input_size (input);
- char *cpointer;
- int null_chars = 0;
if (gsf_input_seek (input, 0, G_SEEK_SET))
return NULL;
@@ -122,28 +120,6 @@ stf_open_and_read (GOIOContext *context, GsfInput *input, size_t *readsize)
g_free (result);
result = NULL;
}
-
- cpointer = (char *)result;
- while (*cpointer != 0)
- cpointer++;
- while (cpointer != ((char *)result + *readsize)) {
- null_chars++;
- *cpointer = ' ';
- while (*cpointer != 0)
- cpointer++;
- }
- if (null_chars > 0) {
- gchar const *format;
- gchar *msg;
- format = ngettext ("The file contains %d NULL character. "
- "It has been changed to a space.",
- "The file contains %d NULL characters. "
- "They have been changed to spaces.",
- null_chars);
- msg = g_strdup_printf (format, null_chars);
- stf_warning (context, msg);
- g_free (msg);
- }
return result;
}
@@ -238,7 +214,7 @@ resize_columns (Sheet *sheet)
* Main routine, handles importing a file including all dialog mumbo-jumbo
**/
static void
-stf_read_workbook (GOFileOpener const *fo, gchar const *enc,
+stf_read_workbook (G_GNUC_UNUSED GOFileOpener const *fo, gchar const *enc,
GOIOContext *context, gpointer wbv, GsfInput *input)
{
DialogStfResult_t *dialogresult = NULL;
@@ -397,6 +373,36 @@ stf_text_to_columns (WorkbookControl *wbc, GOCmdContext *cc)
g_object_unref (G_OBJECT (buf));
}
+static void
+clear_stray_NULs (GOIOContext *context, GString *utf8data)
+{
+ char *cpointer, *endpointer;
+ int null_chars = 0;
+
+ cpointer = utf8data->str;
+ endpointer = utf8data->str + utf8data->len;
+ while (*cpointer != 0)
+ cpointer++;
+ while (cpointer != endpointer) {
+ null_chars++;
+ *cpointer = ' ';
+ while (*cpointer != 0)
+ cpointer++;
+ }
+ if (null_chars > 0) {
+ gchar const *format;
+ gchar *msg;
+ format = ngettext ("The file contains %d NULL character. "
+ "It has been changed to a space.",
+ "The file contains %d NULL characters. "
+ "They have been changed to spaces.",
+ null_chars);
+ msg = g_strdup_printf (format, null_chars);
+ stf_warning (context, msg);
+ g_free (msg);
+ }
+}
+
/**
* stf_read_workbook_auto_csvtab
* @fo : file opener
@@ -408,14 +414,15 @@ stf_text_to_columns (WorkbookControl *wbc, GOCmdContext *cc)
* Attempt to auto-detect CSV or tab-delimited file
**/
static void
-stf_read_workbook_auto_csvtab (GOFileOpener const *fo, gchar const *enc,
+stf_read_workbook_auto_csvtab (G_GNUC_UNUSED GOFileOpener const *fo, gchar const *enc,
GOIOContext *context,
gpointer wbv, GsfInput *input)
{
Sheet *sheet, *old_sheet;
Workbook *book;
char *name;
- char *data, *utf8data;
+ char *data;
+ GString *utf8data;
size_t data_len;
StfParseOptions_t *po;
const char *gsfname;
@@ -433,7 +440,7 @@ stf_read_workbook_auto_csvtab (GOFileOpener const *fo, gchar const *enc,
if (!data)
return;
- enc = go_guess_encoding (data, data_len, enc, &utf8data);
+ enc = go_guess_encoding (data, data_len, enc, &utf8data, NULL);
g_free (data);
if (!enc) {
@@ -442,6 +449,8 @@ stf_read_workbook_auto_csvtab (GOFileOpener const *fo, gchar const *enc,
return;
}
+ clear_stray_NULs (context, utf8data);
+
/*
* Try to get the filename we're reading from. This is not a
* great way.
@@ -452,14 +461,14 @@ stf_read_workbook_auto_csvtab (GOFileOpener const *fo, gchar const *enc,
const char *ext = gsf_extension_pointer (gsfname);
gboolean iscsv = ext && strcasecmp (ext, "csv") == 0;
if (iscsv)
- po = stf_parse_options_guess_csv (utf8data);
+ po = stf_parse_options_guess_csv (utf8data->str);
else
- po = stf_parse_options_guess (utf8data);
+ po = stf_parse_options_guess (utf8data->str);
}
lines_chunk = g_string_chunk_new (100 * 1024);
lines = stf_parse_general (po, lines_chunk,
- utf8data, utf8data + strlen (utf8data));
+ utf8data->str, utf8data->str + utf8data->len);
rows = lines->len;
cols = 0;
for (i = 0; i < rows; i++) {
@@ -475,7 +484,7 @@ stf_read_workbook_auto_csvtab (GOFileOpener const *fo, gchar const *enc,
g_free (name);
workbook_sheet_attach (book, sheet);
- if (stf_parse_sheet (po, utf8data, NULL, sheet, 0, 0)) {
+ if (stf_parse_sheet (po, utf8data->str, NULL, sheet, 0, 0)) {
workbook_recalc_all (book);
resize_columns (sheet);
if (po->cols_exceeded || po->rows_exceeded) {
@@ -496,13 +505,13 @@ stf_read_workbook_auto_csvtab (GOFileOpener const *fo, gchar const *enc,
stf_parse_options_free (po);
- g_free (utf8data);
+ g_string_free (utf8data, TRUE);
}
/***********************************************************************************/
static void
-stf_write_csv (GOFileSaver const *fs, GOIOContext *context,
+stf_write_csv (G_GNUC_UNUSED GOFileSaver const *fs, GOIOContext *context,
gconstpointer wbv, GsfOutput *output)
{
Sheet *sheet;
@@ -541,9 +550,8 @@ csv_tsv_probe (GOFileOpener const *fo, GsfInput *input, GOFileProbeLevel pl)
guint8 const *header;
gsf_off_t i;
char const *enc = NULL;
- char *header_utf8;
+ GString *header_utf8;
char const *p;
- int try;
gboolean ok = TRUE;
if (gsf_input_seek (input, 0, G_SEEK_SET))
@@ -559,18 +567,11 @@ csv_tsv_probe (GOFileOpener const *fo, GsfInput *input, GOFileProbeLevel pl)
if (NULL == (header = gsf_input_read (input, i, NULL)))
return FALSE;
- /*
- * It is conceivable that encoding guessing could fail
- * if our truncated buffer had partial characters. We
- * really need go_guess_encoding_truncated, but for now
- * let's just try cutting a byte away at a time.
- */
- for (try = 0; !enc && try < MIN (i, 6); try++)
- enc = go_guess_encoding (header, i - try, NULL, &header_utf8);
+ enc = go_guess_encoding (header, i, NULL, &header_utf8, NULL);
if (!enc)
return FALSE;
- for (p = header_utf8; *p; p = g_utf8_next_char (p)) {
+ for (p = header_utf8->str; *p; p = g_utf8_next_char (p)) {
gunichar uc = g_utf8_get_char (p);
/* isprint might not be true for these: */
if (uc == '\n' || uc == '\t' || uc == '\r')
@@ -580,7 +581,7 @@ csv_tsv_probe (GOFileOpener const *fo, GsfInput *input, GOFileProbeLevel pl)
* http://en.wikipedia.org/wiki/Byte_Order_Mark for
* background.
*/
- if (p == header_utf8 && uc == 0x0000FEFF) {
+ if (p == header_utf8->str && uc == 0x0000FEFF) {
continue;
}
if (!g_unichar_isprint (uc)) {
@@ -589,7 +590,7 @@ csv_tsv_probe (GOFileOpener const *fo, GsfInput *input, GOFileProbeLevel pl)
}
}
- g_free (header_utf8);
+ g_string_free (header_utf8, TRUE);
return ok;
} else {
char const *name = gsf_input_name (input);
diff --git a/src/xml-sax-read.c b/src/xml-sax-read.c
index a609eff..1249ee9 100644
--- a/src/xml-sax-read.c
+++ b/src/xml-sax-read.c
@@ -462,7 +462,7 @@ xml_sax_wb (GsfXMLIn *xin, xmlChar const **attrs)
{ "http://www.gnome.org/gnumeric/v3", GNM_XML_V3 },
{ "http://www.gnome.org/gnumeric/v2", GNM_XML_V2 },
{ "http://www.gnome.org/gnumeric/", GNM_XML_V1 },
- { NULL }
+ { NULL, 0}
};
int i;
for (i = 0 ; GnumericVersions [i].id != NULL ; ++i )
@@ -483,7 +483,7 @@ xml_sax_wb (GsfXMLIn *xin, xmlChar const **attrs)
}
static void
-xml_sax_document_meta (GsfXMLIn *xin, xmlChar const **attrs)
+xml_sax_document_meta (GsfXMLIn *xin, G_GNUC_UNUSED xmlChar const **attrs)
{
XMLSaxParseState *state = (XMLSaxParseState *)xin->user_state;
@@ -934,7 +934,7 @@ xml_sax_page_break (GsfXMLIn *xin, xmlChar const **attrs)
}
static void
-xml_sax_page_breaks_begin (GsfXMLIn *xin, xmlChar const **attrs)
+xml_sax_page_breaks_begin (GsfXMLIn *xin, G_GNUC_UNUSED xmlChar const **attrs)
{
XMLSaxParseState *state = (XMLSaxParseState *)xin->user_state;
xml_sax_must_have_sheet (state);
@@ -3307,7 +3307,7 @@ maybe_convert (GsfInput *input, gboolean quiet)
gsf_off_t input_size;
GString the_buffer, *buffer = &the_buffer;
guint ui;
- char *converted;
+ GString *converted = NULL;
char const *encoding;
gboolean ok;
gboolean any_numbered = FALSE;
@@ -3358,19 +3358,21 @@ maybe_convert (GsfInput *input, gboolean quiet)
}
}
- encoding = go_guess_encoding (buffer->str, buffer->len, NULL, &converted);
+ encoding = go_guess_encoding (buffer->str, buffer->len, NULL, &converted, NULL);
if (encoding && !any_numbered &&
- converted && strcmp (buffer->str, converted) == 0)
+ converted && buffer->len == converted->len &&
+ strcmp (buffer->str, converted->str) == 0)
quiet = TRUE;
g_free (buffer->str);
if (encoding) {
+ gsize len = converted->len;
g_object_unref (input);
if (!quiet)
g_warning ("Converted xml document with no explicit encoding from transliterated %s to UTF-8.",
encoding);
- return gsf_input_memory_new ((void *)converted, strlen (converted), TRUE);
+ return gsf_input_memory_new ((void *)g_string_free (converted, FALSE), len, TRUE);
} else {
if (!quiet)
g_warning ("Failed to convert xml document with no explicit encoding to UTF-8.");
@@ -3379,7 +3381,7 @@ maybe_convert (GsfInput *input, gboolean quiet)
}
static void
-gnm_xml_file_open (GOFileOpener const *fo, GOIOContext *io_context,
+gnm_xml_file_open (G_GNUC_UNUSED GOFileOpener const *fo, GOIOContext *io_context,
gpointer wb_view, GsfInput *input)
{
XMLSaxParseState state;
@@ -3494,7 +3496,7 @@ gnm_xml_probe_element (const xmlChar *name,
}
static gboolean
-xml_probe (GOFileOpener const *fo, GsfInput *input, GOFileProbeLevel pl)
+xml_probe (G_GNUC_UNUSED GOFileOpener const *fo, GsfInput *input, GOFileProbeLevel pl)
{
if (pl == GO_FILE_PROBE_FILE_NAME) {
char const *name = gsf_input_name (input);
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]