[gnumeric] Fix import of non-UTF8 csv files. [#658916]



commit 14b59c0c202adadae49ec14adf0193269c66739d
Author: Andreas J Guelzow <aguelzow pyrshep ca>
Date:   Tue Sep 13 12:40:45 2011 -0600

    Fix import of non-UTF8 csv files. [#658916]
    
    2011-09-13  Andreas J. Guelzow <aguelzow pyrshep ca>
    
    	* src/stf.c (stf_open_and_read): do not check for NUL here
    	(clear_stray_NULs): new
    	(stf_read_workbook_auto_csvtab): adjust call to go_guess_encoding and
    	call clear_stray_NULs
    	(csv_tsv_probe): adjust call to go_guess_encoding
    
    2011-09-13  Andreas J. Guelzow <aguelzow pyrshep ca>
    
    	* dialog-stf-main-page.c (stf_dialog_main_page_init): adjust call to
    	go_guess_encoding
    
    2011-09-13  Andreas J. Guelzow <aguelzow pyrshep ca>
    
    	* html_read.c (html_file_probe): adjust call to go_guess_encoding

 ChangeLog                          |    8 +++
 NEWS                               |    1 +
 plugins/html/ChangeLog             |    4 ++
 plugins/html/html_read.c           |   24 +++------
 src/dialogs/ChangeLog              |    5 ++
 src/dialogs/dialog-stf-main-page.c |    2 +-
 src/stf.c                          |   97 ++++++++++++++++++------------------
 src/xml-sax-read.c                 |   20 ++++---
 8 files changed, 87 insertions(+), 74 deletions(-)
---
diff --git a/ChangeLog b/ChangeLog
index 7b2ae00..50ff8ea 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+2011-09-13  Andreas J. Guelzow <aguelzow pyrshep ca>
+
+	* src/stf.c (stf_open_and_read): do not check for NUL here
+	(clear_stray_NULs): new
+	(stf_read_workbook_auto_csvtab): adjust call to go_guess_encoding and
+	call clear_stray_NULs
+	(csv_tsv_probe): adjust call to go_guess_encoding
+
 2011-09-08  Morten Welinder  <terra gnome org>
 
 	* configure.in: Switch to AM_MAINTAINER_MODE([enable]).
diff --git a/NEWS b/NEWS
index 8b132ed..a8fedc9 100644
--- a/NEWS
+++ b/NEWS
@@ -19,6 +19,7 @@ Andreas:
 	* Some more number formatting improvements to ODF import/export.
 	* Read and write tick spacing from/to ODF files.
 	* Fix graph editor crash. [#658223]
+	* Fix import of non-UTF8 csv files. [#658916]
 
 Jean:
 	* Make things build against gtk+-3.0.
diff --git a/plugins/html/ChangeLog b/plugins/html/ChangeLog
index ac693bf..68858ed 100644
--- a/plugins/html/ChangeLog
+++ b/plugins/html/ChangeLog
@@ -1,3 +1,7 @@
+2011-09-13  Andreas J. Guelzow <aguelzow pyrshep ca>
+
+	* html_read.c (html_file_probe): adjust call to go_guess_encoding
+
 2011-07-31  Morten Welinder <terra gnome org>
 
 	* Release 1.10.17
diff --git a/plugins/html/html_read.c b/plugins/html/html_read.c
index 0d45bf6..4743d68 100644
--- a/plugins/html/html_read.c
+++ b/plugins/html/html_read.c
@@ -482,7 +482,7 @@ html_search_for_tables (htmlNodePtr cur, htmlDocPtr doc,
 }
 
 void
-html_file_open (GOFileOpener const *fo, GOIOContext *io_context,
+html_file_open (G_GNUC_UNUSED GOFileOpener const *fo, GOIOContext *io_context,
 		WorkbookView *wb_view, GsfInput *input)
 {
 	guint8 const *buf;
@@ -571,13 +571,14 @@ html_file_open (GOFileOpener const *fo, GOIOContext *io_context,
 
 /* Quick and dirty html probe. */
 gboolean
-html_file_probe (GOFileOpener const *fo, GsfInput *input, GOFileProbeLevel pl)
+html_file_probe (G_GNUC_UNUSED GOFileOpener const *fo, GsfInput *input, 
+		 G_GNUC_UNUSED GOFileProbeLevel pl)
 {
 	gsf_off_t size = 200;
 	guint8 const* buf = gsf_input_read (input, size, NULL);
 	gchar *ulstr = NULL;
+	GString *ustr;
 	gboolean res = FALSE;
-	int try;
 
 	/* Avoid seeking in large streams - try to read, fall back if
 	 * stream is too short.  (Actually, currently _size does not
@@ -589,20 +590,11 @@ html_file_probe (GOFileOpener const *fo, GsfInput *input, GOFileProbeLevel pl)
 			return res;
 	}
 
-	/*
-	 * It is conceivable that encoding guessing could fail
-	 * if our truncated buffer had partial characters.  We
-	 * really need go_guess_encoding_truncated, but for now
-	 * let's just try cutting a byte away at a time.
-	 */
-	for (try = 0; try < MIN (size, 6); try++) {
-		char *ustr;
-		if (go_guess_encoding (buf, size - try, NULL, &ustr)) {
-			ulstr = g_utf8_strdown (ustr, -1);
-			g_free (ustr);
-			break;
-		}
+	if (go_guess_encoding (buf, size, NULL, &ustr, NULL)) {
+		ulstr = g_utf8_strdown (ustr->str, -1);
+		g_string_free (ustr, TRUE);
 	}
+
 	if (!ulstr)
 		return res;
 
diff --git a/src/dialogs/ChangeLog b/src/dialogs/ChangeLog
index 0a680d2..ef2283e 100644
--- a/src/dialogs/ChangeLog
+++ b/src/dialogs/ChangeLog
@@ -1,3 +1,8 @@
+2011-09-13  Andreas J. Guelzow <aguelzow pyrshep ca>
+
+	* dialog-stf-main-page.c (stf_dialog_main_page_init): adjust call to
+	go_guess_encoding
+
 2011-09-12  Jean Brefort  <jean brefort normalesup org>
 
 	* autofilter-top10.ui: fixed radio buttons.
diff --git a/src/dialogs/dialog-stf-main-page.c b/src/dialogs/dialog-stf-main-page.c
index eb6d4ee..44d1560 100644
--- a/src/dialogs/dialog-stf-main-page.c
+++ b/src/dialogs/dialog-stf-main-page.c
@@ -345,7 +345,7 @@ stf_dialog_main_page_init (GtkBuilder *gui, StfDialogData *pagedata)
 
 	encoding_guess = go_guess_encoding (pagedata->raw_data, pagedata->raw_data_len,
 					    "ASCII",
-					    NULL);
+					    NULL, NULL);
 
 	pagedata->main.main_separated = GTK_RADIO_BUTTON (go_gtk_builder_get_widget (gui, "main_separated"));
 	pagedata->main.main_fixed     = GTK_RADIO_BUTTON (go_gtk_builder_get_widget (gui, "main_fixed"));
diff --git a/src/stf.c b/src/stf.c
index ba0e65f..6c23906 100644
--- a/src/stf.c
+++ b/src/stf.c
@@ -93,13 +93,11 @@ stf_warning (GOIOContext *context, char const *msg)
  * returns : a buffer containing the file contents
  **/
 static char *
-stf_open_and_read (GOIOContext *context, GsfInput *input, size_t *readsize)
+stf_open_and_read (G_GNUC_UNUSED GOIOContext *context, GsfInput *input, size_t *readsize)
 {
 	gpointer result;
 	gulong    allocsize;
 	gsf_off_t size = gsf_input_size (input);
-	char *cpointer;
-	int null_chars = 0;
 
 	if (gsf_input_seek (input, 0, G_SEEK_SET))
 		return NULL;
@@ -122,28 +120,6 @@ stf_open_and_read (GOIOContext *context, GsfInput *input, size_t *readsize)
 		g_free (result);
 		result = NULL;
 	}
-
-	cpointer = (char *)result;
-	while (*cpointer != 0)
-		cpointer++;
-	while (cpointer != ((char *)result + *readsize)) {
-		null_chars++;
-		*cpointer = ' ';
-		while (*cpointer != 0)
-			cpointer++;
-	}
-	if (null_chars > 0) {
-		gchar const *format;
-		gchar *msg;
-		format = ngettext ("The file contains %d NULL character. "
-				   "It has been changed to a space.",
-				   "The file contains %d NULL characters. "
-				   "They have been changed to spaces.",
-				   null_chars);
-		msg = g_strdup_printf (format, null_chars);
-		stf_warning (context, msg);
-		g_free (msg);
-	}
 	return result;
 }
 
@@ -238,7 +214,7 @@ resize_columns (Sheet *sheet)
  * Main routine, handles importing a file including all dialog mumbo-jumbo
  **/
 static void
-stf_read_workbook (GOFileOpener const *fo,  gchar const *enc,
+stf_read_workbook (G_GNUC_UNUSED GOFileOpener const *fo,  gchar const *enc,
 		   GOIOContext *context, gpointer wbv, GsfInput *input)
 {
 	DialogStfResult_t *dialogresult = NULL;
@@ -397,6 +373,36 @@ stf_text_to_columns (WorkbookControl *wbc, GOCmdContext *cc)
 	g_object_unref (G_OBJECT (buf));
 }
 
+static void
+clear_stray_NULs (GOIOContext *context, GString *utf8data)
+{
+	char *cpointer, *endpointer;
+	int null_chars = 0;
+
+	cpointer = utf8data->str;
+	endpointer = utf8data->str + utf8data->len;
+	while (*cpointer != 0)
+		cpointer++;
+	while (cpointer != endpointer) {
+		null_chars++;
+		*cpointer = ' ';
+		while (*cpointer != 0)
+			cpointer++;
+	}
+	if (null_chars > 0) {
+		gchar const *format;
+		gchar *msg;
+		format = ngettext ("The file contains %d NULL character. "
+				   "It has been changed to a space.",
+				   "The file contains %d NULL characters. "
+				   "They have been changed to spaces.",
+				   null_chars);
+		msg = g_strdup_printf (format, null_chars);
+		stf_warning (context, msg);
+		g_free (msg);
+	}
+}
+
 /**
  * stf_read_workbook_auto_csvtab
  * @fo       : file opener
@@ -408,14 +414,15 @@ stf_text_to_columns (WorkbookControl *wbc, GOCmdContext *cc)
  * Attempt to auto-detect CSV or tab-delimited file
  **/
 static void
-stf_read_workbook_auto_csvtab (GOFileOpener const *fo, gchar const *enc,
+stf_read_workbook_auto_csvtab (G_GNUC_UNUSED GOFileOpener const *fo, gchar const *enc,
 			       GOIOContext *context,
 			       gpointer wbv, GsfInput *input)
 {
 	Sheet *sheet, *old_sheet;
 	Workbook *book;
 	char *name;
-	char *data, *utf8data;
+	char *data;
+	GString *utf8data;
 	size_t data_len;
 	StfParseOptions_t *po;
 	const char *gsfname;
@@ -433,7 +440,7 @@ stf_read_workbook_auto_csvtab (GOFileOpener const *fo, gchar const *enc,
 	if (!data)
 		return;
 
-	enc = go_guess_encoding (data, data_len, enc, &utf8data);
+	enc = go_guess_encoding (data, data_len, enc, &utf8data, NULL);
 	g_free (data);
 
 	if (!enc) {
@@ -442,6 +449,8 @@ stf_read_workbook_auto_csvtab (GOFileOpener const *fo, gchar const *enc,
 		return;
 	}
 
+	clear_stray_NULs (context, utf8data);
+
 	/*
 	 * Try to get the filename we're reading from.  This is not a
 	 * great way.
@@ -452,14 +461,14 @@ stf_read_workbook_auto_csvtab (GOFileOpener const *fo, gchar const *enc,
 		const char *ext = gsf_extension_pointer (gsfname);
 		gboolean iscsv = ext && strcasecmp (ext, "csv") == 0;
 		if (iscsv)
-			po = stf_parse_options_guess_csv (utf8data);
+			po = stf_parse_options_guess_csv (utf8data->str);
 		else
-			po = stf_parse_options_guess (utf8data);
+			po = stf_parse_options_guess (utf8data->str);
 	}
 
 	lines_chunk = g_string_chunk_new (100 * 1024);
 	lines = stf_parse_general (po, lines_chunk,
-				   utf8data, utf8data + strlen (utf8data));
+				   utf8data->str, utf8data->str + utf8data->len);
 	rows = lines->len;
 	cols = 0;
 	for (i = 0; i < rows; i++) {
@@ -475,7 +484,7 @@ stf_read_workbook_auto_csvtab (GOFileOpener const *fo, gchar const *enc,
 	g_free (name);
 	workbook_sheet_attach (book, sheet);
 
-	if (stf_parse_sheet (po, utf8data, NULL, sheet, 0, 0)) {
+	if (stf_parse_sheet (po, utf8data->str, NULL, sheet, 0, 0)) {
 		workbook_recalc_all (book);
 		resize_columns (sheet);
 		if (po->cols_exceeded || po->rows_exceeded) {
@@ -496,13 +505,13 @@ stf_read_workbook_auto_csvtab (GOFileOpener const *fo, gchar const *enc,
 
 
 	stf_parse_options_free (po);
-	g_free (utf8data);
+	g_string_free (utf8data, TRUE);
 }
 
 /***********************************************************************************/
 
 static void
-stf_write_csv (GOFileSaver const *fs, GOIOContext *context,
+stf_write_csv (G_GNUC_UNUSED GOFileSaver const *fs, GOIOContext *context,
 	       gconstpointer wbv, GsfOutput *output)
 {
 	Sheet *sheet;
@@ -541,9 +550,8 @@ csv_tsv_probe (GOFileOpener const *fo, GsfInput *input, GOFileProbeLevel pl)
 		guint8 const *header;
 		gsf_off_t i;
 		char const *enc = NULL;
-		char *header_utf8;
+		GString *header_utf8;
 		char const *p;
-		int try;
 		gboolean ok = TRUE;
 
 		if (gsf_input_seek (input, 0, G_SEEK_SET))
@@ -559,18 +567,11 @@ csv_tsv_probe (GOFileOpener const *fo, GsfInput *input, GOFileProbeLevel pl)
 		if (NULL == (header = gsf_input_read (input, i, NULL)))
 			return FALSE;
 
-		/*
-		 * It is conceivable that encoding guessing could fail
-		 * if our truncated buffer had partial characters.  We
-		 * really need go_guess_encoding_truncated, but for now
-		 * let's just try cutting a byte away at a time.
-		 */
-		for (try = 0; !enc && try < MIN (i, 6); try++)
-			enc = go_guess_encoding (header, i - try, NULL, &header_utf8);
+		enc = go_guess_encoding (header, i, NULL, &header_utf8, NULL);
 		if (!enc)
 			return FALSE;
 
-		for (p = header_utf8; *p; p = g_utf8_next_char (p)) {
+		for (p = header_utf8->str; *p; p = g_utf8_next_char (p)) {
 			gunichar uc = g_utf8_get_char (p);
 			/* isprint might not be true for these: */
 			if (uc == '\n' || uc == '\t' || uc == '\r')
@@ -580,7 +581,7 @@ csv_tsv_probe (GOFileOpener const *fo, GsfInput *input, GOFileProbeLevel pl)
 			 * http://en.wikipedia.org/wiki/Byte_Order_Mark for
 			 * background.
 			 */
-			if (p == header_utf8 && uc == 0x0000FEFF) {
+			if (p == header_utf8->str && uc == 0x0000FEFF) {
 				continue;
 			}
 			if (!g_unichar_isprint (uc)) {
@@ -589,7 +590,7 @@ csv_tsv_probe (GOFileOpener const *fo, GsfInput *input, GOFileProbeLevel pl)
 			}
 		}
 
-		g_free (header_utf8);
+		g_string_free (header_utf8, TRUE);
 		return ok;
 	} else {
 		char const *name = gsf_input_name (input);
diff --git a/src/xml-sax-read.c b/src/xml-sax-read.c
index a609eff..1249ee9 100644
--- a/src/xml-sax-read.c
+++ b/src/xml-sax-read.c
@@ -462,7 +462,7 @@ xml_sax_wb (GsfXMLIn *xin, xmlChar const **attrs)
 				{ "http://www.gnome.org/gnumeric/v3";, GNM_XML_V3 },
 				{ "http://www.gnome.org/gnumeric/v2";, GNM_XML_V2 },
 				{ "http://www.gnome.org/gnumeric/";, GNM_XML_V1 },
-				{ NULL }
+				{ NULL, 0}
 			};
 			int i;
 			for (i = 0 ; GnumericVersions [i].id != NULL ; ++i )
@@ -483,7 +483,7 @@ xml_sax_wb (GsfXMLIn *xin, xmlChar const **attrs)
 }
 
 static void
-xml_sax_document_meta (GsfXMLIn *xin, xmlChar const **attrs)
+xml_sax_document_meta (GsfXMLIn *xin, G_GNUC_UNUSED xmlChar const **attrs)
 {
 	XMLSaxParseState *state = (XMLSaxParseState *)xin->user_state;
 
@@ -934,7 +934,7 @@ xml_sax_page_break (GsfXMLIn *xin, xmlChar const **attrs)
 }
 
 static void
-xml_sax_page_breaks_begin (GsfXMLIn *xin, xmlChar const **attrs)
+xml_sax_page_breaks_begin (GsfXMLIn *xin, G_GNUC_UNUSED xmlChar const **attrs)
 {
 	XMLSaxParseState *state = (XMLSaxParseState *)xin->user_state;
 	xml_sax_must_have_sheet (state);
@@ -3307,7 +3307,7 @@ maybe_convert (GsfInput *input, gboolean quiet)
 	gsf_off_t input_size;
 	GString the_buffer, *buffer = &the_buffer;
 	guint ui;
-	char *converted;
+	GString *converted = NULL;
 	char const *encoding;
 	gboolean ok;
 	gboolean any_numbered = FALSE;
@@ -3358,19 +3358,21 @@ maybe_convert (GsfInput *input, gboolean quiet)
 		}
 	}
 
-	encoding = go_guess_encoding (buffer->str, buffer->len, NULL, &converted);
+	encoding = go_guess_encoding (buffer->str, buffer->len, NULL, &converted, NULL);
 	if (encoding && !any_numbered &&
-	    converted && strcmp (buffer->str, converted) == 0)
+	    converted && buffer->len == converted->len &&
+	    strcmp (buffer->str, converted->str) == 0)
 		quiet = TRUE;
 
 	g_free (buffer->str);
 
 	if (encoding) {
+		gsize len = converted->len;
 		g_object_unref (input);
 		if (!quiet)
 			g_warning ("Converted xml document with no explicit encoding from transliterated %s to UTF-8.",
 				   encoding);
-		return gsf_input_memory_new ((void *)converted, strlen (converted), TRUE);
+		return gsf_input_memory_new ((void *)g_string_free (converted, FALSE), len, TRUE);
 	} else {
 		if (!quiet)
 			g_warning ("Failed to convert xml document with no explicit encoding to UTF-8.");
@@ -3379,7 +3381,7 @@ maybe_convert (GsfInput *input, gboolean quiet)
 }
 
 static void
-gnm_xml_file_open (GOFileOpener const *fo, GOIOContext *io_context,
+gnm_xml_file_open (G_GNUC_UNUSED GOFileOpener const *fo, GOIOContext *io_context,
 		   gpointer wb_view, GsfInput *input)
 {
 	XMLSaxParseState state;
@@ -3494,7 +3496,7 @@ gnm_xml_probe_element (const xmlChar *name,
 }
 
 static gboolean
-xml_probe (GOFileOpener const *fo, GsfInput *input, GOFileProbeLevel pl)
+xml_probe (G_GNUC_UNUSED GOFileOpener const *fo, GsfInput *input, GOFileProbeLevel pl)
 {
 	if (pl == GO_FILE_PROBE_FILE_NAME) {
 		char const *name = gsf_input_name (input);



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]