[glom] Import: Ignore newlines in quotes.
- From: Murray Cumming <murrayc src gnome org>
- To: svn-commits-list gnome org
- Subject: [glom] Import: Ignore newlines in quotes.
- Date: Fri, 10 Jul 2009 10:21:45 +0000 (UTC)
commit d2c6d449d83ad4ced51fed4ed00a33e2d0be02ec
Author: Murray Cumming <murrayc murrayc com>
Date: Fri Jul 10 12:11:02 2009 +0200
Import: Ignore newlines in quotes.
* glom/dialog_import_csv.cc: on_idle_parse(): Ignore newlines in quotes when
dividing the input into "lines" to give to handle_line().
advance_field(): Do not skip the character after "".
ChangeLog | 5 +++
glom/dialog_import_csv.cc | 89 ++++++++++++++++++++++++++++++++++-----------
glom/dialog_import_csv.h | 4 ++
3 files changed, 77 insertions(+), 21 deletions(-)
---
diff --git a/ChangeLog b/ChangeLog
index f374fe4..5c4e9e2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,10 @@
2009-07-10 Murray Cumming <murrayc murrayc com>
+ * glom/dialog_import_csv.cc: on_idle_parse(): Ignore newlines in quotes when
+ dividing the input into "lines" to give to handle_line().
+
+2009-07-10 Murray Cumming <murrayc murrayc com>
+
Import: Handle "" as ". Fix a crash.
* glom/dialog_import_csv.[h|cc]: Use typedefs to simplify code.
diff --git a/glom/dialog_import_csv.cc b/glom/dialog_import_csv.cc
index eef6893..1554f2e 100644
--- a/glom/dialog_import_csv.cc
+++ b/glom/dialog_import_csv.cc
@@ -37,13 +37,12 @@ namespace
{
const gunichar DELIMITER = ',';
+const gunichar QUOTE = '\"';
static bool next_char_is_quote(const Glib::ustring::const_iterator& iter, const Glib::ustring::const_iterator& end)
{
if(iter == end)
return false;
-
- const gunichar quote_char = (gunichar)'\"';
//Look at the next character to see if it's really "" (an escaped "):
Glib::ustring::const_iterator iter_next = iter;
@@ -51,7 +50,7 @@ static bool next_char_is_quote(const Glib::ustring::const_iterator& iter, const
if(iter_next != end)
{
const gunichar c_next = *iter_next;
- if(c_next == quote_char)
+ if(c_next == QUOTE)
{
return true;
}
@@ -63,7 +62,6 @@ static bool next_char_is_quote(const Glib::ustring::const_iterator& iter, const
//Parse the field in a comma-separated line, returning the field including the quotes:
static Glib::ustring::const_iterator advance_field(const Glib::ustring::const_iterator& iter, const Glib::ustring::const_iterator& end, Glib::ustring& field)
{
- const gunichar quote_char = (gunichar)'\"';
bool inside_quotes = false;
//bool string_finished = false; //Ignore anything after "something", such as "something"else,
@@ -80,7 +78,7 @@ static Glib::ustring::const_iterator advance_field(const Glib::ustring::const_it
if(inside_quotes)
{
// End of quoted string?
- if(c == quote_char)
+ if(c == QUOTE)
{
if(next_char_is_quote(walk, end))
{
@@ -102,7 +100,7 @@ static Glib::ustring::const_iterator advance_field(const Glib::ustring::const_it
else
{
// Start of quoted string:
- if((c == quote_char))
+ if((c == QUOTE))
{
inside_quotes = true;
continue;
@@ -657,7 +655,7 @@ void Dialog_Import_CSV::encoding_error()
bool Dialog_Import_CSV::on_idle_parse()
{
- // The amount of bytes to process in one pass of the idle handler
+ // The amount of bytes to process in one pass of the idle handler:
static const guint CONVERT_BUFFER_SIZE = 1024;
const char* inbuffer = &m_raw[m_parser->input_position];
@@ -701,25 +699,67 @@ bool Dialog_Import_CSV::on_idle_parse()
m_parser->input_position += (inbuf - inbuffer);
// We now have outbuf - outbuffer bytes of valid UTF-8 in outbuffer.
- const char* prev = outbuffer;
- const char* pos;
- const char to_find[] = { '\r', '\n', '\0' };
-
- while( (pos = std::find_first_of<const char*>(prev, outbuf, to_find, to_find + sizeof(to_find))) != outbuf)
+ const char* prev_line_end = outbuffer;
+ const char* prev = prev_line_end;
+
+ //Identify the record rows in the .cvs file.
+ //We can't just search for newlines because they may be inside quotes too.
+ //TODO: Use a regex instead, to more easily handle quotes?
+ bool in_quotes = false;
+ while(true)
{
- if(*pos == '\0')
+ //Note that, unlike std::string::find*, std::find* returns an iterator (char*), not a position.
+ //It returns outbuf if none is found.
+ const char newline_to_find[] = { '\r', '\n', '\0' };
+ const char* pos_newline = std::find_first_of<const char*>(prev, outbuf, newline_to_find, newline_to_find + sizeof(newline_to_find));
+
+ const char quote_to_find[] = {(char)QUOTE};
+ const char* pos_quote = std::find_first_of<const char*>(prev, outbuf, quote_to_find, quote_to_find + sizeof(quote_to_find));
+
+ //Examine the first character (quote or newline) that was found:
+ const char* pos = pos_newline;
+ if((pos_quote != outbuf) && pos_quote < pos)
+ pos = pos_quote;
+
+ if(pos == outbuf)
+ break;
+
+ char ch = *pos;
+
+ if(ch == '\0')
{
- // There is a nullbyte in the conversion. As normal text files don't
- // contain nullbytes, this only occurs when converting for example a UTF-16
- // file from ISO-8859-1 to UTF-8 (note that the UTF-16 file is valid ISO-8859-1,
+ // There is a null byte in the conversion. Because normal text files don't
+ // contain null bytes this only occurs when converting, for example, a UTF-16
+ // file from ISO-8859-1 to UTF-8 (note that the UTF-16 file is valid ISO-8859-1 -
// it just contains lots of nullbytes). We therefore produce an error here.
encoding_error();
return false;
}
+ else if(in_quotes)
+ {
+ //Ignore newlines inside quotes.
+
+ //End quote:
+ if(ch == (char)QUOTE)
+ in_quotes = false;
+
+ prev = pos + 1;
+ continue;
+ }
else
{
- m_parser->current_line.append(prev, pos - prev);
- ++ m_parser->line_number;
+ //Start quote:
+ if(ch == (char)QUOTE)
+ {
+ in_quotes = true;
+ prev = pos + 1;
+ continue;
+ }
+
+ //Found a newline (outside of quotes) that marks the end of the line:
+ m_parser->current_line.append(prev_line_end, pos - prev_line_end);
+ ++(m_parser->line_number);
+
if(!m_parser->current_line.empty())
handle_line(m_parser->current_line, m_parser->line_number);
@@ -729,7 +769,13 @@ bool Dialog_Import_CSV::on_idle_parse()
prev = pos + 1;
// Skip DOS-style linebreak (\r\n)
- if(*pos == '\r' && prev != outbuf && *prev == '\n') ++ prev;
+ if(ch == '\r'
+ && prev != outbuf && *prev == '\n')
+ {
+ ++prev;
+ }
+
+ prev_line_end = prev;
}
}
@@ -737,7 +783,8 @@ bool Dialog_Import_CSV::on_idle_parse()
m_parser->current_line.append(prev, outbuf - prev);
if(!m_stream && m_raw.size() == m_parser->input_position)
{
- ++ m_parser->line_number;
+ ++(m_parser->line_number);
+
// Handle last line, if nonempty
if(m_parser->current_line.empty())
handle_line(m_parser->current_line, m_parser->line_number);
@@ -872,7 +919,7 @@ void Dialog_Import_CSV::field_data_func(Gtk::CellRenderer* renderer, const Gtk::
{
sharedptr<Field> field = m_fields[column_number];
- if(row < m_rows.size())
+ if(row != -1 && (unsigned int)row < m_rows.size())
{
const type_row_strings& row_strings = m_rows[row];
if(column_number < row_strings.size())
diff --git a/glom/dialog_import_csv.h b/glom/dialog_import_csv.h
index 4e99efb..90dbcfc 100644
--- a/glom/dialog_import_csv.h
+++ b/glom/dialog_import_csv.h
@@ -88,6 +88,10 @@ private:
void encoding_error();
bool on_idle_parse();
+
+ /** Parse a row from a .cvs file. Note that this "line" might have newline
+ * characters inside one field value, inside quotes.
+ */
void handle_line(const Glib::ustring& line, unsigned int line_number);
void line_data_func(Gtk::CellRenderer* renderer, const Gtk::TreeModel::iterator& iter);
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]