[glom/import_csv_refactored] CsvParser: Make this an actual parser, with a sane API.



commit 851201d8caf0a78f6b13a016bf554c57b6adcc1e
Author: Murray Cumming <murrayc murrayc com>
Date:   Fri Sep 18 14:35:24 2009 +0200

    CsvParser: Make this an actual parser, with a sane API.
    
    * glom/dialog_import_csv.[h|cc]:
    * glom/dialog_import_csv_progress.[h|cc]: Move into glom/import_csv/
    * glom/import_csv/: Added file_encodings.[h|cc], moving the encodings
    list into it. This is similar to mode_design/iso_codes.[h|cc].
    * glom/import_csv.[h|cc]: Rename to glom/import_csv/csv_parser.[h|cc]
    to match the class name.
    Move actual parsing (and the vector of row/column data) into
    CsvParser, leaving the dialogs to just respond to signals and get
    data via get methods.
    
    However, this breaks the import tests, because they add data to
    the parser directly using m_raw. They must be changed to read actual
    files, or maybe we could add some intermediate level of API that they
    could test.

 ChangeLog                                          |   19 +
 Makefile_glom.am                                   |   14 +-
 Makefile_tests.am                                  |    8 +-
 glom/frame_glom.cc                                 |    4 +-
 glom/import_csv.cc                                 |  324 -----------
 glom/import_csv.h                                  |  131 -----
 glom/import_csv/csv_parser.cc                      |  594 ++++++++++++++++++++
 glom/import_csv/csv_parser.h                       |  194 +++++++
 glom/{ => import_csv}/dialog_import_csv.cc         |  367 +++----------
 glom/{ => import_csv}/dialog_import_csv.h          |   52 +-
 .../{ => import_csv}/dialog_import_csv_progress.cc |    0
 glom/{ => import_csv}/dialog_import_csv_progress.h |    0
 glom/import_csv/file_encodings.cc                  |  148 +++++
 glom/import_csv/file_encodings.h                   |   62 ++
 glom/mode_design/iso_codes.h                       |    3 +-
 po/POTFILES.in                                     |    5 +-
 tests/import/test_parsing.cc                       |    4 +-
 tests/import/test_signals.cc                       |    4 +-
 18 files changed, 1139 insertions(+), 794 deletions(-)
---
diff --git a/ChangeLog b/ChangeLog
index 9d15e07..44394d1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,22 @@
+2009-09-18  Murray Cumming  <murrayc murrayc com>
+
+	CsvParser: Make this an actual parser, with a sane API.
+
+	* glom/dialog_import_csv.[h|cc]:
+	* glom/dialog_import_csv_progress.[h|cc]: Move into glom/import_csv/
+	* glom/import_csv/: Added file_encodings.[h|cc], moving the encodings 
+	list into it. This is similar to mode_design/iso_codes.[h|cc].
+	* glom/import_csv.[h|cc]: Rename to glom/import_csv/csv_parser.[h|cc] 
+	to match the class name.
+	Move actual parsing (and the vector of row/column data) into 
+	CsvParser, leaving the dialogs to just respond to signals and get 
+	data via get methods.
+
+	However, this breaks the import tests, because they add data to 
+	the parser directly using m_raw. They must be changed to read actual 
+	files, or maybe we could add some intermediate level of API that they 
+	could test.
+
 2009-09-18  Murray Cumming  <murrayc murrayc-desktop>
 
 	CsvParser: Made some API private.
diff --git a/Makefile_glom.am b/Makefile_glom.am
index 5057657..01c03f6 100644
--- a/Makefile_glom.am
+++ b/Makefile_glom.am
@@ -36,10 +36,6 @@ glom_glom_SOURCES =							\
 	glom/dialog_existing_or_new.h					\
 	glom/dialog_glom.cc						\
 	glom/dialog_glom.h						\
-	glom/dialog_import_csv.cc					\
-	glom/dialog_import_csv.h					\
-	glom/dialog_import_csv_progress.cc				\
-	glom/dialog_import_csv_progress.h				\
 	glom/dialog_invalid_data.cc					\
 	glom/dialog_invalid_data.h					\
 	glom/dialog_progress_creating.cc				\
@@ -54,8 +50,6 @@ glom_glom_SOURCES =							\
 	glom/glom_postgres.h						\
 	glom/glom_privs.cc						\
 	glom/glom_privs.h						\
-	glom/import_csv.cc					\
-	glom/import_csv.h					\
 	glom/main.cc							\
 	glom/notebook_glom.cc						\
 	glom/notebook_glom.h						\
@@ -78,6 +72,14 @@ glom_glom_SOURCES =							\
 	glom/bakery/busy_cursor.h					\
 	glom/bakery/dialog_offersave.cc					\
 	glom/bakery/dialog_offersave.h					\
+	glom/import_csv/dialog_import_csv.cc				\
+	glom/import_csv/dialog_import_csv.h				\
+	glom/import_csv/dialog_import_csv_progress.cc			\
+	glom/import_csv/dialog_import_csv_progress.h			\
+	glom/import_csv/file_encodings.cc				\
+	glom/import_csv/file_encodings.h				\
+	glom/import_csv/csv_parser.cc					\
+	glom/import_csv/csv_parser.h					\
 	glom/mode_data/box_data.cc					\
 	glom/mode_data/box_data.h					\
 	glom/mode_data/box_data_calendar_related.cc			\
diff --git a/Makefile_tests.am b/Makefile_tests.am
index d8dfa7d..044443d 100644
--- a/Makefile_tests.am
+++ b/Makefile_tests.am
@@ -48,12 +48,12 @@ tests_test_parsing_time_SOURCES = tests/test_parsing_time.cc
 tests_test_signal_reemit_SOURCES = tests/test_signal_reemit.cc
 tests_test_load_python_library_SOURCES = tests/test_load_python_library.cc
 tests_import_test_parsing_SOURCES =	\
-	glom/import_csv.cc	\
-	glom/import_csv.h	\
+	glom/import/csv_parser.cc	\
+	glom/import/csv_parser.h	\
 	tests/import/test_parsing.cc
 tests_import_test_signals_SOURCES =	\
-	glom/import_csv.cc	\
-	glom/import_csv.h	\
+	glom/import/csv_parser.cc	\
+	glom/import/csv_parser.h	\
 	tests/import/test_signals.cc
 
 glom_libglom_test_connectionpool_LDADD = $(tests_ldadd)
diff --git a/glom/frame_glom.cc b/glom/frame_glom.cc
index 74e0bd3..e70e04e 100644
--- a/glom/frame_glom.cc
+++ b/glom/frame_glom.cc
@@ -23,8 +23,8 @@
 
 #include <glom/frame_glom.h>
 #include <glom/application.h>
-#include <glom/dialog_import_csv.h>
-#include <glom/dialog_import_csv_progress.h>
+#include <glom/import_csv/dialog_import_csv.h>
+#include <glom/import_csv/dialog_import_csv_progress.h>
 #include <libglom/appstate.h>
 
 #include <libglom/connectionpool.h>
diff --git a/glom/import_csv/csv_parser.cc b/glom/import_csv/csv_parser.cc
new file mode 100644
index 0000000..19889f9
--- /dev/null
+++ b/glom/import_csv/csv_parser.cc
@@ -0,0 +1,594 @@
+/* Glom
+ *
+ * Copyright (C) 2001-2004 Murray Cumming
+ * Copyright (C) 2009 Openismus GmbH
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#include "csv_parser.h"
+
+#include <cerrno>
+
+// On Windows, "iconv" seems to be a define for "libiconv", breaking the Glib::IConv::iconv() call.
+#ifdef iconv
+#undef iconv
+#endif
+
+namespace Glom
+{
+
+bool CsvParser::next_char_is_quote(const Glib::ustring::const_iterator& iter, const Glib::ustring::const_iterator& end)
+{
+  if(iter == end)
+    return false;
+
+  // Look at the next character to see if it's really "" (an escaped "):
+  Glib::ustring::const_iterator iter_next = iter;
+  ++iter_next;
+  if(iter_next != end)
+  {
+    const gunichar c_next = *iter_next;
+    if(c_next == CsvParser::QUOTE)
+    {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+CsvParser::CsvParser(const std::string& encoding_charset)
+: m_raw(0),
+  m_encoding(encoding_charset),
+  m_input_position(0),
+  m_idle_connection(),
+  m_line_number(0),
+  m_state(STATE_NONE),
+  m_stream(),
+  m_rows()
+{
+}
+
+void CsvParser::set_file_and_start_parsing(const std::string& file_uri)
+{
+  m_file = Gio::File::create_for_uri(file_uri);
+  m_file->read_async(sigc::mem_fun(*this, &CsvParser::on_file_read));
+  set_state(CsvParser::STATE_PARSING);
+
+  // Query the display name of the file to set in the title:
+  m_file->query_info_async(sigc::mem_fun(*this, &CsvParser::on_file_query_info), G_FILE_ATTRIBUTE_STANDARD_DISPLAY_NAME);
+}
+
+ 
+CsvParser::~CsvParser()
+{
+  m_idle_connection.disconnect();
+}
+
+CsvParser::State CsvParser::get_state() const
+{
+  return m_state;
+}
+
+guint CsvParser::get_rows_count() const
+{
+  return m_rows.size();
+}
+
+bool CsvParser::get_rows_empty() const
+{
+  return m_rows.empty();
+}
+
+
+/// Get the number of columns of data in this row.
+guint CsvParser::get_cols_count(guint row_number) const
+{
+  if(row_number >= m_rows.size())
+    return 0;
+
+  return m_rows[row_number].size();
+}
+
+const Glib::ustring& CsvParser::get_data(guint row, guint col)
+{
+  static Glib::ustring empty_result;
+
+  if(row >= m_rows.size())
+  {
+    std::cerr << "CsvParser::get_data(): row out of range." << std::endl;
+    return empty_result;
+  }
+
+  const type_row_strings& row_data = m_rows[row];
+  if(col >= row_data.size())
+  {
+    std::cerr << "CsvParser::get_data(): col out of range." << std::endl;
+    return empty_result;
+  }
+
+  return row_data[col];
+}
+
+
+CsvParser::type_signal_file_read_error CsvParser::signal_file_read_error() const
+{
+  return m_signal_file_read_error;
+}
+
+CsvParser::type_signal_file_read_error CsvParser::signal_have_display_name() const
+{
+  return m_signal_have_display_name;
+}
+
+CsvParser::type_signal_encoding_error CsvParser::signal_encoding_error() const
+{
+  return m_signal_encoding_error;
+}
+
+CsvParser::type_signal_line_scanned CsvParser::signal_line_scanned() const
+{
+  return m_signal_line_scanned;
+}
+
+CsvParser::type_signal_state_changed CsvParser::signal_state_changed() const
+{
+  return m_signal_state_changed;
+}
+
+void CsvParser::set_encoding(const Glib::ustring& encoding_charset)
+{
+  if(m_encoding == encoding_charset)
+    return;
+
+  m_encoding = encoding_charset;
+  
+  //Stop parsing if the encoding changes.
+  //The caller should restart the parsing when wanted.
+  clear();
+  set_state(STATE_NONE);
+}
+
+// Parse the field in a comma-separated line, returning the field including the quotes:
+// (But can it operate on non-UTF, read: binary, data?)
+Glib::ustring::const_iterator CsvParser::advance_field(const Glib::ustring::const_iterator& iter, const Glib::ustring::const_iterator& end, Glib::ustring& field)
+{
+  bool inside_quotes = false;
+  //bool string_finished = false; //Ignore anything after "something", such as "something"else,
+
+  field.clear();
+
+  Glib::ustring::const_iterator walk;
+  for(walk = iter; walk != end; ++walk)
+  {
+    const gunichar c = *walk;
+
+    //if(string_finished)
+    //  continue;
+
+    if(inside_quotes)
+    {
+      // End of quoted string?
+      if(c == CsvParser::QUOTE)
+      {
+        if(CsvParser::next_char_is_quote(walk, end))
+        {
+          // This is "" so it's not an end quote. Just add one quote:
+          field += c;
+          ++walk; //Skip the second ".
+        }
+        else
+        {
+          inside_quotes = false;
+          //string_finished = true; //Ignore anything else before the next comma.
+        }
+
+        continue;
+      }
+    }
+    else
+    {
+      // Start of quoted string:
+      if((c == CsvParser::QUOTE))
+      {
+        inside_quotes = true;
+        continue;
+      }
+      // End of field:
+      else if(!inside_quotes && c == CsvParser::DELIMITER)
+      {
+        break;
+      }
+
+      continue;
+    }
+
+    field += c; // Just so that we don't need to iterate through the field again, since there is no Glib::ustring::substr(iter, iter)
+  }
+
+  // TODO: Throw error if still inside a quoted string?
+  //std::cout << "debug: field=" << field << std::endl;
+  return walk;
+}
+
+void CsvParser::clear()
+{
+  m_file.reset();
+  m_buffer.reset(0);
+
+  //m_stream.reset();
+  //m_raw.clear();
+  m_rows.clear();
+  // Set to current encoding I guess ...
+  //m_conv("UTF-8", encoding),
+  m_input_position= 0;
+  // Disconnect signal handlers, too? Nah, I don't think so ...
+  //m_idle_connection.disconnect();
+  m_line_number = 0;
+  set_state(STATE_NONE);
+}
+
+bool CsvParser::on_idle_parse()
+{
+  Glib::IConv conv("UTF-8", m_encoding);
+
+  // The amount of bytes to process in one pass of the idle handler:
+  static const guint CONVERT_BUFFER_SIZE = 1024;
+
+  const char* inbuffer = &m_raw[m_input_position];
+  char* inbuf = const_cast<char*>(inbuffer);
+  gsize inbytes = m_raw.size() - m_input_position;
+  char outbuffer[CONVERT_BUFFER_SIZE];
+  char* outbuf = outbuffer;
+  gsize outbytes = CONVERT_BUFFER_SIZE;
+
+  const std::size_t result = conv.iconv(&inbuf, &inbytes, &outbuf, &outbytes);
+  bool more_to_process = (inbytes != 0);
+
+  if(result == static_cast<size_t>(-1))
+  {
+    if(errno == EILSEQ)
+    {
+      // Invalid text in the current encoding.
+      set_state(STATE_ENCODING_ERROR);
+      signal_encoding_error().emit();
+      return false;
+    }
+
+    // If EINVAL is set, this means that an incomplete multibyte sequence was at
+    // the end of the input. We might have some more bytes, but those do not make
+    // up a whole character, so we need to wait for more input.
+    if(errno == EINVAL)
+    {
+      if(!m_stream)
+      {
+        // This means that we already reached the end of the file. The file
+        // should not end with an incomplete multibyte sequence.
+        set_state(STATE_ENCODING_ERROR);
+        signal_encoding_error().emit();
+        return false;
+      }
+      else
+      {
+        more_to_process = false;
+      }
+    }
+  }
+
+  m_input_position += (inbuf - inbuffer);
+
+  // We now have outbuf - outbuffer bytes of valid UTF-8 in outbuffer.
+  const char* prev_line_end = outbuffer;
+  const char* prev = prev_line_end;
+
+  // Identify the record rows in the .csv file.
+  // We can't just search for newlines because they may be inside quotes too. 
+  // TODO: Use a regex instead, to more easily handle quotes?
+  bool in_quotes = false;
+  while(true)
+  {
+    // Note that, unlike std::string::find*, std::find* returns an iterator (char*), not a position.
+    // It returns outbuf if none is found.
+    const char newline_to_find[] = { '\r', '\n', '\0' };
+    const char* pos_newline = std::find_first_of<const char*>(prev, outbuf, newline_to_find, newline_to_find + sizeof(newline_to_find));
+
+    const char quote_to_find[] = {(char)QUOTE};
+    const char* pos_quote = std::find_first_of<const char*>(prev, outbuf, quote_to_find, quote_to_find + sizeof(quote_to_find));
+
+    // Examine the first character (quote or newline) that was found:
+    const char* pos = pos_newline;
+    if((pos_quote != outbuf) && pos_quote < pos)
+      pos = pos_quote;
+
+    if(pos == outbuf)
+      break;
+
+    char ch = *pos;   
+
+    if(ch == '\0')
+    {
+      // There is a null byte in the conversion. Because normal text files don't
+      // contain null bytes this only occurs when converting, for example, a UTF-16
+      // file from ISO-8859-1 to UTF-8 (note that the UTF-16 file is valid ISO-8859-1 - 
+      // it just contains lots of nullbytes). We therefore produce an error here.
+      set_state(STATE_ENCODING_ERROR);
+      signal_encoding_error().emit();
+      return false;
+    }
+    else if(in_quotes)
+    {
+      // Ignore newlines inside quotes.
+
+      // End quote:
+      if(ch == (char)QUOTE)
+        in_quotes = false;
+
+      prev = pos + 1;
+      continue;
+    }
+    else
+    {
+      // Start quote:
+      if(ch == (char)QUOTE)
+      {
+        in_quotes = true;
+        prev = pos + 1;
+        continue;
+      }
+
+      // Found a newline (outside of quotes) that marks the end of the line:
+      m_current_line.append(prev_line_end, pos - prev_line_end);
+      ++m_line_number;
+
+      if(!m_current_line.empty())
+      {
+        do_line_scanned(m_current_line, m_line_number);
+      }
+
+      m_current_line.clear();
+
+      // Skip linebreak
+      prev = pos + 1;
+
+      // Skip DOS-style linebreak (\r\n)
+      if(ch == '\r' 
+         && prev != outbuf && *prev == '\n')
+      {
+         ++prev;
+      }
+
+      prev_line_end = prev;
+    }
+  }
+
+  // Append last chunk of this line
+  m_current_line.append(prev, outbuf - prev);
+  if(!m_stream && m_raw.size() == m_input_position)
+  {
+    ++m_line_number;
+
+    // Handle last line, if nonempty
+    if(!m_current_line.empty())
+    {
+      do_line_scanned(m_current_line, m_line_number);
+    }
+
+    // We have parsed the whole file. We have finished.
+    set_state(STATE_PARSED);
+  }
+
+  // Continue if there are more bytes to process
+  return more_to_process;
+}
+
+void CsvParser::do_line_scanned(const Glib::ustring& line, guint line_number)
+{
+  //std::cout << "debug: on_line_scanned=" << line_number << std::endl;
+  if(line.empty())
+   return;
+
+  m_rows.push_back(CsvParser::type_row_strings());
+  type_row_strings& row = m_rows.back();
+
+  Glib::ustring field;
+  //Gtk::TreeModelColumnRecord record;
+
+  // Parse first field:
+  Glib::ustring::const_iterator line_iter = CsvParser::advance_field(line.begin(), line.end(), field);
+  row.push_back(field);
+
+  // Parse more fields:
+  while(line_iter != line.end())
+  {
+    // Skip delimiter:
+    ++line_iter;
+
+    // Read field:
+    line_iter = advance_field(line_iter, line.end(), field);
+
+    // Add field to current row:
+    row.push_back(field);
+  }
+
+  signal_line_scanned().emit(line, line_number);
+}
+
+void CsvParser::on_file_read(const Glib::RefPtr<Gio::AsyncResult>& result)
+{
+#ifdef GLIBMM_EXCEPTIONS_ENABLED
+  try
+  {
+    m_stream = m_file->read_finish(result);
+
+    m_buffer.reset(new Buffer);
+    m_stream->read_async(m_buffer->buf, sizeof(m_buffer->buf), sigc::mem_fun(*this, &CsvParser::on_stream_read));
+  }
+  catch(const Glib::Exception& error)
+  {
+    signal_file_read_error().emit( error.what() );
+    clear();
+    // TODO: Response?
+  }
+#else
+    std::auto_ptr<Glib::Error> error;
+    m_stream = m_file->read_finish(result, error);
+    if (!error.get())
+    {
+      m_buffer.reset(new Buffer);
+      m_stream->read_async(m_buffer->buf, sizeof(m_buffer->buf), sigc::mem_fun(*this, &CsvParser::on_stream_read));
+    }
+    else
+    {
+      signal_file_read_error().emit( error->what() );
+      clear();
+    }
+#endif    
+}
+
+
+void CsvParser::on_stream_read(const Glib::RefPtr<Gio::AsyncResult>& result)
+{
+#ifdef GLIBMM_EXCEPTIONS_ENABLED
+  try
+  {
+    const gssize size = m_stream->read_finish(result);
+    m_raw.insert(m_raw.end(), m_buffer->buf, m_buffer->buf + size);
+
+    // If the parser already exists, but it is currently not parsing because it waits
+    // for new input, then continue parsing.
+    // TODO: Introduce CsvParser::is_idle_handler_connected() instead?
+    if(!m_idle_connection.connected())
+    {
+      m_idle_connection = Glib::signal_idle().connect(sigc::mem_fun(*this, &CsvParser::on_idle_parse));
+    }
+    // If the parser does not exist yet, then create a new parser, except when the
+    // current encoding does not work for the file, in which case the user must first
+    // choose another encoding.
+    else if(m_state != CsvParser::STATE_ENCODING_ERROR)
+    {
+      begin_parse();
+    }
+
+    if(size > 0)
+    {
+      // Read the next few bytes
+      m_stream->read_async(m_buffer->buf, sizeof(m_buffer->buf), sigc::mem_fun(*this, &CsvParser::on_stream_read));
+    }
+    else
+    {
+      // Finished reading
+      m_buffer.reset(0);
+      m_stream.reset();
+      m_file.reset();
+    }
+  }
+  catch(const Glib::Exception& error)
+  {
+    signal_file_read_error().emit( error.what() );
+    clear();
+    // TODO: Response?
+  }
+#else
+    std::auto_ptr<Glib::Error> error;
+    const gssize size = m_stream->read_finish(result, error);
+    if (!error.get())
+    {
+      m_raw.insert(m_raw.end(), m_buffer->buf, m_buffer->buf + size);
+
+      // If the parser already exists, but it is currently not parsing because it waits
+      // for new input, then continue parsing.
+      if(!m_idle_connection.connected())
+      {
+        m_idle_connection = Glib::signal_idle().connect(sigc::mem_fun(*m_parser.get(), &CsvParser::on_idle_parse));
+      }
+      // If the parser does not exist yet, then create a new parser, except when the
+      // current encoding does not work for the file ,in which case the user must first
+      // choose another encoding.
+      else if(m_state != CsvParser::ENCODING_ERROR)
+      {
+        begin_parse();
+      }
+
+      if(size > 0)
+      {
+        // Read the next few bytes
+        m_stream->read_async(m_buffer->buf, sizeof(m_buffer->buf), sigc::mem_fun(*this, &CsvParser::on_stream_read));
+      }
+      else
+      {
+        // Finished reading
+        m_buffer.reset(0);
+        m_stream.reset();
+        m_file.reset();
+      }
+    }
+    if (error.get())
+    {
+      signal_file_read_error().emit( error->what() );
+      clear();
+    }
+#endif
+}
+
+
+void CsvParser::on_file_query_info(const Glib::RefPtr<Gio::AsyncResult>& result)
+{
+#ifdef GLIBMM_EXCEPTIONS_ENABLED
+  try
+  {
+    Glib::RefPtr<Gio::FileInfo> info = m_file->query_info_finish(result);
+    if(info)
+      signal_have_display_name().emit(info->get_display_name());
+  }
+  catch(const Glib::Exception& ex)
+  {
+    std::cerr << "Failed to fetch display name of uri " << m_file->get_uri() << ": " << ex.what() << std::endl;
+  }
+#else
+  std::auto_ptr<Glib::Error> error;
+  Glib::RefPtr<Gio::FileInfo> info = m_file->query_info_finish(result, error);
+  if (!error.get())
+  {
+    if(info)
+      signal_have_display_name().emit(info->get_display_name());
+  }
+  else
+    std::cerr << "Failed to fetch display name of uri " << m_file->get_uri() << ": " << error->what() << std::endl;
+#endif    
+}
+
+
+//TODO This seems to be superfluous - we already connect the idle handler elsewhere.
+void CsvParser::begin_parse()
+{
+  clear();
+
+  set_state(STATE_PARSING);
+  m_idle_connection = Glib::signal_idle().connect(sigc::mem_fun(*this, &CsvParser::on_idle_parse));
+}
+
+void CsvParser::set_state(State state)
+{
+  if(m_state == state)
+    return;
+
+  m_state = state;
+  signal_state_changed().emit();
+}
+
+
+} // namespace Glom
diff --git a/glom/import_csv/csv_parser.h b/glom/import_csv/csv_parser.h
new file mode 100644
index 0000000..fcb0893
--- /dev/null
+++ b/glom/import_csv/csv_parser.h
@@ -0,0 +1,194 @@
+/* Glom
+ *
+ * Copyright (C) 2001-2004 Murray Cumming
+ * Copyright (C) 2009 Openismus GmbH
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#ifndef GLOM_IMPORT_CSV_H
+#define GLOM_IMPORT_CSV_H
+
+#include "base_db.h"
+
+#include <memory>
+#include <giomm/asyncresult.h>
+#include <giomm/file.h>
+#include <giomm/inputstream.h>
+#include <gtkmm/liststore.h>
+
+namespace Glom
+{
+
+// We use the low-level Glib::IConv routines to progressively convert the
+// input data in an idle handler.
+
+/** Parses .csv (comma-separated values) text files.
+ * See http://en.wikipedia.org/wiki/Comma-separated_values for the file format.
+ *
+ * set_file_and_start_parsing() to start parsing.
+ * The data can then be read via get_date(), with get_rows_count() and get_cols_count().
+ * 
+ * The signals offer feedback while the parsing is happening.
+ */
+class CsvParser
+{
+public:
+
+  //TODO: Avoid having to specify an initial encoding.
+  explicit CsvParser(const std::string& encoding_charset);
+
+  ~CsvParser();
+
+  enum State {
+    STATE_NONE, 
+    STATE_PARSING,  /**< Parsing is in progress. */
+    STATE_ENCODING_ERROR, /**< An error happened while parsing. */
+    STATE_PARSED /**< Finished parsing. */
+  };
+
+  /// Get the current state of the parser.
+  State get_state() const;
+
+  /// Get the number of rows parsed so far.
+  guint get_rows_count() const;
+
+  bool get_rows_empty() const;
+
+  /// Get the number of columns of data in this row.
+  guint get_cols_count(guint row_number) const;
+
+  //The nasty reference return is for performance.
+  const Glib::ustring& get_data(guint row, guint col);
+
+  // Signals:
+  typedef sigc::signal<void, const Glib::ustring&> type_signal_file_read_error;
+
+  /** This signal will be emitted if the parser encounters an error while trying to open the file for reading.
+   */
+  type_signal_file_read_error signal_file_read_error() const;
+
+
+  typedef sigc::signal<void, const Glib::ustring&> type_signal_have_display_name;
+
+  /** This signal will be emitted when the parser has discovered the 
+   * display name for the file. This does not require any parsing of the contents, 
+   * but it is asynchronous, so CsvParser signals this as a convenience.
+   */
+  type_signal_have_display_name signal_have_display_name() const;
+
+
+  typedef sigc::signal<void> type_signal_encoding_error;
+
+  /** This signal will be emitted when the parser encounters an error while parsing.
+   * TODO: How do we discover what the error is?
+   */
+  type_signal_encoding_error signal_encoding_error() const;
+
+
+  typedef sigc::signal<void, std::string, unsigned int> type_signal_line_scanned;
+
+  /** This signal will be emitted each time the parser has scanned a line. TODO: Do we mean row instead of line? - A row contain a newline.
+   */
+  type_signal_line_scanned signal_line_scanned() const;
+
+
+  typedef sigc::signal<void> type_signal_state_changed;
+
+  /** This signal will be emitted when the state changes.
+   */
+  type_signal_state_changed signal_state_changed() const;
+
+
+  /// Make parser object reusable.
+  void clear();
+
+  /** Change the encoding used when reading the file.
+   * This stop parsing. Call set_file_and_start_parsing() to restart the parser  
+   * with the specified encoding. 
+   * See the FileEncoding namespace.
+   */
+  void set_encoding(const Glib::ustring& encoding_charset);
+
+  void set_file_and_start_parsing(const std::string& uri);
+
+private:
+
+  typedef std::vector<Glib::ustring> type_row_strings;
+  typedef std::vector<type_row_strings> type_rows;
+
+
+  // In order to not make the UI feel sluggish during larger imports we parse
+  // on chunk at a time in the idle handler.
+  bool on_idle_parse();
+
+  void begin_parse();
+
+  static const gunichar DELIMITER = ',';
+  static const gunichar QUOTE = '\"';
+
+  static bool next_char_is_quote(const Glib::ustring::const_iterator& iter, const Glib::ustring::const_iterator& end);
+
+  void do_line_scanned(const Glib::ustring& current_line, guint line_number);
+
+  //TODO: Document this:
+  static Glib::ustring::const_iterator advance_field(const Glib::ustring::const_iterator& iter, const Glib::ustring::const_iterator& end, Glib::ustring& field);
+
+  void on_file_read(const Glib::RefPtr<Gio::AsyncResult>& result);
+  void on_stream_read(const Glib::RefPtr<Gio::AsyncResult>& result);
+  void on_file_query_info(const Glib::RefPtr<Gio::AsyncResult>& result);
+
+  void set_state(State state);
+
+  // The raw data in the original encoding. We keep this so we can convert
+  // from the user-selected encoding to UTF-8 every time the user changes
+  // the encoding.
+  std::vector<char> m_raw;
+
+  std::string m_encoding;
+  std::vector<char>::size_type m_input_position;
+  std::string m_current_line;
+
+  sigc::connection m_idle_connection;
+  unsigned int m_line_number;
+
+  State m_state;
+
+  Glib::RefPtr<Gio::FileInputStream> m_stream;
+
+  // Parsed data:
+  type_rows m_rows;
+
+  type_signal_file_read_error m_signal_file_read_error;
+  type_signal_have_display_name m_signal_have_display_name;
+  type_signal_encoding_error m_signal_encoding_error;
+  type_signal_line_scanned m_signal_line_scanned;
+  type_signal_state_changed m_signal_state_changed;
+
+  Glib::RefPtr<Gio::File> m_file;
+
+  struct Buffer
+  {
+    char buf[1024];
+  };
+  std::auto_ptr<Buffer> m_buffer;
+};
+
+} //namespace Glom
+
+#endif //GLOM_IMPORT_CSV_H
+
+
diff --git a/glom/dialog_import_csv.cc b/glom/import_csv/dialog_import_csv.cc
similarity index 68%
rename from glom/dialog_import_csv.cc
rename to glom/import_csv/dialog_import_csv.cc
index fae6d04..e10bac3 100644
--- a/glom/dialog_import_csv.cc
+++ b/glom/import_csv/dialog_import_csv.cc
@@ -19,6 +19,7 @@
  */
 
 #include "dialog_import_csv.h"
+#include <glom/import_csv/file_encodings.h>
 #include <libglom/libglom_config.h>
 
 #include <libglom/data_structure/glomconversions.h>
@@ -32,62 +33,19 @@
 namespace
 {
 
-struct Encoding {
-  const char* name;
-  const char* charset;
-};
-
-//TODO: Can we get this from anywhere else, such as iso-codes? murrayc
-const Encoding ENCODINGS[] = {
-  { N_("Unicode"), "UTF-8" },
-  { N_("Unicode"), "UTF-16" },
-  { N_("Unicode"), "UTF-16BE" },
-  { N_("Unicode"), "UTF-16LE" },
-  { N_("Unicode"), "UTF-32" },
-  { N_("Unicode"), "UTF-7" },
-  { N_("Unicode"), "UCS-2" },
-  { N_("Unicode"), "UCS-4" },
-  { NULL, NULL }, // This just adds a separator in the combo box
-  { N_("Western"), "ISO-8859-1" },
-  { N_("Central European"), "ISO-8859-2" },
-  { N_("South European"), "ISO-8859-3" },
-  { N_("Baltic"), "ISO-8859-4" },
-  { N_("Cyrillic"), "ISO-8859-5" },
-  { N_("Arabic"), "ISO-8859-6" },
-  { N_("Greek"), "ISO-8859-7" },
-  { N_("Hebrew Visual"), "ISO-8859-8" },
-  { N_("Hebrew"), "ISO-8859-8-I" },
-  { N_("Turkish"), "ISO-8859-9" },
-  { N_("Nordic"), "ISO-8859-10" },
-  { N_("Baltic"), "ISO-8859-13" },
-  { N_("Celtic"), "ISO-8859-14" },
-  { N_("Western"), "ISO-8859-15" },
-  { N_("Romanian"), "ISO-8859-16" },
-  { NULL, NULL },
-  { N_("Central European"), "WINDOWS-1250" },
-  { N_("Cyrillic"), "WINDOWS-1251" },
-  { N_("Western"), "WINDOWS-1252" },
-  { N_("Greek"), "WINDOWS-1253" },
-  { N_("Turkish"), "WINDOWS-1254" },
-  { N_("Hebrew"), "WINDOWS-1255" },
-  { N_("Arabic"), "WINDOWS-1256" },
-  { N_("Baltic"), "WINDOWS-1257" },
-  { N_("Vietnamese"), "WINDOWS-1258" }
-};
-
 // When auto-detecting the encoding, we try to read the file in these
 // encodings, in order:
-const Encoding AUTODETECT_ENCODINGS[] = {
-  { N_("Unicode"), "UTF-8" },
-  { N_("Western"), "ISO-8859-1" },
-  { N_("Western"), "ISO-8859-15" },
-  { N_("Unicode"), "UTF-16" },
-  { N_("Unicode"), "UCS-2" },
-  { N_("Unicode"), "UCS-4" }
+const char* AUTODETECT_ENCODINGS_CHARSETS[] = {
+  "UTF-8",
+  "ISO-8859-1",
+  "ISO-8859-15",
+  "UTF-16",
+  "UCS-2",
+  "UCS-4"
 };
 
-const guint N_ENCODINGS = sizeof(ENCODINGS)/sizeof(ENCODINGS[0]);
-const guint N_AUTODETECT_ENCODINGS = sizeof(AUTODETECT_ENCODINGS)/sizeof(AUTODETECT_ENCODINGS[0]);
+const guint N_AUTODETECT_ENCODINGS_CHARSETS = sizeof(AUTODETECT_ENCODINGS_CHARSETS)/sizeof(AUTODETECT_ENCODINGS_CHARSETS[0]);
+
 
 Glib::ustring encoding_display(const Glib::ustring& name, const Glib::ustring& charset)
 {
@@ -119,6 +77,7 @@ Dialog_Import_CSV::Dialog_Import_CSV(BaseObjectType* cobject, const Glib::RefPtr
     throw std::runtime_error("Missing widgets from glade file for Dialog_Import_CSV");
 #endif
 
+  //Fill the list of encodings:
   m_encoding_model = Gtk::ListStore::create(m_encoding_columns);
 
   Gtk::TreeModel::iterator iter = m_encoding_model->append();
@@ -127,14 +86,17 @@ Dialog_Import_CSV::Dialog_Import_CSV(BaseObjectType* cobject, const Glib::RefPtr
   // Separator:
   m_encoding_model->append();
 
-  for(guint i = 0; i < N_ENCODINGS; ++ i)
+  const FileEncodings::type_list_encodings list_encodings =  FileEncodings::get_list_of_encodings();
+  for(FileEncodings::type_list_encodings::const_iterator encodings_iter = list_encodings.begin(); encodings_iter  != list_encodings.end(); encodings_iter ++)
   {
+    const FileEncodings::Encoding encoding = *encodings_iter;
+    if(encoding.get_name().empty())
+      continue;
+
     iter = m_encoding_model->append();
-    if(ENCODINGS[i].name != 0)
-    {
-      (*iter)[m_encoding_columns.m_col_name] = _(ENCODINGS[i].name);
-      (*iter)[m_encoding_columns.m_col_charset] = ENCODINGS[i].charset;
-    }
+    Gtk::TreeModel::Row row = *iter;
+    row[m_encoding_columns.m_col_name] = encoding.get_name();
+    row[m_encoding_columns.m_col_charset] = encoding.get_charset();
   }
 
   Gtk::CellRendererText* renderer = Gtk::manage(new Gtk::CellRendererText);
@@ -144,12 +106,15 @@ Dialog_Import_CSV::Dialog_Import_CSV(BaseObjectType* cobject, const Glib::RefPtr
   m_encoding_combo->set_row_separator_func(sigc::mem_fun(*this, &Dialog_Import_CSV::row_separator_func));
   m_encoding_combo->set_active(0);
 
-  m_encoding_combo->signal_changed().connect(sigc::mem_fun(*this, &Dialog_Import_CSV::on_encoding_changed));
+  m_encoding_combo->signal_changed().connect(sigc::mem_fun(*this, &Dialog_Import_CSV::on_combo_encoding_changed));
 
   // TODO: Reset parser encoding on selection changed.
   m_parser = std::auto_ptr<CsvParser>(new CsvParser(get_current_encoding().c_str()));
-  m_parser->signal_encoding_error().connect(sigc::mem_fun(*this, &Dialog_Import_CSV::on_encoding_error));
-  m_parser->signal_line_scanned().connect(sigc::mem_fun(*this, &Dialog_Import_CSV::on_line_scanned));
+  m_parser->signal_file_read_error().connect(sigc::mem_fun(*this, &Dialog_Import_CSV::on_parser_file_read_error));
+  m_parser->signal_have_display_name().connect(sigc::mem_fun(*this, &Dialog_Import_CSV::on_parser_have_display_name));
+  m_parser->signal_encoding_error().connect(sigc::mem_fun(*this, &Dialog_Import_CSV::on_parser_encoding_error));
+  m_parser->signal_line_scanned().connect(sigc::mem_fun(*this, &Dialog_Import_CSV::on_parser_line_scanned));
+  m_parser->signal_state_changed().connect(sigc::mem_fun(*this, &Dialog_Import_CSV::on_parser_state_changed));
 
   m_first_line_as_title->signal_toggled().connect(sigc::mem_fun(*this, &Dialog_Import_CSV::on_first_line_as_title_toggled));
   m_sample_rows->signal_changed().connect(sigc::mem_fun(*this, &Dialog_Import_CSV::on_sample_rows_changed));
@@ -240,14 +205,7 @@ void Dialog_Import_CSV::import(const Glib::ustring& uri, const Glib::ustring& in
     m_field_model_sorted = Gtk::TreeModelSort::create(m_field_model);
     m_field_model_sorted->set_sort_column(m_field_columns.m_col_field_name, Gtk::SORT_ASCENDING);
 
-
-    m_file = Gio::File::create_for_uri(uri);
-    m_file->read_async(sigc::mem_fun(*this, &Dialog_Import_CSV::on_file_read));
-
-    // Query the display name of the file to set in the title:
-    m_file->query_info_async(sigc::mem_fun(*this, &Dialog_Import_CSV::on_query_info), G_FILE_ATTRIBUTE_STANDARD_DISPLAY_NAME);
-
-    set_parser_state(CsvParser::STATE_PARSING);
+    m_parser->set_file_and_start_parsing(uri);
   }
 }
 
@@ -276,7 +234,7 @@ const Glib::ustring& Dialog_Import_CSV::get_data(guint row, guint col)
   if(m_first_line_as_title->get_active())
     ++row;
 
-  return m_parser->m_rows[row][col];
+  return m_parser->get_data(row, col);
 }
 
 void Dialog_Import_CSV::clear()
@@ -289,9 +247,7 @@ void Dialog_Import_CSV::clear()
   m_field_model.reset();
   m_field_model_sorted.reset();
   m_fields.clear();
-  m_file.reset();
   m_filename.clear();
-  m_buffer.reset(0);
   m_parser->clear();
   //m_parser.reset(0);
   m_encoding_info->set_text("");
@@ -300,7 +256,6 @@ void Dialog_Import_CSV::clear()
   m_first_line_as_title->set_sensitive(false);
   m_sample_rows->set_sensitive(false);
 
-  set_parser_state(CsvParser::STATE_NONE);
   validate_primary_key();
 }
 
@@ -323,151 +278,10 @@ bool Dialog_Import_CSV::row_separator_func(const Glib::RefPtr<Gtk::TreeModel>& /
   return (*iter)[m_encoding_columns.m_col_name] == "";
 }
 
-void Dialog_Import_CSV::on_query_info(const Glib::RefPtr<Gio::AsyncResult>& result)
-{
-#ifdef GLIBMM_EXCEPTIONS_ENABLED
-  try
-  {
-    Glib::RefPtr<Gio::FileInfo> info = m_file->query_info_finish(result);
-    m_filename = info->get_display_name();
-    set_title(m_filename + _(" - Import From CSV File"));
-  }
-  catch(const Glib::Exception& ex)
-  {
-    std::cerr << "Failed to fetch display name of uri " << m_file->get_uri() << ": " << ex.what() << std::endl;
-  }
-#else
-  std::auto_ptr<Glib::Error> error;
-  Glib::RefPtr<Gio::FileInfo> info = m_file->query_info_finish(result, error);
-  if (!error.get())
-  {
-    m_filename = info->get_display_name();
-    set_title(m_filename + _(" - Import From CSV File"));
-  }
-  else
-    std::cerr << "Failed to fetch display name of uri " << m_file->get_uri() << ": " << error->what() << std::endl;
-#endif    
-}
 
-void Dialog_Import_CSV::on_file_read(const Glib::RefPtr<Gio::AsyncResult>& result)
+void Dialog_Import_CSV::on_combo_encoding_changed()
 {
-#ifdef GLIBMM_EXCEPTIONS_ENABLED
-  try
-  {
-    m_parser->m_stream = m_file->read_finish(result);
-
-    m_buffer.reset(new Buffer);
-    m_parser->m_stream->read_async(m_buffer->buf, sizeof(m_buffer->buf), sigc::mem_fun(*this, &Dialog_Import_CSV::on_stream_read));
-  }
-  catch(const Glib::Exception& error)
-  {
-    show_error_dialog(_("Could Not Open file"), Glib::ustring::compose(_("The file at \"%1\" could not be opened: %2"), m_file->get_uri(), error.what()));
-    clear();
-    // TODO: Response?
-  }
-#else
-    std::auto_ptr<Glib::Error> error;
-    m_parser->m_stream = m_file->read_finish(result, error);
-    if (!error.get())
-    {
-      m_buffer.reset(new Buffer);
-      m_parser->m_stream->read_async(m_buffer->buf, sizeof(m_buffer->buf), sigc::mem_fun(*this, &Dialog_Import_CSV::on_stream_read));
-    }
-    else
-    {
-      show_error_dialog(_("Could Not Open file"), Glib::ustring::compose(_("The file at \"%1\" could not be opened: %2"), m_file->get_uri(), error->what()));
-      clear();
-    }
-#endif    
-}
-
-void Dialog_Import_CSV::on_stream_read(const Glib::RefPtr<Gio::AsyncResult>& result)
-{
-#ifdef GLIBMM_EXCEPTIONS_ENABLED
-  try
-  {
-    const gssize size = m_parser->m_stream->read_finish(result);
-    m_parser->m_raw.insert(m_parser->m_raw.end(), m_buffer->buf, m_buffer->buf + size);
-
-    // If the parser already exists, but it is currently not parsing because it waits
-    // for new input, then continue parsing.
-    // TODO: Introduce CsvParser::is_idle_handler_connected() instead?
-    if(m_parser.get() && !m_parser->m_idle_connection.connected())
-    {
-      m_parser->m_idle_connection = Glib::signal_idle().connect(sigc::mem_fun(*m_parser.get(), &CsvParser::on_idle_parse));
-    }
-    // If the parser does not exist yet, then create a new parser, except when the
-    // current encoding does not work for the file ,in which case the user must first
-    // choose another encoding.
-    else if(!m_parser.get() && get_parser_state() != CsvParser::STATE_ENCODING_ERROR)
-    {
-      begin_parse();
-    }
-
-    if(size > 0)
-    {
-      // Read the next few bytes
-      m_parser->m_stream->read_async(m_buffer->buf, sizeof(m_buffer->buf), sigc::mem_fun(*this, &Dialog_Import_CSV::on_stream_read));
-    }
-    else
-    {
-      // Finished reading
-      m_buffer.reset(0);
-      m_parser->m_stream.reset();
-      m_file.reset();
-    }
-  }
-  catch(const Glib::Exception& error)
-  {
-    show_error_dialog(_("Could Not Read File"), Glib::ustring::compose(_("The file at \"%1\" could not be read: %2"), m_file->get_uri(), error.what()));
-    clear();
-    // TODO: Response?
-  }
-#else
-    std::auto_ptr<Glib::Error> error;
-    const gssize size = m_parser->m_stream->read_finish(result, error);
-    if (!error.get())
-    {
-      m_parser->m_raw.insert(m_parser->m_raw.end(), m_buffer->buf, m_buffer->buf + size);
-
-      // If the parser already exists, but it is currently not parsing because it waits
-      // for new input, then continue parsing.
-      if(m_parser.get() && !m_parser->m_idle_connection.connected())
-      {
-        m_parser->m_idle_connection = Glib::signal_idle().connect(sigc::mem_fun(*m_parser.get(), &CsvParser::on_idle_parse));
-      }
-      // If the parser does not exist yet, then create a new parser, except when the
-      // current encoding does not work for the file ,in which case the user must first
-      // choose another encoding.
-      else if(!m_parser.get() && get_parser_state() != CsvParser::ENCODING_ERROR)
-      {
-        begin_parse();
-      }
-
-      if(size > 0)
-      {
-        // Read the next few bytes
-        m_parser->m_stream->read_async(m_buffer->buf, sizeof(m_buffer->buf), sigc::mem_fun(*this, &Dialog_Import_CSV::on_stream_read));
-      }
-      else
-      {
-        // Finished reading
-        m_buffer.reset(0);
-        m_parser->m_stream.reset();
-        m_file.reset();
-      }
-    }
-    if (error.get())
-    {
-      show_error_dialog(_("Could Not Read File"), Glib::ustring::compose(_("The file at \"%1\" could not be read: %2"), m_file->get_uri(), error->what()));
-      clear();
-    }
-#endif
-}
-
-void Dialog_Import_CSV::on_encoding_changed()
-{
-  int active = m_encoding_combo->get_active_row_number();
+  const int active = m_encoding_combo->get_active_row_number();
   switch(active)
   {
   case -1: // No active item
@@ -482,16 +296,9 @@ void Dialog_Import_CSV::on_encoding_changed()
     break;
   }
 
-  // Reset current parsing process
-  // TODO: Troublesome. Parser now contains a bit more members, not sure we can simply reset it like that.
-  //m_parser.reset(0);
-  //m_parser->clear();
-  //m_parser->set_encoding(get_current_encoding().c_str());
-
-  // Parse from beginning with new encoding if we have already some data to
-  // parse.
-  if(!m_parser->m_raw.empty())
-    begin_parse();
+  // Parse from beginning with new encoding:
+  m_parser->set_encoding(get_current_encoding());
+  m_parser->set_file_and_start_parsing(m_filename);
 }
 
 void Dialog_Import_CSV::on_first_line_as_title_toggled()
@@ -529,7 +336,7 @@ void Dialog_Import_CSV::on_first_line_as_title_toggled()
     Gtk::TreeModel::Path path("1");
     Gtk::TreeModel::iterator iter = m_sample_model->get_iter(path);
 
-    if((!iter || (*iter)[m_sample_columns.m_col_row] != 0) && !m_parser->m_rows.empty() && m_sample_rows->get_value_as_int() > 0)
+    if((!iter || (*iter)[m_sample_columns.m_col_row] != 0) && !m_parser->get_rows_empty() && m_sample_rows->get_value_as_int() > 0)
     {
       // Add first row to model
       if(!iter)
@@ -597,17 +404,20 @@ Glib::ustring Dialog_Import_CSV::get_current_encoding() const
   {
     // Auto-Detect:
     g_assert(m_auto_detect_encoding != -1);
-    return AUTODETECT_ENCODINGS[m_auto_detect_encoding].charset;
+    return AUTODETECT_ENCODINGS_CHARSETS[m_auto_detect_encoding];
   }
 
-  // TODO: change return type?
-  return encoding.c_str();
+  return encoding;
 }
 
 void Dialog_Import_CSV::begin_parse()
 {
   if(m_auto_detect_encoding != -1)
-    m_encoding_info->set_text(Glib::ustring::compose(_("Encoding detected as: %1"), encoding_display(gettext(AUTODETECT_ENCODINGS[m_auto_detect_encoding].name), AUTODETECT_ENCODINGS[m_auto_detect_encoding].charset)));
+  {
+    const char* encoding_charset = AUTODETECT_ENCODINGS_CHARSETS[m_auto_detect_encoding];
+    const Glib::ustring encoding_name = FileEncodings::get_name_of_charset(encoding_charset);
+    m_encoding_info->set_text(Glib::ustring::compose(_("Encoding detected as: %1"), encoding_display(encoding_name, encoding_charset)));
+  }
   else
     m_encoding_info->set_text("");
 
@@ -619,17 +429,14 @@ void Dialog_Import_CSV::begin_parse()
   m_parser->clear();
 
   m_parser->set_encoding(get_current_encoding().c_str());
-  set_parser_state(CsvParser::STATE_PARSING);
 
   // Allow the Import button to be pressed when a field for the primary key
   // field is set. When the import button is pressed without the file being
   // fully loaded, the import progress waits for us to load the rest.
   validate_primary_key();
-
-  m_parser->m_idle_connection = Glib::signal_idle().connect(sigc::mem_fun(*m_parser.get(), &CsvParser::on_idle_parse));
 }
 
-void Dialog_Import_CSV::on_encoding_error()
+void Dialog_Import_CSV::on_parser_encoding_error()
 {
   m_parser->clear();
   // Clear sample preview (TODO: Let it visible, and only remove when reparsing?)
@@ -637,9 +444,6 @@ void Dialog_Import_CSV::on_encoding_error()
   m_sample_view->remove_all_columns();
   m_sample_view->set_model(m_sample_model); // Empty model
 
-  // TODO: move into parser.
-  set_parser_state(CsvParser::STATE_ENCODING_ERROR);
-
   // Don't allow the import button to be pressed when an error occured. This
   // would not make sense since we cleared all the parsed row data anyway.
   validate_primary_key();
@@ -648,7 +452,7 @@ void Dialog_Import_CSV::on_encoding_error()
   if(m_auto_detect_encoding != -1)
   {
     ++ m_auto_detect_encoding;
-    if(static_cast<guint>(m_auto_detect_encoding) < N_AUTODETECT_ENCODINGS)
+    if(static_cast<guint>(m_auto_detect_encoding) < N_AUTODETECT_ENCODINGS_CHARSETS)
       begin_parse();
     else
       m_encoding_info->set_text(_("Encoding detection failed. Please manually choose one from the box."));
@@ -662,39 +466,12 @@ void Dialog_Import_CSV::on_encoding_error()
 /*
  * No, this is wrong. Creating the tree model and handling a line from the CSV file are two separate steps. Proposal: Construct tree model *after* parsing, using row[0].
  */
-void Dialog_Import_CSV::on_line_scanned(const Glib::ustring& line, guint line_number)
+void Dialog_Import_CSV::on_parser_line_scanned(const Glib::ustring& line, guint row_number)
 {
-  std::cout << "debug: on_line_scanned=" << line << std::endl;
-  if(line.empty())
-   return;
-
-  m_parser->m_rows.push_back(CsvParser::type_row_strings());
-  CsvParser::type_row_strings& row = m_parser->m_rows.back();
-
-  Glib::ustring field;
-  //Gtk::TreeModelColumnRecord record;
-
-  // Parse first field:
-  Glib::ustring::const_iterator line_iter = CsvParser::advance_field(line.begin(), line.end(), field);
-  row.push_back(field);
-
-  // Parse more fields:
-  while(line_iter != line.end())
-  {
-    // Skip delimiter:
-    ++line_iter;
-
-    // Read field:
-    line_iter = CsvParser::advance_field(line_iter, line.end(), field);
-
-    // Add field to current row:
-    row.push_back(field);
-  }
-
   // This is the first line read if there is no model yet:
   if(!m_sample_model)
   {
-    setup_sample_model(row);
+    setup_sample_model(row_number);
     Gtk::TreeModel::iterator iter = m_sample_model->append();
     // -1 means the row to select target fields (see special handling in cell data funcs)
     (*iter)[m_sample_columns.m_col_row] = -1;
@@ -707,7 +484,7 @@ void Dialog_Import_CSV::on_line_scanned(const Glib::ustring& line, guint line_nu
 
   // Don't add if this is the first line and m_first_line_as_title is active:
   const guint parser_rows_count = m_parser->get_rows_count();
-  if(line_number > 1 || !m_first_line_as_title->get_active())
+  if(row_number > 1 || !m_first_line_as_title->get_active())
   {
     if(sample_rows < static_cast<guint>(m_sample_rows->get_value_as_int()))
     {
@@ -717,7 +494,7 @@ void Dialog_Import_CSV::on_line_scanned(const Glib::ustring& line, guint line_nu
   }
 }
 
-void Dialog_Import_CSV::setup_sample_model(CsvParser::type_row_strings& row)
+void Dialog_Import_CSV::setup_sample_model(guint data_row_number)
 {
   m_sample_model = Gtk::ListStore::create(m_sample_columns);
   m_sample_view->set_model(m_sample_model);
@@ -725,7 +502,7 @@ void Dialog_Import_CSV::setup_sample_model(CsvParser::type_row_strings& row)
   // Create field vector that contains the fields into which to import
   // the data.
   //m_fields.resize(row.size());
-  m_fields.resize(row.size());
+  m_fields.resize(m_parser->get_rows_count());
 
   // Start with a column showing the line number.
   Gtk::CellRendererText* text = Gtk::manage(new Gtk::CellRendererText);
@@ -734,9 +511,11 @@ void Dialog_Import_CSV::setup_sample_model(CsvParser::type_row_strings& row)
   col->set_cell_data_func(*text, sigc::mem_fun(*this, &Dialog_Import_CSV::line_data_func));
   m_sample_view->append_column(*col);
 
-  for(guint i = 0; i < row.size(); ++ i)
+  const guint cols_count = m_parser->get_cols_count(data_row_number);
+  for(guint i = 0; i < cols_count; ++ i)
   {
-    m_sample_view->append_column(*Gtk::manage(column_factory(row[i], i)));
+    const Glib::ustring& data = m_parser->get_data(data_row_number, i);
+    m_sample_view->append_column(*Gtk::manage(column_factory(data, i)));
   }
 }
 
@@ -814,11 +593,7 @@ void Dialog_Import_CSV::field_data_func(Gtk::CellRenderer* renderer, const Gtk::
 
       if(row != -1 && (unsigned int)row < m_parser->get_rows_count())
       {
-        const CsvParser::type_row_strings& row_strings = m_parser->m_rows[row];
-        if(column_number < row_strings.size())
-        {
-
-          const Glib::ustring& orig_text = row_strings[column_number];
+          const Glib::ustring& orig_text = m_parser->get_data(row, column_number);
 
           if(field)
           {
@@ -855,7 +630,6 @@ void Dialog_Import_CSV::field_data_func(Gtk::CellRenderer* renderer, const Gtk::
           }
 
           editable = false;
-        }
       }
     }
   }
@@ -906,17 +680,6 @@ void Dialog_Import_CSV::on_field_edited(const Glib::ustring& path, const Glib::u
   }
 }
 
-void Dialog_Import_CSV::set_parser_state(CsvParser::State state)
-{
-  // Calling the member of a member, introduced by refactoring. TODO: clean up set_parser_state() interface.
-  if(m_parser->get_state() != state)
-  {
-    m_parser->m_state = state;
-    // Should be emitted by parser?
-    m_signal_state_changed.emit();
-  }
-}
-
 void Dialog_Import_CSV::validate_primary_key()
 {
   if(get_parser_state() == (CsvParser::STATE_NONE | CsvParser::STATE_ENCODING_ERROR))
@@ -937,7 +700,7 @@ void Dialog_Import_CSV::validate_primary_key()
       // and the m_fields array is not up to date. It is set in handle_line()
       // when the first line is parsed.
       primary_key_selected = false;
-      if(!m_parser->m_rows.empty())
+      if(!m_parser->get_rows_empty())
       {
         for(type_vec_fields::iterator iter = m_fields.begin(); iter != m_fields.end(); ++ iter)
         {
@@ -967,5 +730,27 @@ void Dialog_Import_CSV::validate_primary_key()
   }
 }
 
+void Dialog_Import_CSV::on_parser_file_read_error(const Glib::ustring& error_message)
+{
+  show_error_dialog(_("Could Not Open file"), 
+    Glib::ustring::compose(_("The file at \"%1\" could not be opened: %2"), m_filename, error_message) );
+}
+
+void Dialog_Import_CSV::on_parser_have_display_name(const Glib::ustring& display_name)
+{
+  set_title(display_name + _(" - Import From CSV File"));
+}
+
+void Dialog_Import_CSV::on_parser_state_changed()
+{
+  //Remit (via our similarly-named signal) this so that the progress dialog can respond:
+  signal_state_changed().emit();
+}
+
+Dialog_Import_CSV::type_signal_state_changed Dialog_Import_CSV::signal_state_changed() const
+{
+  return m_signal_state_changed;
+}
+
 } //namespace Glom
 
diff --git a/glom/dialog_import_csv.h b/glom/import_csv/dialog_import_csv.h
similarity index 84%
rename from glom/dialog_import_csv.h
rename to glom/import_csv/dialog_import_csv.h
index 1e9eb94..5a1856a 100644
--- a/glom/dialog_import_csv.h
+++ b/glom/import_csv/dialog_import_csv.h
@@ -21,7 +21,7 @@
 #ifndef GLOM_DIALOG_IMPORT_CSV_H
 #define GLOM_DIALOG_IMPORT_CSV_H
 
-#include "import_csv.h"
+#include <glom/import_csv/csv_parser.h>
 #include <glom/base_db.h>
 
 #include <memory>
@@ -44,13 +44,10 @@ class Dialog_Import_CSV
     public Base_DB
 {
 public:
-  typedef sigc::signal<void> SignalStateChanged;
-
   Dialog_Import_CSV(BaseObjectType* cobject, const Glib::RefPtr<Gtk::Builder>& builder);
 
   void import(const Glib::ustring& uri, const Glib::ustring& into_table);
 
-  //TODO: move into parser?
   CsvParser::State get_parser_state() const;
   Glib::ustring get_target_table_name() const;
   const Glib::ustring& get_filename() const;
@@ -60,39 +57,42 @@ public:
   sharedptr<const Field> get_field_for_column(unsigned int col) const;
   const Glib::ustring& get_data(unsigned int row, unsigned int col);
 
-  SignalStateChanged signal_state_changed() const { return m_signal_state_changed; }
+  typedef sigc::signal<void> type_signal_state_changed;
+
+  /** This signal will be emitted when the parser's state changes.
+   */
+  type_signal_state_changed signal_state_changed() const;
+
 
 private:
   void clear();
   void show_error_dialog(const Glib::ustring& primary, const Glib::ustring& secondary);
 
-  void encoding_data_func(const Gtk::TreeModel::iterator& iter, Gtk::CellRendererText& renderer);
-  bool row_separator_func(const Glib::RefPtr<Gtk::TreeModel>& model, const Gtk::TreeModel::iterator& iter) const;
-
-  void on_query_info(const Glib::RefPtr<Gio::AsyncResult>& result);
-  void on_file_read(const Glib::RefPtr<Gio::AsyncResult>& result);
-  void on_stream_read(const Glib::RefPtr<Gio::AsyncResult>& result);
-
-  void on_encoding_changed();
-  void on_first_line_as_title_toggled();
-  void on_sample_rows_changed();
-
   Glib::ustring get_current_encoding() const;
   void begin_parse();
-  void on_encoding_error();
 
-  bool on_idle_parse();
-
-  void on_line_scanned(const Glib::ustring& line, unsigned int line_number);
-  void setup_sample_model(CsvParser::type_row_strings& row);
+  void setup_sample_model(guint data_row_number);
   Gtk::TreeViewColumn* column_factory(const Glib::ustring& title, guint index);
   Gtk::CellRendererCombo* cell_factory(guint index);
 
+  //CellRenderer cell_data_func callbacks:
   void line_data_func(Gtk::CellRenderer* renderer, const Gtk::TreeModel::iterator& iter);
   void field_data_func(Gtk::CellRenderer* renderer, const Gtk::TreeModel::iterator& iter, unsigned int column_number);
   void on_field_edited(const Glib::ustring& path, const Glib::ustring& new_text, unsigned int column_number);
 
-  void set_parser_state(CsvParser::State state);
+  void encoding_data_func(const Gtk::TreeModel::iterator& iter, Gtk::CellRendererText& renderer);
+  bool row_separator_func(const Glib::RefPtr<Gtk::TreeModel>& model, const Gtk::TreeModel::iterator& iter) const;
+
+  void on_parser_file_read_error(const Glib::ustring& error_message);
+  void on_parser_have_display_name(const Glib::ustring& display_name);
+  void on_parser_encoding_error();
+  void on_parser_line_scanned(const Glib::ustring& line, unsigned int row_number);
+  void on_parser_state_changed();
+
+  void on_combo_encoding_changed();
+  void on_first_line_as_title_toggled();
+  void on_sample_rows_changed();
+
   void validate_primary_key();
 
   class EncodingColumns: public Gtk::TreeModelColumnRecord
@@ -141,14 +141,8 @@ private:
   Gtk::Label* m_advice_label;
   Gtk::Label* m_error_label;
 
-  Glib::RefPtr<Gio::File> m_file;
   Glib::ustring m_filename;
 
-  struct Buffer {
-    char buf[1024];
-  };
-  std::auto_ptr<Buffer> m_buffer;
-
   // Index into the ENCODINGS array (see dialog_import_csv.cc) for the
   // encoding that we currently try to read the data with, or -1 if
   // auto-detection is disabled.
@@ -158,7 +152,7 @@ private:
   typedef std::vector< sharedptr<Field> > type_vec_fields;
   type_vec_fields m_fields;
 
-  SignalStateChanged m_signal_state_changed;
+  type_signal_state_changed m_signal_state_changed;
 };
 
 } //namespace Glom
diff --git a/glom/dialog_import_csv_progress.cc b/glom/import_csv/dialog_import_csv_progress.cc
similarity index 100%
rename from glom/dialog_import_csv_progress.cc
rename to glom/import_csv/dialog_import_csv_progress.cc
diff --git a/glom/dialog_import_csv_progress.h b/glom/import_csv/dialog_import_csv_progress.h
similarity index 100%
rename from glom/dialog_import_csv_progress.h
rename to glom/import_csv/dialog_import_csv_progress.h
diff --git a/glom/import_csv/file_encodings.cc b/glom/import_csv/file_encodings.cc
new file mode 100644
index 0000000..9cc7ef5
--- /dev/null
+++ b/glom/import_csv/file_encodings.cc
@@ -0,0 +1,148 @@
+/* Glom
+ *
+ * Copyright (C) 2009 Openismus GmbH
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+//#include "config.h" //For ISO_CODES_PREFIX.
+
+#include <glom/import_csv/file_encodings.h>
+#include <algorithm>
+#include <glibmm/i18n.h>
+
+namespace Glom
+{
+
+namespace FileEncodings
+{
+
+Encoding::Encoding(const char* name, const char* charset)
+: m_name(name), m_charset(charset)
+{
+}
+
+Glib::ustring Encoding::get_charset() const
+{
+  if(m_charset)
+    return m_charset;
+  else
+    return Glib::ustring();
+}
+
+Glib::ustring Encoding::get_name() const
+{
+  if(m_name)
+    return m_name;
+  else
+    return Glib::ustring();
+}
+
+/** A predicate for use with std::find_if() to find an Encoding with the charset.
+ */
+template<class T_Element>
+class predicate_EncodingHasCharset
+{
+public:
+  predicate_EncodingHasCharset(const Glib::ustring& charset)
+  {
+    m_charset = charset;
+  }
+
+  virtual ~predicate_EncodingHasCharset()
+  {
+  }
+
+  bool operator() (const T_Element& element)
+  {
+    return (element.get_charset() == m_charset);
+  }
+
+private:
+  Glib::ustring m_charset;
+};
+
+
+static type_list_encodings list_encodings;
+
+static void add_encoding(const gchar* name, const gchar* encoding)
+{
+  list_encodings.push_back(Encoding(name, encoding));
+}
+
+type_list_encodings get_list_of_encodings()
+{
+  if(!list_encodings.empty())
+    return list_encodings;
+
+  //TODO: Can we get this from anywhere else, such as iso-codes? murrayc
+  //TODO: Make this generally more efficient.
+  add_encoding(_("Unicode"), "UTF-8");
+  add_encoding(_("Unicode"), "UTF-16");
+  add_encoding(_("Unicode"), "UTF-16BE");
+  add_encoding(_("Unicode"), "UTF-16LE");
+  add_encoding(_("Unicode"), "UTF-32");
+  add_encoding(_("Unicode"), "UTF-7");
+  add_encoding(_("Unicode"), "UCS-2");
+  add_encoding(_("Unicode"), "UCS-4");
+  add_encoding(0, 0); // This just adds a separator in the combo box
+  add_encoding(_("Western"), "ISO-8859-1");
+  add_encoding(_("Central European"), "ISO-8859-2");
+  add_encoding(_("South European"), "ISO-8859-3");
+  add_encoding(_("Baltic"), "ISO-8859-4");
+  add_encoding(_("Cyrillic"), "ISO-8859-5");
+  add_encoding(_("Arabic"), "ISO-8859-6");
+  add_encoding(_("Greek"), "ISO-8859-7");
+  add_encoding(_("Hebrew Visual"), "ISO-8859-8");
+  add_encoding(_("Hebrew"), "ISO-8859-8-I");
+  add_encoding(_("Turkish"), "ISO-8859-9");
+  add_encoding(_("Nordic"), "ISO-8859-10");
+  add_encoding(_("Baltic"), "ISO-8859-13");
+  add_encoding(_("Celtic"), "ISO-8859-14");
+  add_encoding(_("Western"), "ISO-8859-15");
+  add_encoding(_("Romanian"), "ISO-8859-16");
+  add_encoding(0, 0); // This just adds a separator in the combo box
+  add_encoding(_("Central European"), "WINDOWS-1250");
+  add_encoding(_("Cyrillic"), "WINDOWS-1251");
+  add_encoding(_("Western"), "WINDOWS-1252");
+  add_encoding(_("Greek"), "WINDOWS-1253");
+  add_encoding(_("Turkish"), "WINDOWS-1254");
+  add_encoding(_("Hebrew"), "WINDOWS-1255");
+  add_encoding(_("Arabic"), "WINDOWS-1256");
+  add_encoding(_("Baltic"), "WINDOWS-1257");
+  add_encoding(_("Vietnamese"), "WINDOWS-1258");
+
+  return list_encodings;
+}
+
+Glib::ustring get_name_of_charset(const Glib::ustring& charset)
+{
+  //Make sure that the list is full:
+  get_list_of_encodings();
+
+  type_list_encodings::const_iterator iter = 
+    std::find_if(list_encodings.begin(), list_encodings.end(), 
+      predicate_EncodingHasCharset<Encoding>(charset));
+
+  if(iter != list_encodings.end())
+    return iter->get_name();
+  else
+    return Glib::ustring();
+}
+
+} //namespace FileEncodings;
+
+} //namespace Glom
diff --git a/glom/import_csv/file_encodings.h b/glom/import_csv/file_encodings.h
new file mode 100644
index 0000000..588e90d
--- /dev/null
+++ b/glom/import_csv/file_encodings.h
@@ -0,0 +1,62 @@
+/* Glom
+ *
+ * Copyright (C) 2009  Openismus GmbH
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#ifndef GLOM_IMPORT_CSV_FILE_ENCODINGS_H
+#define GLOM_IMPORT_CSV_FILE_ENCODINGS_H
+
+#include <glibmm/ustring.h>
+#include <list>
+
+namespace Glom
+{
+
+namespace FileEncodings
+{
+
+class Encoding
+{
+public:
+  Encoding(const char* name, const char* charset);
+
+  Glib::ustring get_charset() const;
+  Glib::ustring get_name() const;
+
+private:
+  const char* m_name;
+  const char* m_charset;
+};
+
+typedef std::list<Encoding> type_list_encodings;
+
+/** Get a list of file encodings to offer to the user.
+ */
+type_list_encodings get_list_of_encodings();
+
+/** Discover the human-readable name (such as "Western") of a charset 
+ * (such as "ISO-8859-1")
+ */
+Glib::ustring get_name_of_charset(const Glib::ustring& charset);
+
+} //namespace FileEncodings
+
+} //namespace Glom
+
+#endif //GLOM_IMPORT_CSV_FILE_ENCODINGS_H
+
diff --git a/glom/mode_design/iso_codes.h b/glom/mode_design/iso_codes.h
index 5e11371..de12925 100644
--- a/glom/mode_design/iso_codes.h
+++ b/glom/mode_design/iso_codes.h
@@ -51,7 +51,8 @@ typedef std::list<Locale> type_list_locales;
 type_list_locales get_list_of_locales();
 
 Glib::ustring get_locale_name(const Glib::ustring& locale_id);
-}
+
+} //namespace IsoCodes
 
 } //namespace Glom
 
diff --git a/po/POTFILES.in b/po/POTFILES.in
index 4ff5ac3..44fb128 100644
--- a/po/POTFILES.in
+++ b/po/POTFILES.in
@@ -8,8 +8,6 @@ glom/box_reports.cc
 glom.desktop.in.in
 glom/dialog_connection.cc
 glom/dialog_existing_or_new.cc
-glom/dialog_import_csv.cc
-glom/dialog_import_csv_progress.cc
 glom/filechooser_export.cc
 glom/frame_glom.cc
 glom/glom_developer.glade
@@ -20,6 +18,9 @@ glom/bakery/app.cc
 glom/bakery/app_withdoc_gtk.cc
 glom/bakery/app_withdoc.cc
 glom/bakery/dialog_offersave.cc
+glom/import_csv/dialog_import_csv.cc
+glom/import_csv/dialog_import_csv_progress.cc
+glom/import_csv/file_encodings.cc
 glom/libglom/connectionpool.cc
 glom/libglom/connectionpool_backends/postgres.cc
 glom/libglom/connectionpool_backends/postgres_central.cc
diff --git a/tests/import/test_parsing.cc b/tests/import/test_parsing.cc
index 19928d9..6874023 100644
--- a/tests/import/test_parsing.cc
+++ b/tests/import/test_parsing.cc
@@ -1,5 +1,5 @@
-#include <glom/import_csv.h>
-#include <glibmm/regex.h>
+#include <glom/import_csv/csv_parser.h>
+//#include <glibmm/regex.h>
 #include <iostream>
 #include <cstdlib>
 
diff --git a/tests/import/test_signals.cc b/tests/import/test_signals.cc
index e0866ae..f4d4677 100644
--- a/tests/import/test_signals.cc
+++ b/tests/import/test_signals.cc
@@ -1,5 +1,5 @@
-#include <glom/import_csv.h>
-#include <glibmm/regex.h>
+#include <glom/import_csv/csv_parser.h>
+//#include <glibmm/regex.h>
 #include <iostream>
 #include <stdexcept>
 #include <cstdlib>



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]