[glabels] Modified CSV parser.



commit bd3208b5cac24a8d6f231a72ed45bb0e73d1bb6e
Author: Jim Evins <evins snaught com>
Date:   Mon Feb 21 22:44:00 2011 -0500

    Modified CSV parser.
    
    Modified CSV parser to more closely follow the format outlined in RFC 4180.
    The primary difference is that quoted text cannot be embedded in the middle
    of a field.  I.e. the quotes (") should envelope the entire field.  Legacy
    "escape" rules still apply.

 src/merge-text.c |  333 ++++++++++++++++++++----------------------------------
 1 files changed, 122 insertions(+), 211 deletions(-)
---
diff --git a/src/merge-text.c b/src/merge-text.c
index a702c3d..4c1f712 100644
--- a/src/merge-text.c
+++ b/src/merge-text.c
@@ -470,17 +470,14 @@ gl_merge_text_copy (glMerge       *dst_merge,
 /*                                                                           */
 /* Attempt to be a robust parser of various CSV (and similar) formats.       */
 /*                                                                           */
-/* Split into fields, accounting for:                                        */
-/*   - delimeters may be embedded in quoted text (")                         */
-/*   - delimeters may be "escaped" by a leading backslash (\)                */
-/*   - quotes may be embedded in quoted text as two adjacent quotes ("")     */
-/*   - quotes may be "escaped" either within or outside of quoted text.      */
-/*   - newlines may be embedded in quoted text, allowing a field to span     */
-/*     more than one line.                                                   */
+/* Based on CSV format described in RFC 4180 section 2.                      */
 /*                                                                           */
-/* This function does not do any parsing of the individual fields, other     */
-/* than to correctly interpet delimeters.  Actual parsing of the individual  */
-/* fields is done in parse_field().                                          */
+/* Additions to RFC 4180 rules:                                              */
+/*   - delimeters and other special characters may be "escaped" by a leading */
+/*     backslash (\)                                                         */
+/*   - C escape sequences for newline (\n) and tab (\t) are also translated. */
+/*   - if quoted text is not followed by a delimeter, any additional text is */
+/*     concatenated with quoted portion.                                     */
 /*                                                                           */
 /* Returns a list of fields.  A blank line is considered a line with one     */
 /* empty field.  Returns empty (NULL) when done.                             */
@@ -489,143 +486,119 @@ static GList *
 parse_line (FILE  *fp,
 	    gchar  delim )
 {
-	GList *list = NULL;
-	GString *string;
-	gint c;
-	enum { BEGIN, NORMAL, QUOTED, QUOTED_QUOTE1,
-               NORMAL_ESCAPED, QUOTED_ESCAPED, DONE } state;
+	GList   *list;
+	GString *field;
+	gint     c;
+	enum { DELIM,
+               QUOTED, QUOTED_QUOTE1, QUOTED_ESCAPED,
+               SIMPLE, SIMPLE_ESCAPED,
+               DONE } state;
 
 	if (fp == NULL) {
 		return NULL;
 	}
 	       
-	state = BEGIN;
-	string = g_string_new( "" );
+	state = DELIM;
+        list  = NULL;
+	field = g_string_new( "" );
 	while ( state != DONE ) {
 		c=getc (fp);
 
 		switch (state) {
 
-		case BEGIN:
-                        if ( c == delim )
-                        {
-                                /* first field is empty. */
-                                list = g_list_append (list, g_strdup (""));
-				state = NORMAL;
-                                break;
-                        }
+		case DELIM:
 			switch (c) {
-			case '"':
-                                string = g_string_append_c (string, c);
-				state = QUOTED;
-				break;
-			case '\\':
-                                string = g_string_append_c (string, c);
-				state = NORMAL_ESCAPED;
-				break;
 			case '\n':
-				/* treat as one empty field. */
+				/* last field is empty. */
 				list = g_list_append (list, g_strdup (""));
 				state = DONE;
 				break;
+			case '\r':
+				/* ignore */
+				state = DELIM;
+				break;
 			case EOF:
                                 /* end of file, no more lines. */
 				state = DONE;
 				break;
-			default:
-                                string = g_string_append_c (string, c);
-				state = NORMAL;
-				break;
-			}
-			break;
-
-		case NORMAL:
-                        if ( c == delim )
-                        {
-                                list = g_list_append (list, parse_field (string->str));
-                                string = g_string_assign( string, "" );
-                                state = NORMAL;
-                                break;
-                        }
-			switch (c) {
 			case '"':
-                                string = g_string_append_c (string, c);
+                                /* start a quoted field. */
 				state = QUOTED;
 				break;
 			case '\\':
-                                string = g_string_append_c (string, c);
-				state = NORMAL_ESCAPED;
-				break;
-			case '\n':
-			case EOF:
-				list = g_list_append (list, parse_field (string->str));
-				state = DONE;
+                                /* simple field, but 1st character is an escape. */
+				state = SIMPLE_ESCAPED;
 				break;
 			default:
-                                string = g_string_append_c (string, c);
-                                state = NORMAL;
+                                if ( c == delim )
+                                {
+                                        /* field is empty. */
+                                        list = g_list_append (list, g_strdup (""));
+                                        state = DELIM;
+                                }
+                                else
+                                {
+                                        /* begining of a simple field. */
+                                        field = g_string_append_c (field, c);
+                                        state = SIMPLE;
+                                }
 				break;
 			}
 			break;
 
 		case QUOTED:
 			switch (c) {
+			case EOF:
+				/* File ended mid way through quoted item, truncate field. */
+				list = g_list_append (list, g_strdup (field->str));
+				state = DONE;
+				break;
 			case '"':
-                                string = g_string_append_c (string, c);
+                                /* Possible end of field, but could be 1st of a pair. */
 				state = QUOTED_QUOTE1;
 				break;
 			case '\\':
-                                string = g_string_append_c (string, c);
+                                /* Escape next character, or special escape, e.g. \n. */
 				state = QUOTED_ESCAPED;
 				break;
-			case EOF:
-				/* File ended mid way through quoted item */
-				list = g_list_append (list, parse_field (string->str));
-				state = DONE;
-				break;
 			default:
-				string = g_string_append_c (string, c);
+                                /* Use character literally. */
+				field = g_string_append_c (field, c);
 				break;
 			}
 			break;
 
 		case QUOTED_QUOTE1:
-                        if ( c == delim )
-                        {
-                                list = g_list_append (list, parse_field (string->str));
-                                string = g_string_assign( string, "" );
-                                state = NORMAL;
-                                break;
-                        }
 			switch (c) {
-			case '"':
-				/* insert quotes in string, stay quoted. */
-				string = g_string_append_c (string, c);
-				state = QUOTED;
-				break;
 			case '\n':
 			case EOF:
 				/* line or file ended after quoted item */
-				list = g_list_append (list, parse_field (string->str));
+				list = g_list_append (list, g_strdup (field->str));
 				state = DONE;
 				break;
-			default:
-                                string = g_string_append_c (string, c);
-				state = NORMAL;
+			case '"':
+				/* second quote, insert and stay quoted. */
+				field = g_string_append_c (field, c);
+				state = QUOTED;
 				break;
-			}
-			break;
-
-		case NORMAL_ESCAPED:
-			switch (c) {
-			case EOF:
-				/* File ended mid way through quoted item */
-				list = g_list_append (list, parse_field (string->str));
-				state = DONE;
+			case '\r':
+				/* ignore and go to fallback */
+				state = SIMPLE;
 				break;
 			default:
-				string = g_string_append_c (string, c);
-				state = NORMAL;
+                                if ( c == delim )
+                                {
+                                        /* end of field. */
+                                        list = g_list_append (list, g_strdup (field->str));
+                                        field = g_string_assign( field, "" );
+                                        state = DELIM;
+                                }
+                                else
+                                {
+                                        /* fallback if not a delim or another quote. */
+                                        field = g_string_append_c (field, c);
+                                        state = SIMPLE;
+                                }
 				break;
 			}
 			break;
@@ -634,141 +607,82 @@ parse_line (FILE  *fp,
 			switch (c) {
 			case EOF:
 				/* File ended mid way through quoted item */
-				list = g_list_append (list, parse_field (string->str));
+				list = g_list_append (list, g_strdup (field->str));
 				state = DONE;
 				break;
-			default:
-				string = g_string_append_c (string, c);
+                        case 'n':
+                                /* Decode "\n" as newline. */
+				field = g_string_append_c (field, '\n');
 				state = QUOTED;
 				break;
-			}
-			break;
-
-		default:
-			g_assert_not_reached();
-			break;
-		}
-
-	}
-	g_string_free( string, TRUE );
-
-	return list;
-}
-
-
-/*---------------------------------------------------------------------------*/
-/* PRIVATE.  Parse field.                                                    */
-/*                                                                           */
-/*  - Strip leading and trailing white space, unless quoted.                 */
-/*  - Strip CR, unless escaped.                                              */
-/*  - Expand '\n' and '\t' into newline and tab characters.                  */
-/*  - Remove quotes, unless escaped (\" anywhere or "" within quotes)        */
-/*---------------------------------------------------------------------------*/
-static gchar *
-parse_field (gchar  *raw_field)
-{
-	GString *string;
-        gchar   *pass1_field, *c, *field;
-	enum { NORMAL, NORMAL_ESCAPED, QUOTED, QUOTED_ESCAPED, QUOTED_QUOTE1} state;
-
-
-        /*
-         * Pass 1: remove leading and trailing spaces.
-         */
-        pass1_field = g_strdup (raw_field);
-        g_strstrip (pass1_field);
-
-        /*
-         * Pass 2: resolve quoting and escaping.
-         */
-	state = NORMAL;
-	string = g_string_new( "" );
-        for ( c=pass1_field; *c != 0; c++ )
-        {
-		switch (state) {
-
-		case NORMAL:
-			switch (*c) {
-			case '\\':
-				state = NORMAL_ESCAPED;
-				break;
-			case '"':
+                        case 't':
+                                /* Decode "\t" as tab. */
+				field = g_string_append_c (field, '\t');
 				state = QUOTED;
 				break;
-			case '\r':
-				/* Strip CR. */
-				break;
 			default:
-                                string = g_string_append_c (string, *c);
+                                /* Use character literally. */
+				field = g_string_append_c (field, c);
+				state = QUOTED;
 				break;
 			}
 			break;
 
-		case NORMAL_ESCAPED:
-			switch (*c) {
-			case 'n':
-				string = g_string_append_c (string, '\n');
-				state = NORMAL;
-				break;
-			case 't':
-				string = g_string_append_c (string, '\t');
-				state = NORMAL;
+		case SIMPLE:
+			switch (c) {
+			case '\n':
+			case EOF:
+				/* line or file ended */
+				list = g_list_append (list, g_strdup (field->str));
+				state = DONE;
 				break;
-			default:
-				string = g_string_append_c (string, *c);
-				state = NORMAL;
+			case '\r':
+				/* ignore */
+				state = SIMPLE;
 				break;
-			}
-			break;
-
-		case QUOTED:
-			switch (*c) {
 			case '\\':
-				state = QUOTED_ESCAPED;
-				break;
-			case '"':
-				state = QUOTED_QUOTE1;
-				break;
-			case '\r':
-				/* Strip CR. */
+                                /* Escape next character, or special escape, e.g. \n. */
+				state = SIMPLE_ESCAPED;
 				break;
 			default:
-				string = g_string_append_c (string, *c);
+                                if ( c == delim )
+                                {
+                                        /* end of field. */
+                                        list = g_list_append (list, g_strdup (field->str));
+                                        field = g_string_assign( field, "" );
+                                        state = DELIM;
+                                }
+                                else
+                                {
+                                        /* Use character literally. */
+                                        field = g_string_append_c (field, c);
+                                        state = SIMPLE;
+                                }
 				break;
 			}
 			break;
 
-		case QUOTED_ESCAPED:
-			switch (*c) {
-			case 'n':
-				string = g_string_append_c (string, '\n');
-				state = QUOTED;
-				break;
-			case 't':
-				string = g_string_append_c (string, '\t');
-				state = QUOTED;
-				break;
-			default:
-				string = g_string_append_c (string, *c);
-				state = QUOTED;
+		case SIMPLE_ESCAPED:
+			switch (c) {
+			case EOF:
+				/* File ended mid way through quoted item */
+				list = g_list_append (list, g_strdup (field->str));
+				state = DONE;
 				break;
-			}
-			break;
-
-		case QUOTED_QUOTE1:
-			switch (*c) {
-			case '"':
-				/* insert quotes in string, stay quoted. */
-				string = g_string_append_c (string, *c);
-				state = QUOTED;
+                        case 'n':
+                                /* Decode "\n" as newline. */
+				field = g_string_append_c (field, '\n');
+				state = SIMPLE;
 				break;
-			case '\r':
-				/* Strip CR, return to QUOTED. */
-				state = QUOTED;
+                        case 't':
+                                /* Decode "\t" as tab. */
+				field = g_string_append_c (field, '\t');
+				state = SIMPLE;
 				break;
 			default:
-                                string = g_string_append_c (string, *c);
-				state = NORMAL;
+                                /* Use character literally. */
+				field = g_string_append_c (field, c);
+				state = SIMPLE;
 				break;
 			}
 			break;
@@ -779,12 +693,9 @@ parse_field (gchar  *raw_field)
 		}
 
 	}
+	g_string_free( field, TRUE );
 
-        field = g_strdup (string->str);
-	g_string_free( string, TRUE );
-        g_free (pass1_field);
-
-	return field;
+	return list;
 }
 
 



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]