G_SEEK_CUR and stateless iconv encodings



	Those of you who are familiar with the changes in GIOChannel
in glib 2.0 may remember that it is impossible to do certain things,
like seek with G_SEEK_CUR or mix reads and writes to a file, unless
the channel's encoding is NULL (binary) or UTF-8. The reason for these
difficulties is that certain things in the backend cannot be accomplished
with iconv() for channels with stateful encodings.
	It has occured to me that there is no reason, if we had a way of
determining which encodings were and were not stateful, that the user
shouldn't be able to do everything with the stateless encodings that can
be done with UTF-8. It might be useful to have a larger set of encodings
(perhaps the various unicodes, UTF-16, and ASCII, in addition to UTF-8)
which are known to be stateless, for which we could provide random access
to files. I would appreciate suggestions as to what encodings this set
should contain.
	My question is, would this be an API change? It doesn't add any
new functions; it only touches the header file to add a bitflag to the
GIOChannel structure. The rest is changes to the backend, and (still
unimplemented) changes to the docs. I've attached the patch below
for anyone who's interested. The "UTF-8 like" encodings would be
specified in enc_list in check_encoding_stateful().

Ron Steinke

Index: glib/giochannel.c
===================================================================
RCS file: /cvs/gnome/glib/glib/giochannel.c,v
retrieving revision 1.32
diff -u -u -r1.32 giochannel.c
--- glib/giochannel.c	2002/01/28 21:17:45	1.32
+++ glib/giochannel.c	2002/01/29 01:00:13
@@ -62,6 +62,7 @@
 
 static GIOError		g_io_error_get_from_g_error	(GIOStatus    status,
 							 GError      *err);
+static gboolean		check_encoding_stateful		(const gchar *encoding);
 static void		g_io_channel_purge		(GIOChannel  *channel);
 static GIOStatus	g_io_channel_fill_buffer	(GIOChannel  *channel,
 							 GError     **err);
@@ -70,6 +71,20 @@
 							 gsize       *terminator_pos,
 							 GError     **error);
 
+static gboolean		check_encoding_stateful		(const gchar *encoding)
+{
+  /* Add UTF-8 if we make this API at some point */
+  const gchar *enc_list[] = {}; /* FIXME add stateless encodings */
+  const gint num_enc = sizeof(enc_list) / sizeof(gchar**);
+  gint i;
+
+  for(i = 0; i < num_enc; ++i)
+    if(strcmp(encoding, enc_list[i]) == 0)
+      return FALSE;
+
+  return TRUE; /* If we don't know an encoding is stateless, default to stateful */
+}
+
 void
 g_io_channel_init (GIOChannel *channel)
 {
@@ -86,6 +101,7 @@
   channel->partial_write_buf[0] = '\0';
   channel->use_buffer = TRUE;
   channel->do_encode = FALSE;
+  channel->is_stateful = FALSE;
   channel->close_on_unref = FALSE;
 }
 
@@ -819,26 +835,64 @@
       case G_SEEK_CUR: /* The user is seeking relative to the head of the buffer */
         if (channel->use_buffer)
           {
-            if (channel->do_encode && channel->encoded_read_buf
-                && channel->encoded_read_buf->len > 0)
+            if (channel->read_buf)
+              offset -= channel->read_buf->len;
+            if (channel->encoded_read_buf && channel->encoded_read_buf->len > 0)
               {
-                g_warning ("Seek type G_SEEK_CUR not allowed for this"
-                  " channel's encoding.\n");
-                return G_IO_STATUS_ERROR;
-              }
-          if (channel->read_buf)
-            offset -= channel->read_buf->len;
-          if (channel->encoded_read_buf)
-            {
-              g_assert (channel->encoded_read_buf->len == 0 || !channel->do_encode);
+                if (channel->is_stateful)
+                  {
+                    g_warning ("Seek type G_SEEK_CUR not allowed for this"
+                      " channel's encoding.\n");
+                    return G_IO_STATUS_ERROR;
+                  }
+
+                if (channel->do_encode)
+                  {
+                    gchar* inbuf = channel->encoded_read_buf->str;
+                    gsize inbytes = channel->encoded_read_buf->len;
+                    gchar buffer[G_IO_NICE_BUF_SIZE];
+
+                    /* Since the encoding isn't stateful, we use the
+                     * write converter to change the contents of the
+                     * read buffer back into their native encoding.
+                     */
 
-              /* If there's anything here, it's because the encoding is UTF-8,
-               * so we can just subtract the buffer length, the same as for
-               * the unencoded data.
-               */
+                    while(TRUE)
+                      {
+                        gchar* outbuf = buffer;
+                        gsize outbytes = sizeof(buffer) / sizeof(char*);
+                        size_t errnum;
+
+                        errnum = g_iconv(channel->write_cd, &inbuf, &inbytes,
+                                         &outbuf, &outbytes);
+
+                        offset -= outbuf - buffer;
+
+                        if(errnum != (size_t) -1)
+                          break;
+
+                        if(errno != E2BIG)
+                          {
+                            g_assert(errno != EBADF); /* Converter should be open */
+			    g_assert(errno != EINVAL); /* Should be valid UTF-8 */
+			    g_assert(errno != EILSEQ); /* Should be valid UTF-8 */
+			    /* This is really a conversion error, but it would be
+                             * confusing if the only error of type G_CONVERT_ERROR
+			     * that could be returned from this function were
+			     * G_CONVERT_ERROR_FAILED. */
+                            g_set_error (error, G_IO_CHANNEL_ERROR,
+					 G_IO_CHANNEL_ERROR_FAILED,
+		                         _("Unable to compute offset: %s"),
+					 strerror (errnum));
+                            return G_IO_STATUS_ERROR;
+                          }
 
-              offset -= channel->encoded_read_buf->len;
-            }
+                        g_assert(outbuf > buffer); /* Prevent infinite loop */
+                      }
+                  }
+                else /* UTF-8 or binary */
+                  offset -= channel->encoded_read_buf->len;
+              }
           }
         break;
       case G_SEEK_SET:
@@ -871,7 +925,7 @@
 
       if (channel->encoded_read_buf)
         {
-          g_assert (channel->encoded_read_buf->len == 0 || !channel->do_encode);
+          g_assert (channel->encoded_read_buf->len == 0 || !channel->is_stateful);
           g_string_truncate (channel->encoded_read_buf, 0);
         }
 
@@ -1036,14 +1090,14 @@
 			   GError      **error)
 {
   GIConv read_cd, write_cd;
-  gboolean did_encode;
+  gboolean did_encode, was_stateful;
 
   g_return_val_if_fail (channel != NULL, G_IO_STATUS_ERROR);
   g_return_val_if_fail ((error == NULL) || (*error == NULL), G_IO_STATUS_ERROR);
 
-  /* Make sure the encoded buffers are empty */
+  /* Make sure the encoded buffers are empty for a stateful encoding */
 
-  g_return_val_if_fail (!channel->do_encode || !channel->encoded_read_buf ||
+  g_return_val_if_fail (!channel->is_stateful || !channel->encoded_read_buf ||
 			channel->encoded_read_buf->len == 0, G_IO_STATUS_ERROR);
 
   if (!channel->use_buffer)
@@ -1061,6 +1115,7 @@
     }
 
   did_encode = channel->do_encode;
+  was_stateful = channel->is_stateful;
 
   if (!encoding || strcmp (encoding, "UTF8") == 0 || strcmp (encoding, "UTF-8") == 0)
     {
@@ -1127,30 +1182,79 @@
 
   /* The encoding is ok, so set the fields in channel */
 
-  if (channel->read_cd != (GIConv) -1)
-    g_iconv_close (channel->read_cd);
-  if (channel->write_cd != (GIConv) -1)
-    g_iconv_close (channel->write_cd);
-
   if (channel->encoded_read_buf && channel->encoded_read_buf->len > 0)
     {
-      g_assert (!did_encode); /* Encoding UTF-8, NULL doesn't use encoded_read_buf */
+      g_assert (!was_stateful);
 
-      /* This is just validated UTF-8, so we can copy it back into read_buf
-       * so it can be encoded in whatever the new encoding is.
-       */
+      if (!did_encode)
+        {
+          /* This is just validated UTF-8, so we can copy it back into read_buf
+           * so it can be encoded in whatever the new encoding is.
+           */
+
+          g_string_prepend_len (channel->read_buf, channel->encoded_read_buf->str,
+                                channel->encoded_read_buf->len);
+          g_string_truncate (channel->encoded_read_buf, 0);
+        }
+      else
+        {
+          gchar* inbuf = channel->encoded_read_buf->str;
+          gsize inbytes = channel->encoded_read_buf->len;
+          gsize cur_pos = 0;
+          gchar buffer[G_IO_NICE_BUF_SIZE];
+
+          /* Since the encoding isn't stateful, we use the
+           * write converter to change the contents of the
+           * read buffer back into their native encoding.
+           */
+
+          while(TRUE)
+            {
+              gchar *outbuf = buffer;
+              gsize step, outbytes = sizeof(buffer) / sizeof(char*);
+              size_t errnum;
+
+              errnum = g_iconv(channel->write_cd, &inbuf, &inbytes,
+                               &outbuf, &outbytes);
+
+              step = outbuf - buffer;
+              g_string_insert_len(channel->read_buf, cur_pos, buffer, step);
+              cur_pos += step;
+
+              if(errnum != (size_t) -1)
+                break;
 
-      g_string_prepend_len (channel->read_buf, channel->encoded_read_buf->str,
-                            channel->encoded_read_buf->len);
-      g_string_truncate (channel->encoded_read_buf, 0);
+              if(errno != E2BIG)
+                {
+                  g_assert(errno != EBADF); /* Converter should be open */
+	          g_assert(errno != EINVAL); /* Should be valid UTF-8 */
+	          g_assert(errno != EILSEQ); /* Should be valid UTF-8 */
+                  /* Return read_buf to it's previous state */
+                  g_string_erase(channel->read_buf, 0, cur_pos);
+                  g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
+                    _("Could not convert read buffer back to original character"
+                      " set: %s"), strerror (errno));
+                  return G_IO_STATUS_ERROR;
+                }
+
+              g_assert(step > 0); /* Prevent infinite loop */
+            } 
+        }
     }
 
+  if (channel->read_cd != (GIConv) -1)
+    g_iconv_close (channel->read_cd);
+  if (channel->write_cd != (GIConv) -1)
+    g_iconv_close (channel->write_cd);
+
   channel->read_cd = read_cd;
   channel->write_cd = write_cd;
 
   g_free (channel->encoding);
   channel->encoding = g_strdup (encoding);
 
+  channel->is_stateful = channel->do_encode && check_encoding_stateful(encoding);
+
   return G_IO_STATUS_NORMAL;
 }
 
@@ -1940,7 +2044,7 @@
   if (channel->is_seekable && (( BUF_LEN (channel->read_buf) > 0)
     || (BUF_LEN (channel->encoded_read_buf) > 0)))
     {
-      if (channel->do_encode && BUF_LEN (channel->encoded_read_buf) > 0)
+      if (channel->is_stateful && BUF_LEN (channel->encoded_read_buf) > 0)
         {
           g_warning("Mixed reading and writing not allowed on encoded files");
           return G_IO_STATUS_ERROR;
Index: glib/giochannel.h
===================================================================
RCS file: /cvs/gnome/glib/glib/giochannel.h,v
retrieving revision 1.17
diff -u -u -r1.17 giochannel.h
--- glib/giochannel.h	2001/09/10 23:59:33	1.17
+++ glib/giochannel.h	2002/01/29 01:00:13
@@ -127,6 +127,7 @@
   guint is_readable    : 1;	/* Cached GIOFlag */
   guint is_writeable   : 1;	/* ditto */
   guint is_seekable    : 1;	/* ditto */
+  guint is_stateful    : 1;	/* The GIConv conversion may be stateful */
 };
 
 typedef gboolean (*GIOFunc) (GIOChannel   *source,



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]