vte r2227 - in trunk: . src



Author: behdad
Date: Sat Nov 29 10:02:38 2008
New Revision: 2227
URL: http://svn.gnome.org/viewvc/vte?rev=2227&view=rev

Log:
2008-11-29  Behdad Esfahbod  <behdad gnome org>

        Bug 317236 â vte resynchrones too late on invalid UTF-8

        * src/vteconv.c (_vte_conv_utf8_utf8): In our UTF-8 to UTF-8 converter
        differentiate between an incomplete sequence and an ill sequence at 
        the end of the buffer.  Also cleanup some minor inaccuracies (return 
        value).



Modified:
   trunk/ChangeLog
   trunk/src/vteconv.c

Modified: trunk/src/vteconv.c
==============================================================================
--- trunk/src/vteconv.c	(original)
+++ trunk/src/vteconv.c	Sat Nov 29 10:02:38 2008
@@ -40,6 +40,7 @@
 	struct _vte_buffer *in_scratch, *out_scratch;
 };
 
+/* We can't use g_utf8_strlen as that's not nul-safe :( */
 static glong
 _vte_conv_utf8_strlen(const gchar *p, gssize max)
 {
@@ -63,7 +64,7 @@
 {
 	gboolean validated;
 	const gchar *endptr;
-	size_t length, bytes;
+	size_t bytes;
 	guint skip;
 
 	/* We don't tolerate shenanigans! */
@@ -74,27 +75,45 @@
 
 	/* Copy whatever data was validated. */
 	bytes = endptr - *inbuf;
-	length = _vte_conv_utf8_strlen(*inbuf, bytes);
 	memcpy(*outbuf, *inbuf, bytes);
 	*inbuf += bytes;
 	*outbuf += bytes;
 	*outbytes_left -= bytes;
 	*inbytes_left -= bytes;
 
-	/* Return the character count if everything looked good, else EILSEQ. */
+	/* Return 0 (number of non-reversible conversions performed) if everything
+	 * looked good, else EILSEQ. */
 	if (validated) {
-		return length;
+		return 0;
 	}
 
-	/* Determine why the end of the string is not valid. */
+	/* Determine why the end of the string is not valid.
+	 * We are pur b stards for running g_utf8_next_char() on an
+	 * invalid sequence. */
 	skip = g_utf8_next_char(*inbuf) - *inbuf;
-	if ((skip > *inbytes_left) || (skip <= 0)) {
-		/* We had enough bytes to validate the character, and
-		 * it failed, or it just doesn't look right. */
+	if (skip > *inbytes_left) {
+		/* We didn't have enough bytes to validate the character.
+		 * That qualifies for EINVAL, but only if the part of the
+		 * character that we have is a valid prefix to a character.
+		 * Differentiating those requires verifying that all the
+		 * remaining bytes after this one are UTF-8 continuation
+		 * bytes.  Actually even that is not quite enough as not
+		 * all continuation bytes are valid in the most strict
+		 * interpretation of UTF-8, but we don't care about that.
+		 */
+		size_t i;
+
+		for (i = 1; i < *inbytes_left; i++)
+			if (((*inbuf)[i] & 0xC0) != 0x80) {
+				/* Not a continuation byte */
+				errno = EILSEQ;
+				return (size_t) -1;
+			}
+
 		errno = EINVAL;
 	} else {
-		/* We didn't have enough bytes to validate the character, so
-		 * it failed. */
+		/* We had enough bytes to validate the character, and
+		 * it failed.  It just doesn't look right. */
 		errno = EILSEQ;
 	}
 	return (size_t) -1;



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]