[gimp/gimp-2-6] Last part of fixes for handling UTF-8 coded strings (Bugs 572865 & 628893)



commit c00dfc97d7bf8f47e4bb888079d6675f3e791e1e
Author: Kevin Cozens <kcozens cvs gnome org>
Date:   Sun Sep 19 20:17:26 2010 -0400

    Last part of fixes for handling UTF-8 coded strings (Bugs 572865 & 628893)
    
    At end of a double quoted string, readstrexp() passed byte count instead
    of character count to mk_counted_string(). Cleaned up basic_inchar().
    It ignores bad UTF-8 coded characters when reading from file or buffer.

 plug-ins/script-fu/tinyscheme/scheme.c |  102 ++++++++++++++++----------------
 1 files changed, 51 insertions(+), 51 deletions(-)
---
diff --git a/plug-ins/script-fu/tinyscheme/scheme.c b/plug-ins/script-fu/tinyscheme/scheme.c
index ba40918..da7b484 100644
--- a/plug-ins/script-fu/tinyscheme/scheme.c
+++ b/plug-ins/script-fu/tinyscheme/scheme.c
@@ -1036,11 +1036,11 @@ static char *store_string(scheme *sc, int char_cnt,
        else
           len = q2 - str;
        q=(gchar*)sc->malloc(len+1);
-     }
-     else {
+     } else {
        len = g_unichar_to_utf8(fill, utf8);
        q=(gchar*)sc->malloc(char_cnt*len+1);
      }
+
      if(q==0) {
        sc->no_memory=1;
        return sc->strbuff;
@@ -1529,82 +1529,81 @@ static void port_close(scheme *sc, pointer p, int flag) {
   }
 }
 
+/* This routine will ignore byte sequences that are not valid UTF-8 */
 static gunichar basic_inchar(port *pt) {
-  int  len;
-
-  if(pt->kind&port_file) {
-    unsigned char utf8[7];
+  if(pt->kind & port_file) {
     int  c;
-    int  i;
 
     c = fgetc(pt->rep.stdio.file);
-    if (c == EOF) return EOF;
-    utf8[0] = c;
 
     while (TRUE)
       {
-        if (utf8[0] <= 0x7f)
+        if (c == EOF) return EOF;
+
+        if (c <= 0x7f)
+            return (gunichar) c;
+
+        /* Is this byte an invalid lead per RFC-3629? */
+        if (c < 0xc2 || c > 0xf4)
           {
-            return (gunichar) utf8[0];
+            /* Ignore invalid lead byte and get the next characer */
+            c = fgetc(pt->rep.stdio.file);
           }
-
-        /* Check for valid lead byte per RFC-3629 */
-        if (utf8[0] >= 0xc2 && utf8[0] <= 0xf4)
+        else    /* Byte is valid lead */
           {
-            len = utf8_length[utf8[0] & 0x3F];
+            unsigned char utf8[7];
+            int  len;
+            int  i;
+
+            utf8[0] = c;    /* Save the lead byte */
+
+            len = utf8_length[c & 0x3F];
             for (i = 1; i <= len; i++)
               {
                 c = fgetc(pt->rep.stdio.file);
-                if (c == EOF) return EOF;
-                utf8[i] = c;
-                if ((utf8[i] & 0xc0) != 0x80)
-                  {
+
+                /* Stop reading if this is not a continuation character */
+                if ((c & 0xc0) != 0x80)
                     break;
-                  }
+
+                utf8[i] = c;
               }
 
-            if (i > len)
+            if (i > len)    /* Read the expected number of bytes? */
               {
-                return g_utf8_get_char ((char *) utf8);
+                return g_utf8_get_char_validated ((char *) utf8,
+                                                  sizeof(utf8));
               }
 
-            /* we did not get enough continuation characters. */
-            utf8[0] = utf8[i];          /* ignore and restart */
+            /* Not enough continuation characters so ignore and restart */
           }
-        else
-          {
-            /* Everything else is invalid and will be ignored */
-            c = fgetc(pt->rep.stdio.file);
-            if (c == EOF) return EOF;
-            utf8[0] = c;
-          }
-      }
+      } /* end of while (TRUE) */
   } else {
-    if(*pt->rep.string.curr==0
-       || pt->rep.string.curr==pt->rep.string.past_the_end) {
-      return EOF;
-    } else {
-      gunichar c;
+    gunichar c;
+    int      len;
+
+    while (TRUE)
+    {
+      /* Found NUL or at end of input buffer? */
+      if (*pt->rep.string.curr == 0 ||
+          pt->rep.string.curr == pt->rep.string.past_the_end) {
+        return EOF;
+      }
 
       len = pt->rep.string.past_the_end - pt->rep.string.curr;
       c = g_utf8_get_char_validated(pt->rep.string.curr, len);
 
-      if (c < 0)
+      if (c >= 0)   /* Valid UTF-8 character? */
       {
-        pt->rep.string.curr = g_utf8_find_next_char(pt->rep.string.curr,
-                                                    pt->rep.string.past_the_end);
-        if (pt->rep.string.curr == NULL)
-            pt->rep.string.curr = pt->rep.string.past_the_end;
-        c = ' ';
-      }
-      else
-      {
-        len = g_unichar_to_utf8(c, NULL);
+        len = g_unichar_to_utf8(c, NULL);   /* Length of UTF-8 sequence */
         pt->rep.string.curr += len;
+        return c;
       }
 
-      return c;
-    }
+      /* Look for next valid UTF-8 character in buffer */
+      pt->rep.string.curr = g_utf8_find_next_char(pt->rep.string.curr,
+                                                  pt->rep.string.past_the_end);
+    } /* end of while (TRUE) */
   }
 }
 
@@ -1760,7 +1759,8 @@ static pointer readstrexp(scheme *sc) {
         break;
       case '"':
         *p=0;
-        return mk_counted_string(sc,sc->strbuff,p-sc->strbuff);
+        return mk_counted_string(sc,sc->strbuff,
+                                 g_utf8_strlen(sc->strbuff, sizeof(sc->strbuff)));
       default:
         len = g_unichar_to_utf8(c, p);
         p += len;
@@ -2749,7 +2749,7 @@ static pointer opexe_0(scheme *sc, enum scheme_opcodes op) {
           s_goto(sc,OP_EVAL);
 
      case OP_DEF1:  /* define */
-       x=find_slot_in_env(sc,sc->envir,sc->code,0);
+          x=find_slot_in_env(sc,sc->envir,sc->code,0);
           if (x != sc->NIL) {
                set_slot_in_env(sc, x, sc->value);
           } else {



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]