g_utf8_salvage



Here is a patch to add a new function, g_utf8_salvage.  It does as follows
-

/* Salvage a UTF-8 string, return a g_malloced string which is
   the same, but with invalid UTF-8 sequences replaced with
   U+FFFD */

gchar    *g_utf8_salvage (const gchar  *str);

This complies with the handling set forward in Markus Kuhn's text file
UTF-8-test.txt, which is available from
<http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>,
which seems as good as any. (and is what xterm is doing)

(It includes my previous patch)

(It also adds _uc variants of g_string_append_c and g_string_insert_c, if
 this is thought a good idea, I'll add _uc variants for all the others?)

-- 
Robert
Index: gutf8.c
===================================================================
RCS file: /cvs/gnome/glib/gutf8.c,v
retrieving revision 1.7
diff -u -r1.7 gutf8.c
--- gutf8.c	2000/09/18 14:55:24	1.7
+++ gutf8.c	2000/11/08 04:03:22
@@ -548,6 +548,12 @@
                  gint          max_len,
                  const gchar **end)
 {
+  static int min_ucs_for_len[] = 
+  { 0, 0, 0x0080, 
+          0x0800, 
+         0x10000, 
+      0x00200000, 
+      0x04000000 };
 
   const gchar *p;
   gboolean retval = TRUE;
@@ -581,12 +587,22 @@
         
       UTF8_GET (result, p, i, mask, len);
 
-      if (result == (gunichar)-1)
+      if (result == (gunichar)-1 ||
+	  (result >= 0xd800 &&
+	   result <= 0xdfff) ||
+	  result == 0xfffe ||
+	  result == 0xffff)
         {
           retval = FALSE;
           break;
         }
       
+      if (result < min_ucs_for_len[len])
+	{
+	  retval = FALSE;
+	  break;
+	}
+
       p += len;
     }
 
@@ -596,4 +612,44 @@
   return retval;
 }
 
+gchar *
+g_utf8_salvage (const gchar *str)
+{
+  GString *new_str = g_string_new (NULL);
+  gchar *retval = NULL, *error = NULL;
+  while (*str)
+    {
+      gint skip;
+      error = NULL;
+      if (g_utf8_validate (str, strlen(str), (const gchar **)&error))
+	{
+	  g_string_append (new_str, str);
+	  retval = new_str->str;
+	  g_string_free (new_str, FALSE);
+	  return retval;
+	}
+      if (error > str)
+	new_str = g_string_append_len (new_str, str, error - str);
+      
+      g_string_append_uc (new_str, 0xfffd);
+      skip = g_utf8_skip[(guchar)*error];
+      str = error;
+      if (skip) 
+	{
+	  skip--;
+	  str++;
+	}
+      while (skip && ((((guchar)(*str)) & 0xc0)==0x80))
+	{
+	  str++;
+	  skip--;
+	}
+      if (str == error)
+	str++;
+    }
 
+  new_str = g_string_append (new_str, str);
+  retval = new_str->str;
+  g_string_free (new_str, FALSE);
+  return retval;
+}
Index: gunicode.h
===================================================================
RCS file: /cvs/gnome/glib/gunicode.h,v
retrieving revision 1.10
diff -u -r1.10 gunicode.h
--- gunicode.h	2000/10/19 15:21:03	1.10
+++ gunicode.h	2000/11/08 04:03:23
@@ -191,6 +191,12 @@
                           gint          max_len,
                           const gchar **end);
 
+/* Salvage a UTF-8 string, return a g_malloced string which is
+   the same, but with invalid UTF-8 sequences replaced with
+   U+FFFD */
+
+gchar    *g_utf8_salvage (const gchar  *str);
+
 G_END_DECLS
 
 #endif /* __G_UNICODE_H__ */
Index: gstring.c
===================================================================
RCS file: /cvs/gnome/glib/gstring.c,v
retrieving revision 1.21
diff -u -r1.21 gstring.c
--- gstring.c	2000/10/27 02:46:03	1.21
+++ gstring.c	2000/11/08 04:03:27
@@ -430,6 +430,15 @@
 }
 
 GString*
+g_string_append_uc (GString *fstring,
+		    gunichar    c)
+{
+  g_return_val_if_fail (fstring != NULL, NULL);
+
+  return g_string_insert_uc (fstring, -1, c);
+}
+
+GString*
 g_string_prepend (GString     *fstring,
 		  const gchar *val)
 {
@@ -469,6 +478,20 @@
   g_return_val_if_fail (pos <= fstring->len, fstring);
   
   return g_string_insert_len (fstring, pos, val, -1);
+}
+
+GString*
+g_string_insert_uc (GString     *fstring,
+		    gint         pos,
+		    gunichar     val)
+{
+  gchar str[10];
+  g_return_val_if_fail (fstring != NULL, NULL);
+  g_return_val_if_fail (pos <= fstring->len, fstring);
+  
+  str[g_unichar_to_utf8 (val, str)] = 0;
+
+  return g_string_insert (fstring, pos, str);
 }
 
 GString*
Index: gstring.h
===================================================================
RCS file: /cvs/gnome/glib/gstring.h,v
retrieving revision 1.2
diff -u -r1.2 gstring.h
--- gstring.h	2000/10/27 02:46:03	1.2
+++ gstring.h	2000/11/08 04:03:27
@@ -28,6 +28,7 @@
 #define __G_STRING_H__
 
 #include <gtypes.h>
+#include <gunicode.h>
 
 G_BEGIN_DECLS
 
@@ -76,6 +77,8 @@
                                          gint             len);
 GString*     g_string_append_c          (GString	 *string,
 					 gchar		  c);
+GString*     g_string_append_uc         (GString         *string,
+                                         gunichar         c);
 GString*     g_string_prepend           (GString	 *string,
 					 const gchar	 *val);
 GString*     g_string_prepend_c         (GString	 *string,
@@ -89,6 +92,9 @@
 GString*     g_string_insert_c          (GString	 *string,
 					 gint		  pos,
 					 gchar		  c);
+GString*     g_string_insert_uc         (GString	 *string,
+					 gint		  pos,
+					 gunichar	  wc);
 GString*     g_string_erase	        (GString	 *string,
 					 gint		  pos,
 					 gint		  len);


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]