[gtkhtml] Bug #610797 - Improve UTF-8 text sanitizing



commit 17e5d90b89266958253c5df6b2f1e0f7c3575cfa
Author: Milan Crha <mcrha redhat com>
Date:   Wed Jul 14 21:37:36 2010 +0200

    Bug #610797 - Improve UTF-8 text sanitizing

 gtkhtml/htmlengine-edit-cut-and-paste.c |   12 ++-
 gtkhtml/htmltext.c                      |  119 ++++++++++++++++++------------
 gtkhtml/htmltext.h                      |    3 +-
 3 files changed, 81 insertions(+), 53 deletions(-)
---
diff --git a/gtkhtml/htmlengine-edit-cut-and-paste.c b/gtkhtml/htmlengine-edit-cut-and-paste.c
index 4f80060..aee80df 100644
--- a/gtkhtml/htmlengine-edit-cut-and-paste.c
+++ b/gtkhtml/htmlengine-edit-cut-and-paste.c
@@ -1394,15 +1394,17 @@ use_pictograms (HTMLEngine *e)
 }
 
 void
-html_engine_insert_text_with_extra_attributes (HTMLEngine *e, const gchar *text, gint len, PangoAttrList *attrs)
+html_engine_insert_text_with_extra_attributes (HTMLEngine *e, const gchar *ptext, gint len, PangoAttrList *attrs)
 {
-	gchar *nl;
+	gchar *nl, *text = NULL;
 	gint alen;
 	gsize bytes;
 
-	bytes = html_text_sanitize (&text, &len);
-	if (!len)
+	bytes = html_text_sanitize (ptext, &text, &len);
+	if (!len || !text) {
+		g_free (text);
 		return;
+	}
 
 	html_undo_level_begin (e->undo, "Insert text", "Delete text");
 	/* FIXME add insert text event */
@@ -1446,6 +1448,8 @@ html_engine_insert_text_with_extra_attributes (HTMLEngine *e, const gchar *text,
 		}
 	} while (nl);
 	html_undo_level_end (e->undo, e);
+
+	g_free (text);
 }
 
 void
diff --git a/gtkhtml/htmltext.c b/gtkhtml/htmltext.c
index 7609a26..00ace28 100644
--- a/gtkhtml/htmltext.c
+++ b/gtkhtml/htmltext.c
@@ -2874,66 +2874,91 @@ html_text_class_init (HTMLTextClass *klass,
 	parent_class = &html_object_class;
 }
 
+/* almost identical copy of glib's _g_utf8_make_valid() */
 static gchar *
-offset_to_pointer_validated (const gchar *str, glong offset, gint *chars_out)
-{
-	const gchar *s = str;
-	glong chars = 0;
-
-	if (offset < 0) {
-		while (*s) {
-			gunichar wc = g_utf8_get_char_validated (s, -1);
-			if (wc == (gunichar)-1 || wc == (gunichar)-2)
-				return NULL;
-			s = g_utf8_next_char (s);
-			chars++;
-		}
+_html_text_utf8_make_valid (const gchar *name, gint len)
+{
+	GString *string;
+	const gchar *remainder, *invalid;
+	gint remaining_bytes, valid_bytes, total_bytes;
+
+	g_return_val_if_fail (name != NULL, NULL);
 
+	string = NULL;
+	remainder = name;
+	if (len == -1) {
+		remaining_bytes = strlen (name);
 	} else {
-		while (offset-- && *s) {
-			gunichar wc = g_utf8_get_char_validated (s, -1);
-			if (wc == (gunichar)-1 || wc == (gunichar)-2)
-				return NULL;
-			s = g_utf8_next_char (s);
-			chars++;
+		const gchar *start = name, *end = name;
+
+		while (len > 0) {
+			gunichar uc = g_utf8_get_char_validated (end, -1);
+
+			if (uc == (gunichar) -2 || uc == (gunichar) -1) {
+				end++;
+			} else if (uc == 0) {
+				break;
+			} else {
+				end = g_utf8_next_char (end);
+			}
+
+			len--;
 		}
+
+		remaining_bytes = end - start;
 	}
 
-	*chars_out = chars;
+	total_bytes = remaining_bytes;
 
-	return (gchar *)s;
+	while (remaining_bytes != 0) {
+		if (g_utf8_validate (remainder, remaining_bytes, &invalid)) 
+			break;
+		valid_bytes = invalid - remainder;
+    
+		if (string == NULL) 
+			string = g_string_sized_new (remaining_bytes);
+
+		g_string_append_len (string, remainder, valid_bytes);
+		/* append U+FFFD REPLACEMENT CHARACTER */
+		g_string_append (string, "\357\277\275");
+
+		remaining_bytes -= valid_bytes + 1;
+		remainder = invalid + 1;
+	}
+  
+	if (string == NULL)
+		return g_strndup (name, total_bytes);
+  
+	g_string_append (string, remainder);
+
+	g_assert (g_utf8_validate (string->str, -1, NULL));
+
+	return g_string_free (string, FALSE);
 }
 
 /**
  * html_text_sanitize:
- * @str: text string (in/out)
+ * @str_in: text string to sanitize (in)
+ * @str_out: newly allocated text string sanitized (out)
  * @len: length of text, in characters (in/out). (A value of
  *       -1 on input means to use all characters in @str)
  *
- * Validates a UTF-8 string up to the given number of characters;
- * if the string is invalid, on output, "[?]" will be stored in
- * @str and 3 in @len, otherwise @str will be left unchanged,
- * and @len will be left unchanged if non-negative, otherwise
- * replaced with the number of characters in @str.
+ * Validates a UTF-8 string up to the given number of characters.
  *
  * Return value: number of bytes in the output value of @str
  **/
 gsize
-html_text_sanitize (const gchar **str, gint *len)
+html_text_sanitize (const gchar *str_in, gchar **str_out, gint *len)
 {
-	gchar *end;
-
-	g_return_val_if_fail (str != NULL, 0);
+	g_return_val_if_fail (str_in != NULL, 0);
+	g_return_val_if_fail (str_out != NULL, 0);
 	g_return_val_if_fail (len != NULL, 0);
 
-	end = offset_to_pointer_validated (*str, *len, len);
-	if (end) {
-		return end - *str;
-	} else {
-		*str = "[?]";
-		*len = 3;
-		return 3;
-	}
+	*str_out = _html_text_utf8_make_valid (str_in, *len);
+	g_return_val_if_fail (*str_out != NULL, 0);
+
+	*len = g_utf8_strlen (*str_out, -1);
+	return strlen (*str_out);
 }
 
 void
@@ -2948,10 +2973,8 @@ html_text_init (HTMLText *text,
 
 	html_object_init (HTML_OBJECT (text), HTML_OBJECT_CLASS (klass));
 
-	text->text_bytes = html_text_sanitize (&str, &len);
+	text->text_bytes = html_text_sanitize (str, &text->text, &len);
 	text->text_len = len;
-	text->text = g_memdup (str, text->text_bytes + 1);
-	text->text [text->text_bytes] = '\0';
 
 	text->font_style    = font_style;
 	text->face          = NULL;
@@ -3030,11 +3053,10 @@ void
 html_text_set_text (HTMLText *text, const gchar *new_text)
 {
 	g_free (text->text);
+	text->text = NULL;
 	text->text_len = -1;
-	text->text_bytes = html_text_sanitize (&new_text,
+	text->text_bytes = html_text_sanitize (new_text, &text->text,
 					       (gint *)&text->text_len);
-	text->text = g_memdup (new_text, text->text_bytes + 1);
-	text->text [text->text_bytes] = '\0';
 	html_object_change_set (HTML_OBJECT (text), HTML_CHANGE_ALL);
 }
 
@@ -3271,13 +3293,13 @@ html_text_trail_space_width (HTMLText *text, HTMLPainter *painter)
 }
 
 void
-html_text_append (HTMLText *text, const gchar *str, gint len)
+html_text_append (HTMLText *text, const gchar *pstr, gint len)
 {
-	gchar *to_delete;
+	gchar *to_delete, *str = NULL;
 	guint bytes;
 
 	to_delete       = text->text;
-	bytes = html_text_sanitize (&str, &len);
+	bytes = html_text_sanitize (pstr, &str, &len);
 	text->text_len += len;
 	text->text      = g_malloc (text->text_bytes + bytes + 1);
 
@@ -3287,6 +3309,7 @@ html_text_append (HTMLText *text, const gchar *str, gint len)
 	text->text[text->text_bytes] = '\0';
 
 	g_free (to_delete);
+	g_free (str);
 
 	html_object_change_set (HTML_OBJECT (text), HTML_CHANGE_ALL);
 }
diff --git a/gtkhtml/htmltext.h b/gtkhtml/htmltext.h
index 69616a5..ab86337 100644
--- a/gtkhtml/htmltext.h
+++ b/gtkhtml/htmltext.h
@@ -286,7 +286,8 @@ void              html_text_set_color_in_range           (HTMLText           *te
 void              html_text_set_color                    (HTMLText           *text,
 							  HTMLColor          *color);
 
-gsize             html_text_sanitize                     (const gchar       **str,
+gsize             html_text_sanitize                     (const gchar       *str_in,
+							  gchar            **str_out,
 							  gint               *len);
 
 Link     *html_link_new                 (gchar *url,



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]