gtkhtml r9115 - trunk/gtkhtml



Author: mcrha
Date: Fri Jan 23 12:12:00 2009
New Revision: 9115
URL: http://svn.gnome.org/viewvc/gtkhtml?rev=9115&view=rev

Log:
2009-01-23  Denis Pauk  <pauk denis gmail com>

	** Fix for bug #567697

	* htmlentity.h: (INVALID_ENTITY_CHARACTER_MARKER):
	* htmlentity.c: (html_entity_parse):
	* htmltokenizer.c: (html_tokenizer_convert_entity),
	(convert_text_encoding):
	Do not strip invalid entities, rather return them back as text.



Modified:
   trunk/gtkhtml/ChangeLog
   trunk/gtkhtml/htmlentity.c
   trunk/gtkhtml/htmlentity.h
   trunk/gtkhtml/htmltokenizer.c

Modified: trunk/gtkhtml/htmlentity.c
==============================================================================
--- trunk/gtkhtml/htmlentity.c	(original)
+++ trunk/gtkhtml/htmlentity.c	Fri Jan 23 12:12:00 2009
@@ -870,6 +870,6 @@
 {
 	struct _EntityEntry * result = html_entity_hash( s, len);
 	if (result == NULL )
-		return ' ';
+		return INVALID_ENTITY_CHARACTER_MARKER;
 	return result->value;
 }

Modified: trunk/gtkhtml/htmlentity.h
==============================================================================
--- trunk/gtkhtml/htmlentity.h	(original)
+++ trunk/gtkhtml/htmlentity.h	Fri Jan 23 12:12:00 2009
@@ -30,6 +30,7 @@
 /* We name it with correct unicode name, but OK, later... Lauris */
 /* char used for &nbsp; - must correspond to table below */
 #define ENTITY_NBSP 160
+#define INVALID_ENTITY_CHARACTER_MARKER '?'
 #define IS_UTF8_NBSP(s) (*s == (guchar)0xc2 && *(s + 1) == (guchar)0xa0)
 
 gulong html_entity_parse (const gchar *s, guint len);

Modified: trunk/gtkhtml/htmltokenizer.c
==============================================================================
--- trunk/gtkhtml/htmltokenizer.c	(original)
+++ trunk/gtkhtml/htmltokenizer.c	Fri Jan 23 12:12:00 2009
@@ -41,7 +41,6 @@
 static guint html_tokenizer_signals[HTML_TOKENIZER_LAST_SIGNAL] = { 0 };
 
 #define TOKEN_BUFFER_SIZE (1 << 10)
-#define INVALID_CHARACTER_MARKER '?'
 
 #define dt(x)
 
@@ -470,11 +469,13 @@
 		if (read_pos < full_pos)
 			if (*read_pos == '&') {
 				/*value to add*/
-				gunichar value = INVALID_CHARACTER_MARKER;
+				gunichar value = INVALID_ENTITY_CHARACTER_MARKER;
 				/*skip not needed &*/
 				read_pos ++;
 				count_chars = strcspn (read_pos, ";");
 				if (count_chars < 14 && count_chars > 1) {
+					/*save for recovery*/
+					gchar save_gchar = *(read_pos + count_chars);
 					*(read_pos + count_chars)=0;
 					/* &#******; */
 					if (*read_pos == '#') {
@@ -488,9 +489,18 @@
 					} else {
 						value = html_entity_parse (read_pos, strlen (read_pos));
 					}
-					read_pos += (count_chars + 1);
-					write_pos += g_unichar_to_utf8 (value, write_pos);
+					if(value != INVALID_ENTITY_CHARACTER_MARKER){							
+						write_pos += g_unichar_to_utf8 (value, write_pos);
+						read_pos += (count_chars + 1);
+					} else {
+						/*recovery old value - it's not entity*/
+						write_pos += g_unichar_to_utf8 ('&', write_pos);
+						*(read_pos + count_chars) = save_gchar;
+					}					
 				}
+				else
+					/*very large string*/
+					write_pos += g_unichar_to_utf8 ('&', write_pos);
 			}
 	}
 	*write_pos = 0;
@@ -527,7 +537,7 @@
 			g_iconv (iconv_cd, (gchar **)&current, &currlength, &newbuffer, &newlength);
 			if (currlength > 0) {
 				g_warning ("IconvError=%s", current);
-				*newbuffer = INVALID_CHARACTER_MARKER;
+				*newbuffer = INVALID_ENTITY_CHARACTER_MARKER;
 				newbuffer ++;
 				current ++;
 				currlength --;



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]