[libsoup/content-sniffing] Initial implementation of the Text or Binary algorithm

From: Gustavo Noronha Silva <gns src gnome org>
To: svn-commits-list gnome org
Subject: [libsoup/content-sniffing] Initial implementation of the Text or Binary algorithm
Date: Wed, 17 Jun 2009 23:31:14 -0400 (EDT)
commit 8940fedc741f0048d9becaeacf38b80799306224
Author: Gustavo Noronha Silva <gns gnome org>
Date:   Thu Jun 18 00:27:03 2009 -0300

    Initial implementation of the Text or Binary algorithm
    
    This is a very simply written implementation of the HTML5 algorithm
    that sniffs content when the server says it is 'text/plain'. It
    detects if the content seems to be binary using a simple test table,
    and avoids privilege escalation from text/plain to types known to be
    scriptable. Tests included.

 libsoup/soup-content-sniffer.c |  251 +++++++++++++++++++++++++++++++++++++++-
 libsoup/soup-content-sniffer.h |    3 +-
 libsoup/soup-message-io.c      |   15 +--
 tests/resources/home.gif       |  Bin 0 -> 995 bytes
 tests/resources/test.html      |   10 ++
 tests/sniffing-test.c          |   74 ++++++++++++
 6 files changed, 339 insertions(+), 14 deletions(-)
---
diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c
index 5ce0644..ecb925c 100644
--- a/libsoup/soup-content-sniffer.c
+++ b/libsoup/soup-content-sniffer.c
@@ -33,7 +33,7 @@
  * Since: 2.27.3
  **/
 
-static char* sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, gboolean *uncertain);
+static char* sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer);
 static gsize get_buffer_size (SoupContentSniffer *sniffer);
 
 static void soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface, gpointer interface_data);
@@ -80,18 +80,242 @@ soup_content_sniffer_new ()
 	return g_object_new (SOUP_TYPE_CONTENT_SNIFFER, NULL);
 }
 
+/* This table is based on the HTML5 spec;
+ * See 2.7.4 Content-Type sniffing: unknown type
+ */
+struct _type_info {
+	const gboolean has_ws;       /* if there is insignificant
+				      * whitespace in the patter */
+	const char *mask;
+	const char *pattern;
+	const guint pattern_length;
+	const char *sniffed_type;
+	const gboolean scriptable;
+};
+
+static struct _type_info types_table[] = {
+	{ FALSE,
+	  "\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF",
+	  "\x3C\x21\x44\x4F\x43\x54\x59\x50\x45\x20\x48\x54\x4D\x4C",
+	  14,
+	  "text/html",
+	  TRUE },
+
+	{ TRUE,
+	  "\xFF\xFF\xDF\xDF\xDF\xDF",
+	  " \x3C\x48\x54\x4D\x4C",
+	  5,
+	  "text/html",
+	  TRUE },
+
+	{ TRUE,
+	  "\xFF\xFF\xDF\xDF\xDF\xDF",
+	  " \x3C\x48\x45\x41\x44",
+	  5,
+	  "text/html",
+	  TRUE },
+
+	{ TRUE,
+	  "\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
+	  " \x3C\x53\x43\x52\x49\x50\x54",
+	  7,
+	  "text/html",
+	  TRUE },
+
+	{ FALSE,
+	  "\xFF\xFF\xFF\xFF\xFF",
+	  "\x25\x50\x44\x46\x2D",
+	  5,
+	  "application/pdf",
+	  TRUE },
+
+	{ FALSE,
+	  "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
+	  "\x25\x21\x50\x53\x2D\x41\x64\x6F\x62\x65\x2D",
+	  11,
+	  "application/postscript",
+	  FALSE },
+
+	/* BOMs go here */
+
+	{ FALSE,
+	  "\xFF\xFF\xFF\xFF\xFF\xFF",
+	  "\x47\x49\x46\x38\x37\x61",
+	  6,
+	  "image/gif",
+	  FALSE },
+
+	{ FALSE,
+	  "\xFF\xFF\xFF\xFF\xFF\xFF",
+	  "\x47\x49\x46\x38\x39\x61",
+	  6,
+	  "image/gif",
+	  FALSE },
+
+	{ FALSE,
+	  "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
+	  "\x89\x50\x4E\x47\x0D\x0A\x1A\x0A",
+	  8,
+	  "image/png",
+	  FALSE },
+
+	{ FALSE,
+	  "\xFF\xFF\xFF",
+	  "\xFF\xD8\xFF",
+	  3,
+	  "image/jpeg",
+	  FALSE },
+
+	{ FALSE,
+	  "\xFF\xFF",
+	  "\x42\x4D",
+	  2,
+	  "image/bmp",
+	  FALSE },
+
+	{ FALSE,
+	  "\xFF\xFF\xFF\xFF",
+	  "\x00\x00\x01\x00",
+	  4,
+	  "image/vnd.microsoft.icon",
+	  FALSE },
+
+	/* Marks the end */
+	{ FALSE,
+	  NULL,
+	  NULL,
+	  0,
+	  NULL,
+	  FALSE },
+};
+
+/* Whether a given byte looks like it might be part of binary content.
+ * Source: HTML5 spec; borrowed from the Chromium mime sniffer code,
+ * which is BSD-lincensed
+ */
+static char kByteLooksBinary[] = {
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,  // 0x00 - 0x0F
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,  // 0x10 - 0x1F
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x20 - 0x2F
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x30 - 0x3F
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x40 - 0x4F
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x50 - 0x5F
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x60 - 0x6F
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x70 - 0x7F
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x80 - 0x8F
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x90 - 0x9F
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xA0 - 0xAF
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xB0 - 0xBF
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xC0 - 0xCF
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xD0 - 0xDF
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xE0 - 0xEF
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xF0 - 0xFF
+};
+
+
+/* HTML5: 2.7.3 Content-Type sniffing: text or binary */
+static char*
+sniff_text_or_binary (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer)
+{
+	const char *resource = buffer->data;
+	int resource_length = MIN(512, buffer->length);
+	gboolean looks_binary = FALSE;
+	int i;
+
+	/* Detecting UTF-16BE, UTF-16LE, and UTF-8 BOMs means it's text/plain */
+	if (resource_length >= 4) {
+		if ((resource[0] == 0xfe && resource[1] == 0xff) ||
+		    (resource[0] == 0xff && resource[1] == 0xfe) ||
+		    (resource[0] == 0xef && resource[1] == 0xbb && resource[2] == 0xbf))
+			return g_strdup ("text/plain");
+	}
+
+	/* Look to see if any of the first n bytes looks binary */
+	for (i = 0; i < resource_length; i++) {
+		if (kByteLooksBinary[(unsigned char)resource[i]]) {
+			looks_binary = TRUE;
+			break;
+		}
+	}
+
+	if (!looks_binary)
+		return g_strdup ("text/plain");
+
+	/* HTML5: 2.7.4 Content-Type sniffing: unknown type
+	 *
+	 * This will probably live in its own function, since it is
+	 * used by other parts of the algorithm
+	 */
+	for (i = 0; types_table[i].pattern != NULL ; i++) {
+		struct _type_info *type_row = &(types_table[i]);
+
+		if (type_row->scriptable)
+			continue;
+
+		if (type_row->has_ws) {
+			int index_stream = 0;
+			int index_pattern = 0;
+			gboolean skip_row = FALSE;
+
+			while (index_stream < resource_length) {
+				/* Skip insignificant white space ("WS" in the spec) */
+				if (type_row->pattern[index_pattern] == ' ') {
+					if (resource[index_stream] == '\x09' ||
+					    resource[index_stream] == '\x0a' ||
+					    resource[index_stream] == '\x0c' ||
+					    resource[index_stream] == '\x0d' ||
+					    resource[index_stream] == '\x20')
+						index_stream++;
+					else
+						index_pattern++;
+				} else {
+					if ((type_row->mask[index_pattern] & resource[index_stream]) != type_row->pattern[index_pattern]) {
+						skip_row = TRUE;
+						break;
+					}
+					index_pattern++;
+					index_stream++;
+				}
+			}
+
+			if (skip_row)
+				continue;
+
+			if (index_pattern > type_row->pattern_length)
+				return g_strdup (type_row->sniffed_type);
+		} else {
+			int j;
+
+			if (resource_length < type_row->pattern_length)
+				continue;
+
+			for (j = 0; j < type_row->pattern_length; j++) {
+				if ((type_row->mask[j] & resource[j]) != type_row->pattern[j])
+					break;
+			}
+
+			/* This means our comparison above matched completely */
+			if (j == type_row->pattern_length)
+				return g_strdup (type_row->sniffed_type);
+		}
+	}
+
+	return g_strdup ("application/octet-stream");
+}
+
 static char*
-sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, gboolean *uncertain)
+sniff_gio (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer)
 {
 	SoupURI *uri;
 	char *uri_path;
 	char *content_type;
 	char *mime_type;
+	gboolean uncertain;
 
 	uri = soup_message_get_uri (msg);
 	uri_path = soup_uri_to_string (uri, TRUE);
 
-	content_type= g_content_type_guess (uri_path, (const guchar*)buffer->data, buffer->length, uncertain);
+	content_type= g_content_type_guess (uri_path, (const guchar*)buffer->data, buffer->length, &uncertain);
 	mime_type = g_content_type_get_mime_type (content_type);
 
 	g_free (uri_path);
@@ -100,6 +324,27 @@ sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, gboole
 	return mime_type;
 }
 
+static char*
+sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer)
+{
+	const char *content_type;
+
+	content_type = soup_message_headers_get_one (msg->response_headers, "Content-Type");
+
+	if (content_type == NULL)
+		return sniff_gio (sniffer, msg, buffer);
+
+	/* If we got text/plain, use text_or_binary */
+	if (g_str_equal (content_type, "text/plain") ||
+	     g_str_equal (content_type, "text/plain; charset=ISO-8859-1") ||
+	     g_str_equal (content_type, "text/plain; charset=iso-8859-1") ||
+	     g_str_equal (content_type, "text/plain; charset=UTF-8")) {
+		return sniff_text_or_binary (sniffer, msg, buffer);
+	}
+
+	return sniff_gio (sniffer, msg, buffer);
+}
+
 static gsize
 get_buffer_size (SoupContentSniffer *sniffer)
 {
diff --git a/libsoup/soup-content-sniffer.h b/libsoup/soup-content-sniffer.h
index 77123ed..ad2116a 100644
--- a/libsoup/soup-content-sniffer.h
+++ b/libsoup/soup-content-sniffer.h
@@ -31,8 +31,7 @@ typedef struct {
 
 	char* (*sniff)              (SoupContentSniffer *sniffer,
 				     SoupMessage *msg,
-				     SoupBuffer *buffer,
-				     gboolean *uncertain);
+				     SoupBuffer *buffer);
 	gsize (*get_buffer_size)    (SoupContentSniffer *sniffer);
 
 	/* Padding for future expansion */
diff --git a/libsoup/soup-message-io.c b/libsoup/soup-message-io.c
index 8c29acc..48e2fb6 100644
--- a/libsoup/soup-message-io.c
+++ b/libsoup/soup-message-io.c
@@ -224,18 +224,15 @@ io_sniff_content (SoupMessage *msg)
 	SoupBuffer *sniffed_buffer = soup_message_body_flatten (io->delayed_chunk_data);
 	SoupContentSnifferClass *content_sniffer_class = SOUP_CONTENT_SNIFFER_GET_CLASS (priv->sniffer);
 	char *sniffed_mime_type;
-	gboolean uncertain;
 
 	io->delay_got_chunks = FALSE;
 
-	sniffed_mime_type = content_sniffer_class->sniff (priv->sniffer, msg, sniffed_buffer, &uncertain);
-	if (!uncertain) {
-		SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK;
-		soup_message_content_sniffed (msg, sniffed_mime_type);
-		g_free (sniffed_mime_type);
-		sniffed_mime_type = NULL;
-		SOUP_MESSAGE_IO_RETURN_VAL_IF_CANCELLED_OR_PAUSED (FALSE);
-	}
+	sniffed_mime_type = content_sniffer_class->sniff (priv->sniffer, msg, sniffed_buffer);
+	SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK;
+	soup_message_content_sniffed (msg, sniffed_mime_type);
+	g_free (sniffed_mime_type);
+	sniffed_mime_type = NULL;
+	SOUP_MESSAGE_IO_RETURN_VAL_IF_CANCELLED_OR_PAUSED (FALSE);
 	g_free (sniffed_mime_type);
 
 	SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK;
diff --git a/tests/resources/home.gif b/tests/resources/home.gif
new file mode 100644
index 0000000..55e1d59
Binary files /dev/null and b/tests/resources/home.gif differ
diff --git a/tests/resources/test.html b/tests/resources/test.html
new file mode 100644
index 0000000..5a6cc0c
--- /dev/null
+++ b/tests/resources/test.html
@@ -0,0 +1,10 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd";>
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<title></title>
+</head>
+<body>
+<h1>GNOME!</h1>
+</body>
+</html>
diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c
index 7adf202..040ac89 100644
--- a/tests/sniffing-test.c
+++ b/tests/sniffing-test.c
@@ -47,6 +47,29 @@ server_callback (SoupServer *server, SoupMessage *msg,
 					   contents,
 					   length);
 	}
+
+	if (g_str_has_prefix (path, "/text_or_binary/")) {
+		char *base_name = g_path_get_basename (path);
+		char *file_name = g_strdup_printf ("resources/%s", base_name);
+
+		g_file_get_contents (file_name,
+				     &contents, &length,
+				     &error);
+
+		g_free (base_name);
+		g_free (file_name);
+
+		if (error) {
+			g_error ("%s", error->message);
+			g_error_free (error);
+			exit (1);
+		}
+
+		soup_message_set_response (msg, "text/plain",
+					   SOUP_MEMORY_TAKE,
+					   contents,
+					   length);
+	}
 }
 
 static gboolean
@@ -186,6 +209,40 @@ do_signals_test (gboolean should_content_sniff,
 	g_main_loop_unref (loop);
 }
 
+static void
+sniffing_content_sniffed (SoupMessage *msg, char *content_type, gpointer data)
+{
+	char *expected_type = (char*)data;
+
+	if (strcmp (content_type, expected_type)) {
+		debug_printf (1, "  sniffing failed! expected %s, got %s\n",
+			      expected_type, content_type);
+		errors++;
+	}
+}
+
+static void
+test_sniffing (const char *path, const char *expected_type)
+{
+	SoupURI *uri = soup_uri_new_with_base (base_uri, path);
+	SoupMessage *msg = soup_message_new_from_uri ("GET", uri);
+	GMainLoop *loop = g_main_loop_new (NULL, TRUE);
+
+	g_object_connect (msg,
+			  "signal::content_sniffed", sniffing_content_sniffed, expected_type,
+			  NULL);
+
+	g_object_ref (msg);
+
+	soup_session_queue_message (session, msg, finished, loop);
+
+	g_main_loop_run (loop);
+
+	soup_uri_free (uri);
+	g_object_unref (msg);
+	g_main_loop_unref (loop);
+}
+
 int
 main (int argc, char **argv)
 {
@@ -220,6 +277,23 @@ main (int argc, char **argv)
 	do_signals_test (TRUE, TRUE, TRUE);
 	do_signals_test (TRUE, TRUE, FALSE);
 
+	/* Test the text_or_binary sniffing path */
+
+	/* GIF is a 'safe' type */
+	test_sniffing ("/text_or_binary/home.gif", "image/gif");
+
+	/* With our current code, no sniffing is done using GIO, so
+	 * the mbox will be identified as text/plain; should we change
+	 * this?
+	 */
+	test_sniffing ("/text_or_binary/mbox", "text/plain");
+
+	/* HTML is considered unsafe for this algorithm, since it is
+	 * scriptable, so going from text/plain to text/html is
+	 * considered 'privilege escalation'
+	 */
+	test_sniffing ("/text_or_binary/test.html", "text/plain");
+
 	soup_uri_free (base_uri);
 
 	test_cleanup ();
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]