[libsoup/content-sniffing] Initial implementation of the Text or Binary algorithm
- From: Gustavo Noronha Silva <gns src gnome org>
- To: svn-commits-list gnome org
- Subject: [libsoup/content-sniffing] Initial implementation of the Text or Binary algorithm
- Date: Wed, 17 Jun 2009 23:31:14 -0400 (EDT)
commit 8940fedc741f0048d9becaeacf38b80799306224
Author: Gustavo Noronha Silva <gns gnome org>
Date: Thu Jun 18 00:27:03 2009 -0300
Initial implementation of the Text or Binary algorithm
This is a very simply written implementation of the HTML5 algorithm
that sniffs content when the server says it is 'text/plain'. It
detects if the content seems to be binary using a simple test table,
and avoids privilege escalation from text/plain to types known to be
scriptable. Tests included.
libsoup/soup-content-sniffer.c | 251 +++++++++++++++++++++++++++++++++++++++-
libsoup/soup-content-sniffer.h | 3 +-
libsoup/soup-message-io.c | 15 +--
tests/resources/home.gif | Bin 0 -> 995 bytes
tests/resources/test.html | 10 ++
tests/sniffing-test.c | 74 ++++++++++++
6 files changed, 339 insertions(+), 14 deletions(-)
---
diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c
index 5ce0644..ecb925c 100644
--- a/libsoup/soup-content-sniffer.c
+++ b/libsoup/soup-content-sniffer.c
@@ -33,7 +33,7 @@
* Since: 2.27.3
**/
-static char* sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, gboolean *uncertain);
+static char* sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer);
static gsize get_buffer_size (SoupContentSniffer *sniffer);
static void soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface, gpointer interface_data);
@@ -80,18 +80,242 @@ soup_content_sniffer_new ()
return g_object_new (SOUP_TYPE_CONTENT_SNIFFER, NULL);
}
+/* This table is based on the HTML5 spec;
+ * See 2.7.4 Content-Type sniffing: unknown type
+ */
+struct _type_info {
+ const gboolean has_ws; /* if there is insignificant
+ * whitespace in the patter */
+ const char *mask;
+ const char *pattern;
+ const guint pattern_length;
+ const char *sniffed_type;
+ const gboolean scriptable;
+};
+
+static struct _type_info types_table[] = {
+ { FALSE,
+ "\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF",
+ "\x3C\x21\x44\x4F\x43\x54\x59\x50\x45\x20\x48\x54\x4D\x4C",
+ 14,
+ "text/html",
+ TRUE },
+
+ { TRUE,
+ "\xFF\xFF\xDF\xDF\xDF\xDF",
+ " \x3C\x48\x54\x4D\x4C",
+ 5,
+ "text/html",
+ TRUE },
+
+ { TRUE,
+ "\xFF\xFF\xDF\xDF\xDF\xDF",
+ " \x3C\x48\x45\x41\x44",
+ 5,
+ "text/html",
+ TRUE },
+
+ { TRUE,
+ "\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
+ " \x3C\x53\x43\x52\x49\x50\x54",
+ 7,
+ "text/html",
+ TRUE },
+
+ { FALSE,
+ "\xFF\xFF\xFF\xFF\xFF",
+ "\x25\x50\x44\x46\x2D",
+ 5,
+ "application/pdf",
+ TRUE },
+
+ { FALSE,
+ "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
+ "\x25\x21\x50\x53\x2D\x41\x64\x6F\x62\x65\x2D",
+ 11,
+ "application/postscript",
+ FALSE },
+
+ /* BOMs go here */
+
+ { FALSE,
+ "\xFF\xFF\xFF\xFF\xFF\xFF",
+ "\x47\x49\x46\x38\x37\x61",
+ 6,
+ "image/gif",
+ FALSE },
+
+ { FALSE,
+ "\xFF\xFF\xFF\xFF\xFF\xFF",
+ "\x47\x49\x46\x38\x39\x61",
+ 6,
+ "image/gif",
+ FALSE },
+
+ { FALSE,
+ "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
+ "\x89\x50\x4E\x47\x0D\x0A\x1A\x0A",
+ 8,
+ "image/png",
+ FALSE },
+
+ { FALSE,
+ "\xFF\xFF\xFF",
+ "\xFF\xD8\xFF",
+ 3,
+ "image/jpeg",
+ FALSE },
+
+ { FALSE,
+ "\xFF\xFF",
+ "\x42\x4D",
+ 2,
+ "image/bmp",
+ FALSE },
+
+ { FALSE,
+ "\xFF\xFF\xFF\xFF",
+ "\x00\x00\x01\x00",
+ 4,
+ "image/vnd.microsoft.icon",
+ FALSE },
+
+ /* Marks the end */
+ { FALSE,
+ NULL,
+ NULL,
+ 0,
+ NULL,
+ FALSE },
+};
+
+/* Whether a given byte looks like it might be part of binary content.
+ * Source: HTML5 spec; borrowed from the Chromium mime sniffer code,
+ * which is BSD-lincensed
+ */
+static char kByteLooksBinary[] = {
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, // 0x00 - 0x0F
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, // 0x10 - 0x1F
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x20 - 0x2F
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x30 - 0x3F
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x40 - 0x4F
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x50 - 0x5F
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x60 - 0x6F
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x70 - 0x7F
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8F
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9F
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xA0 - 0xAF
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xB0 - 0xBF
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xC0 - 0xCF
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xD0 - 0xDF
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xE0 - 0xEF
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xF0 - 0xFF
+};
+
+
+/* HTML5: 2.7.3 Content-Type sniffing: text or binary */
+static char*
+sniff_text_or_binary (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer)
+{
+ const char *resource = buffer->data;
+ int resource_length = MIN(512, buffer->length);
+ gboolean looks_binary = FALSE;
+ int i;
+
+ /* Detecting UTF-16BE, UTF-16LE, and UTF-8 BOMs means it's text/plain */
+ if (resource_length >= 4) {
+ if ((resource[0] == 0xfe && resource[1] == 0xff) ||
+ (resource[0] == 0xff && resource[1] == 0xfe) ||
+ (resource[0] == 0xef && resource[1] == 0xbb && resource[2] == 0xbf))
+ return g_strdup ("text/plain");
+ }
+
+ /* Look to see if any of the first n bytes looks binary */
+ for (i = 0; i < resource_length; i++) {
+ if (kByteLooksBinary[(unsigned char)resource[i]]) {
+ looks_binary = TRUE;
+ break;
+ }
+ }
+
+ if (!looks_binary)
+ return g_strdup ("text/plain");
+
+ /* HTML5: 2.7.4 Content-Type sniffing: unknown type
+ *
+ * This will probably live in its own function, since it is
+ * used by other parts of the algorithm
+ */
+ for (i = 0; types_table[i].pattern != NULL ; i++) {
+ struct _type_info *type_row = &(types_table[i]);
+
+ if (type_row->scriptable)
+ continue;
+
+ if (type_row->has_ws) {
+ int index_stream = 0;
+ int index_pattern = 0;
+ gboolean skip_row = FALSE;
+
+ while (index_stream < resource_length) {
+ /* Skip insignificant white space ("WS" in the spec) */
+ if (type_row->pattern[index_pattern] == ' ') {
+ if (resource[index_stream] == '\x09' ||
+ resource[index_stream] == '\x0a' ||
+ resource[index_stream] == '\x0c' ||
+ resource[index_stream] == '\x0d' ||
+ resource[index_stream] == '\x20')
+ index_stream++;
+ else
+ index_pattern++;
+ } else {
+ if ((type_row->mask[index_pattern] & resource[index_stream]) != type_row->pattern[index_pattern]) {
+ skip_row = TRUE;
+ break;
+ }
+ index_pattern++;
+ index_stream++;
+ }
+ }
+
+ if (skip_row)
+ continue;
+
+ if (index_pattern > type_row->pattern_length)
+ return g_strdup (type_row->sniffed_type);
+ } else {
+ int j;
+
+ if (resource_length < type_row->pattern_length)
+ continue;
+
+ for (j = 0; j < type_row->pattern_length; j++) {
+ if ((type_row->mask[j] & resource[j]) != type_row->pattern[j])
+ break;
+ }
+
+ /* This means our comparison above matched completely */
+ if (j == type_row->pattern_length)
+ return g_strdup (type_row->sniffed_type);
+ }
+ }
+
+ return g_strdup ("application/octet-stream");
+}
+
static char*
-sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, gboolean *uncertain)
+sniff_gio (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer)
{
SoupURI *uri;
char *uri_path;
char *content_type;
char *mime_type;
+ gboolean uncertain;
uri = soup_message_get_uri (msg);
uri_path = soup_uri_to_string (uri, TRUE);
- content_type= g_content_type_guess (uri_path, (const guchar*)buffer->data, buffer->length, uncertain);
+ content_type= g_content_type_guess (uri_path, (const guchar*)buffer->data, buffer->length, &uncertain);
mime_type = g_content_type_get_mime_type (content_type);
g_free (uri_path);
@@ -100,6 +324,27 @@ sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, gboole
return mime_type;
}
+static char*
+sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer)
+{
+ const char *content_type;
+
+ content_type = soup_message_headers_get_one (msg->response_headers, "Content-Type");
+
+ if (content_type == NULL)
+ return sniff_gio (sniffer, msg, buffer);
+
+ /* If we got text/plain, use text_or_binary */
+ if (g_str_equal (content_type, "text/plain") ||
+ g_str_equal (content_type, "text/plain; charset=ISO-8859-1") ||
+ g_str_equal (content_type, "text/plain; charset=iso-8859-1") ||
+ g_str_equal (content_type, "text/plain; charset=UTF-8")) {
+ return sniff_text_or_binary (sniffer, msg, buffer);
+ }
+
+ return sniff_gio (sniffer, msg, buffer);
+}
+
static gsize
get_buffer_size (SoupContentSniffer *sniffer)
{
diff --git a/libsoup/soup-content-sniffer.h b/libsoup/soup-content-sniffer.h
index 77123ed..ad2116a 100644
--- a/libsoup/soup-content-sniffer.h
+++ b/libsoup/soup-content-sniffer.h
@@ -31,8 +31,7 @@ typedef struct {
char* (*sniff) (SoupContentSniffer *sniffer,
SoupMessage *msg,
- SoupBuffer *buffer,
- gboolean *uncertain);
+ SoupBuffer *buffer);
gsize (*get_buffer_size) (SoupContentSniffer *sniffer);
/* Padding for future expansion */
diff --git a/libsoup/soup-message-io.c b/libsoup/soup-message-io.c
index 8c29acc..48e2fb6 100644
--- a/libsoup/soup-message-io.c
+++ b/libsoup/soup-message-io.c
@@ -224,18 +224,15 @@ io_sniff_content (SoupMessage *msg)
SoupBuffer *sniffed_buffer = soup_message_body_flatten (io->delayed_chunk_data);
SoupContentSnifferClass *content_sniffer_class = SOUP_CONTENT_SNIFFER_GET_CLASS (priv->sniffer);
char *sniffed_mime_type;
- gboolean uncertain;
io->delay_got_chunks = FALSE;
- sniffed_mime_type = content_sniffer_class->sniff (priv->sniffer, msg, sniffed_buffer, &uncertain);
- if (!uncertain) {
- SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK;
- soup_message_content_sniffed (msg, sniffed_mime_type);
- g_free (sniffed_mime_type);
- sniffed_mime_type = NULL;
- SOUP_MESSAGE_IO_RETURN_VAL_IF_CANCELLED_OR_PAUSED (FALSE);
- }
+ sniffed_mime_type = content_sniffer_class->sniff (priv->sniffer, msg, sniffed_buffer);
+ SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK;
+ soup_message_content_sniffed (msg, sniffed_mime_type);
+ g_free (sniffed_mime_type);
+ sniffed_mime_type = NULL;
+ SOUP_MESSAGE_IO_RETURN_VAL_IF_CANCELLED_OR_PAUSED (FALSE);
g_free (sniffed_mime_type);
SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK;
diff --git a/tests/resources/home.gif b/tests/resources/home.gif
new file mode 100644
index 0000000..55e1d59
Binary files /dev/null and b/tests/resources/home.gif differ
diff --git a/tests/resources/test.html b/tests/resources/test.html
new file mode 100644
index 0000000..5a6cc0c
--- /dev/null
+++ b/tests/resources/test.html
@@ -0,0 +1,10 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<title></title>
+</head>
+<body>
+<h1>GNOME!</h1>
+</body>
+</html>
diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c
index 7adf202..040ac89 100644
--- a/tests/sniffing-test.c
+++ b/tests/sniffing-test.c
@@ -47,6 +47,29 @@ server_callback (SoupServer *server, SoupMessage *msg,
contents,
length);
}
+
+ if (g_str_has_prefix (path, "/text_or_binary/")) {
+ char *base_name = g_path_get_basename (path);
+ char *file_name = g_strdup_printf ("resources/%s", base_name);
+
+ g_file_get_contents (file_name,
+ &contents, &length,
+ &error);
+
+ g_free (base_name);
+ g_free (file_name);
+
+ if (error) {
+ g_error ("%s", error->message);
+ g_error_free (error);
+ exit (1);
+ }
+
+ soup_message_set_response (msg, "text/plain",
+ SOUP_MEMORY_TAKE,
+ contents,
+ length);
+ }
}
static gboolean
@@ -186,6 +209,40 @@ do_signals_test (gboolean should_content_sniff,
g_main_loop_unref (loop);
}
+static void
+sniffing_content_sniffed (SoupMessage *msg, char *content_type, gpointer data)
+{
+ char *expected_type = (char*)data;
+
+ if (strcmp (content_type, expected_type)) {
+ debug_printf (1, " sniffing failed! expected %s, got %s\n",
+ expected_type, content_type);
+ errors++;
+ }
+}
+
+static void
+test_sniffing (const char *path, const char *expected_type)
+{
+ SoupURI *uri = soup_uri_new_with_base (base_uri, path);
+ SoupMessage *msg = soup_message_new_from_uri ("GET", uri);
+ GMainLoop *loop = g_main_loop_new (NULL, TRUE);
+
+ g_object_connect (msg,
+ "signal::content_sniffed", sniffing_content_sniffed, expected_type,
+ NULL);
+
+ g_object_ref (msg);
+
+ soup_session_queue_message (session, msg, finished, loop);
+
+ g_main_loop_run (loop);
+
+ soup_uri_free (uri);
+ g_object_unref (msg);
+ g_main_loop_unref (loop);
+}
+
int
main (int argc, char **argv)
{
@@ -220,6 +277,23 @@ main (int argc, char **argv)
do_signals_test (TRUE, TRUE, TRUE);
do_signals_test (TRUE, TRUE, FALSE);
+ /* Test the text_or_binary sniffing path */
+
+ /* GIF is a 'safe' type */
+ test_sniffing ("/text_or_binary/home.gif", "image/gif");
+
+ /* With our current code, no sniffing is done using GIO, so
+ * the mbox will be identified as text/plain; should we change
+ * this?
+ */
+ test_sniffing ("/text_or_binary/mbox", "text/plain");
+
+ /* HTML is considered unsafe for this algorithm, since it is
+ * scriptable, so going from text/plain to text/html is
+ * considered 'privilege escalation'
+ */
+ test_sniffing ("/text_or_binary/test.html", "text/plain");
+
soup_uri_free (base_uri);
test_cleanup ();
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]