[libsoup] Implement content sniffing



commit 3c9f3cdffc32126700f25d8a0c55f68b6f587bde
Author: Gustavo Noronha Silva <gns gnome org>
Date:   Wed Jun 17 20:53:17 2009 -0300

    Implement content sniffing
    
    The implementation is based on the draft spec on Content-Type
    Processing Model (draft-abarth-mime-sniff-01). It is a spinoff from
    the HTML5 spec.
    
    Soup now provides a SoupContentSniffer session feature, which hooks
    into the message I/O, and delays emissions of the got-chunk signal to
    be able to figure out the Content-Type of messages from the actual
    content received, in some cases.
    
    GIO is also used to sniff content, whenever the spec allows further
    sniffing.
    
    http://bugzilla.gnome.org/show_bug.cgi?id=572589

 .gitignore                     |    1 +
 libsoup/Makefile.am            |    2 +
 libsoup/soup-content-sniffer.c |  570 ++++++++++++++++++++++++++++++++++++++++
 libsoup/soup-content-sniffer.h |   57 ++++
 libsoup/soup-marshal.list      |    1 +
 libsoup/soup-message-headers.c |   19 ++-
 libsoup/soup-message-io.c      |  128 +++++++++-
 libsoup/soup-message-private.h |    5 +
 libsoup/soup-message.c         |   57 ++++
 libsoup/soup-message.h         |    1 +
 libsoup/soup.h                 |    1 +
 tests/Makefile.am              |    3 +
 tests/resources/atom.xml       |   35 +++
 tests/resources/home.gif       |  Bin 0 -> 995 bytes
 tests/resources/mbox           |   16 ++
 tests/resources/rss20.xml      |   26 ++
 tests/resources/test.html      |   10 +
 tests/sniffing-test.c          |  429 ++++++++++++++++++++++++++++++
 18 files changed, 1356 insertions(+), 5 deletions(-)
---
diff --git a/.gitignore b/.gitignore
index b0cd3a4..1bb227d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -69,6 +69,7 @@ tests/redirect-test
 tests/server-auth-test
 tests/simple-httpd
 tests/simple-proxy
+tests/sniffing-test
 tests/ssl-test
 tests/streaming-test
 tests/timeout-test
diff --git a/libsoup/Makefile.am b/libsoup/Makefile.am
index 949f243..2d3a6ea 100644
--- a/libsoup/Makefile.am
+++ b/libsoup/Makefile.am
@@ -55,6 +55,7 @@ soup_headers =			\
 	soup-auth-domain.h	\
 	soup-auth-domain-basic.h  \
 	soup-auth-domain-digest.h \
+	soup-content-sniffer.h  \
 	soup-cookie.h		\
 	soup-cookie-jar.h	\
 	soup-cookie-jar-text.h	\
@@ -119,6 +120,7 @@ libsoup_2_4_la_SOURCES =		\
 	soup-auth-manager-ntlm.c	\
 	soup-connection.h		\
 	soup-connection.c		\
+	soup-content-sniffer.c		\
 	soup-cookie.c			\
 	soup-cookie-jar.c		\
 	soup-cookie-jar-text.c		\
diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c
new file mode 100644
index 0000000..5fdee5c
--- /dev/null
+++ b/libsoup/soup-content-sniffer.c
@@ -0,0 +1,570 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * soup-content-sniffer.c
+ *
+ * Copyright (C) 2009 Gustavo Noronha Silva.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <string.h>
+#include <gio/gio.h>
+
+#include "soup-content-sniffer.h"
+#include "soup-enum-types.h"
+#include "soup-message.h"
+#include "soup-message-private.h"
+#include "soup-session-feature.h"
+#include "soup-uri.h"
+
+/**
+ * SECTION:soup-content-sniffer
+ * @short_description: Content sniffing for #SoupSession
+ *
+ * A #SoupContentSniffer tries to detect the actual content type of
+ * the files that are being downloaded by looking at some of the data
+ * before the #SoupMessage emits its #SoupMessage::got-headers signal.
+ * #SoupContentSniffer implements #SoupSessionFeature, so you can add
+ * content sniffing to a session with soup_session_add_feature() or
+ * soup_session_add_feature_by_type().
+ *
+ * Since: 2.27.3
+ **/
+
+static char *sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, GHashTable **params);
+static gsize get_buffer_size (SoupContentSniffer *sniffer);
+
+static void soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface, gpointer interface_data);
+
+static void request_queued (SoupSessionFeature *feature, SoupSession *session, SoupMessage *msg);
+static void request_unqueued (SoupSessionFeature *feature, SoupSession *session, SoupMessage *msg);
+
+G_DEFINE_TYPE_WITH_CODE (SoupContentSniffer, soup_content_sniffer, G_TYPE_OBJECT,
+			 G_IMPLEMENT_INTERFACE (SOUP_TYPE_SESSION_FEATURE,
+						soup_content_sniffer_session_feature_init))
+
+static void
+soup_content_sniffer_init (SoupContentSniffer *content_sniffer)
+{
+}
+
+static void
+soup_content_sniffer_class_init (SoupContentSnifferClass *content_sniffer_class)
+{
+	content_sniffer_class->sniff = sniff;
+	content_sniffer_class->get_buffer_size = get_buffer_size;
+}
+
+static void
+soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface,
+					   gpointer interface_data)
+{
+	feature_interface->request_queued = request_queued;
+	feature_interface->request_unqueued = request_unqueued;
+}
+
+/**
+ * soup_content_sniffer_new:
+ *
+ * Creates a new #SoupContentSniffer.
+ *
+ * Returns: a new #SoupContentSniffer
+ *
+ * Since: 2.27.3
+ **/
+SoupContentSniffer *
+soup_content_sniffer_new ()
+{
+	return g_object_new (SOUP_TYPE_CONTENT_SNIFFER, NULL);
+}
+
+char *
+soup_content_sniffer_sniff (SoupContentSniffer *sniffer,
+			    SoupMessage *msg, SoupBuffer *buffer,
+			    GHashTable **params)
+{
+	g_return_val_if_fail (SOUP_IS_CONTENT_SNIFFER (sniffer), NULL);
+	g_return_val_if_fail (SOUP_IS_MESSAGE (msg), NULL);
+	g_return_val_if_fail (buffer != NULL, NULL);
+
+	return SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer)->sniff (sniffer, msg, buffer, params);
+}
+
+/* This table is based on the HTML5 spec;
+ * See 2.7.4 Content-Type sniffing: unknown type
+ */
+typedef struct {
+	/* @has_ws is TRUE if @pattern contains "generic" whitespace */
+	gboolean    has_ws;
+	const char *mask;
+	const char *pattern;
+	guint       pattern_length;
+	const char *sniffed_type;
+	gboolean    scriptable;
+} SoupContentSnifferPattern;
+
+static SoupContentSnifferPattern types_table[] = {
+	{ FALSE,
+	  "\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF",
+	  "\x3C\x21\x44\x4F\x43\x54\x59\x50\x45\x20\x48\x54\x4D\x4C",
+	  14,
+	  "text/html",
+	  TRUE },
+
+	{ TRUE,
+	  "\xFF\xFF\xDF\xDF\xDF\xDF",
+	  " \x3C\x48\x54\x4D\x4C",
+	  5,
+	  "text/html",
+	  TRUE },
+
+	{ TRUE,
+	  "\xFF\xFF\xDF\xDF\xDF\xDF",
+	  " \x3C\x48\x45\x41\x44",
+	  5,
+	  "text/html",
+	  TRUE },
+
+	{ TRUE,
+	  "\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
+	  " \x3C\x53\x43\x52\x49\x50\x54",
+	  7,
+	  "text/html",
+	  TRUE },
+
+	{ FALSE,
+	  "\xFF\xFF\xFF\xFF\xFF",
+	  "\x25\x50\x44\x46\x2D",
+	  5,
+	  "application/pdf",
+	  TRUE },
+
+	{ FALSE,
+	  "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
+	  "\x25\x21\x50\x53\x2D\x41\x64\x6F\x62\x65\x2D",
+	  11,
+	  "application/postscript",
+	  FALSE },
+
+	{ FALSE,
+	  "\xFF\xFF\x00\x00",
+	  "\xFE\xFF\x00\x00",
+	  4,
+	  "text/plain",
+	  FALSE },
+
+	{ FALSE,
+	  "\xFF\xFF\x00\x00",
+	  "\xFF\xFF\x00\x00",
+	  4,
+	  "text/plain",
+	  FALSE },
+
+	{ FALSE,
+	  "\xFF\xFF\xFF\x00",
+	  "\xEF\xBB\xBF\x00",
+	  4,
+	  "text/plain",
+	  FALSE },
+
+	{ FALSE,
+	  "\xFF\xFF\xFF\xFF\xFF\xFF",
+	  "\x47\x49\x46\x38\x37\x61",
+	  6,
+	  "image/gif",
+	  FALSE },
+
+	{ FALSE,
+	  "\xFF\xFF\xFF\xFF\xFF\xFF",
+	  "\x47\x49\x46\x38\x39\x61",
+	  6,
+	  "image/gif",
+	  FALSE },
+
+	{ FALSE,
+	  "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
+	  "\x89\x50\x4E\x47\x0D\x0A\x1A\x0A",
+	  8,
+	  "image/png",
+	  FALSE },
+
+	{ FALSE,
+	  "\xFF\xFF\xFF",
+	  "\xFF\xD8\xFF",
+	  3,
+	  "image/jpeg",
+	  FALSE },
+
+	{ FALSE,
+	  "\xFF\xFF",
+	  "\x42\x4D",
+	  2,
+	  "image/bmp",
+	  FALSE },
+
+	{ FALSE,
+	  "\xFF\xFF\xFF\xFF",
+	  "\x00\x00\x01\x00",
+	  4,
+	  "image/vnd.microsoft.icon",
+	  FALSE }
+};
+
+/* Whether a given byte looks like it might be part of binary content.
+ * Source: HTML5 spec; borrowed from the Chromium mime sniffer code,
+ * which is BSD-licensed
+ */
+static char byte_looks_binary[] = {
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,  /* 0x00 - 0x0F */
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,  /* 0x10 - 0x1F */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x20 - 0x2F */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x30 - 0x3F */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x40 - 0x4F */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x50 - 0x5F */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x60 - 0x6F */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x70 - 0x7F */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x80 - 0x8F */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x90 - 0x9F */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xA0 - 0xAF */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xB0 - 0xBF */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xC0 - 0xCF */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xD0 - 0xDF */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xE0 - 0xEF */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xF0 - 0xFF */
+};
+
+static char *
+sniff_gio (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer)
+{
+	SoupURI *uri;
+	char *uri_path;
+	char *content_type;
+	char *mime_type;
+	gboolean uncertain;
+
+	uri = soup_message_get_uri (msg);
+	uri_path = soup_uri_to_string (uri, TRUE);
+
+	content_type= g_content_type_guess (uri_path, (const guchar*)buffer->data, buffer->length, &uncertain);
+	mime_type = g_content_type_get_mime_type (content_type);
+
+	g_free (uri_path);
+	g_free (content_type);
+
+	return mime_type;
+}
+
+/* HTML5: 2.7.4 Content-Type sniffing: unknown type */
+static char*
+sniff_unknown (SoupContentSniffer *sniffer, SoupMessage *msg,
+	       SoupBuffer *buffer, gboolean for_text_or_binary)
+{
+	const char *resource = buffer->data;
+	int resource_length = MIN (512, buffer->length);
+	char *gio_guess;
+	int i;
+
+	for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
+		SoupContentSnifferPattern *type_row = &(types_table[i]);
+
+		/* The scriptable types should be skiped for the text
+		 * or binary path, but considered for other paths */
+		if (for_text_or_binary && type_row->scriptable)
+			continue;
+
+		if (type_row->has_ws) {
+			int index_stream = 0;
+			int index_pattern = 0;
+			gboolean skip_row = FALSE;
+
+			while (index_stream < resource_length) {
+				/* Skip insignificant white space ("WS" in the spec) */
+				if (type_row->pattern[index_pattern] == ' ') {
+					if (resource[index_stream] == '\x09' ||
+					    resource[index_stream] == '\x0a' ||
+					    resource[index_stream] == '\x0c' ||
+					    resource[index_stream] == '\x0d' ||
+					    resource[index_stream] == '\x20')
+						index_stream++;
+					else
+						index_pattern++;
+				} else {
+					if ((type_row->mask[index_pattern] & resource[index_stream]) != type_row->pattern[index_pattern]) {
+						skip_row = TRUE;
+						break;
+					}
+					index_pattern++;
+					index_stream++;
+				}
+			}
+
+			if (skip_row)
+				continue;
+
+			if (index_pattern > type_row->pattern_length)
+				return g_strdup (type_row->sniffed_type);
+		} else {
+			int j;
+
+			if (resource_length < type_row->pattern_length)
+				continue;
+
+			for (j = 0; j < type_row->pattern_length; j++) {
+				if ((type_row->mask[j] & resource[j]) != type_row->pattern[j])
+					break;
+			}
+
+			/* This means our comparison above matched completely */
+			if (j == type_row->pattern_length)
+				return g_strdup (type_row->sniffed_type);
+		}
+	}
+
+	/* The spec allows us to use platform sniffing to find out
+	 * about other types that are not covered, but we need to be
+	 * careful to not escalate privileges, if on text or binary.
+	 */
+	gio_guess = sniff_gio (sniffer, msg, buffer);
+
+	if (for_text_or_binary) {
+		for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
+			SoupContentSnifferPattern *type_row = &(types_table[i]);
+
+			if (!g_ascii_strcasecmp (type_row->sniffed_type, gio_guess) &&
+			    type_row->scriptable) {
+				g_free (gio_guess);
+				gio_guess = NULL;
+				break;
+			}
+		}
+	}
+
+	if (gio_guess)
+		return gio_guess;
+
+	return g_strdup ("application/octet-stream");
+}
+
+/* HTML5: 2.7.3 Content-Type sniffing: text or binary */
+static char*
+sniff_text_or_binary (SoupContentSniffer *sniffer, SoupMessage *msg,
+		      SoupBuffer *buffer)
+{
+	const char *resource = buffer->data;
+	int resource_length = MIN (512, buffer->length);
+	gboolean looks_binary = FALSE;
+	int i;
+
+	/* Detecting UTF-16BE, UTF-16LE, or UTF-8 BOMs means it's text/plain */
+	if (resource_length >= 4) {
+		if ((resource[0] == 0xFE && resource[1] == 0xFF) ||
+		    (resource[0] == 0xFF && resource[1] == 0xFE) ||
+		    (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF))
+			return g_strdup ("text/plain");
+	}
+
+	/* Look to see if any of the first n bytes looks binary */
+	for (i = 0; i < resource_length; i++) {
+		if (byte_looks_binary[(unsigned char)resource[i]]) {
+			looks_binary = TRUE;
+			break;
+		}
+	}
+
+	if (!looks_binary)
+		return g_strdup ("text/plain");
+
+	return sniff_unknown (sniffer, msg, buffer, TRUE);
+}
+
+static char*
+sniff_images (SoupContentSniffer *sniffer, SoupMessage *msg,
+	      SoupBuffer *buffer, const char *content_type)
+{
+	const char *resource = buffer->data;
+	int resource_length = MIN (512, buffer->length);
+	int i;
+
+	for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
+		SoupContentSnifferPattern *type_row = &(types_table[i]);
+
+		if (resource_length < type_row->pattern_length)
+			continue;
+
+		if (!g_str_has_prefix (type_row->sniffed_type, "image/"))
+			continue;
+
+		/* All of the image types use all-\xFF for the mask,
+		 * so we can just memcmp.
+		 */
+		if (memcmp (type_row->pattern, resource, type_row->pattern_length) == 0)
+			return g_strdup (type_row->sniffed_type);
+	}
+
+	return g_strdup (content_type);
+}
+
+static char*
+sniff_feed_or_html (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer)
+{
+	const char *resource = buffer->data;
+	int resource_length = MIN (512, buffer->length);
+	int pos = 0;
+
+	/* Skip a leading UTF-8 BOM */
+	if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF)
+		pos = 3;
+
+ look_for_tag:
+	/* Skip insignificant white space */
+	while ((resource[pos] == '\x09') ||
+	       (resource[pos] == '\x20') ||
+	       (resource[pos] == '\x0A') ||
+	       (resource[pos] == '\x0D'))
+		pos++;
+
+	/* != < */
+	if (resource[pos] != '\x3C')
+		return g_strdup ("text/html");
+
+	pos++;
+
+	/* Skipping comments */
+	if ((resource[pos] == '\x2D') ||
+	    (resource[pos+1] == '\x2D') ||
+	    (resource[pos+2] == '\x3E')) {
+		pos = pos + 3;
+
+		while ((resource[pos] != '\x2D') &&
+		       (resource[pos+1] != '\x2D') &&
+		       (resource[pos+2] != '\x3E'))
+			pos++;
+
+		goto look_for_tag;
+	}
+
+	/* == ! */
+	if (resource[pos] == '\x21') {
+		do {
+			pos++;
+		} while (resource[pos] != '\x3E');
+
+		pos++;
+
+		goto look_for_tag;
+	} else if (resource[pos] == '\x3F') { /* ? */
+		do {
+			pos++;
+		} while ((resource[pos] != '\x3F') &&
+			 (resource[pos+1] != '\x3E'));
+
+		pos = pos + 2;
+
+		goto look_for_tag;
+	}
+
+	if ((resource[pos] == '\x72') &&
+	    (resource[pos+1] == '\x73') &&
+	    (resource[pos+2] == '\x73'))
+		return g_strdup ("application/rss+xml");
+
+	if ((resource[pos] == '\x66') &&
+	    (resource[pos+1] == '\x65') &&
+	    (resource[pos+2] == '\x65') &&
+	    (resource[pos+3] == '\x64'))
+		return g_strdup ("application/atom+xml");
+
+	return g_strdup ("text/html");
+}
+
+static char*
+sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, GHashTable **params)
+{
+	const char *content_type_with_params;
+	const char *content_type;
+
+	content_type = soup_message_headers_get_content_type (msg->response_headers, params);
+	content_type_with_params = soup_message_headers_get_one (msg->response_headers, "Content-Type");
+
+
+	/* These comparisons are done in an ASCII-case-insensitive
+	 * manner because the spec requires it */
+	if ((content_type == NULL) ||
+	    !g_ascii_strcasecmp (content_type, "unknown/unknown") ||
+	    !g_ascii_strcasecmp (content_type, "application/unknown") ||
+	    !g_ascii_strcasecmp (content_type, "*/*"))
+		return sniff_unknown (sniffer, msg, buffer, FALSE);
+
+	if (g_str_has_suffix (content_type, "+xml") ||
+	    !g_ascii_strcasecmp (content_type, "text/xml") ||
+	    !g_ascii_strcasecmp (content_type, "application/xml"))
+		return g_strdup (content_type);
+
+	/* 2.7.5 Content-Type sniffing: image
+	 * The spec says:
+	 *
+	 *   If the resource's official type is "image/svg+xml", then
+	 *   the sniffed type of the resource is its official type (an
+	 *   XML type)
+	 *
+	 * The XML case is handled by the if above; if you refactor
+	 * this code, keep this in mind.
+	 */
+	if (!g_ascii_strncasecmp (content_type, "image/", 6))
+		return sniff_images (sniffer, msg, buffer, content_type);
+
+	/* If we got text/plain, use text_or_binary */
+	if (g_str_equal (content_type_with_params, "text/plain") ||
+	    g_str_equal (content_type_with_params, "text/plain; charset=ISO-8859-1") ||
+	    g_str_equal (content_type_with_params, "text/plain; charset=iso-8859-1") ||
+	    g_str_equal (content_type_with_params, "text/plain; charset=UTF-8")) {
+		return sniff_text_or_binary (sniffer, msg, buffer);
+	}
+
+	if (!g_ascii_strcasecmp (content_type, "text/html"))
+		return sniff_feed_or_html (sniffer, msg, buffer);
+
+	return g_strdup (content_type);
+}
+
+static gsize
+get_buffer_size (SoupContentSniffer *sniffer)
+{
+	return 512;
+}
+
+static void
+soup_content_sniffer_got_headers_cb (SoupMessage *msg, SoupContentSniffer *sniffer)
+{
+	SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
+	SoupContentSnifferClass *content_sniffer_class = SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer);
+
+	priv->should_sniff_content = TRUE;
+	priv->bytes_for_sniffing = content_sniffer_class->get_buffer_size (sniffer);
+}
+
+static void
+request_queued (SoupSessionFeature *feature, SoupSession *session,
+		SoupMessage *msg)
+{
+	SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
+
+	priv->sniffer = g_object_ref (feature);
+	g_signal_connect (msg, "got-headers",
+			  G_CALLBACK (soup_content_sniffer_got_headers_cb),
+			  feature);
+}
+
+static void
+request_unqueued (SoupSessionFeature *feature, SoupSession *session,
+		  SoupMessage *msg)
+{
+	SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
+
+	g_object_unref (priv->sniffer);
+	priv->sniffer = NULL;
+
+	g_signal_handlers_disconnect_by_func (msg, soup_content_sniffer_got_headers_cb, feature);
+}
diff --git a/libsoup/soup-content-sniffer.h b/libsoup/soup-content-sniffer.h
new file mode 100644
index 0000000..a8aa915
--- /dev/null
+++ b/libsoup/soup-content-sniffer.h
@@ -0,0 +1,57 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Copyright (C) 2009 Gustavo Noronha Silva.
+ */
+
+#ifndef SOUP_CONTENT_SNIFFER_H
+#define SOUP_CONTENT_SNIFFER_H 1
+
+#include <libsoup/soup-types.h>
+#include <libsoup/soup-message-body.h>
+
+G_BEGIN_DECLS
+
+#define SOUP_TYPE_CONTENT_SNIFFER            (soup_content_sniffer_get_type ())
+#define SOUP_CONTENT_SNIFFER(obj)            (G_TYPE_CHECK_INSTANCE_CAST ((obj), SOUP_TYPE_CONTENT_SNIFFER, SoupContentSniffer))
+#define SOUP_CONTENT_SNIFFER_CLASS(klass)    (G_TYPE_CHECK_CLASS_CAST ((klass), SOUP_TYPE_CONTENT_SNIFFER, SoupContentSnifferClass))
+#define SOUP_IS_CONTENT_SNIFFER(obj)         (G_TYPE_CHECK_INSTANCE_TYPE ((obj), SOUP_TYPE_CONTENT_SNIFFER))
+#define SOUP_IS_CONTENT_SNIFFER_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((obj), SOUP_TYPE_CONTENT_SNIFFER))
+#define SOUP_CONTENT_SNIFFER_GET_CLASS(obj)  (G_TYPE_INSTANCE_GET_CLASS ((obj), SOUP_TYPE_CONTENT_SNIFFER, SoupContentSnifferClass))
+
+typedef struct _SoupContentSnifferPrivate SoupContentSnifferPrivate;
+
+typedef struct {
+	GObject parent;
+
+	SoupContentSnifferPrivate *priv;
+} SoupContentSniffer;
+
+typedef struct {
+	GObjectClass parent_class;
+
+	char* (*sniff)              (SoupContentSniffer *sniffer,
+				     SoupMessage *msg,
+				     SoupBuffer *buffer,
+				     GHashTable **params);
+	gsize (*get_buffer_size)    (SoupContentSniffer *sniffer);
+
+	/* Padding for future expansion */
+	void (*_libsoup_reserved1) (void);
+	void (*_libsoup_reserved2) (void);
+	void (*_libsoup_reserved3) (void);
+	void (*_libsoup_reserved4) (void);
+	void (*_libsoup_reserved5) (void);
+} SoupContentSnifferClass;
+
+GType               soup_content_sniffer_get_type (void);
+
+SoupContentSniffer *soup_content_sniffer_new      (void);
+
+char               *soup_content_sniffer_sniff    (SoupContentSniffer *sniffer,
+						   SoupMessage *msg,
+						   SoupBuffer *buffer,
+						   GHashTable **params);
+
+G_END_DECLS
+
+#endif /* SOUP_CONTENT_SNIFFER_H */
diff --git a/libsoup/soup-marshal.list b/libsoup/soup-marshal.list
index 1a43570..d0c53ef 100644
--- a/libsoup/soup-marshal.list
+++ b/libsoup/soup-marshal.list
@@ -6,3 +6,4 @@ NONE:OBJECT,OBJECT
 NONE:OBJECT,POINTER
 NONE:BOXED,BOXED
 NONE:OBJECT,OBJECT,BOOLEAN
+NONE:STRING,BOXED
diff --git a/libsoup/soup-message-headers.c b/libsoup/soup-message-headers.c
index f0abb78..185346e 100644
--- a/libsoup/soup-message-headers.c
+++ b/libsoup/soup-message-headers.c
@@ -226,6 +226,20 @@ find_header (SoupHeader *hdr_array, const char *interned_name, int nth)
 	return -1;
 }
 
+static int
+find_last_header (SoupHeader *hdr_array, guint length, const char *interned_name, int nth)
+{
+	int i;
+
+	for (i = length; i >= 0; i--) {
+		if (hdr_array[i].name == interned_name) {
+			if (nth-- == 0)
+				return i;
+		}
+	}
+	return -1;
+}
+
 /**
  * soup_message_headers_remove:
  * @hdrs: a #SoupMessageHeaders
@@ -277,12 +291,15 @@ const char *
 soup_message_headers_get_one (SoupMessageHeaders *hdrs, const char *name)
 {
 	SoupHeader *hdr_array = (SoupHeader *)(hdrs->array->data);
+	guint hdr_length = hdrs->array->len;
 	int index;
 
 	g_return_val_if_fail (name != NULL, NULL);
 
 	name = intern_header_name (name, NULL);
-	index = find_header (hdr_array, name, 0);
+
+	index = find_last_header (hdr_array, hdr_length, name, 0);
+
 	return (index == -1) ? NULL : hdr_array[index].value;
 }
 
diff --git a/libsoup/soup-message-io.c b/libsoup/soup-message-io.c
index 8e04b66..10657b7 100644
--- a/libsoup/soup-message-io.c
+++ b/libsoup/soup-message-io.c
@@ -18,6 +18,7 @@
 #include "soup-misc.h"
 #include "soup-socket.h"
 #include "soup-ssl.h"
+#include "soup-uri.h"
 
 typedef enum {
 	SOUP_MESSAGE_IO_CLIENT,
@@ -53,6 +54,11 @@ typedef struct {
 	SoupMessageBody      *read_body;
 	goffset               read_length;
 
+	gboolean              acked_content_sniff_decision;
+	gboolean              delay_got_chunks;
+	SoupMessageBody      *delayed_chunk_data;
+	gsize                 delayed_chunk_length;
+
 	SoupMessageIOState    write_state;
 	SoupEncoding          write_encoding;
 	GString              *write_buf;
@@ -105,6 +111,9 @@ soup_message_io_cleanup (SoupMessage *msg)
 	if (io->write_chunk)
 		soup_buffer_free (io->write_chunk);
 
+	if (io->delayed_chunk_data)
+		soup_message_body_free (io->delayed_chunk_data);
+
 	g_slice_free (SoupMessageIOData, io);
 }
 
@@ -207,6 +216,35 @@ io_disconnected (SoupSocket *sock, SoupMessage *msg)
 	io_error (sock, msg, NULL);
 }
 
+static gboolean
+io_sniff_content (SoupMessage *msg)
+{
+	SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
+	SoupMessageIOData *io = priv->io_data;
+	SoupBuffer *sniffed_buffer = soup_message_body_flatten (io->delayed_chunk_data);
+	char *sniffed_mime_type;
+	GHashTable *params = NULL;
+
+	io->delay_got_chunks = FALSE;
+
+	sniffed_mime_type = soup_content_sniffer_sniff (priv->sniffer, msg, sniffed_buffer, &params);
+	SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK;
+	soup_message_content_sniffed (msg, sniffed_mime_type, params);
+	g_free (sniffed_mime_type);
+	if (params)
+		g_hash_table_destroy (params);
+	SOUP_MESSAGE_IO_RETURN_VAL_IF_CANCELLED_OR_PAUSED (FALSE);
+
+	SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK;
+	soup_message_got_chunk (msg, sniffed_buffer);
+	soup_buffer_free (sniffed_buffer);
+	soup_message_body_free (io->delayed_chunk_data);
+	io->delayed_chunk_data = NULL;
+	SOUP_MESSAGE_IO_RETURN_VAL_IF_CANCELLED_OR_PAUSED (FALSE);
+
+	return TRUE;
+}
+
 /* Reads data from io->sock into io->read_meta_buf. If @to_blank is
  * %TRUE, it reads up until a blank line ("CRLF CRLF" or "LF LF").
  * Otherwise, it reads up until a single CRLF or LF.
@@ -294,6 +332,21 @@ read_body_chunk (SoupMessage *msg)
 	GError *error = NULL;
 	SoupBuffer *buffer;
 
+	if (!io->acked_content_sniff_decision) {
+		/* The content sniffer feature decides whether a
+		 * message needs to be sniffed while handling
+		 * got-headers, but the message may be paused in a
+		 * user handler, so we need to make sure the signal is
+		 * emitted, or delay_got_chunks is correctly setup
+		 * here.
+		 */
+		if (priv->should_sniff_content)
+			io->delay_got_chunks = TRUE;
+		else if (priv->sniffer)
+			soup_message_content_sniffed (msg, NULL, NULL);
+		io->acked_content_sniff_decision = TRUE;
+	}
+
 	while (read_to_eof || io->read_length > 0) {
 		if (priv->chunk_allocator) {
 			buffer = priv->chunk_allocator (msg, io->read_length, priv->chunk_allocator_data);
@@ -324,10 +377,24 @@ read_body_chunk (SoupMessage *msg)
 
 			io->read_length -= nread;
 
-			SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK;
-			soup_message_got_chunk (msg, buffer);
-			soup_buffer_free (buffer);
-			SOUP_MESSAGE_IO_RETURN_VAL_IF_CANCELLED_OR_PAUSED (FALSE);
+			if (io->delay_got_chunks) {
+				if (!io->delayed_chunk_data)
+					io->delayed_chunk_data = soup_message_body_new ();
+
+				soup_message_body_append_buffer (io->delayed_chunk_data, buffer);
+				io->delayed_chunk_length += buffer->length;
+
+				/* We already have enough data to perform sniffing, so do it */
+				if (io->delayed_chunk_length > priv->bytes_for_sniffing) {
+					if (!io_sniff_content (msg))
+						return FALSE;
+				}
+			} else {
+				SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK;
+				soup_message_got_chunk (msg, buffer);
+				soup_buffer_free (buffer);
+				SOUP_MESSAGE_IO_RETURN_VAL_IF_CANCELLED_OR_PAUSED (FALSE);
+			}
 			continue;
 		}
 
@@ -675,6 +742,23 @@ io_read (SoupSocket *sock, SoupMessage *msg)
 	guint status;
 
  read_more:
+	/* We have delayed chunks, but are no longer delaying, so this
+	 * means we already sniffed but the message got paused while
+	 * content-sniffed was being handled, in which case we did not
+	 * emit the necessary got-chunk; See also the handling for
+	 * state SOUP_MESSAGE_IO_STATE_BODY in the switch bellow.
+	 */
+	if (io->delayed_chunk_data && !io->delay_got_chunks) {
+		SoupBuffer *sniffed_buffer = soup_message_body_flatten (io->delayed_chunk_data);
+
+		SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK;
+		soup_message_got_chunk (msg, sniffed_buffer);
+		soup_buffer_free (sniffed_buffer);
+		soup_message_body_free (io->delayed_chunk_data);
+		io->delayed_chunk_data = NULL;
+		SOUP_MESSAGE_IO_RETURN_IF_CANCELLED_OR_PAUSED;
+	}
+
 	switch (io->read_state) {
 	case SOUP_MESSAGE_IO_STATE_NOT_STARTED:
 		return;
@@ -782,6 +866,39 @@ io_read (SoupSocket *sock, SoupMessage *msg)
 			return;
 
 	got_body:
+		/* A chunk of data may have been read and the emission
+		 * of got_chunk delayed because we wanted to wait for
+		 * more chunks to arrive, for doing content sniffing,
+		 * but the body was too small, so we need to check if
+		 * an emission is in order here, along with the
+		 * sniffing, if we haven't done it yet, of course.
+		 */
+		if (io->delayed_chunk_data) {
+			if (io->delay_got_chunks) {
+				if (!io_sniff_content (msg))
+					return;
+			} else {
+				SoupBuffer *sniffed_buffer = soup_message_body_flatten (io->delayed_chunk_data);
+
+				SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK;
+				soup_message_got_chunk (msg, sniffed_buffer);
+				soup_buffer_free (sniffed_buffer);
+				soup_message_body_free (io->delayed_chunk_data);
+				io->delayed_chunk_data = NULL;
+
+				/* If we end up returning, read_state
+				 * needs to be set to IO_STATE_BODY,
+				 * and read_length must be 0; since we
+				 * may be coming from STATE_TRAILERS,
+				 * or may be doing a read-to-eof, we
+				 * sanitize these here.
+				 */
+				io->read_state = SOUP_MESSAGE_IO_STATE_BODY;
+				io->read_length = 0;
+				SOUP_MESSAGE_IO_RETURN_IF_CANCELLED_OR_PAUSED;
+			}
+		}
+
 		io->read_state = SOUP_MESSAGE_IO_STATE_FINISHING;
 
 		SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK;
@@ -885,6 +1002,9 @@ new_iostate (SoupMessage *msg, SoupSocket *sock, SoupMessageIOMode mode,
 	io->read_state  = SOUP_MESSAGE_IO_STATE_NOT_STARTED;
 	io->write_state = SOUP_MESSAGE_IO_STATE_NOT_STARTED;
 
+	if (priv->should_sniff_content)
+		io->delay_got_chunks = TRUE;
+
 	if (priv->io_data)
 		soup_message_io_cleanup (msg);
 	priv->io_data = io;
diff --git a/libsoup/soup-message-private.h b/libsoup/soup-message-private.h
index f47251a..999c335 100644
--- a/libsoup/soup-message-private.h
+++ b/libsoup/soup-message-private.h
@@ -9,6 +9,7 @@
 #include "soup-message.h"
 #include "soup-auth.h"
 #include "soup-connection.h"
+#include "soup-content-sniffer.h"
 
 typedef enum {
 	SOUP_MESSAGE_IO_STATUS_IDLE,
@@ -29,6 +30,10 @@ typedef struct {
 	guint              msg_flags;
 	gboolean           server_side;
 
+	SoupContentSniffer *sniffer;
+	gboolean           should_sniff_content;
+	gsize              bytes_for_sniffing;
+
 	SoupHTTPVersion    http_version, orig_http_version;
 
 	SoupURI           *uri;
diff --git a/libsoup/soup-message.c b/libsoup/soup-message.c
index 5475bb7..f614946 100644
--- a/libsoup/soup-message.c
+++ b/libsoup/soup-message.c
@@ -99,6 +99,7 @@ enum {
 	GOT_HEADERS,
 	GOT_CHUNK,
 	GOT_BODY,
+	CONTENT_SNIFFED,
 
 	RESTARTED,
 	FINISHED,
@@ -402,6 +403,44 @@ soup_message_class_init (SoupMessageClass *message_class)
 			      G_TYPE_NONE, 0);
 
 	/**
+	 * SoupMessage::content-sniffed:
+	 * @msg: the message
+	 * @type: the content type that we got from sniffing
+	 * @params: a #GHashTable with the parameters
+	 *
+	 * This signal is emitted after %got-headers, and before the
+	 * first %got-chunk. If content sniffing is disabled, or no
+	 * content sniffing will be performed, due to the sniffer
+	 * deciding to trust the Content-Type sent by the server, this
+	 * signal is emitted immediately after %got_headers, and @type
+	 * is %NULL.
+	 *
+	 * If the #SoupContentSniffer feature is enabled, and the
+	 * sniffer decided to perform sniffing, the first %got_chunk
+	 * emission may be delayed, so that the sniffer has enough
+	 * data to correctly sniff the content. It notified the
+	 * library user that the content has been sniffed, and allows
+	 * it to change the header contents in the message, if
+	 * desired.
+	 *
+	 * After this signal is emitted, the data that was spooled so
+	 * that sniffing could be done is delivered on the first
+	 * emission of %got_chunk.
+	 *
+	 * Since: 2.27.3
+	 **/
+	signals[CONTENT_SNIFFED] =
+		g_signal_new ("content_sniffed",
+			      G_OBJECT_CLASS_TYPE (object_class),
+			      G_SIGNAL_RUN_FIRST,
+			      0,
+			      NULL, NULL,
+			      soup_marshal_NONE__STRING_BOXED,
+			      G_TYPE_NONE, 2,
+			      G_TYPE_STRING,
+			      G_TYPE_HASH_TABLE);
+
+	/**
 	 * SoupMessage::restarted:
 	 * @msg: the message
 	 *
@@ -858,6 +897,24 @@ soup_message_got_body (SoupMessage *msg)
 	g_signal_emit (msg, signals[GOT_BODY], 0);
 }
 
+/**
+ * soup_message_content_sniffed:
+ * @msg: a #SoupMessage
+ * @type: a string with the sniffed content type
+ * @params: a #GHashTable with the parameters
+ *
+ * Emits the %content_sniffed signal, indicating that the IO layer
+ * finished sniffing the content type for @msg. If content sniffing
+ * will not be performed, due to the sniffer deciding to trust the
+ * Content-Type sent by the server, this signal is emitted immediately
+ * after %got_headers, with %NULL as @content_type.
+ **/
+void
+soup_message_content_sniffed (SoupMessage *msg, const char *content_type, GHashTable *params)
+{
+	g_signal_emit (msg, signals[CONTENT_SNIFFED], 0, content_type, params);
+}
+
 static void
 restarted (SoupMessage *req)
 {
diff --git a/libsoup/soup-message.h b/libsoup/soup-message.h
index 1b850be..b940ac6 100644
--- a/libsoup/soup-message.h
+++ b/libsoup/soup-message.h
@@ -155,6 +155,7 @@ void soup_message_got_informational   (SoupMessage *msg);
 void soup_message_got_headers         (SoupMessage *msg);
 void soup_message_got_chunk           (SoupMessage *msg, SoupBuffer *chunk);
 void soup_message_got_body            (SoupMessage *msg);
+void soup_message_content_sniffed     (SoupMessage *msg, const char *content_type, GHashTable *params);
 void soup_message_restarted           (SoupMessage *msg);
 void soup_message_finished            (SoupMessage *msg);
 
diff --git a/libsoup/soup.h b/libsoup/soup.h
index 496a4c1..ddb73f7 100644
--- a/libsoup/soup.h
+++ b/libsoup/soup.h
@@ -15,6 +15,7 @@ extern "C" {
 #include <libsoup/soup-auth-domain.h>
 #include <libsoup/soup-auth-domain-basic.h>
 #include <libsoup/soup-auth-domain-digest.h>
+#include <libsoup/soup-content-sniffer.h>
 #include <libsoup/soup-cookie.h>
 #include <libsoup/soup-cookie-jar.h>
 #include <libsoup/soup-cookie-jar-text.h>
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 0d46df5..ca8158d 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -26,6 +26,7 @@ noinst_PROGRAMS =	\
 	redirect-test	\
 	simple-httpd	\
 	simple-proxy	\
+	sniffing-test   \
 	streaming-test	\
 	timeout-test	\
 	uri-parsing	\
@@ -58,6 +59,7 @@ redirect_test_SOURCES = redirect-test.c $(TEST_SRCS)
 server_auth_test_SOURCES = server-auth-test.c $(TEST_SRCS)
 simple_httpd_SOURCES = simple-httpd.c
 simple_proxy_SOURCES = simple-proxy.c
+sniffing_test_SOURCES = sniffing-test.c  $(TEST_SRCS)
 ssl_test_SOURCES = ssl-test.c $(TEST_SRCS)
 streaming_test_SOURCES = streaming-test.c $(TEST_SRCS)
 timeout_test_SOURCES = timeout-test.c $(TEST_SRCS)
@@ -87,6 +89,7 @@ TESTS =			\
 	misc-test	\
 	ntlm-test	\
 	redirect-test	\
+	sniffing-test	\
 	streaming-test	\
 	timeout-test	\
 	uri-parsing	\
diff --git a/tests/resources/atom.xml b/tests/resources/atom.xml
new file mode 100644
index 0000000..962ecf4
--- /dev/null
+++ b/tests/resources/atom.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0"?>
+<feed xmlns="http://www.w3.org/2005/Atom"; xmlns:planet="http://planet.libsouprocks.net/"; xmlns:indexing="urn:atom-extension:indexing" indexing:index="no"><access:restriction xmlns:access="http://www.bloglines.com/about/specs/fac-1.0"; relationship="deny"/>
+  <title>A small ATOM feed</title>
+  <updated>2009-07-02T10:27:44Z</updated>
+  <generator>kov</generator>
+  <author>
+    <name>Anonymous Coward</name>
+  </author>
+  <id>http://libsoup.rocks/atom.xml</id>
+  <link href="http://libsoup.rocks/atom.xml"; rel="self" type="application/atom+xml"/>
+  <link href="http://libsoup.rocks/"; rel="alternate"/>
+
+  <entry xml:lang="en">
+    <id>http://libsoup.rocks/so/much/</id>
+    <link href="http://libsoup.rocks/so/much/"; rel="alternate" type="text/html"/>
+    <title>One post too many</title>
+    <summary>woo [...]</summary>
+    <content type="xhtml"><div xmlns="http://www.w3.org/1999/xhtml";><p>woohoo</p></div>
+    </content>
+    <updated>2009-07-02T10:38:28Z</updated>
+    <category term="Category1"/>
+    <category term="Personal"/>
+    <author>
+      <name>kov</name>
+    </author>
+    <source>
+      <id>http://libsoup.rocks/blog</id>
+      <link href="http://libsoup.rocks/blog/feed"; rel="self" type="application/atom+xml"/>
+      <link href="http://libsoup.rocks/blog"; rel="alternate" type="text/html"/>
+      <subtitle>Just stuff to test libsoup</subtitle>
+      <title>Random stuff to test libsoup</title>
+      <updated>2009-07-02T00:38:29Z</updated>
+    </source>
+  </entry>
+</feed>
diff --git a/tests/resources/home.gif b/tests/resources/home.gif
new file mode 100644
index 0000000..55e1d59
Binary files /dev/null and b/tests/resources/home.gif differ
diff --git a/tests/resources/mbox b/tests/resources/mbox
new file mode 100644
index 0000000..929ad2b
--- /dev/null
+++ b/tests/resources/mbox
@@ -0,0 +1,16 @@
+From email here Wed Jun 17 21:20:48 2009
+Return-path: <email here>
+Envelope-to: email here
+Delivery-date: Wed, 17 Jun 2009 21:20:48 -0300
+Received: from email by here.domain with local (Exim 4.69)
+	(envelope-from <email here>)
+	id 1MH5N2-0008Lq-7c
+	for email here; Wed, 17 Jun 2009 21:20:48 -0300
+To: email here
+Subject: This is just so that I have a mailbox
+Message-Id: <E1MH5N2-0008Lq-7c here domain>
+From: A Nice User <email here>
+Date: Wed, 17 Jun 2009 21:20:48 -0300
+
+This is a dumb email.
+
diff --git a/tests/resources/rss20.xml b/tests/resources/rss20.xml
new file mode 100644
index 0000000..d64bdda
--- /dev/null
+++ b/tests/resources/rss20.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0"?>
+<rss version="2.0">
+
+<channel>
+	<title>A small RSS</title>
+	<link>http://libsoup.rocks/</link>
+	<language>en</language>
+	<description>A small RSS to test libsoup</description>
+
+<item>
+	<title>One post too many</title>
+	<guid isPermaLink="true">http://libsoup.rocks/so/much/</guid>
+	<link>http://libsoup.rocks/so/much/</link>
+	<description>&lt;p&gt;woohoo&lt;/p&gt;</description>
+	<pubDate>Wed, 02 Jul 2009 10:26:28 +0000</pubDate>
+</item>
+<item>
+	<title>GCDS will rock</title>
+	<guid isPermaLink="true">http://libsoup.rocks/so/much/again/</guid>
+	<link>http://libsoup.rocks/so/much/again/</link>
+	<description>&lt;p&gt;I mean, really.&lt;/p&gt;</description>
+	<pubDate>Wed, 02 Jul 2009 10:26:28 +0000</pubDate>
+</item>
+
+</channel>
+</rss>
diff --git a/tests/resources/test.html b/tests/resources/test.html
new file mode 100644
index 0000000..5a6cc0c
--- /dev/null
+++ b/tests/resources/test.html
@@ -0,0 +1,10 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd";>
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<title></title>
+</head>
+<body>
+<h1>GNOME!</h1>
+</body>
+</html>
diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c
new file mode 100644
index 0000000..ad2690f
--- /dev/null
+++ b/tests/sniffing-test.c
@@ -0,0 +1,429 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Copyright (C) 2009 Gustavo Noronha Silva <gns gnome org>.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <libsoup/soup.h>
+
+#include "test-utils.h"
+
+SoupSession *session;
+SoupURI *base_uri;
+SoupMessageBody *chunk_data;
+
+static void
+server_callback (SoupServer *server, SoupMessage *msg,
+		 const char *path, GHashTable *query,
+		 SoupClientContext *context, gpointer data)
+{
+	GError *error = NULL;
+	char *chunked;
+	char *contents;
+	gsize length;
+
+	if (msg->method != SOUP_METHOD_GET) {
+		soup_message_set_status (msg, SOUP_STATUS_NOT_IMPLEMENTED);
+		return;
+	}
+
+	soup_message_set_status (msg, SOUP_STATUS_OK);
+
+	if (query) {
+		chunked = g_hash_table_lookup (query, "chunked");
+		if (chunked && g_str_equal (chunked, "yes"))
+			soup_message_headers_set_encoding (msg->response_headers,
+							   SOUP_ENCODING_CHUNKED);
+	}
+
+	if (!strcmp (path, "/mbox")) {
+		g_file_get_contents ("resources/mbox",
+				     &contents, &length,
+				     &error);
+
+		if (error) {
+			g_error ("%s", error->message);
+			g_error_free (error);
+			exit (1);
+		}
+
+		soup_message_set_response (msg, "text/plain",
+					   SOUP_MEMORY_TAKE,
+					   contents,
+					   length);
+	}
+
+	if (g_str_has_prefix (path, "/text_or_binary/")) {
+		char *base_name = g_path_get_basename (path);
+		char *file_name = g_strdup_printf ("resources/%s", base_name);
+
+		g_file_get_contents (file_name,
+				     &contents, &length,
+				     &error);
+
+		g_free (base_name);
+		g_free (file_name);
+
+		if (error) {
+			g_error ("%s", error->message);
+			g_error_free (error);
+			exit (1);
+		}
+
+		soup_message_set_response (msg, "text/plain",
+					   SOUP_MEMORY_TAKE,
+					   contents,
+					   length);
+	}
+
+	if (g_str_has_prefix (path, "/unknown/")) {
+		char *base_name = g_path_get_basename (path);
+		char *file_name = g_strdup_printf ("resources/%s", base_name);
+
+		g_file_get_contents (file_name,
+				     &contents, &length,
+				     &error);
+
+		g_free (base_name);
+		g_free (file_name);
+
+		if (error) {
+			g_error ("%s", error->message);
+			g_error_free (error);
+			exit (1);
+		}
+
+		soup_message_set_response (msg, "UNKNOWN/unknown",
+					   SOUP_MEMORY_TAKE,
+					   contents,
+					   length);
+	}
+
+	if (g_str_has_prefix (path, "/type/")) {
+		char **components = g_strsplit (path, "/", 4);
+		char *ptr;
+
+		char *base_name = g_path_get_basename (path);
+		char *file_name = g_strdup_printf ("resources/%s", base_name);
+
+		g_file_get_contents (file_name,
+				     &contents, &length,
+				     &error);
+
+		g_free (base_name);
+		g_free (file_name);
+
+		if (error) {
+			g_error ("%s", error->message);
+			g_error_free (error);
+			exit (1);
+		}
+
+		/* Hack to allow passing type in the URI */
+		ptr = g_strrstr (components[2], "_");
+		*ptr = '/';
+
+		soup_message_set_response (msg, components[2],
+					   SOUP_MEMORY_TAKE,
+					   contents,
+					   length);
+
+		g_strfreev (components);
+	}
+
+	if (g_str_has_prefix (path, "/multiple_headers/")) {
+		char *base_name = g_path_get_basename (path);
+		char *file_name = g_strdup_printf ("resources/%s", base_name);
+
+		g_file_get_contents (file_name,
+				     &contents, &length,
+				     &error);
+
+		g_free (base_name);
+		g_free (file_name);
+
+		if (error) {
+			g_error ("%s", error->message);
+			g_error_free (error);
+			exit (1);
+		}
+
+		soup_message_set_response (msg, "text/xml",
+					   SOUP_MEMORY_TAKE,
+					   contents,
+					   length);
+
+		soup_message_headers_append (msg->response_headers,
+					     "Content-Type", "text/plain");
+	}
+
+}
+
+static gboolean
+unpause_msg (gpointer data)
+{
+	SoupMessage *msg = (SoupMessage*)data;
+	soup_session_unpause_message (session, msg);
+	return FALSE;
+}
+
+
+static void
+content_sniffed (SoupMessage *msg, char *content_type, GHashTable *params, gpointer data)
+{
+	gboolean should_pause = GPOINTER_TO_INT (data);
+
+	if (g_object_get_data (G_OBJECT (msg), "got-chunk")) {
+		debug_printf (1, "  got-chunk got emitted before content-sniffed\n");
+		errors++;
+	}
+
+	g_object_set_data (G_OBJECT (msg), "content-sniffed", GINT_TO_POINTER (TRUE));
+
+	if (should_pause) {
+		soup_session_pause_message (session, msg);
+		g_idle_add (unpause_msg, msg);
+	}
+}
+
+static void
+got_headers (SoupMessage *msg, gpointer data)
+{
+	gboolean should_pause = GPOINTER_TO_INT (data);
+
+	if (g_object_get_data (G_OBJECT (msg), "content-sniffed")) {
+		debug_printf (1, "  content-sniffed got emitted before got-headers\n");
+		errors++;
+	}
+
+	g_object_set_data (G_OBJECT (msg), "got-headers", GINT_TO_POINTER (TRUE));
+
+	if (should_pause) {
+		soup_session_pause_message (session, msg);
+		g_idle_add (unpause_msg, msg);
+	}
+}
+
+static void
+got_chunk (SoupMessage *msg, SoupBuffer *chunk, gpointer data)
+{
+	gboolean should_accumulate = GPOINTER_TO_INT (data);
+
+	g_object_set_data (G_OBJECT (msg), "got-chunk", GINT_TO_POINTER (TRUE));
+
+	if (!should_accumulate) {
+		if (!chunk_data)
+			chunk_data = soup_message_body_new ();
+		soup_message_body_append_buffer (chunk_data, chunk);
+	}
+}
+
+static void
+finished (SoupSession *session, SoupMessage *msg, gpointer data)
+{
+	GMainLoop *loop = (GMainLoop*)data;
+	g_main_loop_quit (loop);
+}
+
+static void
+do_signals_test (gboolean should_content_sniff,
+		 gboolean should_pause,
+		 gboolean should_accumulate,
+		 gboolean chunked_encoding)
+{
+	SoupURI *uri = soup_uri_new_with_base (base_uri, "/mbox");
+	SoupMessage *msg = soup_message_new_from_uri ("GET", uri);
+	GMainLoop *loop = g_main_loop_new (NULL, TRUE);
+	char *contents;
+	gsize length;
+	GError *error = NULL;
+	SoupBuffer *body;
+
+	if (chunked_encoding)
+		soup_uri_set_query (uri, "chunked=yes");
+
+	soup_message_body_set_accumulate (msg->response_body, should_accumulate);
+
+	g_object_connect (msg,
+			  "signal::got-headers", got_headers, GINT_TO_POINTER (should_pause),
+			  "signal::got-chunk", got_chunk, GINT_TO_POINTER (should_accumulate),
+			  "signal::content_sniffed", content_sniffed, GINT_TO_POINTER (should_pause),
+			  NULL);
+
+	g_object_ref (msg);
+	soup_session_queue_message (session, msg, finished, loop);
+
+	g_main_loop_run (loop);
+
+	if (!should_content_sniff &&
+	    g_object_get_data (G_OBJECT (msg), "content-sniffed")) {
+		debug_printf (1, "  content-sniffed got emitted without a sniffer\n");
+		errors++;
+	} else if (should_content_sniff &&
+		   !g_object_get_data (G_OBJECT (msg), "content-sniffed")) {
+		debug_printf (1, "  content-sniffed did not get emitted\n");
+		errors++;
+	}
+
+	g_file_get_contents ("resources/mbox",
+			     &contents, &length,
+			     &error);
+
+	if (error) {
+		g_error ("%s", error->message);
+		g_error_free (error);
+		exit (1);
+	}
+
+	if (!should_accumulate) {
+		body = soup_message_body_flatten (chunk_data);
+		soup_message_body_free (chunk_data);
+		chunk_data = NULL;
+	} else
+		body = soup_message_body_flatten (msg->response_body);
+
+	if (body->length != length) {
+		debug_printf (1, "  lengths do not match\n");
+		errors++;
+	}
+
+	if (memcmp (body->data, contents, length)) {
+		debug_printf (1, "  downloaded data does not match\n");
+		errors++;
+	}
+
+	g_free (contents);
+	soup_buffer_free (body);
+
+	soup_uri_free (uri);
+	g_object_unref (msg);
+	g_main_loop_unref (loop);
+}
+
+static void
+sniffing_content_sniffed (SoupMessage *msg, char *content_type, GHashTable *params, gpointer data)
+{
+	char *expected_type = (char*)data;
+
+	if (strcmp (content_type, expected_type)) {
+		debug_printf (1, "  sniffing failed! expected %s, got %s\n",
+			      expected_type, content_type);
+		errors++;
+	}
+}
+
+static void
+test_sniffing (const char *path, const char *expected_type)
+{
+	SoupURI *uri = soup_uri_new_with_base (base_uri, path);
+	SoupMessage *msg = soup_message_new_from_uri ("GET", uri);
+	GMainLoop *loop = g_main_loop_new (NULL, TRUE);
+
+	g_object_connect (msg,
+			  "signal::content_sniffed", sniffing_content_sniffed, expected_type,
+			  NULL);
+
+	g_object_ref (msg);
+
+	soup_session_queue_message (session, msg, finished, loop);
+
+	g_main_loop_run (loop);
+
+	soup_uri_free (uri);
+	g_object_unref (msg);
+	g_main_loop_unref (loop);
+}
+
+int
+main (int argc, char **argv)
+{
+	SoupServer *server;
+	SoupContentSniffer *sniffer;
+
+	test_init (argc, argv, NULL);
+
+	server = soup_test_server_new (TRUE);
+	soup_server_add_handler (server, NULL, server_callback, NULL, NULL);
+	base_uri = soup_uri_new ("http://127.0.0.1/";);
+	soup_uri_set_port (base_uri, soup_server_get_port (server));
+
+	session = soup_session_async_new ();
+
+	/* No sniffer, no content_sniffed should be emitted */
+	do_signals_test (FALSE, FALSE, FALSE, FALSE);
+	do_signals_test (FALSE, FALSE, FALSE, TRUE);
+	do_signals_test (FALSE, FALSE, TRUE, FALSE);
+	do_signals_test (FALSE, FALSE, TRUE, TRUE);
+
+	do_signals_test (FALSE, TRUE, TRUE, FALSE);
+	do_signals_test (FALSE, TRUE, TRUE, TRUE);
+	do_signals_test (FALSE, TRUE, FALSE, FALSE);
+	do_signals_test (FALSE, TRUE, FALSE, TRUE);
+
+	sniffer = soup_content_sniffer_new ();
+	soup_session_add_feature (session, (SoupSessionFeature*)sniffer);
+
+	/* Now, with a sniffer, content_sniffed must be emitted after
+	 * got-headers, and before got-chunk.
+	 */
+	do_signals_test (TRUE, FALSE, FALSE, FALSE);
+	do_signals_test (TRUE, FALSE, FALSE, TRUE);
+	do_signals_test (TRUE, FALSE, TRUE, FALSE);
+	do_signals_test (TRUE, FALSE, TRUE, TRUE);
+
+	do_signals_test (TRUE, TRUE, TRUE, FALSE);
+	do_signals_test (TRUE, TRUE, TRUE, TRUE);
+	do_signals_test (TRUE, TRUE, FALSE, FALSE);
+	do_signals_test (TRUE, TRUE, FALSE, TRUE);
+
+	/* Test the text_or_binary sniffing path */
+
+	/* GIF is a 'safe' type */
+	test_sniffing ("/text_or_binary/home.gif", "image/gif");
+
+	/* With our current code, no sniffing is done using GIO, so
+	 * the mbox will be identified as text/plain; should we change
+	 * this?
+	 */
+	test_sniffing ("/text_or_binary/mbox", "text/plain");
+
+	/* HTML is considered unsafe for this algorithm, since it is
+	 * scriptable, so going from text/plain to text/html is
+	 * considered 'privilege escalation'
+	 */
+	test_sniffing ("/text_or_binary/test.html", "text/plain");
+
+	/* Test the unknown sniffing path */
+
+	test_sniffing ("/unknown/test.html", "text/html");
+	test_sniffing ("/unknown/home.gif", "image/gif");
+	test_sniffing ("/unknown/mbox", "application/mbox");
+
+	/* Test the XML sniffing path */
+
+	test_sniffing ("/type/text_xml/home.gif", "text/xml");
+	test_sniffing ("/type/anice_type+xml/home.gif", "anice/type+xml");
+	test_sniffing ("/type/application_xml/home.gif", "application/xml");
+
+	/* Test the image sniffing path */
+
+	test_sniffing ("/type/image_png/home.gif", "image/gif");
+
+	/* Test the feed or html path */
+
+	test_sniffing ("/type/text_html/test.html", "text/html");
+	test_sniffing ("/type/text_html/rss20.xml", "application/rss+xml");
+	test_sniffing ("/type/text_html/atom.xml", "application/atom+xml");
+
+	/* The spec tells us to only use the last Content-Type header */
+
+	test_sniffing ("/multiple_headers/home.gif", "image/gif");
+
+	soup_uri_free (base_uri);
+
+	test_cleanup ();
+	return errors != 0;
+}



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]