[libsoup/content-sniffing] Refactor the handling of the unknown path, to handle more cases

From: Gustavo Noronha Silva <gns src gnome org>
To: svn-commits-list gnome org
Subject: [libsoup/content-sniffing] Refactor the handling of the unknown path, to handle more cases
Date: Thu, 18 Jun 2009 21:57:12 -0400 (EDT)
commit c406891948e46be6b7cdec262989f6c2c375959a
Author: Gustavo Noronha Silva <gns gnome org>
Date:   Thu Jun 18 22:16:21 2009 -0300

    Refactor the handling of the unknown path, to handle more cases
    
    The unknown handling was first written for the text or binary
    algorithm, but it is also used stand-alone, when handling empty,
    unknown/unknown, or application/unknown content types.

 libsoup/soup-content-sniffer.c |   92 +++++++++++++++++++++++----------------
 tests/sniffing-test.c          |   29 +++++++++++++
 2 files changed, 83 insertions(+), 38 deletions(-)
---
diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c
index 74884a4..a7a51f9 100644
--- a/libsoup/soup-content-sniffer.c
+++ b/libsoup/soup-content-sniffer.c
@@ -231,44 +231,20 @@ static char kByteLooksBinary[] = {
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xF0 - 0xFF
 };
 
-
-/* HTML5: 2.7.3 Content-Type sniffing: text or binary */
+/* HTML5: 2.7.4 Content-Type sniffing: unknown type */
 static char*
-sniff_text_or_binary (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer)
+sniff_unknown (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, gboolean for_text_or_binary)
 {
 	const char *resource = buffer->data;
 	int resource_length = MIN(512, buffer->length);
-	gboolean looks_binary = FALSE;
 	int i;
 
-	/* Detecting UTF-16BE, UTF-16LE, and UTF-8 BOMs means it's text/plain */
-	if (resource_length >= 4) {
-		if ((resource[0] == 0xfe && resource[1] == 0xff) ||
-		    (resource[0] == 0xff && resource[1] == 0xfe) ||
-		    (resource[0] == 0xef && resource[1] == 0xbb && resource[2] == 0xbf))
-			return g_strdup ("text/plain");
-	}
-
-	/* Look to see if any of the first n bytes looks binary */
-	for (i = 0; i < resource_length; i++) {
-		if (kByteLooksBinary[(unsigned char)resource[i]]) {
-			looks_binary = TRUE;
-			break;
-		}
-	}
-
-	if (!looks_binary)
-		return g_strdup ("text/plain");
-
-	/* HTML5: 2.7.4 Content-Type sniffing: unknown type
-	 *
-	 * This will probably live in its own function, since it is
-	 * used by other parts of the algorithm
-	 */
 	for (i = 0; types_table[i].pattern != NULL ; i++) {
 		struct _type_info *type_row = &(types_table[i]);
 
-		if (type_row->scriptable)
+		/* The scriptable types should be skiped for the text
+		 * or binary path, but considered for other paths */
+		if (for_text_or_binary && type_row->scriptable)
 			continue;
 
 		if (type_row->has_ws) {
@@ -322,6 +298,37 @@ sniff_text_or_binary (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer
 	return g_strdup ("application/octet-stream");
 }
 
+/* HTML5: 2.7.3 Content-Type sniffing: text or binary */
+static char*
+sniff_text_or_binary (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer)
+{
+	const char *resource = buffer->data;
+	int resource_length = MIN(512, buffer->length);
+	gboolean looks_binary = FALSE;
+	int i;
+
+	/* Detecting UTF-16BE, UTF-16LE, and UTF-8 BOMs means it's text/plain */
+	if (resource_length >= 4) {
+		if ((resource[0] == 0xfe && resource[1] == 0xff) ||
+		    (resource[0] == 0xff && resource[1] == 0xfe) ||
+		    (resource[0] == 0xef && resource[1] == 0xbb && resource[2] == 0xbf))
+			return g_strdup ("text/plain");
+	}
+
+	/* Look to see if any of the first n bytes looks binary */
+	for (i = 0; i < resource_length; i++) {
+		if (kByteLooksBinary[(unsigned char)resource[i]]) {
+			looks_binary = TRUE;
+			break;
+		}
+	}
+
+	if (!looks_binary)
+		return g_strdup ("text/plain");
+
+	return sniff_unknown (sniffer, msg, buffer, TRUE);
+}
+
 static char*
 sniff_gio (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer)
 {
@@ -346,18 +353,25 @@ sniff_gio (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer)
 static char*
 sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer)
 {
+	const char *content_type_with_params;
 	const char *content_type;
 
-	content_type = soup_message_headers_get_one (msg->response_headers, "Content-Type");
+	content_type = soup_message_headers_get_content_type (msg->response_headers, NULL);
+
+	/* These comparisons are done in an ASCII-case-insensitive
+	 * manner because the spec requires it */
+	if ((content_type == NULL) ||
+	    !g_ascii_strcasecmp (content_type, "unknown/unknown") ||
+	    !g_ascii_strcasecmp (content_type, "application/unknown"))
+		return sniff_unknown (sniffer, msg, buffer, FALSE);
 
-	if (content_type == NULL)
-		return sniff_gio (sniffer, msg, buffer);
+	content_type_with_params = soup_message_headers_get_one (msg->response_headers, "Content-Type");
 
 	/* If we got text/plain, use text_or_binary */
-	if (g_str_equal (content_type, "text/plain") ||
-	     g_str_equal (content_type, "text/plain; charset=ISO-8859-1") ||
-	     g_str_equal (content_type, "text/plain; charset=iso-8859-1") ||
-	     g_str_equal (content_type, "text/plain; charset=UTF-8")) {
+	if (g_str_equal (content_type_with_params, "text/plain") ||
+	     g_str_equal (content_type_with_params, "text/plain; charset=ISO-8859-1") ||
+	     g_str_equal (content_type_with_params, "text/plain; charset=iso-8859-1") ||
+	     g_str_equal (content_type_with_params, "text/plain; charset=UTF-8")) {
 		return sniff_text_or_binary (sniffer, msg, buffer);
 	}
 
@@ -378,8 +392,10 @@ soup_content_sniffer_got_headers_cb (SoupMessage *msg, SoupContentSniffer *sniff
 	const char *content_type = soup_message_headers_get_content_type (msg->response_headers, NULL);
 
 	if ((content_type == NULL)
-	    || (strcmp (content_type, "application/octet-stream") == 0)
-	    || (strcmp (content_type, "text/plain") == 0)) {
+	    || (g_ascii_strcasecmp (content_type, "application/octet-stream") == 0)
+	    || (g_ascii_strcasecmp (content_type, "text/plain") == 0)
+	    || (g_ascii_strcasecmp (content_type, "unknown/unknown") == 0)
+	    || (g_ascii_strcasecmp (content_type, "application/unknown") == 0)) {
 		priv->should_sniff_content = TRUE;
 		priv->bytes_for_sniffing = content_sniffer_class->get_buffer_size (sniffer);
 	}
diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c
index 040ac89..a23ab3a 100644
--- a/tests/sniffing-test.c
+++ b/tests/sniffing-test.c
@@ -70,6 +70,29 @@ server_callback (SoupServer *server, SoupMessage *msg,
 					   contents,
 					   length);
 	}
+
+	if (g_str_has_prefix (path, "/unknown/")) {
+		char *base_name = g_path_get_basename (path);
+		char *file_name = g_strdup_printf ("resources/%s", base_name);
+
+		g_file_get_contents (file_name,
+				     &contents, &length,
+				     &error);
+
+		g_free (base_name);
+		g_free (file_name);
+
+		if (error) {
+			g_error ("%s", error->message);
+			g_error_free (error);
+			exit (1);
+		}
+
+		soup_message_set_response (msg, "UNKNOWN/unknown",
+					   SOUP_MEMORY_TAKE,
+					   contents,
+					   length);
+	}
 }
 
 static gboolean
@@ -294,6 +317,12 @@ main (int argc, char **argv)
 	 */
 	test_sniffing ("/text_or_binary/test.html", "text/plain");
 
+	/* Test the unknown sniffing path */
+
+	test_sniffing ("/unknown/test.html", "text/html");
+	test_sniffing ("/unknown/home.gif", "image/gif");
+	test_sniffing ("/unknown/mbox", "application/octet-stream");
+
 	soup_uri_free (base_uri);
 
 	test_cleanup ();
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]