[libsoup: 2/10] sniffing: Implement the check-apache-bug flag



commit 6510806d97713450625bbd648d3ce6cd953a4df9
Author: Gustavo Noronha Silva <gns gnome org>
Date:   Sun Dec 8 19:11:21 2013 +0100

    sniffing: Implement the check-apache-bug flag
    
    Run the text or binary algorithm when some specific text/plain Content-Types
    are provided, since older versions of apache would send that type for binary
    files. http://mimesniff.spec.whatwg.org/#dfnReturnLink-0

 libsoup/soup-content-sniffer.c |   37 ++++++++++++++++++++++++++++---------
 tests/resources/text.txt       |    1 +
 tests/sniffing-test.c          |   11 ++++++++++-
 tests/soup-tests.gresource.xml |    1 +
 4 files changed, 40 insertions(+), 10 deletions(-)
---
diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c
index d2a0808..e16658b 100644
--- a/libsoup/soup-content-sniffer.c
+++ b/libsoup/soup-content-sniffer.c
@@ -2,7 +2,7 @@
 /*
  * soup-content-sniffer.c
  *
- * Copyright (C) 2009 Gustavo Noronha Silva.
+ * Copyright (C) 2009, 2013 Gustavo Noronha Silva.
  *
  * This code implements the following specification:
  *
@@ -297,7 +297,7 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer,
        return g_strdup ("text/plain");
 }
 
-/* HTML5: 2.7.3 Content-Type sniffing: text or binary */
+/* MIMESNIFF: 7.2 Sniffing a mislabeled binary resource */
 static char*
 sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer)
 {
@@ -306,15 +306,20 @@ sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer)
        gboolean looks_binary = FALSE;
        int i;
 
-       /* Detecting UTF-16BE, UTF-16LE, or UTF-8 BOMs means it's text/plain */
-       if (resource_length >= 4) {
+       /* 2. Detecting UTF-16BE, UTF-16LE BOMs means it's text/plain */
+       if (resource_length >= 2) {
                if ((resource[0] == 0xFE && resource[1] == 0xFF) ||
-                   (resource[0] == 0xFF && resource[1] == 0xFE) ||
-                   (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF))
+                   (resource[0] == 0xFF && resource[1] == 0xFE))
                        return g_strdup ("text/plain");
        }
 
-       /* Look to see if any of the first n bytes looks binary */
+       /* 3. UTF-8 BOM. */
+       if (resource_length >= 3) {
+               if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF)
+                       return g_strdup ("text/plain");
+       }
+
+       /* 4. Look to see if any of the first n bytes looks binary */
        for (i = 0; i < resource_length; i++) {
                if (byte_looks_binary[resource[i]]) {
                        looks_binary = TRUE;
@@ -325,6 +330,9 @@ sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer)
        if (!looks_binary)
                return g_strdup ("text/plain");
 
+       /* 5. Execute 7.1 Identifying a resource with an unknown MIME type.
+        * TODO: sniff-scriptable needs to be unset.
+        */
        return sniff_unknown (sniffer, buffer, TRUE);
 }
 
@@ -472,14 +480,25 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg,
 
        content_type = soup_message_headers_get_content_type (msg->response_headers, params);
 
-       /* These comparisons are done in an ASCII-case-insensitive
-        * manner because the spec requires it */
+       /* MIMESNIFF: 7 Determining the sniffed MIME type of a resource. */
+
+       /* 1. Unknown/undefined supplied type respecting sniff-scritable. */
        if ((content_type == NULL) ||
            !g_ascii_strcasecmp (content_type, "unknown/unknown") ||
            !g_ascii_strcasecmp (content_type, "application/unknown") ||
            !g_ascii_strcasecmp (content_type, "*/*"))
                return sniff_unknown (sniffer, buffer, FALSE);
 
+       /* TODO: 2. no-sniff flag handling. */
+
+       /* 3. check-for-apache-bug */
+       if ((content_type != NULL) &&
+           (g_str_equal (content_type, "text/plain") ||
+            g_str_equal (content_type, "text/plain; charset=ISO-8859-1") ||
+            g_str_equal (content_type, "text/plain; charset=iso-8859-1") ||
+            g_str_equal (content_type, "text/plain; charset=UTF-8")))
+               return sniff_text_or_binary (sniffer, buffer);
+
        if (g_str_has_suffix (content_type, "+xml") ||
            !g_ascii_strcasecmp (content_type, "text/xml") ||
            !g_ascii_strcasecmp (content_type, "application/xml"))
diff --git a/tests/resources/text.txt b/tests/resources/text.txt
new file mode 100644
index 0000000..ff7066f
--- /dev/null
+++ b/tests/resources/text.txt
@@ -0,0 +1 @@
+This is just text.
diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c
index 532e6ed..b668f40 100644
--- a/tests/sniffing-test.c
+++ b/tests/sniffing-test.c
@@ -49,7 +49,7 @@ server_callback (SoupServer *server, SoupMessage *msg,
                                             "Content-Type", "text/plain");
        }
 
-       if (g_str_has_prefix (path, "/text_or_binary/")) {
+       if (g_str_has_prefix (path, "/text_or_binary/") || g_str_has_prefix (path, "/apache_bug/")) {
                char *base_name = g_path_get_basename (path);
 
                response = soup_test_load_resource (base_name, &error);
@@ -442,6 +442,15 @@ main (int argc, char **argv)
                              GINT_TO_POINTER (TRUE),
                              do_signals_tests);
 
+       /* Test the apache bug sniffing path */
+       g_test_add_data_func ("/sniffing/apache-bug/binary",
+                             "/apache_bug/text_binary.txt => application/octet-stream",
+                             do_sniffing_test);
+       g_test_add_data_func ("/sniffing/apache-bug/text",
+                             "/apache_bug/text.txt => text/plain",
+                             do_sniffing_test);
+
+       /* GIF is a 'safe' type */
        g_test_add_data_func ("/sniffing/type/gif",
                              "text_or_binary/home.gif => image/gif",
                              do_sniffing_test);
diff --git a/tests/soup-tests.gresource.xml b/tests/soup-tests.gresource.xml
index 9b580a3..2fe21dd 100644
--- a/tests/soup-tests.gresource.xml
+++ b/tests/soup-tests.gresource.xml
@@ -12,6 +12,7 @@
     <file>resources/ps_binary.ps</file>
     <file>resources/rss20.xml</file>
     <file>resources/test.html</file>
+    <file>resources/text.txt</file>
     <file>resources/text_binary.txt</file>
   </gresource>
 </gresources>


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]