[libsoup/content-sniffing-update: 4/9] Adjust the general unknown MIME type algorithm



commit 83532082f2b4f6407802469fc8231ff872da6e00
Author: Gustavo Noronha Silva <gns gnome org>
Date:   Mon Dec 9 11:54:29 2013 +0100

    Adjust the general unknown MIME type algorithm
    
    This change adjusts the pattern matching table to the current form of the
    MIMESNIFF spec, adding a check for a tag-terminating byte and using the formally
    defined sniff-scriptable flag.

 libsoup/soup-content-sniffer.c   |  239 +++++++++++++++++++++++++++++---------
 tests/resources/html_binary.html |    2 +-
 2 files changed, 186 insertions(+), 55 deletions(-)
---
diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c
index 0564bbe..60b7f4c 100644
--- a/libsoup/soup-content-sniffer.c
+++ b/libsoup/soup-content-sniffer.c
@@ -77,12 +77,15 @@ soup_content_sniffer_init (SoupContentSniffer *content_sniffer)
 {
 }
 
-/* This table is based on the HTML5 spec;
- * See 2.7.4 Content-Type sniffing: unknown type
+/* This table is based on the MIMESNIFF spec;
+ * See 7.1 Identifying a resource with an unknown MIME type
  */
 typedef struct {
        /* @has_ws is TRUE if @pattern contains "generic" whitespace */
        gboolean      has_ws;
+       /* @has_tag_termination is TRUE if we should check for a tag-terminating
+        * byte (0x20 " " or 0x3E ">") after the pattern match. */
+       gboolean      has_tag_termination;
        const guchar *mask;
        const guchar *pattern;
        guint         pattern_length;
@@ -90,111 +93,234 @@ typedef struct {
        gboolean      scriptable;
 } SoupContentSnifferPattern;
 
+
+/* When has_ws is TRUE, spaces in the pattern will indicate where insignificant space
+ * is allowed. Those spaces are marked with \x00 on the mask.
+ */
 static SoupContentSnifferPattern types_table[] = {
-       { FALSE,
-         (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF",
-         (const guchar *)"\x3C\x21\x44\x4F\x43\x54\x59\x50\x45\x20\x48\x54\x4D\x4C",
+       /* Scriptable types. */
+
+       { TRUE, TRUE,
+         (const guchar *)"\x00\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF",
+         (const guchar *)" <!DOCTYPE HTML",
          14,
          "text/html",
          TRUE },
 
-       { TRUE,
-         (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF",
-         (const guchar *)" \x3C\x48\x54\x4D\x4C",
+       { TRUE, TRUE,
+         (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
+         (const guchar *)" <HTML",
          5,
          "text/html",
          TRUE },
 
-       { TRUE,
-         (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF",
-         (const guchar *)" \x3C\x48\x45\x41\x44",
+       { TRUE, TRUE,
+         (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
+         (const guchar *)" <HEAD",
          5,
          "text/html",
          TRUE },
 
-       { TRUE,
-         (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
-         (const guchar *)" \x3C\x53\x43\x52\x49\x50\x54",
+       { TRUE, TRUE,
+         (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
+         (const guchar *)" <SCRIPT",
+         7,
+         "text/html",
+         TRUE },
+
+       { TRUE, TRUE,
+         (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
+         (const guchar *)" <IFRAME",
          7,
          "text/html",
          TRUE },
 
-       { FALSE,
+       { TRUE, TRUE,
+         (const guchar *)"\x00\xFF\xDF\xFF",
+         (const guchar *)" <H1",
+         3,
+         "text/html",
+         TRUE },
+
+       { TRUE, TRUE,
+         (const guchar *)"\x00\xFF\xDF\xDF\xDF",
+         (const guchar *)" <DIV",
+         4,
+         "text/html",
+         TRUE },
+
+       { TRUE, TRUE,
+         (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
+         (const guchar *)" <FONT",
+         5,
+         "text/html",
+         TRUE },
+
+       { TRUE, TRUE,
+         (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF",
+         (const guchar *)" <TABLE",
+         6,
+         "text/html",
+         TRUE },
+
+       { TRUE, TRUE,
+         (const guchar *)"\x00\xFF\xDF",
+         (const guchar *)" <A",
+         2,
+         "text/html",
+         TRUE },
+
+       { TRUE, TRUE,
+         (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF",
+         (const guchar *)" <STYLE",
+         6,
+         "text/html",
+         TRUE },
+
+       { TRUE, TRUE,
+         (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF",
+         (const guchar *)" <TITLE",
+         6,
+         "text/html",
+         TRUE },
+
+       { TRUE, TRUE,
+         (const guchar *)"\x00\xFF\xDF",
+         (const guchar *)" <B",
+         2,
+         "text/html",
+         TRUE },
+
+       { TRUE, TRUE,
+         (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
+         (const guchar *)" <BODY",
+         5,
+         "text/html",
+         TRUE },
+
+       { TRUE, TRUE,
+         (const guchar *)"\x00\xFF\xDF\xDF",
+         (const guchar *)" <BR",
+         3,
+         "text/html",
+         TRUE },
+
+       { TRUE, TRUE,
+         (const guchar *)"\x00\xFF\xDF",
+         (const guchar *)" <P",
+         2,
+         "text/html",
+         TRUE },
+
+       { TRUE, TRUE,
+         (const guchar *)"\x00\xFF\xFF\xFF\xFF",
+         (const guchar *)" <!--",
+         4,
+         "text/html",
+         TRUE },
+
+       { TRUE, FALSE,
+         (const guchar *)"\x00\xFF\xFF\xFF\xFF\xFF",
+         (const guchar *)" <?xml",
+         5,
+         "text/html",
+         TRUE },
+
+       { FALSE, FALSE,
          (const guchar *)"\xFF\xFF\xFF\xFF\xFF",
-         (const guchar *)"\x25\x50\x44\x46\x2D",
+         (const guchar *)"%PDF-",
          5,
          "application/pdf",
          TRUE },
 
-       { FALSE,
+       /* Non-scriptable types. */
+       { FALSE, FALSE,
          (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
-         (const guchar *)"\x25\x21\x50\x53\x2D\x41\x64\x6F\x62\x65\x2D",
+         (const guchar *)"%!PS-Adobe-",
          11,
          "application/postscript",
          FALSE },
 
-       { FALSE,
+       { FALSE, FALSE, /* UTF-16BE BOM */
          (const guchar *)"\xFF\xFF\x00\x00",
          (const guchar *)"\xFE\xFF\x00\x00",
          4,
          "text/plain",
          FALSE },
 
-       { FALSE,
-         (const guchar *)"\xFF\xFF\x00\x00",
+       { FALSE, FALSE, /* UTF-16LE BOM */
          (const guchar *)"\xFF\xFF\x00\x00",
+         (const guchar *)"\xFF\xFE\x00\x00",
          4,
          "text/plain",
          FALSE },
 
-       { FALSE,
+       { FALSE, FALSE, /* UTF-8 BOM */
          (const guchar *)"\xFF\xFF\xFF\x00",
          (const guchar *)"\xEF\xBB\xBF\x00",
          4,
          "text/plain",
          FALSE },
 
-       { FALSE,
+       /* Images. */
+
+       { FALSE, FALSE, /* Windows icon signature. */
+         (const guchar *)"\xFF\xFF\xFF\xFF",
+         (const guchar *)"\x00\x00\x01\x00",
+         4,
+         "image/x-icon",
+         FALSE },
+
+       { FALSE, FALSE, /* Windows cursor signature. */
+         (const guchar *)"\xFF\xFF\xFF\xFF",
+         (const guchar *)"\x00\x00\x02\x00",
+         4,
+         "image/x-icon",
+         FALSE },
+
+       { FALSE, FALSE, /* BMP. */
+         (const guchar *)"\xFF\xFF",
+         (const guchar *)"BM",
+         2,
+         "image/bmp",
+         FALSE },
+
+    { FALSE, FALSE, /* GIF. */
          (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
-         (const guchar *)"\x47\x49\x46\x38\x37\x61",
+         (const guchar *)"GIF87a",
          6,
          "image/gif",
          FALSE },
 
-       { FALSE,
+       { FALSE, FALSE, /* GIF. */
          (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
-         (const guchar *)"\x47\x49\x46\x38\x39\x61",
+         (const guchar *)"GIF89a",
          6,
          "image/gif",
          FALSE },
 
-       { FALSE,
+       { FALSE, FALSE, /* WEBP. */
+         (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF",
+         (const guchar *)"RIFF\x00\x00\x00\x00WEBPVP",
+         14,
+         "image/webp",
+         FALSE },
+
+       { FALSE, FALSE, /* PNG. */
          (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
-         (const guchar *)"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A",
+         (const guchar *)"\x89PNG\x0D\x0A\x1A\x0A",
          8,
          "image/png",
          FALSE },
 
-       { FALSE,
+       { FALSE, FALSE, /* JPEG. */
          (const guchar *)"\xFF\xFF\xFF",
          (const guchar *)"\xFF\xD8\xFF",
          3,
          "image/jpeg",
          FALSE },
 
-       { FALSE,
-         (const guchar *)"\xFF\xFF",
-         (const guchar *)"\x42\x4D",
-         2,
-         "image/bmp",
-         FALSE },
-
-       { FALSE,
-         (const guchar *)"\xFF\xFF\xFF\xFF",
-         (const guchar *)"\x00\x00\x01\x00",
-         4,
-         "image/vnd.microsoft.icon",
-         FALSE }
+       /* TODO: audio/video, archive type. */
 };
 
 /* Whether a given byte looks like it might be part of binary content.
@@ -223,7 +349,7 @@ static char byte_looks_binary[] = {
 /* HTML5: 2.7.4 Content-Type sniffing: unknown type */
 static char*
 sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer,
-              gboolean for_text_or_binary)
+              gboolean sniff_scriptable)
 {
        const guchar *resource = (const guchar *)buffer->data;
        int resource_length = MIN (512, buffer->length);
@@ -232,9 +358,7 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer,
        for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
                SoupContentSnifferPattern *type_row = &(types_table[i]);
 
-               /* The scriptable types should be skiped for the text
-                * or binary path, but considered for other paths */
-               if (for_text_or_binary && type_row->scriptable)
+               if (!sniff_scriptable && type_row->scriptable)
                        continue;
 
                if (type_row->has_ws) {
@@ -267,8 +391,14 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer,
                        if (skip_row)
                                continue;
 
-                       if (index_pattern > type_row->pattern_length)
+                       if (index_pattern > type_row->pattern_length) {
+                               if (type_row->has_tag_termination &&
+                                       resource[index_stream] != '\x20' &&
+                                   resource[index_stream] != '\x3E')
+                                       continue;
+
                                return g_strdup (type_row->sniffed_type);
+                       }
                } else {
                        int j;
 
@@ -286,9 +416,6 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer,
                }
        }
 
-       if (for_text_or_binary)
-               return g_strdup ("application/octet-stream");
-
        for (i = 0; i < resource_length; i++) {
                if (byte_looks_binary[resource[i]])
                        return g_strdup ("application/octet-stream");
@@ -478,21 +605,25 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg,
 {
        const char *content_type;
        const char *x_content_type_options;
+       gboolean no_sniff = FALSE;
 
        content_type = soup_message_headers_get_content_type (msg->response_headers, params);
 
        /* MIMESNIFF: 7 Determining the sniffed MIME type of a resource. */
 
-       /* 1. Unknown/undefined supplied type respecting sniff-scritable. */
+       x_content_type_options = soup_message_headers_get_one (msg->response_headers, 
"X-Content-Type-Options");
+       if (!g_strcmp0 (x_content_type_options, "nosniff"))
+               no_sniff = TRUE;
+
+       /* 1. Unknown/undefined supplied type with sniff-scritable = !nosniff. */
        if ((content_type == NULL) ||
            !g_ascii_strcasecmp (content_type, "unknown/unknown") ||
            !g_ascii_strcasecmp (content_type, "application/unknown") ||
            !g_ascii_strcasecmp (content_type, "*/*"))
-               return sniff_unknown (sniffer, buffer, FALSE);
+               return sniff_unknown (sniffer, buffer, !no_sniff);
 
        /* 2. If nosniff is specified in X-Content-Type-Options use the supplied MIME type. */
-       x_content_type_options = soup_message_headers_get_one (msg->response_headers, 
"X-Content-Type-Options");
-       if (!g_strcmp0 (x_content_type_options, "nosniff"))
+       if (no_sniff)
                return g_strdup (content_type);
 
        /* 3. check-for-apache-bug */
diff --git a/tests/resources/html_binary.html b/tests/resources/html_binary.html
index 9200dd4..d443048 100644
--- a/tests/resources/html_binary.html
+++ b/tests/resources/html_binary.html
@@ -1 +1 @@
-<HTML 
+<HTML


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]