[libsoup/content-sniffing-update: 7/8] Bring image sniffing up-to-date with the MIMESNIFF spec



commit a3f8439058153fd83ae722202aa9019b50a13fd3
Author: Gustavo Noronha Silva <gns gnome org>
Date:   Tue Dec 10 17:17:40 2013 +0100

    Bring image sniffing up-to-date with the MIMESNIFF spec

 libsoup/soup-content-sniffer.c |  216 +++++++++++++++++++++-------------------
 tests/resources/home.jpg       |  Bin 0 -> 1074 bytes
 tests/resources/home.png       |  Bin 0 -> 313 bytes
 tests/resources/tux.webp       |  Bin 0 -> 17128 bytes
 tests/sniffing-test.c          |    3 +
 5 files changed, 118 insertions(+), 101 deletions(-)
---
diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c
index 11211c6..3c16544 100644
--- a/libsoup/soup-content-sniffer.c
+++ b/libsoup/soup-content-sniffer.c
@@ -40,7 +40,6 @@ static void soup_content_sniffer_session_feature_init (SoupSessionFeatureInterfa
 static SoupContentProcessorInterface *soup_content_sniffer_default_content_processor_interface;
 static void soup_content_sniffer_content_processor_init (SoupContentProcessorInterface *interface, gpointer 
interface_data);
 
-
 G_DEFINE_TYPE_WITH_CODE (SoupContentSniffer, soup_content_sniffer, G_TYPE_OBJECT,
                         G_IMPLEMENT_INTERFACE (SOUP_TYPE_SESSION_FEATURE,
                                                soup_content_sniffer_session_feature_init)
@@ -77,6 +76,105 @@ soup_content_sniffer_init (SoupContentSniffer *content_sniffer)
 {
 }
 
+typedef struct {
+       const guchar *mask;
+       const guchar *pattern;
+       guint         pattern_length;
+       const char   *sniffed_type;
+} SoupContentSnifferMediaPattern;
+
+static char*
+sniff_media (SoupContentSniffer *sniffer,
+            SoupBuffer *buffer,
+            SoupContentSnifferMediaPattern table[],
+            int table_length)
+{
+       const guchar *resource = (const guchar *)buffer->data;
+       int resource_length = MIN (512, buffer->length);
+       int i;
+
+       for (i = 0; i < table_length; i++) {
+               SoupContentSnifferMediaPattern *type_row = &(table[i]);
+               int j;
+
+               if (resource_length < type_row->pattern_length)
+                       continue;
+
+               for (j = 0; j < type_row->pattern_length; j++) {
+                       if ((type_row->mask[j] & resource[j]) != type_row->pattern[j])
+                               break;
+               }
+
+               /* This means our comparison above matched completely */
+               if (j == type_row->pattern_length)
+                       return g_strdup (type_row->sniffed_type);
+       }
+
+       return NULL;
+}
+
+/* This table is based on the MIMESNIFF spec;
+ * See 6.1 Matching an image type pattern
+ */
+static SoupContentSnifferMediaPattern image_types_table[] = {
+
+       /* Windows icon signature. */
+       { (const guchar *)"\xFF\xFF\xFF\xFF",
+         (const guchar *)"\x00\x00\x01\x00",
+         4,
+         "image/x-icon" },
+
+       /* Windows cursor signature. */
+       { (const guchar *)"\xFF\xFF\xFF\xFF",
+         (const guchar *)"\x00\x00\x02\x00",
+         4,
+         "image/x-icon" },
+
+       /* BMP. */
+       { (const guchar *)"\xFF\xFF",
+         (const guchar *)"BM",
+         2,
+         "image/bmp" },
+
+       /* GIFs. */
+       { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
+         (const guchar *)"GIF87a",
+         6,
+         "image/gif" },
+
+       { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
+         (const guchar *)"GIF89a",
+         6,
+         "image/gif" },
+
+       /* WEBP. */
+       { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF",
+         (const guchar *)"RIFF\x00\x00\x00\x00WEBPVP",
+         14,
+         "image/webp" },
+
+       /* PNG. */
+       { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
+         (const guchar *)"\x89PNG\x0D\x0A\x1A\x0A",
+         8,
+         "image/png" },
+
+       /* JPEG. */
+       { (const guchar *)"\xFF\xFF\xFF",
+         (const guchar *)"\xFF\xD8\xFF",
+         3,
+         "image/jpeg" },
+};
+
+static char*
+sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer)
+{
+       return sniff_media (sniffer,
+                           buffer,
+                           image_types_table,
+                           G_N_ELEMENTS (image_types_table));
+}
+
 /* This table is based on the MIMESNIFF spec;
  * See 7.1 Identifying a resource with an unknown MIME type
  */
@@ -84,7 +182,7 @@ typedef struct {
        /* @has_ws is TRUE if @pattern contains "generic" whitespace */
        gboolean      has_ws;
        /* @has_tag_termination is TRUE if we should check for a tag-terminating
-        * byte (0x20 " " or 0x3E ">") after the pattern match. */
+                * byte (0x20 " " or 0x3E ">") after the pattern match. */
        gboolean      has_tag_termination;
        const guchar *mask;
        const guchar *pattern;
@@ -93,7 +191,6 @@ typedef struct {
        gboolean      scriptable;
 } SoupContentSnifferPattern;
 
-
 /* When has_ws is TRUE, spaces in the pattern will indicate where insignificant space
  * is allowed. Those spaces are marked with \x00 on the mask.
  */
@@ -261,66 +358,6 @@ static SoupContentSnifferPattern types_table[] = {
          4,
          "text/plain",
          FALSE },
-
-       /* Images. */
-
-       { FALSE, FALSE, /* Windows icon signature. */
-         (const guchar *)"\xFF\xFF\xFF\xFF",
-         (const guchar *)"\x00\x00\x01\x00",
-         4,
-         "image/x-icon",
-         FALSE },
-
-       { FALSE, FALSE, /* Windows cursor signature. */
-         (const guchar *)"\xFF\xFF\xFF\xFF",
-         (const guchar *)"\x00\x00\x02\x00",
-         4,
-         "image/x-icon",
-         FALSE },
-
-       { FALSE, FALSE, /* BMP. */
-         (const guchar *)"\xFF\xFF",
-         (const guchar *)"BM",
-         2,
-         "image/bmp",
-         FALSE },
-
-    { FALSE, FALSE, /* GIF. */
-         (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
-         (const guchar *)"GIF87a",
-         6,
-         "image/gif",
-         FALSE },
-
-       { FALSE, FALSE, /* GIF. */
-         (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
-         (const guchar *)"GIF89a",
-         6,
-         "image/gif",
-         FALSE },
-
-       { FALSE, FALSE, /* WEBP. */
-         (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF",
-         (const guchar *)"RIFF\x00\x00\x00\x00WEBPVP",
-         14,
-         "image/webp",
-         FALSE },
-
-       { FALSE, FALSE, /* PNG. */
-         (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
-         (const guchar *)"\x89PNG\x0D\x0A\x1A\x0A",
-         8,
-         "image/png",
-         FALSE },
-
-       { FALSE, FALSE, /* JPEG. */
-         (const guchar *)"\xFF\xFF\xFF",
-         (const guchar *)"\xFF\xD8\xFF",
-         3,
-         "image/jpeg",
-         FALSE },
-
-       /* TODO: audio/video, archive type. */
 };
 
 /* Whether a given byte looks like it might be part of binary content.
@@ -351,6 +388,7 @@ static char*
 sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer,
               gboolean sniff_scriptable)
 {
+    char *sniffed_type = NULL;
        const guchar *resource = (const guchar *)buffer->data;
        int resource_length = MIN (512, buffer->length);
        int i;
@@ -416,6 +454,12 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer,
                }
        }
 
+       sniffed_type = sniff_images (sniffer, buffer);
+
+       if (sniffed_type != NULL)
+               return sniffed_type;
+
+
        for (i = 0; i < resource_length; i++) {
                if (byte_looks_binary[resource[i]])
                        return g_strdup ("application/octet-stream");
@@ -463,33 +507,6 @@ sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer)
        return sniff_unknown (sniffer, buffer, TRUE);
 }
 
-static char*
-sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer,
-             const char *content_type)
-{
-       const guchar *resource = (const guchar *)buffer->data;
-       int resource_length = MIN (512, buffer->length);
-       int i;
-
-       for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
-               SoupContentSnifferPattern *type_row = &(types_table[i]);
-
-               if (resource_length < type_row->pattern_length)
-                       continue;
-
-               if (!g_str_has_prefix (type_row->sniffed_type, "image/"))
-                       continue;
-
-               /* All of the image types use all-\xFF for the mask,
-                * so we can just memcmp.
-                */
-               if (memcmp (type_row->pattern, resource, type_row->pattern_length) == 0)
-                       return g_strdup (type_row->sniffed_type);
-       }
-
-       return g_strdup (content_type);
-}
-
 static gboolean skip_insignificant_space (const gchar *resource, int *pos, int resource_length)
 {
        while ((resource[*pos] == '\x09') ||
@@ -644,6 +661,7 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg,
 {
        const char *content_type;
        const char *x_content_type_options;
+       char *sniffed_type = NULL;
        gboolean no_sniff = FALSE;
 
        content_type = soup_message_headers_get_content_type (msg->response_headers, params);
@@ -683,18 +701,14 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg,
        if (!g_ascii_strcasecmp (content_type, "text/html"))
                return sniff_feed_or_html (sniffer, buffer);
 
-       /* 2.7.5 Content-Type sniffing: image
-        * The spec says:
-        *
-        *   If the resource's official type is "image/svg+xml", then
-        *   the sniffed type of the resource is its official type (an
-        *   XML type)
-        *
-        * The XML case is handled by the if above; if you refactor
-        * this code, keep this in mind.
+       /* 6. Image types.
         */
-       if (!g_ascii_strncasecmp (content_type, "image/", 6))
-               return sniff_images (sniffer, buffer, content_type);
+       if (!g_ascii_strncasecmp (content_type, "image/", 6)) {
+               sniffed_type = sniff_images (sniffer, buffer);
+               if (sniffed_type != NULL)
+                       return sniffed_type;
+               return g_strdup (content_type);
+       }
 
        /* If we got text/plain, use text_or_binary */
        if (g_str_equal (content_type, "text/plain")) {
diff --git a/tests/resources/home.jpg b/tests/resources/home.jpg
new file mode 100644
index 0000000..ac1f3bb
Binary files /dev/null and b/tests/resources/home.jpg differ
diff --git a/tests/resources/home.png b/tests/resources/home.png
new file mode 100644
index 0000000..0bb82ba
Binary files /dev/null and b/tests/resources/home.png differ
diff --git a/tests/resources/tux.webp b/tests/resources/tux.webp
new file mode 100644
index 0000000..8764f06
Binary files /dev/null and b/tests/resources/tux.webp differ
diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c
index c1fe56b..d53d06f 100644
--- a/tests/sniffing-test.c
+++ b/tests/sniffing-test.c
@@ -596,6 +596,9 @@ main (int argc, char **argv)
        /* Test the image sniffing path */
 
        test_sniffing ("/type/image_png/home.gif", "image/gif");
+       test_sniffing ("/type/image_gif/home.png", "image/png");
+       test_sniffing ("/type/image_png/home.jpg", "image/jpeg");
+       test_sniffing ("/type/image_png/tux.webp", "image/webp");
 
        /* The spec tells us to only use the last Content-Type header */
 


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]