[libsoup/content-sniffing-update: 6/8] Bring feed vs HTML up-to-date with the MIMESNIFF spec

From: Gustavo Noronha Silva <gns src gnome org>
To: commits-list gnome org
Cc:
Subject: [libsoup/content-sniffing-update: 6/8] Bring feed vs HTML up-to-date with the MIMESNIFF spec
Date: Tue, 10 Dec 2013 18:48:38 +0000 (UTC)
commit 6fcb86b308d2402cb8da84548c3b0f3bcc5276c6
Author: Gustavo Noronha Silva <gns gnome org>
Date:   Mon Dec 9 16:20:02 2013 +0100

    Bring feed vs HTML up-to-date with the MIMESNIFF spec
    
    * decide on that before doing the image sniffing to match the spec
    * use const gchar* and g_str_has_prefix for comparisons to make it
      more legible
    * deal with rdf:RDF tags

 libsoup/soup-content-sniffer.c |  116 +++++++++++++++++++++++++++-------------
 tests/resources/feed.rdf       |   32 +++++++++++
 tests/sniffing-test.c          |    9 ++--
 3 files changed, 115 insertions(+), 42 deletions(-)
---
diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c
index d5deb43..55ac5e8 100644
--- a/libsoup/soup-content-sniffer.c
+++ b/libsoup/soup-content-sniffer.c
@@ -490,10 +490,25 @@ sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer,
        return g_strdup (content_type);
 }
 
+static gboolean skip_insignificant_space (const gchar *resource, int *pos, int resource_length)
+{
+       while ((resource[*pos] == '\x09') ||
+              (resource[*pos] == '\x20') ||
+              (resource[*pos] == '\x0A') ||
+              (resource[*pos] == '\x0D')) {
+               *pos = *pos + 1;
+
+               if (*pos > resource_length)
+                       return TRUE;
+       }
+
+       return FALSE;
+}
+
 static char*
 sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer)
 {
-       const guchar *resource = (const guchar *)buffer->data;
+       const gchar *resource = (const gchar *)buffer->data;
        int resource_length = MIN (512, buffer->length);
        int pos = 0;
 
@@ -508,19 +523,10 @@ sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer)
        if (pos > resource_length)
                goto text_html;
 
-       /* Skip insignificant white space */
-       while ((resource[pos] == '\x09') ||
-              (resource[pos] == '\x20') ||
-              (resource[pos] == '\x0A') ||
-              (resource[pos] == '\x0D')) {
-               pos++;
-
-               if (pos > resource_length)
-                       goto text_html;
-       }
+       if (skip_insignificant_space (resource, &pos, resource_length))
+               goto text_html;
 
-       /* != < */
-       if (resource[pos] != '\x3C')
+       if (resource[pos] != '<')
                return g_strdup ("text/html");
 
        pos++;
@@ -528,73 +534,106 @@ sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer)
        if ((pos + 2) > resource_length)
                goto text_html;
 
-       /* Skipping comments */
-       if ((resource[pos] == '\x2D') ||
-           (resource[pos+1] == '\x2D') ||
-           (resource[pos+2] == '\x3E')) {
+       /* Skip comments. */
+       if (g_str_has_prefix (resource + pos, "!--")) {
                pos = pos + 3;
 
                if ((pos + 2) > resource_length)
                        goto text_html;
 
-               while ((resource[pos] != '\x2D') &&
-                      (resource[pos+1] != '\x2D') &&
-                      (resource[pos+2] != '\x3E')) {
+               while (!g_str_has_prefix (resource + pos, "-->")) {
                        pos++;
 
                        if ((pos + 2) > resource_length)
                                goto text_html;
                }
 
+               pos = pos + 3;
+
                goto look_for_tag;
        }
 
        if (pos > resource_length)
                goto text_html;
 
-       /* == ! */
-       if (resource[pos] == '\x21') {
+       if (resource[pos] == '!') {
                do {
                        pos++;
 
                        if (pos > resource_length)
                                goto text_html;
-               } while (resource[pos] != '\x3E');
+               } while (resource[pos] != '>');
 
                pos++;
 
                goto look_for_tag;
-       } else if (resource[pos] == '\x3F') { /* ? */
+       } else if (resource[pos] == '?') {
                do {
                        pos++;
 
                        if ((pos + 1) > resource_length)
                                goto text_html;
-               } while ((resource[pos] != '\x3F') &&
-                        (resource[pos+1] != '\x3E'));
+               } while (!g_str_has_prefix (resource + pos, "?>"));
 
                pos = pos + 2;
 
                goto look_for_tag;
        }
 
-       if ((pos + 2) > resource_length)
+       if ((pos + 3) > resource_length)
                goto text_html;
 
-       if ((resource[pos] == '\x72') &&
-           (resource[pos+1] == '\x73') &&
-           (resource[pos+2] == '\x73'))
+       if (g_str_has_prefix (resource + pos, "rss"))
                return g_strdup ("application/rss+xml");
 
-       if ((pos + 3) > resource_length)
+       if ((pos + 4) > resource_length)
                goto text_html;
 
-       if ((resource[pos] == '\x66') &&
-           (resource[pos+1] == '\x65') &&
-           (resource[pos+2] == '\x65') &&
-           (resource[pos+3] == '\x64'))
+       if (g_str_has_prefix (resource + pos, "feed"))
                return g_strdup ("application/atom+xml");
 
+       if ((pos + 7) > resource_length)
+               goto text_html;
+
+       if (g_str_has_prefix (resource + pos, "rdf:RDF")) {
+               pos = pos + 7;
+
+               if (skip_insignificant_space (resource, &pos, resource_length))
+                       goto text_html;
+
+               if ((pos + 32) > resource_length)
+                       goto text_html;
+
+               if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"";)) {
+                       pos = pos + 32;
+
+                       if (skip_insignificant_space (resource, &pos, resource_length))
+                               goto text_html;
+
+                       if ((pos + 55) > resource_length)
+                               goto text_html;
+
+                       if (g_str_has_prefix (resource + pos, 
"xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"";))
+                               return g_strdup ("application/rss+xml");
+               }
+
+               if ((pos + 55) > resource_length)
+                       goto text_html;
+
+               if (g_str_has_prefix (resource + pos, 
"xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"";)) {
+                       pos = pos + 55;
+
+                       if (skip_insignificant_space (resource, &pos, resource_length))
+                               goto text_html;
+
+                       if ((pos + 32) > resource_length)
+                               goto text_html;
+
+                       if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"";))
+                               return g_strdup ("application/rss+xml");
+               }
+       }
+
  text_html:
        return g_strdup ("text/html");
 }
@@ -641,6 +680,10 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg,
            !g_ascii_strcasecmp (content_type, "application/xml"))
                return g_strdup (content_type);
 
+       /* 5. Distinguish feed from HTML. */
+       if (!g_ascii_strcasecmp (content_type, "text/html"))
+               return sniff_feed_or_html (sniffer, buffer);
+
        /* 2.7.5 Content-Type sniffing: image
         * The spec says:
         *
@@ -659,9 +702,6 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg,
                return sniff_text_or_binary (sniffer, buffer);
        }
 
-       if (!g_ascii_strcasecmp (content_type, "text/html"))
-               return sniff_feed_or_html (sniffer, buffer);
-
        return g_strdup (content_type);
 }
 
diff --git a/tests/resources/feed.rdf b/tests/resources/feed.rdf
new file mode 100644
index 0000000..f3d9e27
--- /dev/null
+++ b/tests/resources/feed.rdf
@@ -0,0 +1,32 @@
+<?xml version="1.0"?>
+
+<!-- RDF Site Summary (RSS) 1.0
+     http://groups.yahoo.com/group/rss-dev/files/specification.html
+     Section 5.3
+  -->
+
+<rdf:RDF 
+  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#";
+  xmlns="http://purl.org/rss/1.0/";>
+
+  <channel rdf:about="http://www.xml.com/xml/news.rss";>
+    <title>XML.com</title>
+    <link>http://xml.com/pub</link>
+    <description>
+    XML.com features a rich mix of information and services 
+      for the XML community.
+    </description>
+
+    <image rdf:resource="http://xml.com/universal/images/xml_tiny.gif"; />
+
+    <items>
+      <rdf:Seq>
+        <rdf:li resource="http://xml.com/pub/2000/08/09/xslt/xslt.html"; />
+        <rdf:li resource="http://xml.com/pub/2000/08/09/rdfdb/index.html"; />
+      </rdf:Seq>
+    </items>
+
+    <textinput rdf:resource="http://search.xml.com"; />
+  </channel>
+
+</rdf:RDF>
diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c
index ab8dbf3..c1fe56b 100644
--- a/tests/sniffing-test.c
+++ b/tests/sniffing-test.c
@@ -586,15 +586,16 @@ main (int argc, char **argv)
        test_sniffing ("/type/anice_type+xml/home.gif", "anice/type+xml");
        test_sniffing ("/type/application_xml/home.gif", "application/xml");
 
-       /* Test the image sniffing path */
-
-       test_sniffing ("/type/image_png/home.gif", "image/gif");
-
        /* Test the feed or html path */
 
        test_sniffing ("/type/text_html/test.html", "text/html");
        test_sniffing ("/type/text_html/rss20.xml", "application/rss+xml");
        test_sniffing ("/type/text_html/atom.xml", "application/atom+xml");
+       test_sniffing ("/type/text_html/feed.rdf", "application/rss+xml");
+
+       /* Test the image sniffing path */
+
+       test_sniffing ("/type/image_png/home.gif", "image/gif");
 
        /* The spec tells us to only use the last Content-Type header */
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]