[libsoup: 7/10] sniffing: Bring feed vs HTML up-to-date with the MIMESNIFF spec



commit cd4f6a94f9275670091326a5aec8a07bce7f8d79
Author: Gustavo Noronha Silva <gns gnome org>
Date:   Mon Dec 9 16:20:02 2013 +0100

    sniffing: Bring feed vs HTML up-to-date with the MIMESNIFF spec
    
    * decide on that before doing the image sniffing to match the spec
    * use const char* and g_str_has_prefix for comparisons to make it
      more legible
    * deal with rdf:RDF tags

 libsoup/soup-content-sniffer.c |  117 +++++++++++++++++++++++++++-------------
 tests/resources/feed.rdf       |   32 +++++++++++
 tests/sniffing-test.c          |   13 +++--
 tests/soup-tests.gresource.xml |    1 +
 4 files changed, 120 insertions(+), 43 deletions(-)
---
diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c
index 154df84..5b768bb 100644
--- a/libsoup/soup-content-sniffer.c
+++ b/libsoup/soup-content-sniffer.c
@@ -491,10 +491,26 @@ sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer,
        return g_strdup (content_type);
 }
 
+static gboolean
+skip_insignificant_space (const char *resource, int *pos, int resource_length)
+{
+       while ((resource[*pos] == '\x09') ||
+              (resource[*pos] == '\x20') ||
+              (resource[*pos] == '\x0A') ||
+              (resource[*pos] == '\x0D')) {
+               *pos = *pos + 1;
+
+               if (*pos > resource_length)
+                       return TRUE;
+       }
+
+       return FALSE;
+}
+
 static char*
 sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer)
 {
-       const guchar *resource = (const guchar *)buffer->data;
+       const char *resource = (const char *)buffer->data;
        int resource_length = MIN (512, buffer->length);
        int pos = 0;
 
@@ -509,19 +525,10 @@ sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer)
        if (pos > resource_length)
                goto text_html;
 
-       /* Skip insignificant white space */
-       while ((resource[pos] == '\x09') ||
-              (resource[pos] == '\x20') ||
-              (resource[pos] == '\x0A') ||
-              (resource[pos] == '\x0D')) {
-               pos++;
-
-               if (pos > resource_length)
-                       goto text_html;
-       }
+       if (skip_insignificant_space (resource, &pos, resource_length))
+               goto text_html;
 
-       /* != < */
-       if (resource[pos] != '\x3C')
+       if (resource[pos] != '<')
                return g_strdup ("text/html");
 
        pos++;
@@ -529,73 +536,106 @@ sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer)
        if ((pos + 2) > resource_length)
                goto text_html;
 
-       /* Skipping comments */
-       if ((resource[pos] == '\x2D') ||
-           (resource[pos+1] == '\x2D') ||
-           (resource[pos+2] == '\x3E')) {
+       /* Skip comments. */
+       if (g_str_has_prefix (resource + pos, "!--")) {
                pos = pos + 3;
 
                if ((pos + 2) > resource_length)
                        goto text_html;
 
-               while ((resource[pos] != '\x2D') &&
-                      (resource[pos+1] != '\x2D') &&
-                      (resource[pos+2] != '\x3E')) {
+               while (!g_str_has_prefix (resource + pos, "-->")) {
                        pos++;
 
                        if ((pos + 2) > resource_length)
                                goto text_html;
                }
 
+               pos = pos + 3;
+
                goto look_for_tag;
        }
 
        if (pos > resource_length)
                goto text_html;
 
-       /* == ! */
-       if (resource[pos] == '\x21') {
+       if (resource[pos] == '!') {
                do {
                        pos++;
 
                        if (pos > resource_length)
                                goto text_html;
-               } while (resource[pos] != '\x3E');
+               } while (resource[pos] != '>');
 
                pos++;
 
                goto look_for_tag;
-       } else if (resource[pos] == '\x3F') { /* ? */
+       } else if (resource[pos] == '?') {
                do {
                        pos++;
 
                        if ((pos + 1) > resource_length)
                                goto text_html;
-               } while ((resource[pos] != '\x3F') &&
-                        (resource[pos+1] != '\x3E'));
+               } while (!g_str_has_prefix (resource + pos, "?>"));
 
                pos = pos + 2;
 
                goto look_for_tag;
        }
 
-       if ((pos + 2) > resource_length)
+       if ((pos + 3) > resource_length)
                goto text_html;
 
-       if ((resource[pos] == '\x72') &&
-           (resource[pos+1] == '\x73') &&
-           (resource[pos+2] == '\x73'))
+       if (g_str_has_prefix (resource + pos, "rss"))
                return g_strdup ("application/rss+xml");
 
-       if ((pos + 3) > resource_length)
+       if ((pos + 4) > resource_length)
                goto text_html;
 
-       if ((resource[pos] == '\x66') &&
-           (resource[pos+1] == '\x65') &&
-           (resource[pos+2] == '\x65') &&
-           (resource[pos+3] == '\x64'))
+       if (g_str_has_prefix (resource + pos, "feed"))
                return g_strdup ("application/atom+xml");
 
+       if ((pos + 7) > resource_length)
+               goto text_html;
+
+       if (g_str_has_prefix (resource + pos, "rdf:RDF")) {
+               pos = pos + 7;
+
+               if (skip_insignificant_space (resource, &pos, resource_length))
+                       goto text_html;
+
+               if ((pos + 32) > resource_length)
+                       goto text_html;
+
+               if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"";)) {
+                       pos = pos + 32;
+
+                       if (skip_insignificant_space (resource, &pos, resource_length))
+                               goto text_html;
+
+                       if ((pos + 55) > resource_length)
+                               goto text_html;
+
+                       if (g_str_has_prefix (resource + pos, 
"xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"";))
+                               return g_strdup ("application/rss+xml");
+               }
+
+               if ((pos + 55) > resource_length)
+                       goto text_html;
+
+               if (g_str_has_prefix (resource + pos, 
"xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"";)) {
+                       pos = pos + 55;
+
+                       if (skip_insignificant_space (resource, &pos, resource_length))
+                               goto text_html;
+
+                       if ((pos + 32) > resource_length)
+                               goto text_html;
+
+                       if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"";))
+                               return g_strdup ("application/rss+xml");
+               }
+       }
+
  text_html:
        return g_strdup ("text/html");
 }
@@ -641,6 +681,10 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg,
            !g_ascii_strcasecmp (content_type, "application/xml"))
                return g_strdup (content_type);
 
+       /* 5. Distinguish feed from HTML. */
+       if (!g_ascii_strcasecmp (content_type, "text/html"))
+               return sniff_feed_or_html (sniffer, buffer);
+
        /* 2.7.5 Content-Type sniffing: image
         * The spec says:
         *
@@ -659,9 +703,6 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg,
                return sniff_text_or_binary (sniffer, buffer);
        }
 
-       if (!g_ascii_strcasecmp (content_type, "text/html"))
-               return sniff_feed_or_html (sniffer, buffer);
-
        return g_strdup (content_type);
 }
 
diff --git a/tests/resources/feed.rdf b/tests/resources/feed.rdf
new file mode 100644
index 0000000..f3d9e27
--- /dev/null
+++ b/tests/resources/feed.rdf
@@ -0,0 +1,32 @@
+<?xml version="1.0"?>
+
+<!-- RDF Site Summary (RSS) 1.0
+     http://groups.yahoo.com/group/rss-dev/files/specification.html
+     Section 5.3
+  -->
+
+<rdf:RDF 
+  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#";
+  xmlns="http://purl.org/rss/1.0/";>
+
+  <channel rdf:about="http://www.xml.com/xml/news.rss";>
+    <title>XML.com</title>
+    <link>http://xml.com/pub</link>
+    <description>
+    XML.com features a rich mix of information and services 
+      for the XML community.
+    </description>
+
+    <image rdf:resource="http://xml.com/universal/images/xml_tiny.gif"; />
+
+    <items>
+      <rdf:Seq>
+        <rdf:li resource="http://xml.com/pub/2000/08/09/xslt/xslt.html"; />
+        <rdf:li resource="http://xml.com/pub/2000/08/09/rdfdb/index.html"; />
+      </rdf:Seq>
+    </items>
+
+    <textinput rdf:resource="http://search.xml.com"; />
+  </channel>
+
+</rdf:RDF>
diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c
index 2dc9fb2..498df97 100644
--- a/tests/sniffing-test.c
+++ b/tests/sniffing-test.c
@@ -539,11 +539,6 @@ main (int argc, char **argv)
                              "type/application_xml/home.gif => application/xml",
                              do_sniffing_test);
 
-       /* Test the image sniffing path */
-       g_test_add_data_func ("/sniffing/type/image",
-                             "type/image_png/home.gif => image/gif",
-                             do_sniffing_test);
-
        /* Test the feed or html path */
        g_test_add_data_func ("/sniffing/type/html/html",
                              "type/text_html/test.html => text/html",
@@ -554,6 +549,14 @@ main (int argc, char **argv)
        g_test_add_data_func ("/sniffing/type/html/atom",
                              "type/text_html/atom.xml => application/atom+xml",
                              do_sniffing_test);
+       g_test_add_data_func ("/sniffing/type/html/rdf",
+                             "type/text_html/feed.rdf => application/rss+xml",
+                             do_sniffing_test);
+
+       /* Test the image sniffing path */
+       g_test_add_data_func ("/sniffing/type/image",
+                             "type/image_png/home.gif => image/gif",
+                             do_sniffing_test);
 
        /* The spec tells us to only use the last Content-Type header */
        g_test_add_data_func ("/sniffing/multiple-headers",
diff --git a/tests/soup-tests.gresource.xml b/tests/soup-tests.gresource.xml
index d24a04b..320cd63 100644
--- a/tests/soup-tests.gresource.xml
+++ b/tests/soup-tests.gresource.xml
@@ -3,6 +3,7 @@
   <gresource prefix="/org/gnome/libsoup/tests">
     <file>index.txt</file>
     <file>resources/atom.xml</file>
+    <file>resources/feed.rdf</file>
     <file>resources/home.gif</file>
     <file>resources/html_binary.html</file>
     <file>resources/leading_space.html</file>


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]