[libsoup: 7/10] sniffing: Bring feed vs HTML up-to-date with the MIMESNIFF spec
- From: Dan Winship <danw src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [libsoup: 7/10] sniffing: Bring feed vs HTML up-to-date with the MIMESNIFF spec
- Date: Mon, 17 Feb 2014 17:30:06 +0000 (UTC)
commit cd4f6a94f9275670091326a5aec8a07bce7f8d79
Author: Gustavo Noronha Silva <gns gnome org>
Date: Mon Dec 9 16:20:02 2013 +0100
sniffing: Bring feed vs HTML up-to-date with the MIMESNIFF spec
* decide on that before doing the image sniffing to match the spec
* use const char* and g_str_has_prefix for comparisons to make it
more legible
* deal with rdf:RDF tags
libsoup/soup-content-sniffer.c | 117 +++++++++++++++++++++++++++-------------
tests/resources/feed.rdf | 32 +++++++++++
tests/sniffing-test.c | 13 +++--
tests/soup-tests.gresource.xml | 1 +
4 files changed, 120 insertions(+), 43 deletions(-)
---
diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c
index 154df84..5b768bb 100644
--- a/libsoup/soup-content-sniffer.c
+++ b/libsoup/soup-content-sniffer.c
@@ -491,10 +491,26 @@ sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer,
return g_strdup (content_type);
}
+static gboolean
+skip_insignificant_space (const char *resource, int *pos, int resource_length)
+{
+ while ((resource[*pos] == '\x09') ||
+ (resource[*pos] == '\x20') ||
+ (resource[*pos] == '\x0A') ||
+ (resource[*pos] == '\x0D')) {
+ *pos = *pos + 1;
+
+ if (*pos > resource_length)
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
static char*
sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer)
{
- const guchar *resource = (const guchar *)buffer->data;
+ const char *resource = (const char *)buffer->data;
int resource_length = MIN (512, buffer->length);
int pos = 0;
@@ -509,19 +525,10 @@ sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer)
if (pos > resource_length)
goto text_html;
- /* Skip insignificant white space */
- while ((resource[pos] == '\x09') ||
- (resource[pos] == '\x20') ||
- (resource[pos] == '\x0A') ||
- (resource[pos] == '\x0D')) {
- pos++;
-
- if (pos > resource_length)
- goto text_html;
- }
+ if (skip_insignificant_space (resource, &pos, resource_length))
+ goto text_html;
- /* != < */
- if (resource[pos] != '\x3C')
+ if (resource[pos] != '<')
return g_strdup ("text/html");
pos++;
@@ -529,73 +536,106 @@ sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer)
if ((pos + 2) > resource_length)
goto text_html;
- /* Skipping comments */
- if ((resource[pos] == '\x2D') ||
- (resource[pos+1] == '\x2D') ||
- (resource[pos+2] == '\x3E')) {
+ /* Skip comments. */
+ if (g_str_has_prefix (resource + pos, "!--")) {
pos = pos + 3;
if ((pos + 2) > resource_length)
goto text_html;
- while ((resource[pos] != '\x2D') &&
- (resource[pos+1] != '\x2D') &&
- (resource[pos+2] != '\x3E')) {
+ while (!g_str_has_prefix (resource + pos, "-->")) {
pos++;
if ((pos + 2) > resource_length)
goto text_html;
}
+ pos = pos + 3;
+
goto look_for_tag;
}
if (pos > resource_length)
goto text_html;
- /* == ! */
- if (resource[pos] == '\x21') {
+ if (resource[pos] == '!') {
do {
pos++;
if (pos > resource_length)
goto text_html;
- } while (resource[pos] != '\x3E');
+ } while (resource[pos] != '>');
pos++;
goto look_for_tag;
- } else if (resource[pos] == '\x3F') { /* ? */
+ } else if (resource[pos] == '?') {
do {
pos++;
if ((pos + 1) > resource_length)
goto text_html;
- } while ((resource[pos] != '\x3F') &&
- (resource[pos+1] != '\x3E'));
+ } while (!g_str_has_prefix (resource + pos, "?>"));
pos = pos + 2;
goto look_for_tag;
}
- if ((pos + 2) > resource_length)
+ if ((pos + 3) > resource_length)
goto text_html;
- if ((resource[pos] == '\x72') &&
- (resource[pos+1] == '\x73') &&
- (resource[pos+2] == '\x73'))
+ if (g_str_has_prefix (resource + pos, "rss"))
return g_strdup ("application/rss+xml");
- if ((pos + 3) > resource_length)
+ if ((pos + 4) > resource_length)
goto text_html;
- if ((resource[pos] == '\x66') &&
- (resource[pos+1] == '\x65') &&
- (resource[pos+2] == '\x65') &&
- (resource[pos+3] == '\x64'))
+ if (g_str_has_prefix (resource + pos, "feed"))
return g_strdup ("application/atom+xml");
+ if ((pos + 7) > resource_length)
+ goto text_html;
+
+ if (g_str_has_prefix (resource + pos, "rdf:RDF")) {
+ pos = pos + 7;
+
+ if (skip_insignificant_space (resource, &pos, resource_length))
+ goto text_html;
+
+ if ((pos + 32) > resource_length)
+ goto text_html;
+
+ if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"")) {
+ pos = pos + 32;
+
+ if (skip_insignificant_space (resource, &pos, resource_length))
+ goto text_html;
+
+ if ((pos + 55) > resource_length)
+ goto text_html;
+
+ if (g_str_has_prefix (resource + pos,
"xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\""))
+ return g_strdup ("application/rss+xml");
+ }
+
+ if ((pos + 55) > resource_length)
+ goto text_html;
+
+ if (g_str_has_prefix (resource + pos,
"xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"")) {
+ pos = pos + 55;
+
+ if (skip_insignificant_space (resource, &pos, resource_length))
+ goto text_html;
+
+ if ((pos + 32) > resource_length)
+ goto text_html;
+
+ if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\""))
+ return g_strdup ("application/rss+xml");
+ }
+ }
+
text_html:
return g_strdup ("text/html");
}
@@ -641,6 +681,10 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg,
!g_ascii_strcasecmp (content_type, "application/xml"))
return g_strdup (content_type);
+ /* 5. Distinguish feed from HTML. */
+ if (!g_ascii_strcasecmp (content_type, "text/html"))
+ return sniff_feed_or_html (sniffer, buffer);
+
/* 2.7.5 Content-Type sniffing: image
* The spec says:
*
@@ -659,9 +703,6 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg,
return sniff_text_or_binary (sniffer, buffer);
}
- if (!g_ascii_strcasecmp (content_type, "text/html"))
- return sniff_feed_or_html (sniffer, buffer);
-
return g_strdup (content_type);
}
diff --git a/tests/resources/feed.rdf b/tests/resources/feed.rdf
new file mode 100644
index 0000000..f3d9e27
--- /dev/null
+++ b/tests/resources/feed.rdf
@@ -0,0 +1,32 @@
+<?xml version="1.0"?>
+
+<!-- RDF Site Summary (RSS) 1.0
+ http://groups.yahoo.com/group/rss-dev/files/specification.html
+ Section 5.3
+ -->
+
+<rdf:RDF
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns="http://purl.org/rss/1.0/">
+
+ <channel rdf:about="http://www.xml.com/xml/news.rss">
+ <title>XML.com</title>
+ <link>http://xml.com/pub</link>
+ <description>
+ XML.com features a rich mix of information and services
+ for the XML community.
+ </description>
+
+ <image rdf:resource="http://xml.com/universal/images/xml_tiny.gif" />
+
+ <items>
+ <rdf:Seq>
+ <rdf:li resource="http://xml.com/pub/2000/08/09/xslt/xslt.html" />
+ <rdf:li resource="http://xml.com/pub/2000/08/09/rdfdb/index.html" />
+ </rdf:Seq>
+ </items>
+
+ <textinput rdf:resource="http://search.xml.com" />
+ </channel>
+
+</rdf:RDF>
diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c
index 2dc9fb2..498df97 100644
--- a/tests/sniffing-test.c
+++ b/tests/sniffing-test.c
@@ -539,11 +539,6 @@ main (int argc, char **argv)
"type/application_xml/home.gif => application/xml",
do_sniffing_test);
- /* Test the image sniffing path */
- g_test_add_data_func ("/sniffing/type/image",
- "type/image_png/home.gif => image/gif",
- do_sniffing_test);
-
/* Test the feed or html path */
g_test_add_data_func ("/sniffing/type/html/html",
"type/text_html/test.html => text/html",
@@ -554,6 +549,14 @@ main (int argc, char **argv)
g_test_add_data_func ("/sniffing/type/html/atom",
"type/text_html/atom.xml => application/atom+xml",
do_sniffing_test);
+ g_test_add_data_func ("/sniffing/type/html/rdf",
+ "type/text_html/feed.rdf => application/rss+xml",
+ do_sniffing_test);
+
+ /* Test the image sniffing path */
+ g_test_add_data_func ("/sniffing/type/image",
+ "type/image_png/home.gif => image/gif",
+ do_sniffing_test);
/* The spec tells us to only use the last Content-Type header */
g_test_add_data_func ("/sniffing/multiple-headers",
diff --git a/tests/soup-tests.gresource.xml b/tests/soup-tests.gresource.xml
index d24a04b..320cd63 100644
--- a/tests/soup-tests.gresource.xml
+++ b/tests/soup-tests.gresource.xml
@@ -3,6 +3,7 @@
<gresource prefix="/org/gnome/libsoup/tests">
<file>index.txt</file>
<file>resources/atom.xml</file>
+ <file>resources/feed.rdf</file>
<file>resources/home.gif</file>
<file>resources/html_binary.html</file>
<file>resources/leading_space.html</file>
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]