[libsoup/content-sniffing-update: 6/7] Bring feed vs HTML up-to-date with the MIMESNIFF spec
- From: Gustavo Noronha Silva <gns src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [libsoup/content-sniffing-update: 6/7] Bring feed vs HTML up-to-date with the MIMESNIFF spec
- Date: Tue, 10 Dec 2013 16:31:48 +0000 (UTC)
commit 6fcb86b308d2402cb8da84548c3b0f3bcc5276c6
Author: Gustavo Noronha Silva <gns gnome org>
Date: Mon Dec 9 16:20:02 2013 +0100
Bring feed vs HTML up-to-date with the MIMESNIFF spec
* decide on that before doing the image sniffing to match the spec
* use const gchar* and g_str_has_prefix for comparisons to make it
more legible
* deal with rdf:RDF tags
libsoup/soup-content-sniffer.c | 116 +++++++++++++++++++++++++++-------------
tests/resources/feed.rdf | 32 +++++++++++
tests/sniffing-test.c | 9 ++--
3 files changed, 115 insertions(+), 42 deletions(-)
---
diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c
index d5deb43..55ac5e8 100644
--- a/libsoup/soup-content-sniffer.c
+++ b/libsoup/soup-content-sniffer.c
@@ -490,10 +490,25 @@ sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer,
return g_strdup (content_type);
}
+static gboolean skip_insignificant_space (const gchar *resource, int *pos, int resource_length)
+{
+ while ((resource[*pos] == '\x09') ||
+ (resource[*pos] == '\x20') ||
+ (resource[*pos] == '\x0A') ||
+ (resource[*pos] == '\x0D')) {
+ *pos = *pos + 1;
+
+ if (*pos > resource_length)
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
static char*
sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer)
{
- const guchar *resource = (const guchar *)buffer->data;
+ const gchar *resource = (const gchar *)buffer->data;
int resource_length = MIN (512, buffer->length);
int pos = 0;
@@ -508,19 +523,10 @@ sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer)
if (pos > resource_length)
goto text_html;
- /* Skip insignificant white space */
- while ((resource[pos] == '\x09') ||
- (resource[pos] == '\x20') ||
- (resource[pos] == '\x0A') ||
- (resource[pos] == '\x0D')) {
- pos++;
-
- if (pos > resource_length)
- goto text_html;
- }
+ if (skip_insignificant_space (resource, &pos, resource_length))
+ goto text_html;
- /* != < */
- if (resource[pos] != '\x3C')
+ if (resource[pos] != '<')
return g_strdup ("text/html");
pos++;
@@ -528,73 +534,106 @@ sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer)
if ((pos + 2) > resource_length)
goto text_html;
- /* Skipping comments */
- if ((resource[pos] == '\x2D') ||
- (resource[pos+1] == '\x2D') ||
- (resource[pos+2] == '\x3E')) {
+ /* Skip comments. */
+ if (g_str_has_prefix (resource + pos, "!--")) {
pos = pos + 3;
if ((pos + 2) > resource_length)
goto text_html;
- while ((resource[pos] != '\x2D') &&
- (resource[pos+1] != '\x2D') &&
- (resource[pos+2] != '\x3E')) {
+ while (!g_str_has_prefix (resource + pos, "-->")) {
pos++;
if ((pos + 2) > resource_length)
goto text_html;
}
+ pos = pos + 3;
+
goto look_for_tag;
}
if (pos > resource_length)
goto text_html;
- /* == ! */
- if (resource[pos] == '\x21') {
+ if (resource[pos] == '!') {
do {
pos++;
if (pos > resource_length)
goto text_html;
- } while (resource[pos] != '\x3E');
+ } while (resource[pos] != '>');
pos++;
goto look_for_tag;
- } else if (resource[pos] == '\x3F') { /* ? */
+ } else if (resource[pos] == '?') {
do {
pos++;
if ((pos + 1) > resource_length)
goto text_html;
- } while ((resource[pos] != '\x3F') &&
- (resource[pos+1] != '\x3E'));
+ } while (!g_str_has_prefix (resource + pos, "?>"));
pos = pos + 2;
goto look_for_tag;
}
- if ((pos + 2) > resource_length)
+ if ((pos + 3) > resource_length)
goto text_html;
- if ((resource[pos] == '\x72') &&
- (resource[pos+1] == '\x73') &&
- (resource[pos+2] == '\x73'))
+ if (g_str_has_prefix (resource + pos, "rss"))
return g_strdup ("application/rss+xml");
- if ((pos + 3) > resource_length)
+ if ((pos + 4) > resource_length)
goto text_html;
- if ((resource[pos] == '\x66') &&
- (resource[pos+1] == '\x65') &&
- (resource[pos+2] == '\x65') &&
- (resource[pos+3] == '\x64'))
+ if (g_str_has_prefix (resource + pos, "feed"))
return g_strdup ("application/atom+xml");
+ if ((pos + 7) > resource_length)
+ goto text_html;
+
+ if (g_str_has_prefix (resource + pos, "rdf:RDF")) {
+ pos = pos + 7;
+
+ if (skip_insignificant_space (resource, &pos, resource_length))
+ goto text_html;
+
+ if ((pos + 32) > resource_length)
+ goto text_html;
+
+ if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"")) {
+ pos = pos + 32;
+
+ if (skip_insignificant_space (resource, &pos, resource_length))
+ goto text_html;
+
+ if ((pos + 55) > resource_length)
+ goto text_html;
+
+ if (g_str_has_prefix (resource + pos,
"xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\""))
+ return g_strdup ("application/rss+xml");
+ }
+
+ if ((pos + 55) > resource_length)
+ goto text_html;
+
+ if (g_str_has_prefix (resource + pos,
"xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"")) {
+ pos = pos + 55;
+
+ if (skip_insignificant_space (resource, &pos, resource_length))
+ goto text_html;
+
+ if ((pos + 32) > resource_length)
+ goto text_html;
+
+ if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\""))
+ return g_strdup ("application/rss+xml");
+ }
+ }
+
text_html:
return g_strdup ("text/html");
}
@@ -641,6 +680,10 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg,
!g_ascii_strcasecmp (content_type, "application/xml"))
return g_strdup (content_type);
+ /* 5. Distinguish feed from HTML. */
+ if (!g_ascii_strcasecmp (content_type, "text/html"))
+ return sniff_feed_or_html (sniffer, buffer);
+
/* 2.7.5 Content-Type sniffing: image
* The spec says:
*
@@ -659,9 +702,6 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg,
return sniff_text_or_binary (sniffer, buffer);
}
- if (!g_ascii_strcasecmp (content_type, "text/html"))
- return sniff_feed_or_html (sniffer, buffer);
-
return g_strdup (content_type);
}
diff --git a/tests/resources/feed.rdf b/tests/resources/feed.rdf
new file mode 100644
index 0000000..f3d9e27
--- /dev/null
+++ b/tests/resources/feed.rdf
@@ -0,0 +1,32 @@
+<?xml version="1.0"?>
+
+<!-- RDF Site Summary (RSS) 1.0
+ http://groups.yahoo.com/group/rss-dev/files/specification.html
+ Section 5.3
+ -->
+
+<rdf:RDF
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns="http://purl.org/rss/1.0/">
+
+ <channel rdf:about="http://www.xml.com/xml/news.rss">
+ <title>XML.com</title>
+ <link>http://xml.com/pub</link>
+ <description>
+ XML.com features a rich mix of information and services
+ for the XML community.
+ </description>
+
+ <image rdf:resource="http://xml.com/universal/images/xml_tiny.gif" />
+
+ <items>
+ <rdf:Seq>
+ <rdf:li resource="http://xml.com/pub/2000/08/09/xslt/xslt.html" />
+ <rdf:li resource="http://xml.com/pub/2000/08/09/rdfdb/index.html" />
+ </rdf:Seq>
+ </items>
+
+ <textinput rdf:resource="http://search.xml.com" />
+ </channel>
+
+</rdf:RDF>
diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c
index ab8dbf3..c1fe56b 100644
--- a/tests/sniffing-test.c
+++ b/tests/sniffing-test.c
@@ -586,15 +586,16 @@ main (int argc, char **argv)
test_sniffing ("/type/anice_type+xml/home.gif", "anice/type+xml");
test_sniffing ("/type/application_xml/home.gif", "application/xml");
- /* Test the image sniffing path */
-
- test_sniffing ("/type/image_png/home.gif", "image/gif");
-
/* Test the feed or html path */
test_sniffing ("/type/text_html/test.html", "text/html");
test_sniffing ("/type/text_html/rss20.xml", "application/rss+xml");
test_sniffing ("/type/text_html/atom.xml", "application/atom+xml");
+ test_sniffing ("/type/text_html/feed.rdf", "application/rss+xml");
+
+ /* Test the image sniffing path */
+
+ test_sniffing ("/type/image_png/home.gif", "image/gif");
/* The spec tells us to only use the last Content-Type header */
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]