[totem-pl-parser] podcast: Always prefer recent date as publish date for feeds



commit d8021d71d32764e7d67c5b3650953707619de392
Author: crvi <crvisqr gmail com>
Date:   Thu Oct 1 00:44:26 2020 +0530

    podcast: Always prefer recent date as publish date for feeds
    
    For rss feed channels, <lastBuildDate> and <pubDate> indeed have
    different meanings. Please refer [1] for more details. But podcast
    providers use it in their own ways. Some use only <pubDate> (
    e.g. podbean ), some only <lastBuildDate> ( e.g. anchor.fm, buzzsprout
    ), some use both ( e.g. podigee, soundcloud ). Podigee seems to use
    <lastBuildDate> for any modifications to feed contents ( including
    episode additions ), which infact should be covered by <pubDate>.
    
    In short, we are only interested in the recent timestamp of both these
    tags, when both are available.
    
    [1] https://www.rssboard.org/rss-profile#element-channel-lastbuilddate

 plparse/tests/parser.c                | 33 +++++++++++++++++++++++++++++++++
 plparse/tests/podcast-image-url.1.rss |  4 ++--
 plparse/totem-pl-parser-podcast.c     | 25 +++++++++++++++++++++++--
 3 files changed, 58 insertions(+), 4 deletions(-)
---
diff --git a/plparse/tests/parser.c b/plparse/tests/parser.c
index ca97945..f71091d 100644
--- a/plparse/tests/parser.c
+++ b/plparse/tests/parser.c
@@ -883,6 +883,38 @@ test_parsing_item_image (void)
        g_free (uri);
 }
 
+static void
+test_parsing_feed_pubdate (void)
+{
+       char *uri;
+
+       /* no <lastBuildDate> or <pubDate> */
+       uri = get_relative_uri (TEST_SRCDIR "585407.rss");
+       g_assert_cmpstr (parser_test_get_playlist_field (uri, TOTEM_PL_PARSER_FIELD_PUB_DATE), ==, NULL);
+       g_free (uri);
+
+       /* only <lastBuildDate> */
+       uri = get_relative_uri (TEST_SRCDIR "791154-kqed.rss");
+       g_assert_cmpstr (parser_test_get_playlist_field (uri, TOTEM_PL_PARSER_FIELD_PUB_DATE), ==, "Mon, 04 
Dec 2017 08:01:09 +0000");
+       g_free (uri);
+
+       /* same <lastBuildDate> and <pubDate> */
+       uri = get_relative_uri (TEST_SRCDIR "560051.xml");
+       g_assert_cmpstr (parser_test_get_playlist_field (uri, TOTEM_PL_PARSER_FIELD_PUB_DATE), ==, "Mon, 8 
Dec 2008 13:20:00 CST");
+       g_free (uri);
+
+       /* <pubDate> followed by <lastBuildDate> */
+       uri = get_relative_uri (TEST_SRCDIR "podcast-empty-description.rss");
+       g_assert_cmpstr (parser_test_get_playlist_field (uri, TOTEM_PL_PARSER_FIELD_PUB_DATE), ==, "Sun, 26 
Jul 2020 20:07:40 +0000");
+       g_free (uri);
+
+       /* <lastBuildDate> followed by <pubDate> */
+       uri = get_relative_uri (TEST_SRCDIR "podcast-image-url.1.rss");
+       g_assert_cmpstr (parser_test_get_playlist_field (uri, TOTEM_PL_PARSER_FIELD_PUB_DATE), ==, "Wed, 23 
Aug 2017 01:55:17 +0000");
+       g_free (uri);
+
+}
+
 static void
 test_parsing_hadess (void)
 {
@@ -1568,6 +1600,7 @@ main (int argc, char *argv[])
                g_test_add_func ("/parser/parsing/podcast_item_description", test_parsing_item_description);
                g_test_add_func ("/parser/parsing/podcast_feed_image", test_parsing_feed_image);
                g_test_add_func ("/parser/parsing/podcast_item_image", test_parsing_item_image);
+               g_test_add_func ("/parser/parsing/podcast_feed_pubdate", test_parsing_feed_pubdate);
                g_test_add_func ("/parser/parsing/live_streaming", test_parsing_live_streaming);
                g_test_add_func ("/parser/parsing/xml_mixed_cdata", test_parsing_xml_mixed_cdata);
                g_test_add_func ("/parser/parsing/m3u_streaming", test_parsing_m3u_streaming);
diff --git a/plparse/tests/podcast-image-url.1.rss b/plparse/tests/podcast-image-url.1.rss
index a28eb88..b331bb9 100644
--- a/plparse/tests/podcast-image-url.1.rss
+++ b/plparse/tests/podcast-image-url.1.rss
@@ -5,8 +5,8 @@
         <atom:link 
href="http://feeds.soundcloud.com/users/soundcloud:users:320899690/sounds.rss?before=336780890"; rel="next" 
type="application/rss+xml"/>
         <title>Exit Poll New England</title>
         <link>http://soundcloud.com/exitpollnewengland</link>
-        <pubDate>Wed, 23 Aug 2017 01:55:17 +0000</pubDate>
         <lastBuildDate>Wed, 23 Aug 2017 01:55:17 +0000</lastBuildDate>
+        <pubDate>Mon, 07 Aug 2017 02:08:50 +0000</pubDate>
         <ttl>60</ttl>
         <language>en</language>
         <copyright>All rights reserved</copyright>
@@ -54,4 +54,4 @@
       <itunes:image href="http://i1.sndcdn.com/artworks-000237209681-dqpcbk-t3000x3000.jpg"/>
     </item>
       </channel>
-    </rss>
\ No newline at end of file
+    </rss>
diff --git a/plparse/totem-pl-parser-podcast.c b/plparse/totem-pl-parser-podcast.c
index 237795e..3c445eb 100644
--- a/plparse/totem-pl-parser-podcast.c
+++ b/plparse/totem-pl-parser-podcast.c
@@ -142,6 +142,26 @@ set_longer_description (xml_node_t *node, const char **description)
        }
 }
 
+static void
+set_recent_date (xml_node_t *node, const char **date)
+{
+       if (node->data == NULL)
+               return;
+
+       if (*date) {
+               guint64 old, new;
+
+               old = totem_pl_parser_parse_date (*date, FALSE);
+               new = totem_pl_parser_parse_date (node->data, FALSE);
+
+               /* prefer recent date */
+               if (new <= old)
+                       return;
+       }
+
+       *date = node->data;
+}
+
 static TotemPlParserResult
 parse_rss_item (TotemPlParser *parser, xml_node_t *parent)
 {
@@ -312,8 +332,9 @@ parse_rss_items (TotemPlParser *parser, const char *uri, xml_node_t *parent)
                        if (href != NULL)
                                img = href;
                } else if (g_ascii_strcasecmp (node->name, "lastBuildDate") == 0
-                        || g_ascii_strcasecmp (node->name, "pubDate") == 0) {
-                       pub_date = node->data;
+                          || (g_ascii_strcasecmp (node->name, "pubDate") == 0)) {
+                       /* prefer recent of <lastBuildDate> and <pubDate> date */
+                       set_recent_date (node, &pub_date);
                } else if (g_ascii_strcasecmp (node->name, "copyright") == 0) {
                        copyright = node->data;
                }


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]