[rhythmbox] podcast: remove tags and decode entities in html podcast descriptions
- From: Jonathan Matthew <jmatthew src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [rhythmbox] podcast: remove tags and decode entities in html podcast descriptions
- Date: Sun, 19 Jun 2016 09:18:53 +0000 (UTC)
commit c1cc0d663f55ba6a4de371579a0e34293f39a504
Author: Jonathan Matthew <jonathan d14n org>
Date: Sun Jun 19 17:02:58 2016 +1000
podcast: remove tags and decode entities in html podcast descriptions
podcast/rb-podcast-properties-dialog.c | 156 +++++++++++++++++++++++++++++++-
1 files changed, 153 insertions(+), 3 deletions(-)
---
diff --git a/podcast/rb-podcast-properties-dialog.c b/podcast/rb-podcast-properties-dialog.c
index 8529ff1..22d3085 100644
--- a/podcast/rb-podcast-properties-dialog.c
+++ b/podcast/rb-podcast-properties-dialog.c
@@ -30,6 +30,7 @@
#include <string.h>
#include <time.h>
+#include <errno.h>
#include <glib/gi18n.h>
#include <gtk/gtk.h>
@@ -110,6 +111,142 @@ enum
G_DEFINE_TYPE (RBPodcastPropertiesDialog, rb_podcast_properties_dialog, GTK_TYPE_DIALOG)
+/* list of HTML-ish strings that we search for to distinguish plain text from HTML podcast
+ * descriptions. we don't really have anything else to go on - regular content type
+ * sniffing only works for proper HTML documents, but these are just tiny fragments, usually
+ * with some simple formatting tags. if we find any of these in a podcast description,
+ * we'll strip HTML tags and attempt to decode entities.
+ */
+static const char *html_clues[] = {
+ "<p>",
+ "<a ",
+ "<b>",
+ "<i>",
+ "<ul>",
+ "<br",
+ "<img ",
+ "<",
+ ">",
+ "&",
+ """,
+ "'",
+ "‘",
+ "’",
+ "“",
+ "”",
+ "&#",
+};
+
+static char *
+unhtml (const char *str)
+{
+ const char *p;
+ char *out, *o, *e;
+ enum {
+ NORMAL,
+ TAG,
+ ENTITY,
+ BAD_ENTITY
+ } state;
+ char entity[6];
+ int elen;
+
+ out = g_malloc (strlen (str) + 1);
+
+ p = str;
+ o = out;
+ state = NORMAL;
+ while (*p != '\0') {
+ switch (state) {
+ case TAG:
+ if (*p == '>') {
+ state = NORMAL;
+ }
+ break;
+
+ case BAD_ENTITY:
+ switch (*p) {
+ case ';':
+ case ' ':
+ *o++ = '?';
+ state = NORMAL;
+ break;
+ default:
+ break;
+ }
+ break;
+
+ case ENTITY:
+ if (*p == ';' || *p == ' ') {
+ *e++ = '\0';
+ if (strncmp (entity, "amp", sizeof(entity)) == 0) {
+ *o++ = '&';
+ } else if (strncmp (entity, "lt", sizeof(entity)) == 0) {
+ *o++ = '<';
+ } else if (strncmp (entity, "quot", sizeof(entity)) == 0) {
+ *o++ = '"';
+ } else if (strncmp (entity, "apos", sizeof(entity)) == 0) {
+ *o++ = '\'';
+ } else if (strncmp (entity, "nbsp", sizeof(entity)) == 0) {
+ *o++ = ' ';
+ } else if (entity[0] == '#') {
+ int base = 10;
+ char *str = entity + 1;
+ char *end = NULL;
+ gulong l;
+
+ if (str[0] == 'x') {
+ base = 16;
+ str++;
+ }
+
+ errno = 0;
+ l = strtoul (str, &end, base);
+ if (end == str || errno != 0 || *end != '\0') {
+ *o++ = '?';
+ } else {
+ o += g_unichar_to_utf8 (l, o);
+ }
+ } else if (elen == 0) {
+ /* bare ampersand */
+ *o++ = '&';
+ *o++ = *p;
+ } else {
+ /* unsupported entity */
+ *o++ = '?';
+ }
+ state = NORMAL;
+ break;
+ }
+ elen++;
+ if (elen == sizeof(entity)) {
+ state = BAD_ENTITY;
+ break;
+ }
+ *e++ = *p;
+ break;
+
+ case NORMAL:
+ switch (*p) {
+ case '<':
+ state = TAG;
+ break;
+ case '&':
+ state = ENTITY;
+ e = entity;
+ elen = 0;
+ break;
+ default:
+ *o++ = *p;
+ }
+ }
+ p++;
+ }
+
+ *o++ = '\0';
+ return out;
+}
+
static void
rb_podcast_properties_dialog_class_init (RBPodcastPropertiesDialogClass *klass)
{
@@ -538,9 +675,22 @@ rb_podcast_properties_dialog_update_date (RBPodcastPropertiesDialog *dialog)
static void
rb_podcast_properties_dialog_update_description (RBPodcastPropertiesDialog *dialog)
{
- const char *str;
- str = rhythmdb_entry_get_string (dialog->priv->current_entry, RHYTHMDB_PROP_DESCRIPTION);
- gtk_label_set_text (GTK_LABEL (dialog->priv->description), str);
+ int i;
+ const char *desc;
+
+ desc = rhythmdb_entry_get_string (dialog->priv->current_entry, RHYTHMDB_PROP_DESCRIPTION);
+ for (i = 0; i < G_N_ELEMENTS (html_clues); i++) {
+ if (g_strstr_len (desc, -1, html_clues[i]) != NULL) {
+ char *text;
+
+ text = unhtml (desc);
+ gtk_label_set_text (GTK_LABEL (dialog->priv->description), text);
+ g_free (text);
+ return;
+ }
+ }
+
+ gtk_label_set_text (GTK_LABEL (dialog->priv->description), desc);
}
static char *
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]