[gnome-software/wip/hughsie/steam: 1/2] Add some HTML processing code
- From: Richard Hughes <rhughes src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gnome-software/wip/hughsie/steam: 1/2] Add some HTML processing code
- Date: Thu, 1 Oct 2015 17:07:13 +0000 (UTC)
commit c30636662dd51a3cbf7d48c5f212d8c3ce1cd4e7
Author: Richard Hughes <richard hughsie com>
Date: Thu Oct 1 14:18:56 2015 +0100
Add some HTML processing code
This will be used in the future to parse descriptions from Steam.
src/plugins/Makefile.am | 1 +
src/plugins/gs-html-utils.c | 248 +++++++++++++++++++++++++++++++++++++++++++
src/plugins/gs-html-utils.h | 34 ++++++
src/plugins/gs-self-test.c | 39 +++++++
4 files changed, 322 insertions(+), 0 deletions(-)
---
diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am
index 3c187f5..713f3c7 100644
--- a/src/plugins/Makefile.am
+++ b/src/plugins/Makefile.am
@@ -180,6 +180,7 @@ check_PROGRAMS = \
gs-self-test
gs_self_test_SOURCES = \
+ gs-html-utils.c \
gs-moduleset.c \
gs-self-test.c
diff --git a/src/plugins/gs-html-utils.c b/src/plugins/gs-html-utils.c
new file mode 100644
index 0000000..8e2eff8
--- /dev/null
+++ b/src/plugins/gs-html-utils.c
@@ -0,0 +1,248 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
+ *
+ * Copyright (C) 2014 Richard Hughes <richard hughsie com>
+ *
+ * Licensed under the GNU General Public License Version 2
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "config.h"
+
+#include <string.h>
+#include <glib.h>
+#include <appstream-glib.h>
+
+#include "gs-html-utils.h"
+
+typedef enum {
+ GS_HTML_UTILS_ACTION_IGNORE,
+ GS_HTML_UTILS_ACTION_PARA,
+ GS_HTML_UTILS_ACTION_LI,
+ GS_HTML_UTILS_ACTION_LAST
+} GsHtmlUtilsAction;
+
+typedef struct {
+ GsHtmlUtilsAction action;
+ GString *str;
+} GsHtmlUtilsHelper;
+
+/**
+ * gs_html_utils_start_cb:
+ **/
+static void
+gs_html_utils_start_cb (GMarkupParseContext *context,
+ const gchar *element_name,
+ const gchar **attribute_names,
+ const gchar **attribute_values,
+ gpointer user_data,
+ GError **error)
+{
+ GsHtmlUtilsHelper *helper = (GsHtmlUtilsHelper *) user_data;
+ if (g_strcmp0 (element_name, "book") == 0)
+ return;
+ if (g_strcmp0 (element_name, "li") == 0) {
+ helper->action = GS_HTML_UTILS_ACTION_LI;
+ return;
+ }
+ if (g_strcmp0 (element_name, "p") == 0) {
+ helper->action = GS_HTML_UTILS_ACTION_PARA;
+ return;
+ }
+ if (g_strcmp0 (element_name, "ul") == 0 ||
+ g_strcmp0 (element_name, "ol") == 0) {
+ g_string_append (helper->str, "<ul>");
+ return;
+ }
+ g_warning ("unhandled START %s", element_name);
+}
+
+/**
+ * gs_html_utils_end_cb:
+ **/
+static void
+gs_html_utils_end_cb (GMarkupParseContext *context,
+ const gchar *element_name,
+ gpointer user_data,
+ GError **error)
+{
+ GsHtmlUtilsHelper *helper = (GsHtmlUtilsHelper *) user_data;
+ if (g_strcmp0 (element_name, "book") == 0 ||
+ g_strcmp0 (element_name, "li") == 0) {
+ return;
+ }
+ if (g_strcmp0 (element_name, "p") == 0) {
+ helper->action = GS_HTML_UTILS_ACTION_IGNORE;
+ return;
+ }
+ if (g_strcmp0 (element_name, "ul") == 0 ||
+ g_strcmp0 (element_name, "ol") == 0) {
+ g_string_append (helper->str, "</ul>");
+ return;
+ }
+ g_warning ("unhandled END %s", element_name);
+}
+
+/**
+ * gs_html_utils_text_cb:
+ **/
+static void
+gs_html_utils_text_cb (GMarkupParseContext *context,
+ const gchar *text,
+ gsize text_len,
+ gpointer user_data,
+ GError **error)
+{
+ GsHtmlUtilsHelper *helper = (GsHtmlUtilsHelper *) user_data;
+ g_autofree gchar *tmp = NULL;
+ g_auto(GStrv) split = NULL;
+ guint i;
+ gchar *strip;
+
+ if (helper->action == GS_HTML_UTILS_ACTION_IGNORE)
+ return;
+
+ /* only add valid lines */
+ tmp = g_markup_escape_text (text, text_len);
+ split = g_strsplit (tmp, "\n", -1);
+ for (i = 0; split[i] != NULL; i++) {
+ strip = g_strstrip (split[i]);
+ if (strip[0] == '\0')
+ continue;
+ if (strlen (strip) < 15)
+ continue;
+ switch (helper->action) {
+ case GS_HTML_UTILS_ACTION_PARA:
+ g_string_append_printf (helper->str, "<p>%s</p>", strip);
+ break;
+ case GS_HTML_UTILS_ACTION_LI:
+ g_string_append_printf (helper->str, "<li>%s</li>", strip);
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+/**
+ * gs_html_utils_strreplace:
+ **/
+static void
+gs_html_utils_strreplace (GString *str, const gchar *search, const gchar *replace)
+{
+ g_auto(GStrv) split = NULL;
+ g_autofree gchar *new = NULL;
+
+ /* optimise */
+ if (g_strstr_len (str->str, -1, search) == NULL)
+ return;
+ split = g_strsplit (str->str, search, -1);
+ new = g_strjoinv (replace, split);
+ g_string_assign (str, new);
+}
+
+/**
+ * gs_html_utils_erase:
+ *
+ * Replaces any tag with whitespace.
+ **/
+static void
+gs_html_utils_erase (GString *str, const gchar *start, const gchar *end)
+{
+ guint i, j;
+ guint start_len = strlen (start);
+ guint end_len = strlen (end);
+ for (i = 0; str->str[i] != '\0'; i++) {
+ if (memcmp (&str->str[i], start, start_len) != 0)
+ continue;
+ for (j = i; i < str->len; j++) {
+ if (memcmp (&str->str[j], end, end_len) != 0)
+ continue;
+ /* delete this section and restart the search */
+ g_string_erase (str, i, (j - i) + end_len);
+ i = -1;
+ break;
+ }
+ }
+}
+
+/**
+ * gs_html_utils_parse_description:
+ **/
+gchar *
+gs_html_utils_parse_description (const gchar *html, GError **error)
+{
+ GMarkupParseContext *ctx;
+ GsHtmlUtilsHelper helper;
+ GMarkupParser parser = {
+ gs_html_utils_start_cb,
+ gs_html_utils_end_cb,
+ gs_html_utils_text_cb,
+ NULL,
+ NULL };
+ g_autofree gchar *check = NULL;
+ g_autofree gchar *tmp = NULL;
+ g_autoptr(GString) str = NULL;
+
+ /* set up XML parser */
+ helper.action = GS_HTML_UTILS_ACTION_PARA;
+ helper.str = g_string_new ("");
+ ctx = g_markup_parse_context_new (&parser, G_MARKUP_TREAT_CDATA_AS_TEXT, &helper, NULL);
+
+ /* ensure this has at least one se of quotes */
+ str = g_string_new ("");
+ g_string_append_printf (str, "<book>%s</book>", html);
+
+ /* convert win32 line endings */
+ g_strdelimit (str->str, "\r", '\n');
+
+ /* tidy up non-compliant HTML5 */
+ gs_html_utils_erase (str, "<img", ">");
+ gs_html_utils_erase (str, "<br", ">");
+
+ /* kill anything that's not wanted */
+ gs_html_utils_erase (str, "<h1", "</h1>");
+ gs_html_utils_erase (str, "<h2", "</h2>");
+ gs_html_utils_erase (str, "<span", "</span>");
+ gs_html_utils_erase (str, "<a", ">");
+ gs_html_utils_erase (str, "</a", ">");
+
+ /* use UTF-8 */
+ gs_html_utils_strreplace (str, "<i>", "");
+ gs_html_utils_strreplace (str, "</i>", "");
+ gs_html_utils_strreplace (str, "<u>", "");
+ gs_html_utils_strreplace (str, "</u>", "");
+ gs_html_utils_strreplace (str, "<b>", "");
+ gs_html_utils_strreplace (str, "</b>", "");
+ gs_html_utils_strreplace (str, "<blockquote>", "");
+ gs_html_utils_strreplace (str, "</blockquote>", "");
+ gs_html_utils_strreplace (str, "<strong>", "");
+ gs_html_utils_strreplace (str, "</strong>", "");
+ gs_html_utils_strreplace (str, "™", "™");
+ gs_html_utils_strreplace (str, "®", "®");
+
+//g_print ("%s\n", str->str);
+
+ /* parse */
+ if (!g_markup_parse_context_parse (ctx, str->str, -1, error))
+ return NULL;
+
+ /* verify this is valid AppStream markup */
+ check = as_markup_convert_simple (helper.str->str, error);
+ if (check == NULL)
+ return NULL;
+
+ return g_strdup (helper.str->str);
+}
diff --git a/src/plugins/gs-html-utils.h b/src/plugins/gs-html-utils.h
new file mode 100644
index 0000000..35cc262
--- /dev/null
+++ b/src/plugins/gs-html-utils.h
@@ -0,0 +1,34 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
+ *
+ * Copyright (C) 2014 Richard Hughes <richard hughsie com>
+ *
+ * Licensed under the GNU General Public License Version 2
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef __GS_STEAM_DESCRIPTION_H
+#define __GS_STEAM_DESCRIPTION_H
+
+#include <glib-object.h>
+
+gchar *gs_html_utils_parse_description (const gchar *html,
+ GError **error);
+
+
+G_END_DECLS
+
+#endif /* __GS_STEAM_DESCRIPTION_H */
+
diff --git a/src/plugins/gs-self-test.c b/src/plugins/gs-self-test.c
index 6b45897..c81fad4 100644
--- a/src/plugins/gs-self-test.c
+++ b/src/plugins/gs-self-test.c
@@ -27,6 +27,44 @@
#include <gtk/gtk.h>
#include "gs-moduleset.h"
+#include "gs-html-utils.h"
+
+static void
+html_utils_func (void)
+{
+ const gchar *input;
+ g_autofree gchar *out_complex = NULL;
+ g_autofree gchar *out_list = NULL;
+ g_autofree gchar *out_simple = NULL;
+ g_autoptr(GError) error = NULL;
+
+ /* simple, from meta */
+ input = "This game is simply awesome™ in every way!";
+ out_simple = gs_html_utils_parse_description (input, &error);
+ g_assert_no_error (error);
+ g_assert_cmpstr (out_simple, ==, "<p>This game is simply awesome™ in every way!</p>");
+
+ /* complex non-compliant HTML, from div */
+ input = " <h1>header</h1>"
+ " <p>First line of the <i>description</i> is okay...</p>"
+ " <img src=\"moo.png\">"
+ " <img src=\"png\">"
+ " <p>Second <strong>line</strong> is <a href=\"#moo\">even</a> better!</p>";
+ out_complex = gs_html_utils_parse_description (input, &error);
+ g_print ("\n\n%s\n\n", out_complex);
+ g_assert_no_error (error);
+ g_assert_cmpstr (out_complex, ==, "<p>First line of the description is okay...</p>"
+ "<p>Second line is even better!</p>");
+
+ /* complex list */
+ input = " <ul>"
+ " <li>First line of the list</li>"
+ " <li>Second line of the list</li>"
+ " </ul>";
+ out_list = gs_html_utils_parse_description (input, &error);
+ g_assert_no_error (error);
+ g_assert_cmpstr (out_list, ==, "<ul><li>First line of the list</li><li>Second line of the
list</li></ul>");
+}
static void
moduleset_func (void)
@@ -76,6 +114,7 @@ main (int argc, char **argv)
/* tests go here */
g_test_add_func ("/moduleset", moduleset_func);
+ g_test_add_func ("/html-utils", html_utils_func);
return g_test_run ();
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]