[tracker] Add EPub extractor.
- From: Carlos Garnacho <carlosg src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker] Add EPub extractor.
- Date: Wed, 27 Apr 2011 15:50:53 +0000 (UTC)
commit 05262264494de6fc3fdcbb978f7dd589c885a8a8
Author: Carlos Garnacho <carlosg gnome org>
Date: Wed Apr 13 12:48:46 2011 +0200
Add EPub extractor.
Fixes GB#642288. At the moment title/author/creation date/text content
are extracted.
src/tracker-extract/10-epub.rule.in | 3 +
src/tracker-extract/Makefile.am | 17 ++-
src/tracker-extract/tracker-extract-epub.c | 373 ++++++++++++++++++++++++++++
3 files changed, 392 insertions(+), 1 deletions(-)
---
diff --git a/src/tracker-extract/10-epub.rule.in b/src/tracker-extract/10-epub.rule.in
new file mode 100644
index 0000000..d859d80
--- /dev/null
+++ b/src/tracker-extract/10-epub.rule.in
@@ -0,0 +1,3 @@
+[ExtractorRule]
+ModulePath= modulesdir@/libextract-epub.so
+MimeTypes=application/epub+zip
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index 4e052a7..73ebf4c 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -18,6 +18,7 @@ AM_CPPFLAGS = \
# date.
rules_in_files = \
10-abw.rule.in \
+ 10-epub.rule.in \
10-flac.rule.in \
10-gif.rule.in \
10-html.rule.in \
@@ -104,10 +105,11 @@ endif
if HAVE_LIBGSF
modules_LTLIBRARIES += \
+ libextract-epub.la \
libextract-msoffice.la \
libextract-msoffice-xml.la \
libextract-oasis.la
-rules_DATA += 10-oasis.rule 10-msoffice.rule 11-msoffice-xml.rule
+rules_DATA += 10-epub.rule 10-oasis.rule 10-msoffice.rule 11-msoffice-xml.rule
endif
if HAVE_POPPLER
@@ -211,6 +213,19 @@ libextract_oasis_la_LIBADD = \
$(TRACKER_EXTRACT_MODULES_LIBS) \
$(LIBGSF_LIBS)
+# EPub
+libextract_epub_la_SOURCES = tracker-extract-epub.c
+libextract_epub_la_CFLAGS = \
+ $(TRACKER_EXTRACT_MODULES_CFLAGS) \
+ $(LIBGSF_CFLAGS)
+libextract_epub_la_LDFLAGS = $(module_flags)
+libextract_epub_la_LIBADD = \
+ $(top_builddir)/src/libtracker-extract/libtracker-extract- TRACKER_API_VERSION@.la \
+ $(top_builddir)/src/libtracker-common/libtracker-common.la \
+ $(BUILD_LIBS) \
+ $(TRACKER_EXTRACT_MODULES_LIBS) \
+ $(LIBGSF_LIBS)
+
# PNG
libextract_png_la_SOURCES = tracker-extract-png.c
libextract_png_la_CFLAGS = \
diff --git a/src/tracker-extract/tracker-extract-epub.c b/src/tracker-extract/tracker-extract-epub.c
new file mode 100644
index 0000000..3034fa7
--- /dev/null
+++ b/src/tracker-extract/tracker-extract-epub.c
@@ -0,0 +1,373 @@
+/*
+ * Copyright (C) 2006, Jamie McCracken <jamiemcc gnome org>
+ * Copyright (C) 2008, Nokia <ivan frade nokia com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include <libtracker-extract/tracker-extract.h>
+
+#include "tracker-main.h"
+#include "tracker-gsf.h"
+#include "tracker-read.h"
+
+#include <unistd.h>
+
+typedef enum {
+ OPF_TAG_TYPE_UNKNOWN,
+ OPF_TAG_TYPE_TITLE,
+ OPF_TAG_TYPE_AUTHOR,
+ OPF_TAG_TYPE_CREATED
+} OPFTagType;
+
+typedef struct {
+ TrackerSparqlBuilder *preupdate;
+ TrackerSparqlBuilder *metadata;
+ OPFTagType element;
+ GList *pages;
+ guint in_metadata : 1;
+ guint in_manifest : 1;
+} OPFData;
+
+typedef struct {
+ GString *contents;
+ gsize limit;
+} OPFContentData;
+
+/* Methods to parse the container.xml file
+ * pointing to the real metadata/content
+ */
+static void
+container_xml_start_element_handler (GMarkupParseContext *context,
+ const gchar *element_name,
+ const gchar **attribute_names,
+ const gchar **attribute_values,
+ gpointer user_data,
+ GError **error)
+{
+ gchar **path_out = user_data;
+ gint i;
+
+ if (g_strcmp0 (element_name, "rootfile") != 0) {
+ return;
+ }
+
+ for (i = 0; attribute_names[i] != NULL; i++) {
+ if (g_strcmp0 (attribute_names[i], "full-path") == 0) {
+ if (!*path_out) {
+ *path_out = g_strdup (attribute_values[i]);
+ }
+ break;
+ }
+ }
+}
+
+/* Methods to parse the OPF document metadata/layout */
+static void
+opf_xml_start_element_handler (GMarkupParseContext *context,
+ const gchar *element_name,
+ const gchar **attribute_names,
+ const gchar **attribute_values,
+ gpointer user_data,
+ GError **error)
+{
+ OPFData *data = user_data;
+ gint i;
+
+ if (g_strcmp0 (element_name, "metadata") == 0) {
+ data->in_metadata = TRUE;
+ } else if (g_strcmp0 (element_name, "manifest") == 0) {
+ data->in_manifest = TRUE;
+ } else if (data->in_metadata) {
+ /* epub metadata */
+ if (g_strcmp0 (element_name, "dc:title") == 0) {
+ data->element = OPF_TAG_TYPE_TITLE;
+ } else if (g_strcmp0 (element_name, "dc:creator") == 0) {
+ for (i = 0; attribute_names[i] != NULL; i++) {
+ if (g_strcmp0 (attribute_names[i], "opf:role") == 0 &&
+ g_strcmp0 (attribute_values[i], "aut") == 0) {
+ data->element = OPF_TAG_TYPE_AUTHOR;
+ break;
+ }
+ }
+ } else if (g_strcmp0 (element_name, "dc:date") == 0) {
+ for (i = 0; attribute_names[i] != NULL; i++) {
+ if (g_strcmp0 (attribute_names[i], "opf:event") == 0 &&
+ g_strcmp0 (attribute_values[i], "original-publication") == 0) {
+ data->element = OPF_TAG_TYPE_CREATED;
+ break;
+ }
+ }
+ }
+ } else if (data->in_manifest &&
+ g_strcmp0 (element_name, "item") == 0) {
+ const gchar *rel_path = NULL;
+ gboolean is_xhtml = FALSE;
+
+ /* Keep list of xhtml documents for plain text extraction */
+ for (i = 0; attribute_names[i] != NULL; i++) {
+ if (g_strcmp0 (attribute_names[i], "href") == 0) {
+ rel_path = attribute_values[i];
+ } else if (g_strcmp0 (attribute_names[i], "media-type") == 0 &&
+ g_strcmp0 (attribute_values[i], "application/xhtml+xml") == 0) {
+ is_xhtml = TRUE;
+ }
+ }
+
+ if (is_xhtml && rel_path) {
+ data->pages = g_list_append (data->pages, g_strdup (rel_path));
+ }
+ }
+}
+
+static void
+opf_xml_end_element_handler (GMarkupParseContext *context,
+ const gchar *element_name,
+ gpointer user_data,
+ GError **error)
+{
+ OPFData *data = user_data;
+
+ if (g_strcmp0 (element_name, "metadata") == 0) {
+ data->in_metadata = FALSE;
+ } else if (g_strcmp0 (element_name, "manifest") == 0) {
+ data->in_manifest = FALSE;
+ } else {
+ data->element = OPF_TAG_TYPE_UNKNOWN;
+ }
+}
+
+static void
+opf_xml_text_handler (GMarkupParseContext *context,
+ const gchar *text,
+ gsize text_len,
+ gpointer user_data,
+ GError **error)
+{
+ OPFData *data = user_data;
+ gchar *date;
+
+ switch (data->element) {
+ case OPF_TAG_TYPE_AUTHOR:
+ tracker_sparql_builder_predicate (data->metadata, "nco:publisher");
+
+ tracker_sparql_builder_object_blank_open (data->metadata);
+ tracker_sparql_builder_predicate (data->metadata, "a");
+ tracker_sparql_builder_object (data->metadata, "nco:Contact");
+
+ tracker_sparql_builder_predicate (data->metadata, "nco:fullname");
+ tracker_sparql_builder_object_unvalidated (data->metadata, text);
+ tracker_sparql_builder_object_blank_close (data->metadata);
+ break;
+ case OPF_TAG_TYPE_TITLE:
+ tracker_sparql_builder_predicate (data->metadata, "nie:title");
+ tracker_sparql_builder_object_unvalidated (data->metadata, text);
+ break;
+ case OPF_TAG_TYPE_CREATED:
+ date = tracker_date_guess (text);
+ tracker_sparql_builder_predicate (data->metadata, "nie:contentCreated");
+ tracker_sparql_builder_object_unvalidated (data->metadata, date);
+ g_free (date);
+ break;
+ case OPF_TAG_TYPE_UNKNOWN:
+ default:
+ break;
+ }
+}
+
+/* Methods to extract XHTML text content */
+static void
+content_xml_text_handler (GMarkupParseContext *context,
+ const gchar *text,
+ gsize text_len,
+ gpointer user_data,
+ GError **error)
+{
+ OPFContentData *content_data = user_data;
+ gsize written_bytes = 0;
+
+ if (text_len <= 0) {
+ return;
+ }
+
+ if (tracker_text_validate_utf8 (text,
+ MIN (text_len, content_data->limit),
+ &content_data->contents,
+ &written_bytes)) {
+ if (content_data->contents->str[content_data->contents->len - 1] != ' ') {
+ g_string_append_c (content_data->contents, ' ');
+ }
+ }
+
+ content_data->limit -= written_bytes;
+}
+
+static gchar *
+extract_opf_path (const gchar *uri)
+{
+ GMarkupParseContext *context;
+ gchar *path = NULL;
+ GError *error = NULL;
+ GMarkupParser parser = {
+ container_xml_start_element_handler,
+ NULL, NULL, NULL, NULL
+ };
+
+ /* Create parsing context */
+ context = g_markup_parse_context_new (&parser, 0, &path, NULL);
+
+ /* Load the internal container file from the Zip archive,
+ * and parse it to extract the .opf file to get metadata from
+ */
+ tracker_gsf_parse_xml_in_zip (uri, "META-INF/container.xml", context, &error);
+ g_markup_parse_context_free (context);
+
+ if (error || !path) {
+ g_warning ("Could not get EPUB container.xml file: %s\n",
+ (error) ? error->message : "No error provided");
+ g_error_free (error);
+ return NULL;
+ }
+
+ return path;
+}
+
+static gchar *
+extract_opf_contents (const gchar *uri,
+ const gchar *content_prefix,
+ GList *content_files)
+{
+ OPFContentData content_data = { 0 };
+ GMarkupParseContext *context;
+ TrackerConfig *config;
+ GError *error = NULL;
+ GList *l;
+ GMarkupParser xml_parser = {
+ NULL, NULL,
+ content_xml_text_handler,
+ NULL, NULL
+ };
+
+ config = tracker_main_get_config ();
+ context = g_markup_parse_context_new (&xml_parser, 0, &content_data, NULL);
+
+ content_data.contents = g_string_new ("");
+ content_data.limit = (gsize) tracker_config_get_max_bytes (config);
+
+ g_debug ("Extracting up to %" G_GSIZE_FORMAT " bytes of content", content_data.limit);
+
+ for (l = content_files; l; l = l->next) {
+ gchar *path;
+
+ /* Page file is relative to OPF file location */
+ path = g_build_filename (content_prefix, l->data, NULL);
+ tracker_gsf_parse_xml_in_zip (uri, path, context, &error);
+ g_free (path);
+
+ if (error) {
+ g_warning ("Error extracting EPUB contents: %s\n",
+ error->message);
+ break;
+ }
+
+ if (content_data.limit <= 0) {
+ /* Reached plain text extraction limit */
+ break;
+ }
+ }
+
+ g_markup_parse_context_free (context);
+
+ return g_string_free (content_data.contents, FALSE);
+}
+
+static gboolean
+extract_opf (const gchar *uri,
+ const gchar *opf_path,
+ TrackerSparqlBuilder *preupdate,
+ TrackerSparqlBuilder *metadata)
+{
+ GMarkupParseContext *context;
+ OPFData data = { 0 };
+ GError *error = NULL;
+ gchar *dirname, *contents;
+ GMarkupParser opf_parser = {
+ opf_xml_start_element_handler,
+ opf_xml_end_element_handler,
+ opf_xml_text_handler,
+ NULL, NULL
+ };
+
+ g_debug ("Extracting OPF file contents from EPUB '%s'", uri);
+
+ tracker_sparql_builder_predicate (metadata, "a");
+ tracker_sparql_builder_object (metadata, "nfo:TextDocument");
+
+ data.metadata = metadata;
+ data.preupdate = preupdate;
+
+ /* Create parsing context */
+ context = g_markup_parse_context_new (&opf_parser, 0, &data, NULL);
+
+ /* Load the internal container file from the Zip archive,
+ * and parse it to extract the .opf file to get metadata from
+ */
+ tracker_gsf_parse_xml_in_zip (uri, opf_path, context, &error);
+ g_markup_parse_context_free (context);
+
+ if (error) {
+ g_warning ("Could not get EPUB '%s' file: %s\n", opf_path,
+ (error) ? error->message : "No error provided");
+ g_error_free (error);
+ return FALSE;
+ }
+
+ dirname = g_path_get_dirname (opf_path);
+ contents = extract_opf_contents (uri, dirname, data.pages);
+ g_free (dirname);
+
+ if (contents && *contents) {
+ tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
+ tracker_sparql_builder_object_unvalidated (metadata, contents);
+ }
+
+ g_list_foreach (data.pages, (GFunc) g_free, NULL);
+ g_list_free (data.pages);
+ g_free (contents);
+
+ return TRUE;
+}
+
+G_MODULE_EXPORT gboolean
+tracker_extract_get_metadata (const gchar *uri,
+ const gchar *mime_used,
+ TrackerSparqlBuilder *preupdate,
+ TrackerSparqlBuilder *metadata,
+ GString *where)
+{
+ gchar *opf_path;
+
+ opf_path = extract_opf_path (uri);
+
+ if (!opf_path) {
+ return FALSE;
+ }
+
+ extract_opf (uri, opf_path, preupdate, metadata);
+ g_free (opf_path);
+
+ return TRUE;
+}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]