Cheers,
Jason
plain text document attachment (tracker-imagemagick-extract-xmp.patch)
Index: src/tracker-extract/tracker-extract-imagemagick.c
===================================================================
--- src/tracker-extract/tracker-extract-imagemagick.c (revision 598)
+++ src/tracker-extract/tracker-extract-imagemagick.c (working copy)
@@ -35,7 +35,7 @@
gint exit_status;
/* imagemagick crashes trying to extract from xcf files */
- if (g_str_has_suffix (filename, '.xcf')) {
+ if (g_str_has_suffix (filename, ".xcf")) {
return;
}
@@ -60,5 +60,16 @@
g_hash_table_insert (metadata, g_strdup ("Image:Comments"), g_strdup (g_strescape
(lines[2], "")));
}
}
+
+ gchar *xmp;
+ argv[0] = g_strdup ("convert");
+ argv[1] = g_strdup (filename);
+ argv[2] = g_strdup ("xmp:-");
+ argv[3] = NULL;
+
+ if (tracker_spawn (argv, 10, &xmp, &exit_status)) {
+ if (exit_status == EXIT_SUCCESS) {
+ tracker_read_xmp(xmp,strlen(xmp),metadata);
+ }
+ }
}
-
plain text document attachment
(tracker-msoffice-extract-license.patch)
Index: src/tracker-extract/tracker-extract-msoffice.c
===================================================================
--- src/tracker-extract/tracker-extract-msoffice.c (revision 598)
+++ src/tracker-extract/tracker-extract-msoffice.c (working copy)
@@ -118,7 +118,26 @@
}
}
+static void
+doc_metadata_cb (gpointer key, gpointer value, gpointer user_data)
+{
+ gchar *name;
+ GsfDocProp *property;
+ GHashTable *metadata;
+ GValue const *val;
+ name = (gchar *) key;
+ property = (GsfDocProp *) value;
+ metadata = (GHashTable *) user_data;
+
+ val = gsf_doc_prop_get_val (property);
+
+ if (strcmp (name, "CreativeCommons_LicenseURL") == 0) {
+ add_gvalue_in_hash_table (metadata, "File:License", val);
+ }
+}
+
+
void
tracker_extract_msoffice (gchar *filename, GHashTable *metadata)
{
@@ -145,25 +164,37 @@
}
stream = gsf_infile_child_by_name (infile, "\05SummaryInformation");
- g_object_unref (G_OBJECT (infile));
-
- if (!stream) {
- gsf_shutdown ();
- return;
+ if (stream) {
+ md = gsf_doc_meta_data_new ();
+
+ if (gsf_msole_metadata_read (stream, md)) {
+ gsf_shutdown ();
+ return;
+ }
+
+ gsf_doc_meta_data_foreach (md, metadata_cb, metadata);
+
+ g_object_unref (G_OBJECT (md));
+ g_object_unref (G_OBJECT (stream));
}
- md = gsf_doc_meta_data_new ();
-
- if (gsf_msole_metadata_read (stream, md)) {
- gsf_shutdown ();
- return;
+ stream = gsf_infile_child_by_name (infile, "\05DocumentSummaryInformation");
+ if (stream) {
+ md = gsf_doc_meta_data_new ();
+
+ if (gsf_msole_metadata_read (stream, md)) {
+ gsf_shutdown ();
+ return;
+ }
+
+ gsf_doc_meta_data_foreach (md, doc_metadata_cb, metadata);
+
+ g_object_unref (G_OBJECT (md));
+ g_object_unref (G_OBJECT (stream));
}
- gsf_doc_meta_data_foreach (md, metadata_cb, metadata);
+ g_object_unref (G_OBJECT (infile));
- g_object_unref (G_OBJECT (md));
- g_object_unref (G_OBJECT (stream));
-
gsf_shutdown ();
}
plain text document attachment (tracker-pdf-extract-xmp.patch)
Index: src/tracker-extract/tracker-extract-pdf.c
===================================================================
--- src/tracker-extract/tracker-extract-pdf.c (revision 598)
+++ src/tracker-extract/tracker-extract-pdf.c (working copy)
@@ -26,6 +26,8 @@
#include <string.h>
#include <glib.h>
+#include "tracker-extract.h"
+
void tracker_extract_pdf (gchar *filename, GHashTable *metadata)
{
PopplerDocument *document;
@@ -34,6 +36,7 @@
gchar *author;
gchar *subject;
gchar *keywords;
+ gchar *metadata_xml;
GTime creation_date;
GError *error = NULL;
@@ -50,6 +53,7 @@
"subject", &subject,
"keywords", &keywords,
"creation-date", &creation_date,
+ "metadata", &metadata_xml,
NULL);
if (title && strlen (title))
@@ -71,10 +75,15 @@
g_hash_table_insert (metadata, g_strdup ("Doc:PageCount"),
g_strdup_printf ("%d", poppler_document_get_n_pages (document)));
+ if ( metadata_xml ) {
+ tracker_read_xmp (metadata_xml,strlen(metadata_xml),metadata);
+ }
+
g_free (title);
g_free (author);
g_free (subject);
g_free (keywords);
+ g_free (metadata_xml);
g_object_unref (document);
}
plain text document attachment (tracker-png-extract-xmp.patch)
Index: src/tracker-extract/tracker-extract-png.c
===================================================================
--- src/tracker-extract/tracker-extract-png.c (revision 598)
+++ src/tracker-extract/tracker-extract-png.c (working copy)
@@ -20,6 +20,8 @@
#include "config.h"
+#include "tracker-extract.h"
+
#include <stdio.h>
#include <glib.h>
#include <png.h>
@@ -79,15 +81,23 @@
g_strdup_printf ("%ld", height));
}
-
if (png_get_text (png_ptr, info_ptr, &text_ptr, &num_text) > 0) {
- for (i = 0; i < num_text; i++) {
- for (j=0; tagmap[j].type; j++) {
- if (strcasecmp (tagmap[j].name, text_ptr[i].key) == 0) {
- if (text_ptr[i].text && strlen (text_ptr[i].text) > 0) {
- g_hash_table_insert (metadata, g_strdup
(tagmap[j].type), g_strdup (text_ptr[i].text));
+ for (i = 0; i < num_text; i++) {
+ if ( text_ptr[i].key != NULL ) {
+ #if defined(HAVE_EXEMPI) && defined(PNG_iTXt_SUPPORTED)
+ if (strcmp("XML:com.adobe.xmp",text_ptr[i].key) == 0) {
+
tracker_read_xmp(text_ptr[i].text,text_ptr[i].itxt_length,metadata);
+ continue;
+ }
+ #endif
+
+ for (j=0; tagmap[j].type; j++) {
+ if (strcasecmp (tagmap[j].name, text_ptr[i].key) == 0) {
+ if (text_ptr[i].text && strlen (text_ptr[i].text) >
0) {
+ g_hash_table_insert (metadata, g_strdup
(tagmap[j].type), g_strdup (text_ptr[i].text));
+ }
+ break;
}
- break;
}
}
}
plain text document attachment (tracker-extract-html.patch)
Index: src/tracker-extract/tracker-extract.c
===================================================================
--- src/tracker-extract/tracker-extract.c (revision 598)
+++ src/tracker-extract/tracker-extract.c (working copy)
@@ -52,6 +52,9 @@
void tracker_extract_totem (gchar *, GHashTable *);
void tracker_extract_oasis (gchar *, GHashTable *);
void tracker_extract_ps (gchar *, GHashTable *);
+#ifdef HAVE_LIBXML2
+void tracker_extract_html (gchar *, GHashTable *);
+#endif
#ifdef HAVE_POPPLER
void tracker_extract_pdf (gchar *, GHashTable *);
#endif
@@ -84,6 +87,10 @@
/* Document extractors */
{ "application/vnd.oasis.opendocument.*", tracker_extract_oasis },
{ "application/postscript", tracker_extract_ps },
+#ifdef HAVE_LIBXML2
+ { "text/html", tracker_extract_html },
+ { "application/xhtml+xml", tracker_extract_html },
+#endif
#ifdef HAVE_POPPLER
{ "application/pdf", tracker_extract_pdf },
#endif
Index: src/tracker-extract/Makefile.am
===================================================================
--- src/tracker-extract/Makefile.am (revision 598)
+++ src/tracker-extract/Makefile.am (working copy)
@@ -5,6 +5,7 @@
$(LIBGSF_CFLAGS) \
$(LIBGSF_CFLAGS) \
$(GSTREAMER_CFLAGS) \
+ $(LIBXML2_CFLAGS) \
$(XINE_CFLAGS)
bin_PROGRAMS = tracker-extract
@@ -33,6 +34,7 @@
tracker-extract-imagemagick.c \
tracker-extract-mplayer.c \
tracker-extract-totem.c \
+ tracker-extract-html.c \
$(video_sources)
tracker_extract_LDADD = $(GLIB2_LIBS) \
@@ -41,4 +43,5 @@
$(LIBEXIF_LIBS) \
$(LIBGSF_LIBS) \
$(GSTREAMER_LIBS) \
+ $(LIBXML2_LIBS) \
$(XINE_LIBS)
Index: configure.ac
===================================================================
--- configure.ac (revision 598)
+++ configure.ac (working copy)
@@ -605,8 +605,25 @@
[ AC_DEFINE(IOPRIO_SUPPORT,[],[Define ioprio support]) ioprio_support=yes ])
AC_MSG_RESULT([$ioprio_support])
-#####################################################
+##################################################################
+# check for libxml2
+##################################################################
+LIBXML2_REQUIRED=0.6
+
+AC_ARG_ENABLE(libxml2, AC_HELP_STRING([--disable-libxml2],[Disable HTML/XML extractors (full-text will
still be available)]),,[enable_libxml2=yes])
+if test "x$enable_libxml2" = "xyes"; then
+ PKG_CHECK_MODULES(LIBXML2, [
+ libxml-2.0 >= $LIBXML2_REQUIRED],
+ [have_libxml2=yes] , [have_libxml2=no])
+ AC_SUBST(LIBXML2_CFLAGS)
+ AC_SUBST(LIBXML2_LIBS)
+else
+ have_libxml2="no (disabled)"
+fi
+AM_CONDITIONAL(HAVE_LIBXML2, test "$have_libxml2" = "yes")
+test "$have_libxml2" = "yes" && AC_DEFINE(HAVE_LIBXML2, [], [Define if we have libxml2])
+
AC_CONFIG_FILES([
Makefile
tracker.pc
@@ -675,6 +692,7 @@
exif (jpeg): $have_libexif
gsf: $have_libgsf
video files: $videos_are_handled ($videos_handler)
+ xml/html formats: $have_libxml2
"
if test "x$enable_external_sqlite" = "xyes"; then
Index: src/tracker-extract/tracker-extract-html.c
===================================================================
--- src/tracker-extract/tracker-extract-html.c (revision 0)
+++ src/tracker-extract/tracker-extract-html.c (revision 0)
@@ -0,0 +1,161 @@
+/* Tracker Extract - extracts embedded metadata from files
+ * Copyright (C) 2007, Jason Kivlighn (jkivlighn gmail com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include "config.h"
+
+#ifdef HAVE_LIBXML2
+
+#include <string.h>
+#include <glib.h>
+#include <libxml/HTMLparser.h>
+
+typedef enum {
+ READ_TITLE,
+ } tag_type;
+
+typedef struct {
+ GHashTable *metadata;
+ tag_type current;
+} HTMLParseInfo;
+
+gboolean
+has_attribute( const xmlChar ** atts, const char *attr, const char*val )
+{
+ int i;
+ for ( i = 0; atts[i]; i+=2 )
+ {
+ if ( strcmp((char*)atts[i],attr) == 0 ) {
+ if ( !val || strcmp((char*)atts[i+1],val) == 0 ) {
+ return TRUE;
+ }
+ }
+ }
+ return FALSE;
+}
+
+const xmlChar *
+lookup_attribute( const xmlChar **atts, const char *attr )
+{
+ int i;
+ for ( i = 0; atts[i]; i+=2 )
+ {
+ if ( strcmp((char*)atts[i],attr) == 0 ) {
+ return atts[i+1];
+ }
+ }
+
+ return NULL;
+}
+
+void
+startElement (void * info, const xmlChar * name, const xmlChar ** atts)
+{
+ /* Look for RDFa triple describing the license */
+ if ( strcmp((char*)name,"a") == 0 ) {
+ /* This tag is a license. Ignore, however, if it is referring to another document */
+ if ( has_attribute(atts,"rel","license") && !has_attribute(atts,"about",NULL) ) {
+ const xmlChar *href = lookup_attribute(atts,"href");
+ if ( href ) {
+ g_hash_table_insert (((HTMLParseInfo *)info)->metadata, g_strdup
("File:License"),
+ g_strdup( (char*)href ));
+ }
+ }
+ } else if ( strcmp((char*)name,"title") == 0 ) {
+ ((HTMLParseInfo *)info)->current = READ_TITLE;
+ } else if ( strcmp((char*)name,"meta") == 0 ) {
+ if ( has_attribute(atts,"name","Author") ) {
+ const xmlChar *author = lookup_attribute(atts,"content");
+ if ( author ) {
+ g_hash_table_insert (((HTMLParseInfo *)info)->metadata, g_strdup
("Doc:Author"),
+ g_strdup( (char*)author ));
+ }
+ }
+ if ( has_attribute(atts,"name","DC.Description") ) {
+ const xmlChar *desc = lookup_attribute(atts,"content");
+ if ( desc ) {
+ g_hash_table_insert (((HTMLParseInfo *)info)->metadata, g_strdup
("Doc:Comments"),
+ g_strdup( (char*)desc ));
+ }
+ }
+ }
+}
+
+void
+characters(void * info, const xmlChar * ch, int len)
+{
+ switch(((HTMLParseInfo *)info)->current) {
+ case READ_TITLE:
+ g_hash_table_insert (((HTMLParseInfo *)info)->metadata, g_strdup
("Doc:Title"),
+ g_strdup( (char*)ch ));
+ break;
+ default: break;
+ }
+
+ ((HTMLParseInfo *)info)->current = -1;
+}
+
+void tracker_extract_html (gchar* filename, GHashTable *metadata)
+{
+ xmlSAXHandler SAXHandlerStruct = {
+ NULL, /* internalSubset */
+ NULL, /* isStandalone */
+ NULL, /* hasInternalSubset */
+ NULL, /* hasExternalSubset */
+ NULL, /* resolveEntity */
+ NULL, /* getEntity */
+ NULL, /* entityDecl */
+ NULL, /* notationDecl */
+ NULL, /* attributeDecl */
+ NULL, /* elementDecl */
+ NULL, /* unparsedEntityDecl */
+ NULL, /* setDocumentLocator */
+ NULL, /* startDocument */
+ NULL, /* endDocument */
+ startElement, /* startElement */
+ NULL, /* endElement */
+ NULL, /* reference */
+ characters, /* characters */
+ NULL, /* ignorableWhitespace */
+ NULL, /* processingInstruction */
+ NULL, /* comment */
+ NULL, /* xmlParserWarning */
+ NULL, /* xmlParserError */
+ NULL, /* xmlParserError */
+ NULL, /* getParameterEntity */
+ NULL, /* cdataBlock */
+ NULL, /* externalSubset */
+ 1, /* initialized */
+ NULL, /* private */
+ NULL, /* startElementNsSAX2Func */
+ NULL, /* endElementNsSAX2Func */
+ NULL /* xmlStructuredErrorFunc */
+ };
+
+ HTMLParseInfo info = { metadata, -1 };
+
+ htmlDocPtr doc;
+ doc = htmlSAXParseFile(filename, NULL, &SAXHandlerStruct, &info);
+ if ( doc ) {
+ xmlFreeDoc(doc);
+ }
+}
+
+#else
+#warning "Not building HTML metadata extractor."
+#endif
_______________________________________________
tracker-list mailing list
tracker-list gnome org
http://mail.gnome.org/mailman/listinfo/tracker-list