[Tracker] Extracting Embedded Licenses



Hi,

imagemagick: Uses 'convert filename xmp:-' to output an image's embedded
XMP.  This works for at least JPEG and TIFF files.  For JPEGs, however,
Imagemagick outputs the namespace and XMP, seperated by \0.  I'm not
sure how I can handle this, without simply assuming that 'convert'
returned two null-terminated strings.  Nevertheless, this extracts the
XMP from TIFF files.

msoffice: Extends the msoffice extractor to also parse the
DocumentSummeryInformation infile, which contains user-defined metadata,
along with license metadata embedded by the MSOffice Creative Commons Add-in

pdf: Extends the pdf extractor to read a PDF's metadata stream and parse
it as XMP.  I'm still awaiting poppler extending the glib bindings to
allow reading the metadata stream.  Until then, it will simply never
find the metadata stream and go on without error.

png: Adds a check for the XML:com:adobe:xmp iTXt field, and parses it as
XMP.

html: Adds a new html parser using libxml2.  Parses the document,
checking for RDFa licenses.  It also checks for other basic HTML
properties like title and author.

There's also several XML formats I'd like to parse for license data,
particularly SVG and SMIL.  Would this be do-able, and if so, how should
I go about it?  Write new extractors for each format or is this too much
overhead?  These could use GMarkupParse, rather than bringing in libxml2
like the HTML parser.

Cheers,
Jason

Index: src/tracker-extract/tracker-extract-imagemagick.c
===================================================================
--- src/tracker-extract/tracker-extract-imagemagick.c   (revision 598)
+++ src/tracker-extract/tracker-extract-imagemagick.c   (working copy)
@@ -35,7 +35,7 @@
        gint           exit_status;
 
        /* imagemagick crashes trying to extract from xcf files */
-       if (g_str_has_suffix (filename, '.xcf')) {
+       if (g_str_has_suffix (filename, ".xcf")) {
                return;
        }
 
@@ -60,5 +60,16 @@
                        g_hash_table_insert (metadata, g_strdup ("Image:Comments"), g_strdup (g_strescape 
(lines[2], "")));
                }
        }
+
+       gchar         *xmp;
+       argv[0] = g_strdup ("convert");
+       argv[1] = g_strdup (filename);
+       argv[2] = g_strdup ("xmp:-");
+       argv[3] = NULL;
+
+       if (tracker_spawn (argv, 10, &xmp, &exit_status)) {
+               if (exit_status == EXIT_SUCCESS) {
+                       tracker_read_xmp(xmp,strlen(xmp),metadata);
+               }
+       }
 }
-
Index: src/tracker-extract/tracker-extract-msoffice.c
===================================================================
--- src/tracker-extract/tracker-extract-msoffice.c      (revision 598)
+++ src/tracker-extract/tracker-extract-msoffice.c      (working copy)
@@ -118,7 +118,26 @@
        }
 }
 
+static void
+doc_metadata_cb (gpointer key, gpointer value, gpointer user_data)
+{
+       gchar           *name;
+       GsfDocProp      *property;
+       GHashTable      *metadata;
+       GValue const    *val;
 
+       name = (gchar *) key;
+       property = (GsfDocProp *) value;
+       metadata = (GHashTable *) user_data;
+
+       val = gsf_doc_prop_get_val (property);
+
+       if (strcmp (name, "CreativeCommons_LicenseURL") == 0) {
+               add_gvalue_in_hash_table (metadata, "File:License", val);
+       }
+}
+
+
 void
 tracker_extract_msoffice (gchar *filename, GHashTable *metadata)
 {
@@ -145,25 +164,37 @@
        }
 
        stream = gsf_infile_child_by_name (infile, "\05SummaryInformation");
-       g_object_unref (G_OBJECT (infile));
-
-       if (!stream) {
-               gsf_shutdown ();
-               return;
+       if (stream) {
+               md = gsf_doc_meta_data_new ();
+       
+               if (gsf_msole_metadata_read (stream, md)) {
+                       gsf_shutdown ();
+                       return;
+               }
+       
+               gsf_doc_meta_data_foreach (md, metadata_cb, metadata);
+       
+               g_object_unref (G_OBJECT (md));
+               g_object_unref (G_OBJECT (stream));
        }
 
-       md = gsf_doc_meta_data_new ();
-
-       if (gsf_msole_metadata_read (stream, md)) {
-               gsf_shutdown ();
-               return;
+       stream = gsf_infile_child_by_name (infile, "\05DocumentSummaryInformation");
+       if (stream) {
+               md = gsf_doc_meta_data_new ();
+       
+               if (gsf_msole_metadata_read (stream, md)) {
+                       gsf_shutdown ();
+                       return;
+               }
+       
+               gsf_doc_meta_data_foreach (md, doc_metadata_cb, metadata);
+       
+               g_object_unref (G_OBJECT (md));
+               g_object_unref (G_OBJECT (stream));
        }
 
-       gsf_doc_meta_data_foreach (md, metadata_cb, metadata);
+       g_object_unref (G_OBJECT (infile));
 
-       g_object_unref (G_OBJECT (md));
-       g_object_unref (G_OBJECT (stream));
-
        gsf_shutdown ();
 }
 
Index: src/tracker-extract/tracker-extract-pdf.c
===================================================================
--- src/tracker-extract/tracker-extract-pdf.c   (revision 598)
+++ src/tracker-extract/tracker-extract-pdf.c   (working copy)
@@ -26,6 +26,8 @@
 #include <string.h>
 #include <glib.h>
 
+#include "tracker-extract.h"
+
 void tracker_extract_pdf (gchar *filename, GHashTable *metadata)
 {
        PopplerDocument *document;
@@ -34,6 +36,7 @@
        gchar           *author;
        gchar           *subject;
        gchar           *keywords;
+       gchar           *metadata_xml;
        GTime            creation_date;
        GError          *error = NULL;
 
@@ -50,6 +53,7 @@
                "subject", &subject,
                "keywords", &keywords,
                "creation-date", &creation_date,
+               "metadata", &metadata_xml,
                NULL);
 
        if (title && strlen (title))
@@ -71,10 +75,15 @@
        g_hash_table_insert (metadata, g_strdup ("Doc:PageCount"),
                g_strdup_printf ("%d", poppler_document_get_n_pages (document)));
 
+       if ( metadata_xml ) {
+               tracker_read_xmp (metadata_xml,strlen(metadata_xml),metadata);
+       }
+
        g_free (title);
        g_free (author);
        g_free (subject);
        g_free (keywords);
+       g_free (metadata_xml);
        g_object_unref (document);
 }
 
Index: src/tracker-extract/tracker-extract-png.c
===================================================================
--- src/tracker-extract/tracker-extract-png.c   (revision 598)
+++ src/tracker-extract/tracker-extract-png.c   (working copy)
@@ -20,6 +20,8 @@
 
 #include "config.h"
 
+#include "tracker-extract.h"
+
 #include <stdio.h>
 #include <glib.h>
 #include <png.h>
@@ -79,15 +81,23 @@
                                             g_strdup_printf ("%ld", height));
                }
 
-
                if (png_get_text (png_ptr, info_ptr, &text_ptr, &num_text) > 0) {
-                       for (i = 0; i < num_text; i++) {
-                               for (j=0; tagmap[j].type; j++) {
-                                       if (strcasecmp (tagmap[j].name,  text_ptr[i].key) == 0) {
-                                               if (text_ptr[i].text && strlen (text_ptr[i].text) > 0) {
-                                                       g_hash_table_insert (metadata, g_strdup 
(tagmap[j].type), g_strdup (text_ptr[i].text));
+                       for (i = 0; i < num_text; i++) {
+                               if ( text_ptr[i].key != NULL ) {
+                                       #if defined(HAVE_EXEMPI) && defined(PNG_iTXt_SUPPORTED)
+                                       if (strcmp("XML:com.adobe.xmp",text_ptr[i].key) == 0) {
+                                               
tracker_read_xmp(text_ptr[i].text,text_ptr[i].itxt_length,metadata);
+                                               continue;
+                                       }
+                                       #endif
+       
+                                       for (j=0; tagmap[j].type; j++) {
+                                               if (strcasecmp (tagmap[j].name,  text_ptr[i].key) == 0) {
+                                                       if (text_ptr[i].text && strlen (text_ptr[i].text) > 
0) {
+                                                               g_hash_table_insert (metadata, g_strdup 
(tagmap[j].type), g_strdup (text_ptr[i].text));
+                                                       }
+                                                       break;
                                                }
-                                               break;
                                        }
                                }
                        }
Index: src/tracker-extract/tracker-extract.c
===================================================================
--- src/tracker-extract/tracker-extract.c       (revision 598)
+++ src/tracker-extract/tracker-extract.c       (working copy)
@@ -52,6 +52,9 @@
 void tracker_extract_totem     (gchar *, GHashTable *);
 void tracker_extract_oasis     (gchar *, GHashTable *);
 void tracker_extract_ps                (gchar *, GHashTable *);
+#ifdef HAVE_LIBXML2
+void tracker_extract_html              (gchar *, GHashTable *);
+#endif
 #ifdef HAVE_POPPLER
 void tracker_extract_pdf       (gchar *, GHashTable *);
 #endif
@@ -84,6 +87,10 @@
        /* Document extractors */
        { "application/vnd.oasis.opendocument.*",       tracker_extract_oasis           },
        { "application/postscript",                     tracker_extract_ps              },
+#ifdef HAVE_LIBXML2
+       { "text/html",                                                  tracker_extract_html    },
+       { "application/xhtml+xml",      tracker_extract_html    },
+#endif
 #ifdef HAVE_POPPLER
        { "application/pdf",                            tracker_extract_pdf             },
 #endif
Index: src/tracker-extract/Makefile.am
===================================================================
--- src/tracker-extract/Makefile.am     (revision 598)
+++ src/tracker-extract/Makefile.am     (working copy)
@@ -5,6 +5,7 @@
        $(LIBGSF_CFLAGS)                        \
        $(LIBGSF_CFLAGS)                        \
        $(GSTREAMER_CFLAGS)                     \
+       $(LIBXML2_CFLAGS)                       \
        $(XINE_CFLAGS)
 
 bin_PROGRAMS = tracker-extract
@@ -33,6 +34,7 @@
        tracker-extract-imagemagick.c           \
        tracker-extract-mplayer.c               \
        tracker-extract-totem.c                 \
+       tracker-extract-html.c                  \
        $(video_sources)
 
 tracker_extract_LDADD = $(GLIB2_LIBS)          \
@@ -41,4 +43,5 @@
        $(LIBEXIF_LIBS)                         \
        $(LIBGSF_LIBS)                          \
        $(GSTREAMER_LIBS)                       \
+       $(LIBXML2_LIBS)                         \
        $(XINE_LIBS)
Index: configure.ac
===================================================================
--- configure.ac        (revision 598)
+++ configure.ac        (working copy)
@@ -605,8 +605,25 @@
    [ AC_DEFINE(IOPRIO_SUPPORT,[],[Define ioprio support]) ioprio_support=yes ])
 AC_MSG_RESULT([$ioprio_support])
 
-#####################################################
+##################################################################
+# check for libxml2
+##################################################################
 
+LIBXML2_REQUIRED=0.6
+
+AC_ARG_ENABLE(libxml2, AC_HELP_STRING([--disable-libxml2],[Disable HTML/XML extractors (full-text will still 
be available)]),,[enable_libxml2=yes])
+if test "x$enable_libxml2" = "xyes"; then
+       PKG_CHECK_MODULES(LIBXML2, [
+               libxml-2.0 >= $LIBXML2_REQUIRED],
+               [have_libxml2=yes] , [have_libxml2=no])
+       AC_SUBST(LIBXML2_CFLAGS)
+       AC_SUBST(LIBXML2_LIBS)
+else
+       have_libxml2="no (disabled)"
+fi
+AM_CONDITIONAL(HAVE_LIBXML2, test "$have_libxml2" = "yes")
+test "$have_libxml2" = "yes" && AC_DEFINE(HAVE_LIBXML2, [], [Define if we have libxml2])
+
 AC_CONFIG_FILES([
        Makefile
        tracker.pc
@@ -675,6 +692,7 @@
        exif (jpeg):                            $have_libexif
        gsf:                                    $have_libgsf
        video files:                            $videos_are_handled ($videos_handler)
+       xml/html formats:                       $have_libxml2
 
 "
 if test "x$enable_external_sqlite" = "xyes"; then
Index: src/tracker-extract/tracker-extract-html.c
===================================================================
--- src/tracker-extract/tracker-extract-html.c  (revision 0)
+++ src/tracker-extract/tracker-extract-html.c  (revision 0)
@@ -0,0 +1,161 @@
+/* Tracker Extract - extracts embedded metadata from files
+ * Copyright (C) 2007, Jason Kivlighn (jkivlighn gmail com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ */
+
+#include "config.h"
+
+#ifdef HAVE_LIBXML2
+
+#include <string.h>
+#include <glib.h>
+#include <libxml/HTMLparser.h>
+
+typedef enum {
+               READ_TITLE,
+       } tag_type;
+
+typedef struct {
+       GHashTable *metadata;
+       tag_type current;
+} HTMLParseInfo;
+
+gboolean
+has_attribute( const xmlChar ** atts, const char *attr, const char*val )
+{
+       int i;
+       for ( i = 0; atts[i]; i+=2 )
+       {
+               if ( strcmp((char*)atts[i],attr) == 0 ) {
+                       if ( !val || strcmp((char*)atts[i+1],val) == 0 ) {
+                               return TRUE;
+                       }
+               }
+       }
+       return FALSE;
+}
+
+const xmlChar *
+lookup_attribute( const xmlChar **atts, const char *attr )
+{
+       int i;
+       for ( i = 0; atts[i]; i+=2 )
+       {
+               if ( strcmp((char*)atts[i],attr) == 0 ) {
+                       return atts[i+1];
+               }
+       }
+
+       return NULL;
+}
+
+void
+startElement (void * info, const xmlChar * name, const xmlChar ** atts)
+{
+       /* Look for RDFa triple describing the license */
+       if ( strcmp((char*)name,"a") == 0 ) {
+               /* This tag is a license.  Ignore, however, if it is referring to another document */
+               if ( has_attribute(atts,"rel","license") && !has_attribute(atts,"about",NULL) ) {
+                       const xmlChar *href = lookup_attribute(atts,"href");
+                       if ( href ) {
+                               g_hash_table_insert (((HTMLParseInfo *)info)->metadata, g_strdup 
("File:License"),
+                                                    g_strdup( (char*)href ));
+                       }
+               }
+       } else if ( strcmp((char*)name,"title") == 0 ) {
+               ((HTMLParseInfo *)info)->current = READ_TITLE;
+       } else if ( strcmp((char*)name,"meta") == 0 ) {
+               if ( has_attribute(atts,"name","Author") ) {
+                       const xmlChar *author = lookup_attribute(atts,"content");
+                       if ( author ) {
+                               g_hash_table_insert (((HTMLParseInfo *)info)->metadata, g_strdup 
("Doc:Author"),
+                                                    g_strdup( (char*)author ));
+                       }
+               }
+               if ( has_attribute(atts,"name","DC.Description") ) {
+                       const xmlChar *desc = lookup_attribute(atts,"content");
+                       if ( desc ) {
+                               g_hash_table_insert (((HTMLParseInfo *)info)->metadata, g_strdup 
("Doc:Comments"),
+                                                    g_strdup( (char*)desc ));
+                       }
+               }
+       }
+}
+
+void
+characters(void * info, const xmlChar * ch, int len)
+{
+       switch(((HTMLParseInfo *)info)->current) {
+               case READ_TITLE:
+                               g_hash_table_insert (((HTMLParseInfo *)info)->metadata, g_strdup 
("Doc:Title"),
+                                                    g_strdup( (char*)ch ));
+                               break;
+               default: break;
+       }
+
+       ((HTMLParseInfo *)info)->current = -1;
+}
+
+void tracker_extract_html (gchar* filename, GHashTable *metadata)
+{
+       xmlSAXHandler SAXHandlerStruct = {
+                       NULL, /* internalSubset */
+                       NULL, /* isStandalone */
+                       NULL, /* hasInternalSubset */
+                       NULL, /* hasExternalSubset */
+                       NULL, /* resolveEntity */
+                       NULL, /* getEntity */
+                       NULL, /* entityDecl */
+                       NULL, /* notationDecl */
+                       NULL, /* attributeDecl */
+                       NULL, /* elementDecl */
+                       NULL, /* unparsedEntityDecl */
+                       NULL, /* setDocumentLocator */
+                       NULL, /* startDocument */
+                       NULL, /* endDocument */
+                       startElement, /* startElement */
+                       NULL, /* endElement */
+                       NULL, /* reference */
+                       characters, /* characters */
+                       NULL, /* ignorableWhitespace */
+                       NULL, /* processingInstruction */
+                       NULL, /* comment */
+                       NULL, /* xmlParserWarning */
+                       NULL, /* xmlParserError */
+                       NULL, /* xmlParserError */
+                       NULL, /* getParameterEntity */
+                       NULL, /* cdataBlock */
+                       NULL, /* externalSubset */
+                       1,    /* initialized */
+                       NULL, /* private */
+                       NULL, /* startElementNsSAX2Func */
+                       NULL, /* endElementNsSAX2Func */
+                       NULL  /* xmlStructuredErrorFunc */
+       };
+
+       HTMLParseInfo   info = { metadata, -1 };
+
+       htmlDocPtr doc;
+       doc = htmlSAXParseFile(filename, NULL, &SAXHandlerStruct, &info);
+       if ( doc ) {
+               xmlFreeDoc(doc);
+       }
+}
+
+#else
+#warning "Not building HTML metadata extractor."
+#endif


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]