Re: [Tracker] Extracting Embedded Licenses



On Mon, 2007-06-18 at 12:49 -0700, Jason Kivlighn wrote:
Hi,

imagemagick: Uses 'convert filename xmp:-' to output an image's embedded
XMP.  This works for at least JPEG and TIFF files.  For JPEGs, however,
Imagemagick outputs the namespace and XMP, seperated by \0.  I'm not
sure how I can handle this, without simply assuming that 'convert'
returned two null-terminated strings.  Nevertheless, this extracts the
XMP from TIFF files.

msoffice: Extends the msoffice extractor to also parse the
DocumentSummeryInformation infile, which contains user-defined metadata,
along with license metadata embedded by the MSOffice Creative Commons Add-in

Is this the old msoffice file format? It would be great to also account
for the new msxml and best practices for this. Is there any info on
this?

pdf: Extends the pdf extractor to read a PDF's metadata stream and parse
it as XMP.  I'm still awaiting poppler extending the glib bindings to
allow reading the metadata stream.  Until then, it will simply never
find the metadata stream and go on without error.

Did you provide them a patch? Speed that ish up with patches!

png: Adds a check for the XML:com:adobe:xmp iTXt field, and parses it as
XMP.

gr8!

html: Adds a new html parser using libxml2.  Parses the document,
checking for RDFa licenses.  It also checks for other basic HTML
properties like title and author.

There's also several XML formats I'd like to parse for license data,
particularly SVG and SMIL.  Would this be do-able, and if so, how should
I go about it?  Write new extractors for each format or is this too much
overhead?  These could use GMarkupParse, rather than bringing in libxml2
like the HTML parser.

Yes, totally doable. Look at how Inkscape handles SVG metadata currently
and figure out best way to deal with current Inkscape method and also
harmonize with our XMP ways.

Cheers!

Jon

Cheers,
Jason

plain text document attachment (tracker-imagemagick-extract-xmp.patch)
Index: src/tracker-extract/tracker-extract-imagemagick.c
===================================================================
--- src/tracker-extract/tracker-extract-imagemagick.c (revision 598)
+++ src/tracker-extract/tracker-extract-imagemagick.c (working copy)
@@ -35,7 +35,7 @@
      gint           exit_status;
 
      /* imagemagick crashes trying to extract from xcf files */
-     if (g_str_has_suffix (filename, '.xcf')) {
+     if (g_str_has_suffix (filename, ".xcf")) {
              return;
      }
 
@@ -60,5 +60,16 @@
                      g_hash_table_insert (metadata, g_strdup ("Image:Comments"), g_strdup (g_strescape 
(lines[2], "")));
              }
      }
+
+     gchar         *xmp;
+     argv[0] = g_strdup ("convert");
+     argv[1] = g_strdup (filename);
+     argv[2] = g_strdup ("xmp:-");
+     argv[3] = NULL;
+
+     if (tracker_spawn (argv, 10, &xmp, &exit_status)) {
+             if (exit_status == EXIT_SUCCESS) {
+                     tracker_read_xmp(xmp,strlen(xmp),metadata);
+             }
+     }
 }
-
plain text document attachment
(tracker-msoffice-extract-license.patch)
Index: src/tracker-extract/tracker-extract-msoffice.c
===================================================================
--- src/tracker-extract/tracker-extract-msoffice.c    (revision 598)
+++ src/tracker-extract/tracker-extract-msoffice.c    (working copy)
@@ -118,7 +118,26 @@
      }
 }
 
+static void
+doc_metadata_cb (gpointer key, gpointer value, gpointer user_data)
+{
+     gchar           *name;
+     GsfDocProp      *property;
+     GHashTable      *metadata;
+     GValue const    *val;
 
+     name = (gchar *) key;
+     property = (GsfDocProp *) value;
+     metadata = (GHashTable *) user_data;
+
+     val = gsf_doc_prop_get_val (property);
+
+     if (strcmp (name, "CreativeCommons_LicenseURL") == 0) {
+             add_gvalue_in_hash_table (metadata, "File:License", val);
+     }
+}
+
+
 void
 tracker_extract_msoffice (gchar *filename, GHashTable *metadata)
 {
@@ -145,25 +164,37 @@
      }
 
      stream = gsf_infile_child_by_name (infile, "\05SummaryInformation");
-     g_object_unref (G_OBJECT (infile));
-
-     if (!stream) {
-             gsf_shutdown ();
-             return;
+     if (stream) {
+             md = gsf_doc_meta_data_new ();
+     
+             if (gsf_msole_metadata_read (stream, md)) {
+                     gsf_shutdown ();
+                     return;
+             }
+     
+             gsf_doc_meta_data_foreach (md, metadata_cb, metadata);
+     
+             g_object_unref (G_OBJECT (md));
+             g_object_unref (G_OBJECT (stream));
      }
 
-     md = gsf_doc_meta_data_new ();
-
-     if (gsf_msole_metadata_read (stream, md)) {
-             gsf_shutdown ();
-             return;
+     stream = gsf_infile_child_by_name (infile, "\05DocumentSummaryInformation");
+     if (stream) {
+             md = gsf_doc_meta_data_new ();
+     
+             if (gsf_msole_metadata_read (stream, md)) {
+                     gsf_shutdown ();
+                     return;
+             }
+     
+             gsf_doc_meta_data_foreach (md, doc_metadata_cb, metadata);
+     
+             g_object_unref (G_OBJECT (md));
+             g_object_unref (G_OBJECT (stream));
      }
 
-     gsf_doc_meta_data_foreach (md, metadata_cb, metadata);
+     g_object_unref (G_OBJECT (infile));
 
-     g_object_unref (G_OBJECT (md));
-     g_object_unref (G_OBJECT (stream));
-
      gsf_shutdown ();
 }
 
plain text document attachment (tracker-pdf-extract-xmp.patch)
Index: src/tracker-extract/tracker-extract-pdf.c
===================================================================
--- src/tracker-extract/tracker-extract-pdf.c (revision 598)
+++ src/tracker-extract/tracker-extract-pdf.c (working copy)
@@ -26,6 +26,8 @@
 #include <string.h>
 #include <glib.h>
 
+#include "tracker-extract.h"
+
 void tracker_extract_pdf (gchar *filename, GHashTable *metadata)
 {
      PopplerDocument *document;
@@ -34,6 +36,7 @@
      gchar           *author;
      gchar           *subject;
      gchar           *keywords;
+     gchar           *metadata_xml;
      GTime            creation_date;
      GError          *error = NULL;
 
@@ -50,6 +53,7 @@
              "subject", &subject,
              "keywords", &keywords,
              "creation-date", &creation_date,
+             "metadata", &metadata_xml,
              NULL);
 
      if (title && strlen (title))
@@ -71,10 +75,15 @@
      g_hash_table_insert (metadata, g_strdup ("Doc:PageCount"),
              g_strdup_printf ("%d", poppler_document_get_n_pages (document)));
 
+     if ( metadata_xml ) {
+             tracker_read_xmp (metadata_xml,strlen(metadata_xml),metadata);
+     }
+
      g_free (title);
      g_free (author);
      g_free (subject);
      g_free (keywords);
+     g_free (metadata_xml);
      g_object_unref (document);
 }
 
plain text document attachment (tracker-png-extract-xmp.patch)
Index: src/tracker-extract/tracker-extract-png.c
===================================================================
--- src/tracker-extract/tracker-extract-png.c (revision 598)
+++ src/tracker-extract/tracker-extract-png.c (working copy)
@@ -20,6 +20,8 @@
 
 #include "config.h"
 
+#include "tracker-extract.h"
+
 #include <stdio.h>
 #include <glib.h>
 #include <png.h>
@@ -79,15 +81,23 @@
                                           g_strdup_printf ("%ld", height));
              }
 
-
              if (png_get_text (png_ptr, info_ptr, &text_ptr, &num_text) > 0) {
-                     for (i = 0; i < num_text; i++) {
-                             for (j=0; tagmap[j].type; j++) {
-                                     if (strcasecmp (tagmap[j].name,  text_ptr[i].key) == 0) {
-                                             if (text_ptr[i].text && strlen (text_ptr[i].text) > 0) {
-                                                     g_hash_table_insert (metadata, g_strdup 
(tagmap[j].type), g_strdup (text_ptr[i].text));
+                     for (i = 0; i < num_text; i++) {
+                             if ( text_ptr[i].key != NULL ) {
+                                     #if defined(HAVE_EXEMPI) && defined(PNG_iTXt_SUPPORTED)
+                                     if (strcmp("XML:com.adobe.xmp",text_ptr[i].key) == 0) {
+                                             
tracker_read_xmp(text_ptr[i].text,text_ptr[i].itxt_length,metadata);
+                                             continue;
+                                     }
+                                     #endif
+     
+                                     for (j=0; tagmap[j].type; j++) {
+                                             if (strcasecmp (tagmap[j].name,  text_ptr[i].key) == 0) {
+                                                     if (text_ptr[i].text && strlen (text_ptr[i].text) > 
0) {
+                                                             g_hash_table_insert (metadata, g_strdup 
(tagmap[j].type), g_strdup (text_ptr[i].text));
+                                                     }
+                                                     break;
                                              }
-                                             break;
                                      }
                              }
                      }
plain text document attachment (tracker-extract-html.patch)
Index: src/tracker-extract/tracker-extract.c
===================================================================
--- src/tracker-extract/tracker-extract.c     (revision 598)
+++ src/tracker-extract/tracker-extract.c     (working copy)
@@ -52,6 +52,9 @@
 void tracker_extract_totem   (gchar *, GHashTable *);
 void tracker_extract_oasis   (gchar *, GHashTable *);
 void tracker_extract_ps              (gchar *, GHashTable *);
+#ifdef HAVE_LIBXML2
+void tracker_extract_html            (gchar *, GHashTable *);
+#endif
 #ifdef HAVE_POPPLER
 void tracker_extract_pdf     (gchar *, GHashTable *);
 #endif
@@ -84,6 +87,10 @@
      /* Document extractors */
      { "application/vnd.oasis.opendocument.*",       tracker_extract_oasis           },
      { "application/postscript",                     tracker_extract_ps              },
+#ifdef HAVE_LIBXML2
+     { "text/html",                                                  tracker_extract_html    },
+     { "application/xhtml+xml",      tracker_extract_html    },
+#endif
 #ifdef HAVE_POPPLER
      { "application/pdf",                            tracker_extract_pdf             },
 #endif
Index: src/tracker-extract/Makefile.am
===================================================================
--- src/tracker-extract/Makefile.am   (revision 598)
+++ src/tracker-extract/Makefile.am   (working copy)
@@ -5,6 +5,7 @@
      $(LIBGSF_CFLAGS)                        \
      $(LIBGSF_CFLAGS)                        \
      $(GSTREAMER_CFLAGS)                     \
+     $(LIBXML2_CFLAGS)                       \
      $(XINE_CFLAGS)
 
 bin_PROGRAMS = tracker-extract
@@ -33,6 +34,7 @@
      tracker-extract-imagemagick.c           \
      tracker-extract-mplayer.c               \
      tracker-extract-totem.c                 \
+     tracker-extract-html.c                  \
      $(video_sources)
 
 tracker_extract_LDADD = $(GLIB2_LIBS)                \
@@ -41,4 +43,5 @@
      $(LIBEXIF_LIBS)                         \
      $(LIBGSF_LIBS)                          \
      $(GSTREAMER_LIBS)                       \
+     $(LIBXML2_LIBS)                         \
      $(XINE_LIBS)
Index: configure.ac
===================================================================
--- configure.ac      (revision 598)
+++ configure.ac      (working copy)
@@ -605,8 +605,25 @@
    [ AC_DEFINE(IOPRIO_SUPPORT,[],[Define ioprio support]) ioprio_support=yes ])
 AC_MSG_RESULT([$ioprio_support])
 
-#####################################################
+##################################################################
+# check for libxml2
+##################################################################
 
+LIBXML2_REQUIRED=0.6
+
+AC_ARG_ENABLE(libxml2, AC_HELP_STRING([--disable-libxml2],[Disable HTML/XML extractors (full-text will 
still be available)]),,[enable_libxml2=yes])
+if test "x$enable_libxml2" = "xyes"; then
+     PKG_CHECK_MODULES(LIBXML2, [
+             libxml-2.0 >= $LIBXML2_REQUIRED],
+             [have_libxml2=yes] , [have_libxml2=no])
+     AC_SUBST(LIBXML2_CFLAGS)
+     AC_SUBST(LIBXML2_LIBS)
+else
+     have_libxml2="no (disabled)"
+fi
+AM_CONDITIONAL(HAVE_LIBXML2, test "$have_libxml2" = "yes")
+test "$have_libxml2" = "yes" && AC_DEFINE(HAVE_LIBXML2, [], [Define if we have libxml2])
+
 AC_CONFIG_FILES([
      Makefile
      tracker.pc
@@ -675,6 +692,7 @@
      exif (jpeg):                            $have_libexif
      gsf:                                    $have_libgsf
      video files:                            $videos_are_handled ($videos_handler)
+     xml/html formats:                       $have_libxml2
 
 "
 if test "x$enable_external_sqlite" = "xyes"; then
Index: src/tracker-extract/tracker-extract-html.c
===================================================================
--- src/tracker-extract/tracker-extract-html.c        (revision 0)
+++ src/tracker-extract/tracker-extract-html.c        (revision 0)
@@ -0,0 +1,161 @@
+/* Tracker Extract - extracts embedded metadata from files
+ * Copyright (C) 2007, Jason Kivlighn (jkivlighn gmail com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ */
+
+#include "config.h"
+
+#ifdef HAVE_LIBXML2
+
+#include <string.h>
+#include <glib.h>
+#include <libxml/HTMLparser.h>
+
+typedef enum {
+             READ_TITLE,
+     } tag_type;
+
+typedef struct {
+     GHashTable *metadata;
+     tag_type current;
+} HTMLParseInfo;
+
+gboolean
+has_attribute( const xmlChar ** atts, const char *attr, const char*val )
+{
+     int i;
+     for ( i = 0; atts[i]; i+=2 )
+     {
+             if ( strcmp((char*)atts[i],attr) == 0 ) {
+                     if ( !val || strcmp((char*)atts[i+1],val) == 0 ) {
+                             return TRUE;
+                     }
+             }
+     }
+     return FALSE;
+}
+
+const xmlChar *
+lookup_attribute( const xmlChar **atts, const char *attr )
+{
+     int i;
+     for ( i = 0; atts[i]; i+=2 )
+     {
+             if ( strcmp((char*)atts[i],attr) == 0 ) {
+                     return atts[i+1];
+             }
+     }
+
+     return NULL;
+}
+
+void
+startElement (void * info, const xmlChar * name, const xmlChar ** atts)
+{
+     /* Look for RDFa triple describing the license */
+     if ( strcmp((char*)name,"a") == 0 ) {
+             /* This tag is a license.  Ignore, however, if it is referring to another document */
+             if ( has_attribute(atts,"rel","license") && !has_attribute(atts,"about",NULL) ) {
+                     const xmlChar *href = lookup_attribute(atts,"href");
+                     if ( href ) {
+                             g_hash_table_insert (((HTMLParseInfo *)info)->metadata, g_strdup 
("File:License"),
+                                                  g_strdup( (char*)href ));
+                     }
+             }
+     } else if ( strcmp((char*)name,"title") == 0 ) {
+             ((HTMLParseInfo *)info)->current = READ_TITLE;
+     } else if ( strcmp((char*)name,"meta") == 0 ) {
+             if ( has_attribute(atts,"name","Author") ) {
+                     const xmlChar *author = lookup_attribute(atts,"content");
+                     if ( author ) {
+                             g_hash_table_insert (((HTMLParseInfo *)info)->metadata, g_strdup 
("Doc:Author"),
+                                                  g_strdup( (char*)author ));
+                     }
+             }
+             if ( has_attribute(atts,"name","DC.Description") ) {
+                     const xmlChar *desc = lookup_attribute(atts,"content");
+                     if ( desc ) {
+                             g_hash_table_insert (((HTMLParseInfo *)info)->metadata, g_strdup 
("Doc:Comments"),
+                                                  g_strdup( (char*)desc ));
+                     }
+             }
+     }
+}
+
+void
+characters(void * info, const xmlChar * ch, int len)
+{
+     switch(((HTMLParseInfo *)info)->current) {
+             case READ_TITLE:
+                             g_hash_table_insert (((HTMLParseInfo *)info)->metadata, g_strdup 
("Doc:Title"),
+                                                  g_strdup( (char*)ch ));
+                             break;
+             default: break;
+     }
+
+     ((HTMLParseInfo *)info)->current = -1;
+}
+
+void tracker_extract_html (gchar* filename, GHashTable *metadata)
+{
+     xmlSAXHandler SAXHandlerStruct = {
+                     NULL, /* internalSubset */
+                     NULL, /* isStandalone */
+                     NULL, /* hasInternalSubset */
+                     NULL, /* hasExternalSubset */
+                     NULL, /* resolveEntity */
+                     NULL, /* getEntity */
+                     NULL, /* entityDecl */
+                     NULL, /* notationDecl */
+                     NULL, /* attributeDecl */
+                     NULL, /* elementDecl */
+                     NULL, /* unparsedEntityDecl */
+                     NULL, /* setDocumentLocator */
+                     NULL, /* startDocument */
+                     NULL, /* endDocument */
+                     startElement, /* startElement */
+                     NULL, /* endElement */
+                     NULL, /* reference */
+                     characters, /* characters */
+                     NULL, /* ignorableWhitespace */
+                     NULL, /* processingInstruction */
+                     NULL, /* comment */
+                     NULL, /* xmlParserWarning */
+                     NULL, /* xmlParserError */
+                     NULL, /* xmlParserError */
+                     NULL, /* getParameterEntity */
+                     NULL, /* cdataBlock */
+                     NULL, /* externalSubset */
+                     1,    /* initialized */
+                     NULL, /* private */
+                     NULL, /* startElementNsSAX2Func */
+                     NULL, /* endElementNsSAX2Func */
+                     NULL  /* xmlStructuredErrorFunc */
+     };
+
+     HTMLParseInfo   info = { metadata, -1 };
+
+     htmlDocPtr doc;
+     doc = htmlSAXParseFile(filename, NULL, &SAXHandlerStruct, &info);
+     if ( doc ) {
+             xmlFreeDoc(doc);
+     }
+}
+
+#else
+#warning "Not building HTML metadata extractor."
+#endif
_______________________________________________
tracker-list mailing list
tracker-list gnome org
http://mail.gnome.org/mailman/listinfo/tracker-list
-- 
Jon Phillips

San Francisco, CA
USA PH 510.499.0894
jon rejon org
http://www.rejon.org

MSN, AIM, Yahoo Chat: kidproto
Jabber Chat: rejon gristle org
IRC: rejon irc freenode net




[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]