[tracker/wip/sam/resource: 26/27] Add support to extractors for outputting metadata as JSON-LD



commit e4890ac7b54aaef42b08bea7434d64aafab5f3e8
Author: Sam Thursfield <sam afuera me uk>
Date:   Thu Apr 7 17:30:56 2016 +0100

    Add support to extractors for outputting metadata as JSON-LD

 configure.ac                             |    6 +-
 src/libtracker-common/tracker-enums.h    |    6 ++
 src/libtracker-sparql/tracker-resource.c |  144 ++++++++++++++++++++++++++++++
 src/libtracker-sparql/tracker-resource.h |    2 +
 src/tracker-extract/tracker-extract.c    |   24 +++++-
 src/tracker-extract/tracker-extract.h    |    7 +-
 src/tracker-extract/tracker-main.c       |   18 ++++-
 src/tracker/tracker-extract.c            |   14 +++-
 8 files changed, 208 insertions(+), 13 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index 541618e..e9a6454 100644
--- a/configure.ac
+++ b/configure.ac
@@ -237,6 +237,7 @@ GSTREAMER_REQUIRED=0.10.31
 GUPNP_DLNA_REQUIRED=0.9.4
 LIBPNG_REQUIRED=0.89
 LIBMEDIAART_REQUIRED=1.9.0
+JSON_GLIB_REQUIRED=1.0.4
 
 # 3.6.11 for sqlite_backup API
 # 3.6.16 to fix test failures
@@ -322,8 +323,9 @@ LIBTRACKER_CONTROL_REQUIRED="glib-2.0        >= $GLIB_REQUIRED
 PKG_CHECK_MODULES(LIBTRACKER_CONTROL, [$LIBTRACKER_CONTROL_REQUIRED])
 
 # Check requirements for libtracker-sparql
-LIBTRACKER_SPARQL_REQUIRED="glib-2.0     >= $GLIB_REQUIRED
-                            gio-unix-2.0 >= $GLIB_REQUIRED
+LIBTRACKER_SPARQL_REQUIRED="glib-2.0      >= $GLIB_REQUIRED
+                            gio-unix-2.0  >= $GLIB_REQUIRED
+                            json-glib-1.0 >= $JSON_GLIB_REQUIRED
                             uuid"
 
 PKG_CHECK_MODULES(LIBTRACKER_SPARQL, [$LIBTRACKER_SPARQL_REQUIRED])
diff --git a/src/libtracker-common/tracker-enums.h b/src/libtracker-common/tracker-enums.h
index 2be97c1..f3e2bbd 100644
--- a/src/libtracker-common/tracker-enums.h
+++ b/src/libtracker-common/tracker-enums.h
@@ -38,6 +38,12 @@ typedef enum {
 typedef enum {
        TRACKER_SERIALIZATION_FORMAT_SPARQL,
        TRACKER_SERIALIZATION_FORMAT_TURTLE,
+       /* JSON and JSON_LD are treated as the same thing right now, but we could
+        * treat them differently if we wanted. also it's nice to be able to pass
+        * both 'json' and 'json-ld' to `tracker extract --output-format=`.
+        */
+       TRACKER_SERIALIZATION_FORMAT_JSON,
+       TRACKER_SERIALIZATION_FORMAT_JSON_LD,
 } TrackerSerializationFormat;
 
 G_END_DECLS
diff --git a/src/libtracker-sparql/tracker-resource.c b/src/libtracker-sparql/tracker-resource.c
index 759911c..7e737a1 100644
--- a/src/libtracker-sparql/tracker-resource.c
+++ b/src/libtracker-sparql/tracker-resource.c
@@ -18,6 +18,7 @@
  */
 
 #include <glib.h>
+#include <json-glib/json-glib.h>
 
 #include <string.h>
 
@@ -979,3 +980,146 @@ tracker_resource_generate_sparql_update (TrackerResource *resource,
 
        g_list_free (done_list);
 }
+
+
+static void generate_jsonld_foreach (gpointer key, gpointer value_ptr, gpointer user_data);
+
+/* FIXME: this could hit an infinite loop if there are circular resource
+ * relationships, make sure those are tested & detected.
+ */
+/* This is not exposed publically right now because then everything including
+ * tracker-resource.h would need to pull in the json-glib dependency ...
+ */
+static JsonNode *
+tracker_resource_generate_jsonld (TrackerResource *self,
+                                   GError **error)
+{
+       /* FIXME: generate a JSON-LD context ! */
+
+       TrackerResourcePrivate *priv = GET_PRIVATE (self);
+       JsonBuilder *builder;
+       JsonNode *result;
+
+       builder = json_builder_new ();
+       json_builder_begin_object (builder);
+
+       /* The JSON-LD spec says it is "important that nodes have an identifier", but
+        * doesn't mandate one. I think it's better to omit the ID for blank nodes
+        * (where the caller passed NULL as an identifier) than to emit something
+        * SPARQL-specific like '_:123'.
+        */
+       if (strncmp (priv->identifier, "_:", 2) != 0) {
+               json_builder_set_member_name (builder, "@id");
+               json_builder_add_string_value (builder, priv->identifier);
+       }
+
+       g_hash_table_foreach (priv->properties, generate_jsonld_foreach, builder);
+
+       json_builder_end_object (builder);
+
+       result = json_builder_get_root (builder);
+       g_object_unref (builder);
+       return result;
+};
+
+static void
+append_value_to_json_builder (const GValue *value,
+                              JsonBuilder *builder)
+{
+       JsonNode *node;
+
+       if (G_VALUE_HOLDS (value, TRACKER_TYPE_RESOURCE)) {
+               TrackerResource *resource;
+               GError *error = NULL;
+
+               resource = TRACKER_RESOURCE (g_value_get_object (value));
+               node = tracker_resource_generate_jsonld (resource, &error);
+
+               if (node) {
+                       json_builder_add_value (builder, node);
+               } else {
+                       g_warning ("Unable to serialize value: %s", error->message);
+                       g_error_free (error);
+               }
+       } else if (G_VALUE_HOLDS (value, TRACKER_TYPE_URI)) {
+               /* URIs can be treated the same as strings in JSON-LD provided the @context
+                * sets the type of that property correctly. However, json_node_set_value()
+                * will reject a GValue holding TRACKER_TYPE_URI, so we have to extract the
+                * string manually here.
+                */
+               const char *uri = g_value_get_string (value);
+               node = json_node_new (JSON_NODE_VALUE);
+               json_node_set_string (node, uri);
+               json_builder_add_value (builder, node);
+       } else {
+               node = json_node_new (JSON_NODE_VALUE);
+               json_node_set_value (node, value);
+               json_builder_add_value (builder, node);
+       }
+}
+
+static void
+generate_jsonld_foreach (gpointer key,
+                         gpointer value_ptr,
+                         gpointer user_data)
+{
+       const char *property = key;
+       const GValue *value = value_ptr;
+       JsonBuilder *builder = JSON_BUILDER (user_data);
+
+       /* FIXME: shouldn't hardcode the unexpanded prefix here!!! */
+       if (strcmp (property, "rdf:type") == 0) {
+               property = "@type";
+       }
+
+       json_builder_set_member_name (builder, property);
+       if (G_VALUE_HOLDS (value, G_TYPE_PTR_ARRAY)) {
+               json_builder_begin_array (builder);
+               g_ptr_array_foreach (g_value_get_boxed (value), (GFunc) append_value_to_json_builder, 
builder);
+               json_builder_end_array (builder);
+       } else {
+               append_value_to_json_builder (value, builder);
+       }
+}
+
+/**
+ * tracker_resource_print_jsonld:
+ * @resource: a #TrackerResource
+ * @error: address where an error can be returned
+ *
+ * Serialize all the information in @resource as a JSON-LD document.
+ *
+ * See <http://www.jsonld.org/> for more information on the JSON-LD
+ * serialization format.
+ *
+ * Returns: a newly-allocated string
+ *
+ * Since: 1.10
+ */
+char *
+tracker_resource_print_jsonld (TrackerResource *resource,
+                               GError **error)
+{
+       GError *sub_error = NULL;
+       JsonNode *json_root_node;
+       JsonGenerator *generator;
+       char *result;
+
+       json_root_node = tracker_resource_generate_jsonld (resource, &sub_error);
+
+       if (json_root_node == NULL) {
+               g_propagate_error (error, sub_error);
+               return NULL;
+       }
+
+       generator = json_generator_new ();
+       json_generator_set_root (generator, json_root_node);
+       json_generator_set_pretty (generator, TRUE);
+
+       result = json_generator_to_data (generator, NULL);
+
+       json_node_free (json_root_node);
+       g_object_unref (generator);
+
+       return result;
+}
diff --git a/src/libtracker-sparql/tracker-resource.h b/src/libtracker-sparql/tracker-resource.h
index fe67b57..f225a64 100644
--- a/src/libtracker-sparql/tracker-resource.h
+++ b/src/libtracker-sparql/tracker-resource.h
@@ -75,6 +75,8 @@ gint tracker_resource_identifier_compare_func (TrackerResource *resource, const
 
 char *tracker_resource_print_turtle(TrackerResource *self, TrackerNamespaceManager *namespaces);
 
+char *tracker_resource_print_jsonld (TrackerResource *self, GError **error);
+
 void tracker_resource_generate_sparql_update (TrackerResource *self, TrackerSparqlBuilder *builder, 
TrackerNamespaceManager *namespaces, const char *graph_id, GError **error);
 
 G_END_DECLS
diff --git a/src/tracker-extract/tracker-extract.c b/src/tracker-extract/tracker-extract.c
index b4fe6e5..cdee66c 100644
--- a/src/tracker-extract/tracker-extract.c
+++ b/src/tracker-extract/tracker-extract.c
@@ -739,9 +739,10 @@ tracker_extract_get_media_art_process (TrackerExtract *extract)
 #endif
 
 void
-tracker_extract_get_metadata_by_cmdline (TrackerExtract *object,
-                                         const gchar    *uri,
-                                         const gchar    *mime)
+tracker_extract_get_metadata_by_cmdline (TrackerExtract             *object,
+                                         const gchar                *uri,
+                                         const gchar                *mime,
+                                         TrackerSerializationFormat  output_format)
 {
        GError *error = NULL;
        TrackerExtractPrivate *priv;
@@ -817,6 +818,23 @@ tracker_extract_get_metadata_by_cmdline (TrackerExtract *object,
                                        g_printerr ("%s\n", error->message);
                                        g_error_free (error);
                                }
+                       } else {
+                               /* JSON-LD extraction */
+                               char *json;
+
+                               /* If this was going into the tracker-store we'd generate a unique ID
+                                * here, so that the data persisted across file renames.
+                                */
+                               tracker_resource_set_identifier (resource, uri);
+
+                               json = tracker_resource_print_jsonld (resource, &error);
+                               if (json) {
+                                       g_print ("%s\n", json);
+                                       g_free (json);
+                               } else {
+                                       g_printerr ("%s\n", error->message);
+                                       g_error_free (error);
+                               }
                        }
 
                        tracker_extract_info_unref (info);
diff --git a/src/tracker-extract/tracker-extract.h b/src/tracker-extract/tracker-extract.h
index 50fa8c3..882c601 100644
--- a/src/tracker-extract/tracker-extract.h
+++ b/src/tracker-extract/tracker-extract.h
@@ -79,9 +79,10 @@ void            tracker_extract_dbus_start              (TrackerExtract
 void            tracker_extract_dbus_stop               (TrackerExtract         *extract);
 
 /* Not DBus API */
-void            tracker_extract_get_metadata_by_cmdline (TrackerExtract         *object,
-                                                         const gchar            *path,
-                                                         const gchar            *mime);
+void            tracker_extract_get_metadata_by_cmdline (TrackerExtract             *object,
+                                                         const gchar                *path,
+                                                         const gchar                *mime,
+                                                         TrackerSerializationFormat  output_format);
 
 G_END_DECLS
 
diff --git a/src/tracker-extract/tracker-main.c b/src/tracker-extract/tracker-main.c
index cc02fdd..db8d081 100644
--- a/src/tracker-extract/tracker-main.c
+++ b/src/tracker-extract/tracker-main.c
@@ -72,6 +72,7 @@ static gint verbosity = -1;
 static gchar *filename;
 static gchar *mime_type;
 static gchar *force_module;
+static gchar *output_format_name;
 static gboolean version;
 
 static TrackerConfig *config;
@@ -95,7 +96,7 @@ static GOptionEntry entries[] = {
          N_("Force a module to be used for extraction (e.g. \"foo\" for \"foo.so\")"),
          N_("MODULE") },
        { "output-format", 'o', 0, G_OPTION_ARG_STRING, &output_format_name,
-         N_("Output results format: 'sparql', or 'turtle'"),
+         N_("Output results format: 'sparql', 'turtle' or 'json'"),
          N_("FORMAT") },
        { "version", 'V', 0,
          G_OPTION_ARG_NONE, &version,
@@ -244,6 +245,9 @@ run_standalone (TrackerConfig *config)
        TrackerExtract *object;
        GFile *file;
        gchar *uri;
+       GEnumClass *enum_class;
+       GEnumValue *enum_value;
+       TrackerSerializationFormat output_format;
 
        /* Set log handler for library messages */
        g_log_set_default_handler (log_handler, NULL);
@@ -253,6 +257,16 @@ run_standalone (TrackerConfig *config)
                verbosity = 3;
        }
 
+       /* Look up the output format by name */
+       enum_class = g_type_class_ref (TRACKER_TYPE_SERIALIZATION_FORMAT);
+       enum_value = g_enum_get_value_by_nick (enum_class, output_format_name);
+       g_type_class_unref (enum_class);
+       if (!enum_value) {
+               g_printerr (N_("Unsupported serialization format '%s'\n"), output_format_name);
+               return EXIT_FAILURE;
+       }
+       output_format = enum_value->value;
+
        tracker_locale_init ();
 
        /* This makes sure we don't steal all the system's resources */
@@ -271,7 +285,7 @@ run_standalone (TrackerConfig *config)
                return EXIT_FAILURE;
        }
 
-       tracker_extract_get_metadata_by_cmdline (object, uri, mime_type);
+       tracker_extract_get_metadata_by_cmdline (object, uri, mime_type, output_format);
 
        g_object_unref (object);
        g_object_unref (file);
diff --git a/src/tracker/tracker-extract.c b/src/tracker/tracker-extract.c
index d4979f3..af219a5 100644
--- a/src/tracker/tracker-extract.c
+++ b/src/tracker/tracker-extract.c
@@ -31,6 +31,7 @@
 #include "tracker-extract.h"
 
 static gchar *verbosity;
+static gchar *output_format = "turtle";
 static gchar **filenames;
 
 #define EXTRACT_OPTIONS_ENABLED()        \
@@ -40,6 +41,9 @@ static GOptionEntry entries[] = {
        { "verbosity", 'v', 0, G_OPTION_ARG_STRING, &verbosity,
          N_("Sets the logging verbosity to LEVEL ('debug', 'detailed', 'minimal', 'errors') for all 
processes"),
          N_("LEVEL") },
+       { "output-format", 'o', 0, G_OPTION_ARG_STRING, &output_format,
+         N_("Output results format: 'sparql', 'turtle' or 'json-ld'"),
+         N_("FORMAT") },
        { G_OPTION_REMAINING, 0, 0, G_OPTION_ARG_FILENAME_ARRAY, &filenames,
          N_("FILE"),
          N_("FILE") },
@@ -48,7 +52,8 @@ static GOptionEntry entries[] = {
 
 
 static gint
-extract_files (TrackerVerbosity verbosity)
+extract_files (TrackerVerbosity verbosity,
+               char *output_format)
 {
        char **p;
        char *tracker_extract_path;
@@ -60,7 +65,10 @@ extract_files (TrackerVerbosity verbosity)
        tracker_extract_path = g_build_filename(LIBEXECDIR, "tracker-extract", NULL);
 
        for (p = filenames; *p; p++) {
-               char *argv[] = {tracker_extract_path, "--verbosity", verbosity_str, "--file", *p, NULL};
+               char *argv[] = {tracker_extract_path,
+                               "--output-format", output_format,
+                               "--verbosity", verbosity_str,
+                               "--file", *p, NULL };
 
                g_spawn_sync(NULL, argv, NULL, G_SPAWN_DEFAULT, NULL, NULL, NULL, NULL, NULL, &error);
 
@@ -99,7 +107,7 @@ extract_run (void)
                }
        }
 
-       return extract_files (verbosity_level);
+       return extract_files (verbosity_level, output_format);
 }
 
 static int


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]