[tracker/wip/carlosg/ttl-parser: 1/2] libtracker-data: Rewrite TrackerTurtleReader to reuse parser grammar



commit eec2b522a384cc3719ac29cbf824c1a36b14f938
Author: Carlos Garnacho <carlosg gnome org>
Date:   Sun Mar 8 22:35:49 2020 +0100

    libtracker-data: Rewrite TrackerTurtleReader to reuse parser grammar
    
    Instead of tapping on the old TrackerSparqlScanner, use the grammar
    definitions from the new parser. Also rewrite it in C.
    
    This allows us to drop a whole bunch of the older parser code, which
    only stood there to parse TTL files efficiently. The new SPARQL parser
    code is designed to generate an intermediate expression tree, which is
    great to increase liberty at the time of interpreting it, but not so great
    at the time of deserializing randomly sized blobs of TTL data.
    
    Even though this new TTL parser doesn't 100% use the infrastructure of
    the new SPARQL parser, it taps on it for the essentials (parsing the
    subject/predicate/object terminals), and also allows incremental TTL
    loading without memory peaks. The advantages of the old TTL parser
    (and the only reason why it stuck) are now moot.

 src/libtracker-data/.gitignore              |   1 -
 src/libtracker-data/meson.build             |   2 +-
 src/libtracker-data/tracker-data-manager.c  |  44 +-
 src/libtracker-data/tracker-data-update.c   |  24 +-
 src/libtracker-data/tracker-turtle-reader.c | 678 ++++++++++++++++++++++++++++
 src/libtracker-data/tracker-turtle-reader.h |  44 ++
 utils/ontology/data-validator.c             |  23 +-
 utils/ontology/ontology-validator.c         |  12 +-
 8 files changed, 776 insertions(+), 52 deletions(-)
---
diff --git a/src/libtracker-data/.gitignore b/src/libtracker-data/.gitignore
index 5d33cf879..85438fb65 100644
--- a/src/libtracker-data/.gitignore
+++ b/src/libtracker-data/.gitignore
@@ -4,6 +4,5 @@ tracker-sparql-pattern.c
 tracker-sparql-query.[ch]
 tracker-sparql-query.vapi
 tracker-sparql-scanner.c
-tracker-turtle-reader.c
 *.valid
 *.cfg.5
diff --git a/src/libtracker-data/meson.build b/src/libtracker-data/meson.build
index 58e38f607..4c04f7a31 100644
--- a/src/libtracker-data/meson.build
+++ b/src/libtracker-data/meson.build
@@ -4,7 +4,6 @@
 libtracker_data_vala = static_library('tracker-sparql-query',
     'tracker-vala-namespace.vala',
     'tracker-sparql-scanner.vala',
-    'tracker-turtle-reader.vala',
     '../libtracker-common/libtracker-common.vapi',
     'libtracker-data.vapi',
     tracker_sparql_vapi,
@@ -54,6 +53,7 @@ libtracker_data = library('tracker-data',
     'tracker-sparql-parser.c',
     'tracker-sparql-types.c',
     'tracker-sparql.c',
+    'tracker-turtle-reader.c',
     'tracker-uuid.c',
     'tracker-vtab-service.c',
     'tracker-vtab-triples.c',
diff --git a/src/libtracker-data/tracker-data-manager.c b/src/libtracker-data/tracker-data-manager.c
index a0bc8e863..109d94923 100644
--- a/src/libtracker-data/tracker-data-manager.c
+++ b/src/libtracker-data/tracker-data-manager.c
@@ -46,6 +46,7 @@
 #include "tracker-sparql-query.h"
 #include "tracker-data-query.h"
 #include "tracker-sparql-parser.h"
+#include "tracker-turtle-reader.h"
 
 #define RDF_PROPERTY                    TRACKER_PREFIX_RDF "Property"
 #define RDF_TYPE                        TRACKER_PREFIX_RDF "type"
@@ -1817,10 +1818,11 @@ load_ontology_file (TrackerDataManager  *manager,
                     GError             **error)
 {
        TrackerTurtleReader *reader;
-       GError              *ttl_error = NULL;
-       gchar               *ontology_uri;
+       GError *ttl_error = NULL;
+       gchar *ontology_uri;
+       const gchar *subject, *predicate, *object;
 
-       reader = tracker_turtle_reader_new (file, &ttl_error);
+       reader = tracker_turtle_reader_new_for_file (file, &ttl_error);
 
        if (ttl_error) {
                g_propagate_error (error, ttl_error);
@@ -1832,14 +1834,11 @@ load_ontology_file (TrackerDataManager  *manager,
        /* Post checks are only needed for ontology updates, not the initial
         * ontology */
 
-       while (ttl_error == NULL && tracker_turtle_reader_next (reader, &ttl_error)) {
-               const gchar *subject, *predicate, *object;
+       while (tracker_turtle_reader_next (reader,
+                                          &subject, &predicate, &object,
+                                          NULL, &ttl_error)) {
                GError *ontology_error = NULL;
 
-               subject = tracker_turtle_reader_get_subject (reader);
-               predicate = tracker_turtle_reader_get_predicate (reader);
-               object = tracker_turtle_reader_get_object (reader);
-
                tracker_data_ontology_load_statement (manager, ontology_uri,
                                                      subject, predicate, object,
                                                      max_id, in_update, NULL, NULL,
@@ -1864,12 +1863,13 @@ static TrackerOntology*
 get_ontology_from_file (TrackerDataManager *manager,
                         GFile              *file)
 {
+       const gchar *subject, *predicate, *object;
        TrackerTurtleReader *reader;
        GError *error = NULL;
        GHashTable *ontology_uris;
        TrackerOntology *ret = NULL;
 
-       reader = tracker_turtle_reader_new (file, &error);
+       reader = tracker_turtle_reader_new_for_file (file, &error);
 
        if (error) {
                g_critical ("Turtle parse error: %s", error->message);
@@ -1882,13 +1882,9 @@ get_ontology_from_file (TrackerDataManager *manager,
                                               g_free,
                                               g_object_unref);
 
-       while (error == NULL && tracker_turtle_reader_next (reader, &error)) {
-               const gchar *subject, *predicate, *object;
-
-               subject = tracker_turtle_reader_get_subject (reader);
-               predicate = tracker_turtle_reader_get_predicate (reader);
-               object = tracker_turtle_reader_get_object (reader);
-
+       while (tracker_turtle_reader_next (reader,
+                                          &subject, &predicate, &object,
+                                          NULL, &error)) {
                if (g_strcmp0 (predicate, RDF_TYPE) == 0) {
                        if (g_strcmp0 (object, TRACKER_PREFIX_TRACKER "Ontology") == 0) {
                                TrackerOntology *ontology;
@@ -2050,10 +2046,12 @@ import_ontology_file (TrackerDataManager *manager,
                       GFile              *file,
                       gboolean            in_update)
 {
+       const gchar *subject, *predicate, *object;
+       gboolean object_is_uri;
        GError *error = NULL;
        TrackerTurtleReader* reader;
 
-       reader = tracker_turtle_reader_new (file, &error);
+       reader = tracker_turtle_reader_new_for_file (file, &error);
 
        if (error != NULL) {
                g_critical ("%s", error->message);
@@ -2061,14 +2059,12 @@ import_ontology_file (TrackerDataManager *manager,
                return;
        }
 
-       while (tracker_turtle_reader_next (reader, &error)) {
-               const gchar *subject = tracker_turtle_reader_get_subject (reader);
-               const gchar *predicate = tracker_turtle_reader_get_predicate (reader);
-               const gchar *object  = tracker_turtle_reader_get_object (reader);
-
+       while (tracker_turtle_reader_next (reader,
+                                          &subject, &predicate, &object,
+                                          &object_is_uri, &error)) {
                tracker_data_ontology_process_statement (manager,
                                                         subject, predicate, object,
-                                                        tracker_turtle_reader_get_object_is_uri (reader),
+                                                        object_is_uri,
                                                         in_update);
        }
 
diff --git a/src/libtracker-data/tracker-data-update.c b/src/libtracker-data/tracker-data-update.c
index 64ebee694..66a094dca 100644
--- a/src/libtracker-data/tracker-data-update.c
+++ b/src/libtracker-data/tracker-data-update.c
@@ -38,6 +38,7 @@
 #include "tracker-property.h"
 #include "tracker-sparql-query.h"
 #include "tracker-sparql.h"
+#include "tracker-turtle-reader.h"
 
 typedef struct _TrackerDataUpdateBuffer TrackerDataUpdateBuffer;
 typedef struct _TrackerDataUpdateBufferGraph TrackerDataUpdateBufferGraph;
@@ -3074,34 +3075,35 @@ tracker_data_load_turtle_file (TrackerData  *data,
        TrackerTurtleReader *reader = NULL;
        GError *inner_error = NULL;
        gboolean in_transaction = FALSE;
+       const gchar *subject, *predicate, *object_str;
+       gboolean object_is_uri;
 
        tracker_data_begin_transaction (data, &inner_error);
        if (inner_error)
                goto failed;
 
        in_transaction = TRUE;
-       reader = tracker_turtle_reader_new (file, &inner_error);
+       reader = tracker_turtle_reader_new_for_file (file, &inner_error);
        if (inner_error)
                goto failed;
 
-       while (tracker_turtle_reader_next (reader, &inner_error)) {
-               const gchar *object_str;
+       while (tracker_turtle_reader_next (reader,
+                                          &subject,
+                                          &predicate,
+                                          &object_str,
+                                          &object_is_uri,
+                                          &inner_error)) {
                GBytes *object;
 
-               object_str = tracker_turtle_reader_get_object (reader);
                object = g_bytes_new (object_str, strlen (object_str) + 1);
 
-               if (tracker_turtle_reader_get_object_is_uri (reader)) {
+               if (object_is_uri) {
                        tracker_data_insert_statement_with_uri (data, graph,
-                                                               tracker_turtle_reader_get_subject (reader),
-                                                               tracker_turtle_reader_get_predicate (reader),
-                                                               object,
+                                                               subject, predicate, object,
                                                                &inner_error);
                } else {
                        tracker_data_insert_statement_with_string (data, graph,
-                                                                  tracker_turtle_reader_get_subject (reader),
-                                                                  tracker_turtle_reader_get_predicate 
(reader),
-                                                                  object,
+                                                                  subject, predicate, object,
                                                                   &inner_error);
                }
 
diff --git a/src/libtracker-data/tracker-turtle-reader.c b/src/libtracker-data/tracker-turtle-reader.c
new file mode 100644
index 000000000..a072227ae
--- /dev/null
+++ b/src/libtracker-data/tracker-turtle-reader.c
@@ -0,0 +1,678 @@
+/*
+ * Copyright (C) 2020, Red Hat Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ *
+ * Author: Carlos Garnacho <carlosg gnome org>
+ */
+#include "config.h"
+
+#include "tracker-turtle-reader.h"
+#include "tracker-sparql-grammar.h"
+#include "tracker-uuid.h"
+
+#include <libtracker-sparql/tracker-connection.h>
+
+#define BUF_SIZE 1024
+#define RDF_TYPE "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
+
+typedef enum
+{
+       STATE_INITIAL,
+       STATE_SUBJECT,
+       STATE_PREDICATE,
+       STATE_OBJECT,
+       STATE_STEP,
+} ParserState;
+
+typedef struct {
+       gchar *subject;
+       gchar *predicate;
+       ParserState state;
+} StateStack;
+
+struct _TrackerTurtleReader {
+       GObject parent_instance;
+       GInputStream *stream;
+       GBufferedInputStream *buffered_stream;
+       GHashTable *blank_nodes;
+       GHashTable *prefixes;
+       GArray *parser_state;
+       gchar *base;
+       gchar *subject;
+       gchar *predicate;
+       gchar *object;
+       gboolean object_is_uri;
+       ParserState state;
+};
+
+enum {
+       PROP_STREAM = 1,
+       N_PROPS
+};
+
+static GParamSpec *props[N_PROPS] = { 0 };
+
+G_DEFINE_TYPE (TrackerTurtleReader,
+               tracker_turtle_reader,
+               G_TYPE_OBJECT)
+
+static void
+tracker_turtle_reader_finalize (GObject *object)
+{
+       TrackerTurtleReader *reader = TRACKER_TURTLE_READER (object);
+
+       g_input_stream_close (G_INPUT_STREAM (reader->buffered_stream), NULL, NULL);
+       g_input_stream_close (reader->stream, NULL, NULL);
+       g_clear_object (&reader->buffered_stream);
+       g_clear_object (&reader->stream);
+       g_clear_pointer (&reader->blank_nodes, g_hash_table_unref);
+       g_clear_pointer (&reader->prefixes, g_hash_table_unref);
+       g_clear_pointer (&reader->parser_state, g_array_unref);
+       g_clear_pointer (&reader->subject, g_free);
+       g_clear_pointer (&reader->predicate, g_free);
+       g_clear_pointer (&reader->object, g_free);
+       g_clear_pointer (&reader->base, g_free);
+
+       G_OBJECT_CLASS (tracker_turtle_reader_parent_class)->finalize (object);
+}
+
+static void
+tracker_turtle_reader_constructed (GObject *object)
+{
+       TrackerTurtleReader *reader = TRACKER_TURTLE_READER (object);
+
+       reader->buffered_stream =
+               G_BUFFERED_INPUT_STREAM (g_buffered_input_stream_new (reader->stream));
+
+       G_OBJECT_CLASS (tracker_turtle_reader_parent_class)->constructed (object);
+}
+
+static void
+tracker_turtle_reader_set_property (GObject      *object,
+                                    guint         prop_id,
+                                    const GValue *value,
+                                    GParamSpec   *pspec)
+{
+       TrackerTurtleReader *reader = TRACKER_TURTLE_READER (object);
+
+       switch (prop_id) {
+       case PROP_STREAM:
+               reader->stream = g_value_dup_object (value);
+               break;
+       default:
+               G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
+               break;
+       }
+}
+
+static void
+tracker_turtle_reader_get_property (GObject    *object,
+                                    guint       prop_id,
+                                    GValue     *value,
+                                    GParamSpec *pspec)
+{
+       TrackerTurtleReader *reader = TRACKER_TURTLE_READER (object);
+
+       switch (prop_id) {
+       case PROP_STREAM:
+               g_value_set_object (value, reader->stream);
+               break;
+       default:
+               G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
+               break;
+       }
+}
+
+static void
+tracker_turtle_reader_class_init (TrackerTurtleReaderClass *klass)
+{
+       GObjectClass *object_class = G_OBJECT_CLASS (klass);
+
+       object_class->finalize = tracker_turtle_reader_finalize;
+       object_class->constructed = tracker_turtle_reader_constructed;
+       object_class->set_property = tracker_turtle_reader_set_property;
+       object_class->get_property = tracker_turtle_reader_get_property;
+
+       props[PROP_STREAM] =
+               g_param_spec_object ("stream",
+                                    "Stream",
+                                    "Stream",
+                                    G_TYPE_INPUT_STREAM,
+                                    G_PARAM_READWRITE |
+                                    G_PARAM_CONSTRUCT_ONLY);
+
+       g_object_class_install_properties (object_class, N_PROPS, props);
+}
+
+static void
+tracker_turtle_reader_init (TrackerTurtleReader *reader)
+{
+       reader->blank_nodes = g_hash_table_new_full (g_str_hash, g_str_equal,
+                                                    g_free, g_free);
+       reader->prefixes = g_hash_table_new_full (g_str_hash, g_str_equal,
+                                                 g_free, g_free);
+       reader->parser_state = g_array_new (FALSE, FALSE, sizeof (StateStack));
+}
+
+TrackerTurtleReader *
+tracker_turtle_reader_new (GInputStream *istream)
+{
+       g_return_val_if_fail (G_IS_INPUT_STREAM (istream), NULL);
+
+       return g_object_new (TRACKER_TYPE_TURTLE_READER,
+                            "stream", istream,
+                            NULL);
+}
+
+TrackerTurtleReader *
+tracker_turtle_reader_new_for_file (GFile   *file,
+                                    GError **error)
+{
+       TrackerTurtleReader *reader;
+       GInputStream *istream;
+
+       g_return_val_if_fail (G_IS_FILE (file), NULL);
+       g_return_val_if_fail (!error || !*error, NULL);
+
+       istream = G_INPUT_STREAM (g_file_read (file, NULL, error));
+       if (!istream)
+               return NULL;
+
+       reader = tracker_turtle_reader_new (istream);
+       g_object_unref (istream);
+
+       return reader;
+}
+
+static void
+push_stack (TrackerTurtleReader *reader)
+{
+       StateStack state;
+
+       state.subject = g_strdup (reader->subject);
+       state.predicate = g_strdup (reader->predicate);
+       state.state = reader->state;
+       g_array_append_val (reader->parser_state, state);
+}
+
+static void
+pop_stack (TrackerTurtleReader *reader)
+{
+       StateStack *state;
+       gchar *s, *p, *o;
+
+       s = reader->subject;
+       p = reader->predicate;
+       o = reader->object;
+       reader->subject = reader->predicate = reader->object = NULL;
+
+       state = &g_array_index (reader->parser_state, StateStack, reader->parser_state->len - 1);
+       reader->subject = state->subject;
+       reader->predicate = state->predicate;
+       reader->state = state->state;
+
+       if (reader->state == STATE_OBJECT) {
+               /* Restore the old subject as current object */
+               reader->object = s;
+               reader->object_is_uri = TRUE;
+               s = NULL;
+       } else if (reader->state == STATE_SUBJECT) {
+               g_clear_pointer (&reader->subject, g_free);
+               reader->subject = s;
+               s = NULL;
+       }
+
+       g_free (s);
+       g_free (p);
+       g_free (o);
+       g_array_remove_index (reader->parser_state, reader->parser_state->len - 1);
+}
+
+static gboolean
+parse_token (TrackerTurtleReader *reader,
+             const gchar         *token)
+{
+       int len = strlen (token);
+       const gchar *buffer;
+       gsize size;
+
+       buffer = g_buffered_input_stream_peek_buffer (reader->buffered_stream,
+                                                     &size);
+       if (size == 0)
+               return FALSE;
+       if (strncasecmp (buffer, token, len) != 0)
+               return FALSE;
+       if (!g_input_stream_skip (G_INPUT_STREAM (reader->buffered_stream),
+                                 len, NULL, NULL))
+               return FALSE;
+
+       return TRUE;
+}
+
+static gboolean
+parse_terminal (TrackerTurtleReader  *reader,
+                TrackerTerminalFunc   terminal_func,
+                guint                 padding,
+                gchar               **out)
+{
+       const gchar *end, *buffer;
+       gchar *str;
+       gsize size;
+
+       buffer = g_buffered_input_stream_peek_buffer (reader->buffered_stream,
+                                                     &size);
+       if (size == 0)
+               return FALSE;
+
+       if (!terminal_func (buffer, &buffer[size], &end))
+               return FALSE;
+
+       if (end - buffer < 2 * padding)
+               return FALSE;
+
+       str = g_strndup (&buffer[padding], end - buffer - (2 * padding));
+
+       if (!g_input_stream_skip (G_INPUT_STREAM (reader->buffered_stream),
+                                 end - buffer, NULL, NULL)) {
+               g_free (str);
+               return FALSE;
+       }
+
+       if (out)
+               *out = str;
+
+       return TRUE;
+}
+
+static gchar *
+generate_bnode (TrackerTurtleReader *reader,
+                const gchar         *label)
+{
+       gchar *bnode;
+
+       if (!label)
+               return tracker_generate_uuid ("urn:uuid");
+
+       bnode = g_hash_table_lookup (reader->blank_nodes, label);
+
+       if (!bnode) {
+               bnode = tracker_generate_uuid ("urn:uuid");
+               g_hash_table_insert (reader->blank_nodes, g_strdup (label), bnode);
+       }
+
+       return g_strdup (bnode);
+}
+
+static gchar *
+expand_prefix (TrackerTurtleReader *reader,
+               const gchar         *shortname)
+{
+       GHashTableIter iter;
+       gpointer key, value;
+
+       g_hash_table_iter_init (&iter, reader->prefixes);
+
+       while (g_hash_table_iter_next (&iter, &key, &value)) {
+               if (g_str_has_prefix (shortname, key)) {
+                       GString *str;
+
+                       str = g_string_new (value);
+                       g_string_append (str, &shortname[strlen(key)]);
+                       return g_string_free (str, FALSE);
+               }
+       }
+
+       return NULL;
+}
+
+static gchar *
+expand_base (TrackerTurtleReader *reader,
+             gchar               *suffix)
+{
+       if (reader->base) {
+               gchar *str;
+
+               str = g_strdup_printf ("%s%s", reader->base, suffix);
+               g_free (suffix);
+               return str;
+       } else {
+               return suffix;
+       }
+}
+
+static void
+advance_whitespace (TrackerTurtleReader *reader)
+{
+       while (TRUE) {
+               gsize size;
+               const gchar *data;
+               gchar ch;
+
+               data = g_buffered_input_stream_peek_buffer (reader->buffered_stream, &size);
+               if (size == 0)
+                       break;
+
+               ch = data[0];
+               if (!(WS))
+                       break;
+
+               if (!g_input_stream_skip (G_INPUT_STREAM (reader->buffered_stream),
+                                         1, NULL, NULL))
+                       break;
+       }
+}
+
+static gboolean
+handle_prefix (TrackerTurtleReader  *reader,
+               GError              **error)
+{
+       gchar *prefix = NULL, *uri = NULL;
+
+       advance_whitespace (reader);
+       if (!parse_terminal (reader, terminal_PNAME_NS, 0, &prefix))
+               goto error;
+
+       advance_whitespace (reader);
+       if (!parse_terminal (reader, terminal_IRIREF, 1, &uri))
+               goto error;
+
+       advance_whitespace (reader);
+       if (!parse_token (reader, "."))
+               goto error;
+
+       g_hash_table_insert (reader->prefixes, prefix, uri);
+       return TRUE;
+error:
+       g_free (prefix);
+       g_free (uri);
+       g_set_error (error,
+                    TRACKER_SPARQL_ERROR,
+                    TRACKER_SPARQL_ERROR_PARSE,
+                    "Could not parse @prefix");
+       return FALSE;
+}
+
+static gboolean
+handle_base (TrackerTurtleReader  *reader,
+             GError              **error)
+{
+       gchar *base = NULL;
+
+       advance_whitespace (reader);
+       if (!parse_terminal (reader, terminal_IRIREF, 0, &base))
+               goto error;
+
+       advance_whitespace (reader);
+       if (!parse_token (reader, "."))
+               goto error;
+
+       g_clear_pointer (&reader->base, g_free);
+       reader->base = base;
+       return TRUE;
+error:
+       g_free (base);
+       g_set_error (error,
+                    TRACKER_SPARQL_ERROR,
+                    TRACKER_SPARQL_ERROR_PARSE,
+                    "Could not parse @base");
+       return FALSE;
+}
+
+static gboolean
+handle_type_cast (TrackerTurtleReader  *reader,
+                  GError              **error)
+{
+       /* These actually go ignored, imposed by the ontology */
+       if (parse_token (reader, "^^")) {
+               if (parse_terminal (reader, terminal_IRIREF, 1, NULL) ||
+                   parse_terminal (reader, terminal_PNAME_LN, 0, NULL) ||
+                   parse_terminal (reader, terminal_PNAME_NS, 0, NULL))
+                       return TRUE;
+
+               g_set_error (error,
+                            TRACKER_SPARQL_ERROR,
+                            TRACKER_SPARQL_ERROR_PARSE,
+                            "Error parsing type cast");
+               return FALSE;
+       }
+
+       return TRUE;
+}
+
+static void
+skip_comments (TrackerTurtleReader *reader)
+{
+       const gchar *buffer, *str;
+       gsize size;
+
+       while (TRUE) {
+               buffer = g_buffered_input_stream_peek_buffer (reader->buffered_stream,
+                                                             &size);
+               if (size == 0)
+                       break;
+               if (buffer[0] != '#')
+                       break;
+
+               str = strchr (buffer, '\n');
+               if (!str)
+                       break;
+
+               if (!g_input_stream_skip (G_INPUT_STREAM (reader->buffered_stream),
+                                         str + 1 - buffer, NULL, NULL))
+                       break;
+
+               advance_whitespace (reader);
+       }
+}
+
+static gboolean
+tracker_turtle_reader_iterate_next (TrackerTurtleReader  *reader,
+                                    GError              **error)
+{
+       while (TRUE) {
+               gchar *str;
+
+               advance_whitespace (reader);
+
+               if (g_buffered_input_stream_fill (reader->buffered_stream, -1, NULL, error) < 0)
+                       return FALSE;
+
+               switch (reader->state) {
+               case STATE_INITIAL:
+                       reader->state = STATE_SUBJECT;
+                       break;
+               case STATE_SUBJECT:
+                       skip_comments (reader);
+
+                       if (g_buffered_input_stream_get_available (reader->buffered_stream) == 0)
+                               return FALSE;
+
+                       if (parse_token (reader, "@prefix")) {
+                               if (!handle_prefix (reader, error))
+                                       return FALSE;
+                               break;
+                       } else if (parse_token (reader, "@base")) {
+                               if (!handle_base (reader, error))
+                                       return FALSE;
+                               break;
+                       }
+
+                       g_clear_pointer (&reader->subject, g_free);
+
+                       if (parse_token (reader, "[")) {
+                               /* Anonymous blank node */
+                               push_stack (reader);
+                               reader->subject = generate_bnode (reader, NULL);
+                               reader->state = STATE_PREDICATE;
+                               continue;
+                       }
+
+                       if (parse_terminal (reader, terminal_IRIREF, 1, &str)) {
+                               reader->subject = expand_base (reader, str);
+                       } else if (parse_terminal (reader, terminal_PNAME_LN, 0, &str) ||
+                                  parse_terminal (reader, terminal_PNAME_NS, 0, &str)) {
+                               reader->subject = expand_prefix (reader, str);
+                               g_free (str);
+                       } else if (parse_terminal (reader, terminal_BLANK_NODE_LABEL, 0, &str)) {
+                               reader->subject = generate_bnode (reader, str);
+                               g_free (str);
+                       } else {
+                               g_set_error (error,
+                                            TRACKER_SPARQL_ERROR,
+                                            TRACKER_SPARQL_ERROR_PARSE,
+                                            "Wrong subject token");
+                               return FALSE;
+                       }
+
+                       reader->state = STATE_PREDICATE;
+                       break;
+               case STATE_PREDICATE:
+                       g_clear_pointer (&reader->predicate, g_free);
+
+                       if (parse_token (reader, "a")) {
+                               reader->predicate = g_strdup (RDF_TYPE);
+                       } else if (parse_terminal (reader, terminal_IRIREF, 1, &str)) {
+                               reader->predicate = expand_base (reader, str);
+                       } else if (parse_terminal (reader, terminal_PNAME_LN, 0, &str) ||
+                                  parse_terminal (reader, terminal_PNAME_NS, 0, &str)) {
+                               reader->predicate = expand_prefix (reader, str);
+                               g_free (str);
+                       } else {
+                               g_set_error (error,
+                                            TRACKER_SPARQL_ERROR,
+                                            TRACKER_SPARQL_ERROR_PARSE,
+                                            "Wrong predicate token");
+                               return FALSE;
+                       }
+
+                       reader->state = STATE_OBJECT;
+                       break;
+               case STATE_OBJECT:
+                       g_clear_pointer (&reader->object, g_free);
+                       reader->object_is_uri = FALSE;
+
+                       if (parse_token (reader, "[")) {
+                               /* Anonymous blank node */
+                               push_stack (reader);
+                               reader->subject = generate_bnode (reader, NULL);
+                               reader->state = STATE_PREDICATE;
+                               continue;
+                       }
+
+                       if (parse_terminal (reader, terminal_IRIREF, 1, &str)) {
+                               reader->object = expand_base (reader, str);
+                               reader->object_is_uri = TRUE;
+                       } else if (parse_terminal (reader, terminal_PNAME_LN, 0, &str) ||
+                                  parse_terminal (reader, terminal_PNAME_NS, 0, &str)) {
+                               reader->object = expand_prefix (reader, str);
+                               reader->object_is_uri = TRUE;
+                               g_free (str);
+                       } else if (parse_terminal (reader, terminal_BLANK_NODE_LABEL, 0, &str)) {
+                               reader->object = generate_bnode (reader, str);
+                               reader->object_is_uri = TRUE;
+                               g_free (str);
+                       } else if (parse_terminal (reader, terminal_STRING_LITERAL1, 1, &str) ||
+                                  parse_terminal (reader, terminal_STRING_LITERAL2, 1, &str)) {
+                               reader->object = str;
+                               if (!handle_type_cast (reader, error))
+                                       return FALSE;
+                       } else if (parse_terminal (reader, terminal_STRING_LITERAL_LONG1, 3, &str) ||
+                                  parse_terminal (reader, terminal_STRING_LITERAL_LONG2, 3, &str)) {
+                               reader->object = str;
+                               if (!handle_type_cast (reader, error))
+                                       return FALSE;
+                       } else if (parse_terminal (reader, terminal_DOUBLE, 0, &str) ||
+                                  parse_terminal (reader, terminal_INTEGER, 0, &str)) {
+                               reader->object = str;
+                       } else if (parse_token (reader, "true")) {
+                               reader->object = g_strdup ("true");
+                       } else if (parse_token (reader, "false")) {
+                               reader->object = g_strdup ("false");
+                       } else {
+                               g_set_error (error,
+                                            TRACKER_SPARQL_ERROR,
+                                            TRACKER_SPARQL_ERROR_PARSE,
+                                            "Wrong object token");
+                               return FALSE;
+                       }
+
+                       reader->state = STATE_STEP;
+
+                       /* This is where next() stops, on lack of errors */
+                       return TRUE;
+                       break;
+               case STATE_STEP:
+                       if (reader->parser_state->len > 0 && parse_token (reader, "]")) {
+                               pop_stack (reader);
+                               if (reader->state == STATE_SUBJECT) {
+                                       reader->state = STATE_PREDICATE;
+                                       continue;
+                               } else if (reader->state == STATE_OBJECT) {
+                                       reader->state = STATE_STEP;
+                                       return TRUE;
+                               }
+                       }
+
+                       if (parse_token (reader, ",")) {
+                               reader->state = STATE_OBJECT;
+                       } else if (parse_token (reader, ";")) {
+                               /* Dot is allowed after semicolon */
+                               advance_whitespace (reader);
+                               if (parse_token (reader, "."))
+                                       reader->state = STATE_SUBJECT;
+                               else
+                                       reader->state = STATE_PREDICATE;
+                       } else if (parse_token (reader, ".")) {
+                               reader->state = STATE_SUBJECT;
+                       } else {
+                               g_set_error (error,
+                                            TRACKER_SPARQL_ERROR,
+                                            TRACKER_SPARQL_ERROR_PARSE,
+                                            "Expected comma, semicolon, or dot");
+                               return FALSE;
+                       }
+
+                       break;
+               }
+       }
+}
+
+gboolean
+tracker_turtle_reader_next (TrackerTurtleReader  *reader,
+                            const gchar         **subject,
+                            const gchar         **predicate,
+                            const gchar         **object,
+                            gboolean             *object_is_uri,
+                            GError              **error)
+{
+       g_return_val_if_fail (TRACKER_IS_TURTLE_READER (reader), FALSE);
+       g_return_val_if_fail (subject, FALSE);
+       g_return_val_if_fail (predicate, FALSE);
+       g_return_val_if_fail (object, FALSE);
+       g_return_val_if_fail (!error || !*error, FALSE);
+
+       if (!tracker_turtle_reader_iterate_next (reader, error))
+               return FALSE;
+
+       *subject = reader->subject;
+       *predicate = reader->predicate;
+       *object = reader->object;
+       if (object_is_uri)
+               *object_is_uri = reader->object_is_uri;
+
+       return TRUE;
+}
diff --git a/src/libtracker-data/tracker-turtle-reader.h b/src/libtracker-data/tracker-turtle-reader.h
new file mode 100644
index 000000000..d7c8c841b
--- /dev/null
+++ b/src/libtracker-data/tracker-turtle-reader.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2020, Red Hat Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ *
+ * Author: Carlos Garnacho <carlosg gnome org>
+ */
+
+#include <gio/gio.h>
+
+#ifndef __TRACKER_TURTLE_READER_H__
+#define __TRACKER_TURTLE_READER_H__
+
+#define TRACKER_TYPE_TURTLE_READER (tracker_turtle_reader_get_type ())
+G_DECLARE_FINAL_TYPE (TrackerTurtleReader,
+                      tracker_turtle_reader,
+                      TRACKER, TURTLE_READER,
+                      GObject);
+
+TrackerTurtleReader * tracker_turtle_reader_new (GInputStream *stream);
+TrackerTurtleReader * tracker_turtle_reader_new_for_file (GFile   *file,
+                                                          GError **error);
+
+gboolean tracker_turtle_reader_next (TrackerTurtleReader  *reader,
+                                     const gchar         **subject,
+                                     const gchar         **predicate,
+                                     const gchar         **object,
+                                     gboolean             *object_is_uri,
+                                     GError              **error);
+
+#endif /* __TRACKER_TURTLE_READER_H__ */
diff --git a/utils/ontology/data-validator.c b/utils/ontology/data-validator.c
index 38dbd90ac..f235a27ee 100644
--- a/utils/ontology/data-validator.c
+++ b/utils/ontology/data-validator.c
@@ -26,6 +26,7 @@
 #include <gio/gio.h>
 
 #include <libtracker-data/tracker-data.h>
+#include <libtracker-data/tracker-turtle-reader.h>
 
 static gchar         *ontology_dir = NULL;
 static gchar         *ttl_file = NULL;
@@ -118,6 +119,7 @@ load_ontology_files (const gchar *services_dir)
        conf_file = g_dir_read_name (services);
 
        while (conf_file) {
+               const gchar *subject, *predicate, *object;
                TrackerTurtleReader *reader;
                GError *error = NULL;
                GFile *file;
@@ -130,13 +132,13 @@ load_ontology_files (const gchar *services_dir)
                fullpath = g_build_filename (dir_uri, conf_file, NULL);
                file = g_file_new_for_path (fullpath);
 
-               reader = tracker_turtle_reader_new (file, NULL);
+               reader = tracker_turtle_reader_new_for_file (file, NULL);
                g_object_unref (file);
 
-               while (error == NULL && tracker_turtle_reader_next (reader, &error)) {
-                       turtle_load_ontology (tracker_turtle_reader_get_subject (reader),
-                                             tracker_turtle_reader_get_predicate (reader),
-                                             tracker_turtle_reader_get_object (reader));
+               while (tracker_turtle_reader_next (reader,
+                                                  &subject, &predicate, &object,
+                                                  NULL, &error)) {
+                       turtle_load_ontology (subject, predicate, object);
                }
 
                g_object_unref (reader);
@@ -162,6 +164,7 @@ load_ontology_files (const gchar *services_dir)
 gint
 main (gint argc, gchar **argv)
 {
+       const gchar *subject, *predicate, *object;
        GOptionContext *context;
        TrackerTurtleReader *reader;
        GError *error = NULL;
@@ -194,13 +197,13 @@ main (gint argc, gchar **argv)
        load_ontology_files (ontology_dir);
 
        file = g_file_new_for_commandline_arg (ttl_file);
-       reader = tracker_turtle_reader_new (file, NULL);
+       reader = tracker_turtle_reader_new_for_file (file, NULL);
        g_object_unref (file);
 
-       while (error == NULL && tracker_turtle_reader_next (reader, &error)) {
-               turtle_statement_handler (tracker_turtle_reader_get_subject (reader),
-                                         tracker_turtle_reader_get_predicate (reader),
-                                         tracker_turtle_reader_get_object (reader));
+       while (tracker_turtle_reader_next (reader,
+                                          &subject, &predicate, &object,
+                                          NULL, &error)) {
+               turtle_statement_handler (subject, predicate, object);
        }
 
        g_object_unref (reader);
diff --git a/utils/ontology/ontology-validator.c b/utils/ontology/ontology-validator.c
index 8d934337d..fd3123134 100644
--- a/utils/ontology/ontology-validator.c
+++ b/utils/ontology/ontology-validator.c
@@ -26,6 +26,7 @@
 #include <gio/gio.h>
 
 #include <libtracker-data/tracker-data.h>
+#include <libtracker-data/tracker-turtle-reader.h>
 
 static gchar         *ontology_dir = NULL;
 
@@ -178,18 +179,19 @@ turtle_load_ontology (const gchar *turtle_subject,
 static void
 process_file (const gchar *ttl_path)
 {
+       const gchar *subject, *predicate, *object;
        TrackerTurtleReader *reader;
        GError *error = NULL;
        GFile *ttl_file = g_file_new_for_path (ttl_path);
 
        g_print ("Processing %s\n", ttl_path);
 
-       reader = tracker_turtle_reader_new (ttl_file, NULL);
+       reader = tracker_turtle_reader_new_for_file (ttl_file, NULL);
 
-       while (error == NULL && tracker_turtle_reader_next (reader, &error)) {
-               turtle_load_ontology (tracker_turtle_reader_get_subject (reader),
-                                     tracker_turtle_reader_get_predicate (reader),
-                                     tracker_turtle_reader_get_object (reader));
+       while (tracker_turtle_reader_next (reader,
+                                          &subject, &predicate, &object,
+                                          NULL, &error)) {
+               turtle_load_ontology (subject, predicate, object);
        }
 
        g_object_unref (reader);


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]