[tracker/wip/carlosg/deserializers: 11/12] libtracker-sparql: Make the turtle reader a deserializer




commit 65c2e33c83fc22287f65cd04b0d83023fa15c990
Author: Carlos Garnacho <carlosg gnome org>
Date:   Sun Feb 13 12:37:11 2022 +0100

    libtracker-sparql: Make the turtle reader a deserializer
    
    This was an ad-hoc internal object, make it a bit less ad-hoc by
    integrating it in this new machinery. All users have been updated
    to using TrackerDeserializer now for iterating over turtle file
    triple contents.

 src/libtracker-sparql/core/meson.build             |   1 -
 src/libtracker-sparql/core/tracker-data-manager.c  |  88 ++-
 src/libtracker-sparql/core/tracker-data-update.c   |  57 +-
 src/libtracker-sparql/core/tracker-turtle-reader.c | 777 --------------------
 src/libtracker-sparql/core/tracker-turtle-reader.h |  49 --
 src/libtracker-sparql/meson.build                  |   1 +
 .../tracker-deserializer-turtle.c                  | 793 +++++++++++++++++++++
 .../tracker-deserializer-turtle.h                  |  38 +
 src/libtracker-sparql/tracker-deserializer.c       |   5 +-
 9 files changed, 926 insertions(+), 883 deletions(-)
---
diff --git a/src/libtracker-sparql/core/meson.build b/src/libtracker-sparql/core/meson.build
index 869cb9790..c409f2684 100644
--- a/src/libtracker-sparql/core/meson.build
+++ b/src/libtracker-sparql/core/meson.build
@@ -28,7 +28,6 @@ core_files = files(
     'tracker-sparql-parser.c',
     'tracker-sparql-types.c',
     'tracker-sparql.c',
-    'tracker-turtle-reader.c',
     'tracker-uuid.c',
     'tracker-vtab-service.c',
     'tracker-vtab-triples.c',
diff --git a/src/libtracker-sparql/core/tracker-data-manager.c 
b/src/libtracker-sparql/core/tracker-data-manager.c
index c9aca121e..2a29008b8 100644
--- a/src/libtracker-sparql/core/tracker-data-manager.c
+++ b/src/libtracker-sparql/core/tracker-data-manager.c
@@ -27,6 +27,8 @@
 #include <libtracker-common/tracker-debug.h>
 #include <libtracker-common/tracker-locale.h>
 
+#include <libtracker-sparql/tracker-deserializer-rdf.h>
+
 #include "tracker-class.h"
 #include "tracker-data-manager.h"
 #include "tracker-data-update.h"
@@ -39,7 +41,6 @@
 #include "tracker-property.h"
 #include "tracker-data-query.h"
 #include "tracker-sparql-parser.h"
-#include "tracker-turtle-reader.h"
 
 #define RDF_PROPERTY                    TRACKER_PREFIX_RDF "Property"
 #define RDF_TYPE                        TRACKER_PREFIX_RDF "type"
@@ -2055,16 +2056,16 @@ load_ontology_file (TrackerDataManager  *manager,
                     guint               *num_parsing_errors,
                     GError             **error)
 {
-       TrackerTurtleReader *reader;
+       TrackerSparqlCursor *deserializer;
        GError *ttl_error = NULL;
        gchar *ontology_uri = g_file_get_uri (file);
        const gchar *subject, *predicate, *object;
-       goffset object_line_no, object_column_no;
+       goffset object_line_no = 0, object_column_no = 0;
 
        if (num_parsing_errors)
                *num_parsing_errors = 0;
 
-       reader = tracker_turtle_reader_new_for_file (file, &ttl_error);
+       deserializer = tracker_deserializer_new_for_file (file, NULL, &ttl_error);
 
        if (ttl_error) {
                g_propagate_prefixed_error (error, ttl_error, "%s: ", ontology_uri);
@@ -2075,13 +2076,24 @@ load_ontology_file (TrackerDataManager  *manager,
        /* Post checks are only needed for ontology updates, not the initial
         * ontology */
 
-       while (tracker_turtle_reader_next (reader,
-                                          &subject, &predicate, &object,
-                                          NULL, NULL, &object_line_no,
-                                          &object_column_no, &ttl_error)) {
+       while (tracker_sparql_cursor_next (deserializer, NULL, &ttl_error)) {
                GError *ontology_error = NULL;
                gboolean loaded_successfully;
 
+               subject = tracker_sparql_cursor_get_string (deserializer,
+                                                           TRACKER_RDF_COL_SUBJECT,
+                                                           NULL);
+               predicate = tracker_sparql_cursor_get_string (deserializer,
+                                                             TRACKER_RDF_COL_PREDICATE,
+                                                             NULL);
+               object = tracker_sparql_cursor_get_string (deserializer,
+                                                          TRACKER_RDF_COL_OBJECT,
+                                                          NULL);
+
+               tracker_deserializer_get_parser_location (TRACKER_DESERIALIZER (deserializer),
+                                                         &object_line_no,
+                                                         &object_column_no);
+
                tracker_data_ontology_load_statement (manager, ontology_uri,
                                                      subject, predicate, object,
                                                      object_line_no, object_column_no, in_update,
@@ -2098,13 +2110,16 @@ load_ontology_file (TrackerDataManager  *manager,
        }
 
        if (ttl_error) {
+               tracker_deserializer_get_parser_location (TRACKER_DESERIALIZER (deserializer),
+                                                         &object_line_no,
+                                                         &object_column_no);
                g_propagate_prefixed_error (error, ttl_error,
                                            "%s:%" G_GOFFSET_FORMAT ":%" G_GOFFSET_FORMAT ": ",
                                            ontology_uri, object_line_no, object_column_no);
        }
 
        g_free (ontology_uri);
-       g_object_unref (reader);
+       g_object_unref (deserializer);
 }
 
 
@@ -2114,14 +2129,14 @@ get_ontology_from_file (TrackerDataManager *manager,
                         GError            **error)
 {
        const gchar *subject, *predicate, *object;
-       TrackerTurtleReader *reader;
+       TrackerSparqlCursor *deserializer;
        GError *internal_error = NULL;
        GHashTable *ontology_uris;
        TrackerOntology *ret = NULL;
-       goffset object_line_no, object_column_no;
+       goffset object_line_no = 0, object_column_no = 0;
        gchar *ontology_uri = g_file_get_uri (file);
 
-       reader = tracker_turtle_reader_new_for_file (file, &internal_error);
+       deserializer = tracker_deserializer_new_for_file (file, NULL, &internal_error);
 
        if (internal_error) {
                g_propagate_prefixed_error (error, internal_error, "%s: ", ontology_uri);
@@ -2133,10 +2148,21 @@ get_ontology_from_file (TrackerDataManager *manager,
                                               g_free,
                                               g_object_unref);
 
-       while (tracker_turtle_reader_next (reader,
-                                          &subject, &predicate, &object,
-                                          NULL, NULL, &object_line_no,
-                                          &object_column_no, &internal_error)) {
+       while (tracker_sparql_cursor_next (deserializer, NULL, &internal_error)) {
+               subject = tracker_sparql_cursor_get_string (deserializer,
+                                                           TRACKER_RDF_COL_SUBJECT,
+                                                           NULL);
+               predicate = tracker_sparql_cursor_get_string (deserializer,
+                                                             TRACKER_RDF_COL_PREDICATE,
+                                                             NULL);
+               object = tracker_sparql_cursor_get_string (deserializer,
+                                                          TRACKER_RDF_COL_OBJECT,
+                                                          NULL);
+
+               tracker_deserializer_get_parser_location (TRACKER_DESERIALIZER (deserializer),
+                                                         &object_line_no,
+                                                         &object_column_no);
+
                if (g_strcmp0 (predicate, RDF_TYPE) == 0) {
                        if (g_strcmp0 (object, TRACKER_PREFIX_NRL "Ontology") == 0) {
                                TrackerOntology *ontology;
@@ -2182,7 +2208,7 @@ get_ontology_from_file (TrackerDataManager *manager,
        }
 
        g_hash_table_unref (ontology_uris);
-       g_object_unref (reader);
+       g_object_unref (deserializer);
 
        if (internal_error) {
                g_propagate_prefixed_error (error, internal_error,
@@ -2333,22 +2359,32 @@ import_ontology_file (TrackerDataManager  *manager,
                       GError             **error)
 {
        const gchar *subject, *predicate, *object;
-       TrackerTurtleReader* reader;
-       goffset object_line_no, object_column_no;
+       TrackerSparqlCursor *deserializer;
+       goffset object_line_no = 0, object_column_no = 0;
        gchar *ontology_uri = g_file_get_uri (file);
 
-       reader = tracker_turtle_reader_new_for_file (file, error);
+       deserializer = tracker_deserializer_new_for_file (file, NULL, error);
 
-       if (!reader) {
+       if (!deserializer) {
                g_prefix_error (error, "%s:", ontology_uri);
                goto out;
        }
 
-       while (tracker_turtle_reader_next (reader,
-                                          &subject, &predicate, &object,
-                                          NULL, NULL, &object_line_no,
-                                          &object_column_no, error)) {
+       while (tracker_sparql_cursor_next (deserializer, NULL, error)) {
                GError *internal_error = NULL;
+               subject = tracker_sparql_cursor_get_string (deserializer,
+                                                           TRACKER_RDF_COL_SUBJECT,
+                                                           NULL);
+               predicate = tracker_sparql_cursor_get_string (deserializer,
+                                                             TRACKER_RDF_COL_PREDICATE,
+                                                             NULL);
+               object = tracker_sparql_cursor_get_string (deserializer,
+                                                          TRACKER_RDF_COL_OBJECT,
+                                                          NULL);
+
+               tracker_deserializer_get_parser_location (TRACKER_DESERIALIZER (deserializer),
+                                                         &object_line_no,
+                                                         &object_column_no);
 
                tracker_data_ontology_process_statement (manager,
                                                         subject, predicate, object,
@@ -2367,7 +2403,7 @@ import_ontology_file (TrackerDataManager  *manager,
                                ontology_uri, object_line_no, object_column_no);
        }
 
-       g_object_unref (reader);
+       g_object_unref (deserializer);
 
 out:
        g_free (ontology_uri);
diff --git a/src/libtracker-sparql/core/tracker-data-update.c 
b/src/libtracker-sparql/core/tracker-data-update.c
index 1934a22e3..b6405673b 100644
--- a/src/libtracker-sparql/core/tracker-data-update.c
+++ b/src/libtracker-sparql/core/tracker-data-update.c
@@ -27,6 +27,8 @@
 
 #include <libtracker-common/tracker-common.h>
 
+#include <libtracker-sparql/tracker-deserializer-rdf.h>
+
 #include "tracker-class.h"
 #include "tracker-data-manager.h"
 #include "tracker-data-update.h"
@@ -36,7 +38,6 @@
 #include "tracker-ontologies.h"
 #include "tracker-property.h"
 #include "tracker-sparql.h"
-#include "tracker-turtle-reader.h"
 #include "tracker-uuid.h"
 
 typedef struct _TrackerDataUpdateBuffer TrackerDataUpdateBuffer;
@@ -2856,33 +2857,38 @@ tracker_data_load_turtle_file (TrackerData  *data,
                                const gchar  *graph,
                                GError      **error)
 {
-       TrackerTurtleReader *reader = NULL;
+       TrackerSparqlCursor *deserializer;
        TrackerOntologies *ontologies;
        GError *inner_error = NULL;
-       const gchar *subject_str, *predicate_str, *object_str, *langtag;
-       gboolean object_is_uri;
-       goffset last_parsed_line_no, last_parsed_column_no;
+       const gchar *subject_str, *predicate_str, *object_str;
+       goffset last_parsed_line_no = 0, last_parsed_column_no = 0;
        gchar *ontology_uri;
 
-       reader = tracker_turtle_reader_new_for_file (file, error);
-       if (!reader)
+       deserializer = tracker_deserializer_new_for_file (file, NULL, error);
+       if (!deserializer)
                return;
 
        ontologies = tracker_data_manager_get_ontologies (data->manager);
 
-       while (tracker_turtle_reader_next (reader,
-                                          &subject_str,
-                                          &predicate_str,
-                                          &object_str,
-                                          &langtag,
-                                          &object_is_uri,
-                                          &last_parsed_line_no,
-                                          &last_parsed_column_no,
-                                          &inner_error)) {
+       while (tracker_sparql_cursor_next (deserializer, NULL, &inner_error)) {
                TrackerProperty *predicate;
                GValue object = G_VALUE_INIT;
                TrackerRowid subject;
 
+               subject_str = tracker_sparql_cursor_get_string (deserializer,
+                                                               TRACKER_RDF_COL_SUBJECT,
+                                                               NULL);
+               predicate_str = tracker_sparql_cursor_get_string (deserializer,
+                                                                 TRACKER_RDF_COL_PREDICATE,
+                                                                 NULL);
+               object_str = tracker_sparql_cursor_get_string (deserializer,
+                                                              TRACKER_RDF_COL_OBJECT,
+                                                              NULL);
+
+               tracker_deserializer_get_parser_location (TRACKER_DESERIALIZER (deserializer),
+                                                         &last_parsed_line_no,
+                                                         &last_parsed_column_no);
+
                predicate = tracker_ontologies_get_property_by_uri (ontologies, predicate_str);
                if (predicate == NULL) {
                        g_set_error (&inner_error, TRACKER_SPARQL_ERROR,
@@ -2901,7 +2907,7 @@ tracker_data_load_turtle_file (TrackerData  *data,
 
                if (!tracker_data_query_string_to_value (data->manager,
                                                         object_str,
-                                                        langtag,
+                                                        NULL, /* FIXME: Missing langtag */
                                                         tracker_property_get_data_type (predicate),
                                                         &object,
                                                         &inner_error))
@@ -2913,16 +2919,9 @@ tracker_data_load_turtle_file (TrackerData  *data,
                if (inner_error)
                        goto failed;
 
-               if (object_is_uri) {
-                       tracker_data_insert_statement_with_uri (data, graph,
-                                                               subject, predicate, &object,
-                                                               &inner_error);
-               } else {
-                       tracker_data_insert_statement_with_string (data, graph,
-                                                                  subject, predicate, &object,
-                                                                  &inner_error);
-               }
-
+               tracker_data_insert_statement (data, graph,
+                                              subject, predicate, &object,
+                                              &inner_error);
                g_value_unset (&object);
 
                if (inner_error)
@@ -2934,12 +2933,12 @@ tracker_data_load_turtle_file (TrackerData  *data,
                        goto failed;
        }
 
-       g_clear_object (&reader);
+       g_clear_object (&deserializer);
 
        return;
 
 failed:
-       g_clear_object (&reader);
+       g_clear_object (&deserializer);
 
        ontology_uri = g_file_get_uri (file);
        g_propagate_prefixed_error (error, inner_error,
diff --git a/src/libtracker-sparql/meson.build b/src/libtracker-sparql/meson.build
index 9644635db..9c708becb 100644
--- a/src/libtracker-sparql/meson.build
+++ b/src/libtracker-sparql/meson.build
@@ -27,6 +27,7 @@ libtracker_sparql_c_sources = files(
     'tracker-cursor.c',
     'tracker-deserializer.c',
     'tracker-deserializer-rdf.c',
+    'tracker-deserializer-turtle.c',
     'tracker-endpoint.c',
     'tracker-endpoint-dbus.c',
     'tracker-endpoint-http.c',
diff --git a/src/libtracker-sparql/tracker-deserializer-turtle.c 
b/src/libtracker-sparql/tracker-deserializer-turtle.c
new file mode 100644
index 000000000..1c3874622
--- /dev/null
+++ b/src/libtracker-sparql/tracker-deserializer-turtle.c
@@ -0,0 +1,793 @@
+/*
+ * Copyright (C) 2020, Red Hat Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ *
+ * Author: Carlos Garnacho <carlosg gnome org>
+ */
+
+/* Deserialization to cursors for the turtle format defined at:
+ *  https://www.w3.org/TR/turtle/
+ */
+
+#include "config.h"
+
+#include "tracker-deserializer-turtle.h"
+
+#include <libtracker-sparql/core/tracker-sparql-grammar.h>
+#include <libtracker-sparql/core/tracker-uuid.h>
+#include <libtracker-sparql/tracker-private.h>
+
+#include <strings.h>
+
+#define BUF_SIZE 1024
+#define RDF_TYPE "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
+
+typedef enum
+{
+       STATE_INITIAL,
+       STATE_SUBJECT,
+       STATE_PREDICATE,
+       STATE_OBJECT,
+       STATE_STEP,
+} ParserState;
+
+typedef struct {
+       gchar *subject;
+       gchar *predicate;
+       ParserState state;
+} StateStack;
+
+struct _TrackerDeserializerTurtle {
+       GObject parent_instance;
+       GBufferedInputStream *buffered_stream;
+       GHashTable *blank_nodes;
+       GArray *parser_state;
+       gchar *base;
+       gchar *subject;
+       gchar *predicate;
+       gchar *object;
+       gchar *object_lang;
+       gboolean object_is_uri;
+       ParserState state;
+       goffset line_no;
+       goffset column_no;
+};
+
+G_DEFINE_TYPE (TrackerDeserializerTurtle,
+               tracker_deserializer_turtle,
+               TRACKER_TYPE_DESERIALIZER_RDF)
+
+static void advance_whitespace_and_comments (TrackerDeserializerTurtle *deserializer);
+
+static void
+tracker_deserializer_turtle_finalize (GObject *object)
+{
+       TrackerDeserializerTurtle *deserializer = TRACKER_DESERIALIZER_TURTLE (object);
+
+       g_clear_object (&deserializer->buffered_stream);
+       g_clear_pointer (&deserializer->blank_nodes, g_hash_table_unref);
+       g_clear_pointer (&deserializer->parser_state, g_array_unref);
+       g_clear_pointer (&deserializer->subject, g_free);
+       g_clear_pointer (&deserializer->predicate, g_free);
+       g_clear_pointer (&deserializer->object, g_free);
+       g_clear_pointer (&deserializer->base, g_free);
+
+       G_OBJECT_CLASS (tracker_deserializer_turtle_parent_class)->finalize (object);
+}
+
+static void
+tracker_deserializer_turtle_constructed (GObject *object)
+{
+       TrackerDeserializerTurtle *deserializer_ttl = TRACKER_DESERIALIZER_TURTLE (object);
+       TrackerDeserializer *deserializer = TRACKER_DESERIALIZER (object);
+       GInputStream *stream;
+
+       G_OBJECT_CLASS (tracker_deserializer_turtle_parent_class)->constructed (object);
+
+       stream = tracker_deserializer_get_stream (deserializer);
+       deserializer_ttl->buffered_stream =
+               G_BUFFERED_INPUT_STREAM (g_buffered_input_stream_new (stream));
+       deserializer_ttl->line_no = 1;
+       deserializer_ttl->column_no = 1;
+}
+
+static void
+push_stack (TrackerDeserializerTurtle *deserializer)
+{
+       StateStack state;
+
+       state.subject = g_strdup (deserializer->subject);
+       state.predicate = g_strdup (deserializer->predicate);
+       state.state = deserializer->state;
+       g_array_append_val (deserializer->parser_state, state);
+}
+
+static void
+pop_stack (TrackerDeserializerTurtle *deserializer)
+{
+       StateStack *state;
+       gchar *s, *p, *o;
+
+       s = deserializer->subject;
+       p = deserializer->predicate;
+       o = deserializer->object;
+       deserializer->subject = deserializer->predicate = deserializer->object = NULL;
+
+       state = &g_array_index (deserializer->parser_state, StateStack, deserializer->parser_state->len - 1);
+       deserializer->subject = state->subject;
+       deserializer->predicate = state->predicate;
+       deserializer->state = state->state;
+
+       if (deserializer->state == STATE_OBJECT) {
+               /* Restore the old subject as current object */
+               deserializer->object = s;
+               deserializer->object_is_uri = TRUE;
+               g_clear_pointer (&deserializer->object_lang, g_free);
+               s = NULL;
+       } else if (deserializer->state == STATE_SUBJECT) {
+               g_clear_pointer (&deserializer->subject, g_free);
+               deserializer->subject = s;
+               s = NULL;
+       }
+
+       g_free (s);
+       g_free (p);
+       g_free (o);
+       g_array_remove_index (deserializer->parser_state, deserializer->parser_state->len - 1);
+}
+
+static void
+calculate_num_lines_and_columns (const gchar     *start,
+                                 gsize            count,
+                                 goffset         *num_lines,
+                                 goffset         *num_columns)
+{
+       *num_lines = 0;
+       *num_columns = 0;
+
+       for (size_t i = 0; i < count; i++)
+       {
+               if (*(start + i) == '\n') {
+                       *num_lines += 1;
+                       *num_columns = 1;
+               } else {
+                       *num_columns += 1;
+               }
+       }
+}
+
+static gsize
+seek_input (TrackerDeserializerTurtle *deserializer,
+            gsize                      count)
+{
+       const gchar *buffer;
+       gsize size;
+       goffset num_lines;
+       goffset num_columns;
+
+       buffer = g_buffered_input_stream_peek_buffer (deserializer->buffered_stream,
+                                                     &size);
+       count = MIN (count, size);
+       if (!count)
+               return 0;
+
+       calculate_num_lines_and_columns (buffer, count, &num_lines, &num_columns);
+
+       deserializer->line_no += num_lines;
+       if (num_lines > 0) {
+               deserializer->column_no = num_columns;
+       } else {
+               deserializer->column_no += num_columns;
+       }
+       return g_input_stream_skip (G_INPUT_STREAM (deserializer->buffered_stream),
+                                   count, NULL, NULL);
+}
+
+static gboolean
+parse_token (TrackerDeserializerTurtle *deserializer,
+             const gchar               *token)
+{
+       int len = strlen (token);
+       const gchar *buffer;
+       gsize size;
+
+       buffer = g_buffered_input_stream_peek_buffer (deserializer->buffered_stream,
+                                                     &size);
+       if (size == 0)
+               return FALSE;
+       if (strncasecmp (buffer, token, len) != 0)
+               return FALSE;
+       if (!seek_input (deserializer, len))
+               return FALSE;
+
+       return TRUE;
+}
+
+static gboolean
+parse_terminal (TrackerDeserializerTurtle  *deserializer,
+                TrackerTerminalFunc         terminal_func,
+                guint                       padding,
+                gchar                     **out)
+{
+       const gchar *end, *buffer;
+       gchar *str;
+       gsize size;
+
+       buffer = g_buffered_input_stream_peek_buffer (deserializer->buffered_stream,
+                                                     &size);
+       if (size == 0)
+               return FALSE;
+
+       if (!terminal_func (buffer, &buffer[size], &end))
+               return FALSE;
+
+       if (end - buffer < 2 * padding)
+               return FALSE;
+
+       str = g_strndup (&buffer[padding], end - buffer - (2 * padding));
+
+       if (!seek_input (deserializer, end - buffer)) {
+               g_free (str);
+               return FALSE;
+       }
+
+       if (out)
+               *out = str;
+       else
+               g_free (str);
+
+       return TRUE;
+}
+
+static gchar *
+generate_bnode (TrackerDeserializerTurtle *deserializer,
+                const gchar               *label)
+{
+       gchar *bnode;
+
+       if (!label)
+               return tracker_generate_uuid ("urn:uuid");
+
+       bnode = g_hash_table_lookup (deserializer->blank_nodes, label);
+
+       if (!bnode) {
+               bnode = tracker_generate_uuid ("urn:uuid");
+               g_hash_table_insert (deserializer->blank_nodes, g_strdup (label), bnode);
+       }
+
+       return g_strdup (bnode);
+}
+
+static gchar *
+expand_prefix (TrackerDeserializerTurtle  *deserializer,
+               const gchar                *shortname,
+               GError                    **error)
+{
+       TrackerNamespaceManager *namespaces;
+       gchar *expanded;
+
+       namespaces = tracker_deserializer_get_namespaces (TRACKER_DESERIALIZER (deserializer));
+       expanded = tracker_namespace_manager_expand_uri (namespaces, shortname);
+
+       if (g_strcmp0 (expanded, shortname) == 0) {
+               g_free (expanded);
+               g_set_error (error,
+                            TRACKER_SPARQL_ERROR,
+                            TRACKER_SPARQL_ERROR_PARSE,
+                            "Unknown prefix %s at line %" G_GOFFSET_FORMAT ", column %" G_GOFFSET_FORMAT,
+                            shortname, deserializer->line_no, deserializer->column_no - strlen(shortname));
+               return NULL;
+       }
+
+       return expanded;
+}
+
+static gchar *
+expand_base (TrackerDeserializerTurtle *deserializer,
+             gchar                     *suffix)
+{
+       if (deserializer->base) {
+               gchar *str;
+
+               str = g_strdup_printf ("%s%s", deserializer->base, suffix);
+               g_free (suffix);
+               return str;
+       } else {
+               return suffix;
+       }
+}
+
+static void
+advance_whitespace (TrackerDeserializerTurtle *deserializer)
+{
+       while (TRUE) {
+               gsize size;
+               const gchar *data;
+               gchar ch;
+
+               data = g_buffered_input_stream_peek_buffer (deserializer->buffered_stream, &size);
+               if (size == 0)
+                       break;
+
+               ch = data[0];
+               if (!(WS))
+                       break;
+
+               if (!seek_input (deserializer, 1))
+                       break;
+       }
+}
+
+static gboolean
+maybe_add_prefix (TrackerDeserializerTurtle  *deserializer,
+                  const gchar                *prefix,
+                  const gchar                *uri,
+                  GError                    **error)
+{
+       TrackerNamespaceManager *namespaces;
+       const gchar *existing;
+
+       namespaces = tracker_deserializer_get_namespaces (TRACKER_DESERIALIZER (deserializer));
+       existing = tracker_namespace_manager_lookup_prefix (namespaces, prefix);
+
+       if (existing) {
+               if (g_strcmp0 (existing, uri) == 0)
+                       return TRUE;
+
+               g_set_error (error,
+                            TRACKER_SPARQL_ERROR,
+                            TRACKER_SPARQL_ERROR_PARSE,
+                            "Prefix '%s' already expands to '%s'",
+                            prefix, existing);
+               return FALSE;
+       }
+
+       tracker_namespace_manager_add_prefix (namespaces, prefix, uri);
+       return TRUE;
+}
+
+static gboolean
+handle_prefix (TrackerDeserializerTurtle  *deserializer,
+               GError                    **error)
+{
+       gchar *prefix = NULL, *uri = NULL;
+       gboolean retval;
+
+       advance_whitespace_and_comments (deserializer);
+       if (!parse_terminal (deserializer, terminal_PNAME_NS, 0, &prefix))
+               goto error;
+
+       advance_whitespace_and_comments (deserializer);
+       if (!parse_terminal (deserializer, terminal_IRIREF, 1, &uri))
+               goto error;
+
+       advance_whitespace_and_comments (deserializer);
+       if (!parse_token (deserializer, "."))
+               goto error;
+
+       /* Remove the trailing ':' in prefix */
+       prefix[strlen(prefix) - 1] = '\0';
+
+       retval = maybe_add_prefix (deserializer, prefix, uri, error);
+       g_free (prefix);
+       g_free (uri);
+
+       return retval;
+error:
+       g_free (prefix);
+       g_free (uri);
+       g_set_error (error,
+                    TRACKER_SPARQL_ERROR,
+                    TRACKER_SPARQL_ERROR_PARSE,
+                    "Could not parse @prefix");
+       return FALSE;
+}
+
+static gboolean
+handle_base (TrackerDeserializerTurtle  *deserializer,
+             GError                    **error)
+{
+       gchar *base = NULL;
+
+       advance_whitespace_and_comments (deserializer);
+       if (!parse_terminal (deserializer, terminal_IRIREF, 0, &base))
+               goto error;
+
+       advance_whitespace_and_comments (deserializer);
+       if (!parse_token (deserializer, "."))
+               goto error;
+
+       g_clear_pointer (&deserializer->base, g_free);
+       deserializer->base = base;
+       return TRUE;
+error:
+       g_free (base);
+       g_set_error (error,
+                    TRACKER_SPARQL_ERROR,
+                    TRACKER_SPARQL_ERROR_PARSE,
+                    "Could not parse @base");
+       return FALSE;
+}
+
+static gboolean
+handle_type_cast (TrackerDeserializerTurtle  *deserializer,
+                  GError                    **error)
+{
+       /* These actually go ignored, imposed by the ontology */
+       if (parse_token (deserializer, "^^")) {
+               if (parse_terminal (deserializer, terminal_IRIREF, 1, NULL) ||
+                   parse_terminal (deserializer, terminal_PNAME_LN, 0, NULL) ||
+                   parse_terminal (deserializer, terminal_PNAME_NS, 0, NULL))
+                       return TRUE;
+
+               g_set_error (error,
+                            TRACKER_SPARQL_ERROR,
+                            TRACKER_SPARQL_ERROR_PARSE,
+                            "Error parsing type cast");
+               return FALSE;
+       }
+
+       return TRUE;
+}
+
+static void
+advance_whitespace_and_comments (TrackerDeserializerTurtle *deserializer)
+{
+       const gchar *buffer, *str;
+       gsize size;
+
+       while (TRUE) {
+               advance_whitespace (deserializer);
+               buffer = g_buffered_input_stream_peek_buffer (deserializer->buffered_stream,
+                                                             &size);
+               if (size == 0)
+                       break;
+               if (buffer[0] != '#')
+                       break;
+
+               str = strchr (buffer, '\n');
+               if (!str)
+                       break;
+
+               if (!seek_input (deserializer, str + 1 - buffer))
+                       break;
+       }
+}
+
+static gboolean
+tracker_deserializer_turtle_iterate_next (TrackerDeserializerTurtle  *deserializer,
+                                          GError                    **error)
+{
+       while (TRUE) {
+               gchar *str, *lang;
+
+               advance_whitespace_and_comments (deserializer);
+
+               if (g_buffered_input_stream_fill (deserializer->buffered_stream, -1, NULL, error) < 0)
+                       return FALSE;
+
+               switch (deserializer->state) {
+               case STATE_INITIAL:
+                       deserializer->state = STATE_SUBJECT;
+                       break;
+               case STATE_SUBJECT:
+                       if (g_buffered_input_stream_get_available (deserializer->buffered_stream) == 0)
+                               return FALSE;
+
+                       if (parse_token (deserializer, "@prefix")) {
+                               if (!handle_prefix (deserializer, error))
+                                       return FALSE;
+                               break;
+                       } else if (parse_token (deserializer, "@base")) {
+                               if (!handle_base (deserializer, error))
+                                       return FALSE;
+                               break;
+                       }
+
+                       g_clear_pointer (&deserializer->subject, g_free);
+
+                       if (parse_token (deserializer, "[")) {
+                               /* Anonymous blank node */
+                               push_stack (deserializer);
+                               deserializer->subject = generate_bnode (deserializer, NULL);
+                               deserializer->state = STATE_PREDICATE;
+                               continue;
+                       }
+
+                       if (parse_terminal (deserializer, terminal_IRIREF, 1, &str)) {
+                               deserializer->subject = expand_base (deserializer, str);
+                       } else if (parse_terminal (deserializer, terminal_PNAME_LN, 0, &str) ||
+                                  parse_terminal (deserializer, terminal_PNAME_NS, 0, &str)) {
+                               deserializer->subject = expand_prefix (deserializer, str, error);
+                               g_free (str);
+
+                               if (*error) {
+                                       return FALSE;
+                               }
+                       } else if (parse_terminal (deserializer, terminal_BLANK_NODE_LABEL, 0, &str)) {
+                               deserializer->subject = generate_bnode (deserializer, str);
+                               g_free (str);
+                       } else {
+                               g_set_error (error,
+                                            TRACKER_SPARQL_ERROR,
+                                            TRACKER_SPARQL_ERROR_PARSE,
+                                            "Wrong subject token");
+                               return FALSE;
+                       }
+
+                       deserializer->state = STATE_PREDICATE;
+                       break;
+               case STATE_PREDICATE:
+                       g_clear_pointer (&deserializer->predicate, g_free);
+
+                       if (parse_token (deserializer, "a")) {
+                               deserializer->predicate = g_strdup (RDF_TYPE);
+                       } else if (parse_terminal (deserializer, terminal_IRIREF, 1, &str)) {
+                               deserializer->predicate = expand_base (deserializer, str);
+                       } else if (parse_terminal (deserializer, terminal_PNAME_LN, 0, &str) ||
+                                  parse_terminal (deserializer, terminal_PNAME_NS, 0, &str)) {
+                               deserializer->predicate = expand_prefix (deserializer, str, error);
+                               g_free (str);
+
+                               if (*error) {
+                                       return FALSE;
+                               }
+                       } else {
+                               g_set_error (error,
+                                            TRACKER_SPARQL_ERROR,
+                                            TRACKER_SPARQL_ERROR_PARSE,
+                                            "Wrong predicate token");
+                               return FALSE;
+                       }
+
+                       deserializer->state = STATE_OBJECT;
+                       break;
+               case STATE_OBJECT:
+                       g_clear_pointer (&deserializer->object, g_free);
+                       g_clear_pointer (&deserializer->object_lang, g_free);
+                       deserializer->object_is_uri = FALSE;
+
+                       if (parse_token (deserializer, "[")) {
+                               /* Anonymous blank node */
+                               push_stack (deserializer);
+                               deserializer->subject = generate_bnode (deserializer, NULL);
+                               deserializer->state = STATE_PREDICATE;
+                               continue;
+                       }
+
+                       if (parse_terminal (deserializer, terminal_IRIREF, 1, &str)) {
+                               deserializer->object = expand_base (deserializer, str);
+                               deserializer->object_is_uri = TRUE;
+                       } else if (parse_terminal (deserializer, terminal_PNAME_LN, 0, &str) ||
+                                  parse_terminal (deserializer, terminal_PNAME_NS, 0, &str)) {
+                               deserializer->object = expand_prefix (deserializer, str, error);
+                               deserializer->object_is_uri = TRUE;
+                               g_free (str);
+
+                               if (*error) {
+                                       return FALSE;
+                               }
+                       } else if (parse_terminal (deserializer, terminal_BLANK_NODE_LABEL, 0, &str)) {
+                               deserializer->object = generate_bnode (deserializer, str);
+                               deserializer->object_is_uri = TRUE;
+                               g_free (str);
+                       } else if (parse_terminal (deserializer, terminal_STRING_LITERAL_LONG1, 3, &str) ||
+                                  parse_terminal (deserializer, terminal_STRING_LITERAL_LONG2, 3, &str)) {
+                               deserializer->object = g_strcompress (str);
+                               g_free (str);
+                               if (parse_terminal (deserializer, terminal_LANGTAG, 0, &lang)) {
+                                       deserializer->object_lang = lang;
+                               } else if (!handle_type_cast (deserializer, error)) {
+                                       return FALSE;
+                               }
+                       } else if (parse_terminal (deserializer, terminal_STRING_LITERAL1, 1, &str) ||
+                                  parse_terminal (deserializer, terminal_STRING_LITERAL2, 1, &str)) {
+                               deserializer->object = g_strcompress (str);
+                               g_free (str);
+                               if (parse_terminal (deserializer, terminal_LANGTAG, 0, &lang)) {
+                                       deserializer->object_lang = lang;
+                               } else if (!handle_type_cast (deserializer, error)) {
+                                       return FALSE;
+                               }
+                       } else if (parse_terminal (deserializer, terminal_DOUBLE, 0, &str) ||
+                                  parse_terminal (deserializer, terminal_INTEGER, 0, &str)) {
+                               deserializer->object = str;
+                       } else if (parse_token (deserializer, "true")) {
+                               deserializer->object = g_strdup ("true");
+                       } else if (parse_token (deserializer, "false")) {
+                               deserializer->object = g_strdup ("false");
+                       } else {
+                               g_set_error (error,
+                                            TRACKER_SPARQL_ERROR,
+                                            TRACKER_SPARQL_ERROR_PARSE,
+                                            "Wrong object token");
+                               return FALSE;
+                       }
+
+                       deserializer->state = STATE_STEP;
+
+                       /* This is where next() stops, on lack of errors */
+                       return TRUE;
+                       break;
+               case STATE_STEP:
+                       if (deserializer->parser_state->len > 0 && parse_token (deserializer, "]")) {
+                               pop_stack (deserializer);
+                               if (deserializer->state == STATE_SUBJECT) {
+                                       deserializer->state = STATE_PREDICATE;
+                                       continue;
+                               } else if (deserializer->state == STATE_OBJECT) {
+                                       deserializer->state = STATE_STEP;
+                                       return TRUE;
+                               }
+                       }
+
+                       if (parse_token (deserializer, ",")) {
+                               deserializer->state = STATE_OBJECT;
+                       } else if (parse_token (deserializer, ";")) {
+                               /* Dot is allowed after semicolon */
+                               advance_whitespace_and_comments (deserializer);
+                               if (parse_token (deserializer, "."))
+                                       deserializer->state = STATE_SUBJECT;
+                               else
+                                       deserializer->state = STATE_PREDICATE;
+                       } else if (parse_token (deserializer, ".")) {
+                               deserializer->state = STATE_SUBJECT;
+                       } else {
+                               g_set_error (error,
+                                            TRACKER_SPARQL_ERROR,
+                                            TRACKER_SPARQL_ERROR_PARSE,
+                                            "Expected comma, semicolon, or dot");
+                               return FALSE;
+                       }
+
+                       break;
+               }
+       }
+}
+
+TrackerSparqlValueType
+tracker_deserializer_turtle_get_value_type (TrackerSparqlCursor *cursor,
+                                            gint                 column)
+{
+       TrackerDeserializerTurtle *deserializer = TRACKER_DESERIALIZER_TURTLE (cursor);
+
+       switch (column) {
+       case TRACKER_RDF_COL_SUBJECT:
+               if (g_str_has_prefix (deserializer->subject, "_:"))
+                       return TRACKER_SPARQL_VALUE_TYPE_BLANK_NODE;
+               else
+                       return TRACKER_SPARQL_VALUE_TYPE_URI;
+       case TRACKER_RDF_COL_PREDICATE:
+               return TRACKER_SPARQL_VALUE_TYPE_URI;
+       case TRACKER_RDF_COL_OBJECT:
+               if (deserializer->object_is_uri)
+                       return TRACKER_SPARQL_VALUE_TYPE_URI;
+               else
+                       return TRACKER_SPARQL_VALUE_TYPE_STRING;
+       default:
+               return TRACKER_SPARQL_VALUE_TYPE_UNBOUND;
+       }
+}
+
+const gchar *
+tracker_deserializer_turtle_get_string (TrackerSparqlCursor *cursor,
+                                        gint                 column,
+                                        glong               *length)
+{
+       TrackerDeserializerTurtle *deserializer = TRACKER_DESERIALIZER_TURTLE (cursor);
+
+       switch (column) {
+       case TRACKER_RDF_COL_SUBJECT:
+               return deserializer->subject;
+       case TRACKER_RDF_COL_PREDICATE:
+               return deserializer->predicate;
+       case TRACKER_RDF_COL_OBJECT:
+               return deserializer->object;
+       default:
+               return NULL;
+       }
+}
+
+gboolean
+tracker_deserializer_turtle_next (TrackerSparqlCursor  *cursor,
+                                  GCancellable         *cancellable,
+                                  GError              **error)
+{
+       TrackerDeserializerTurtle *deserializer = TRACKER_DESERIALIZER_TURTLE (cursor);
+
+       return tracker_deserializer_turtle_iterate_next (deserializer, error);
+}
+
+void
+tracker_deserializer_turtle_rewind (TrackerSparqlCursor* cursor)
+{
+       TrackerDeserializerTurtle *deserializer = TRACKER_DESERIALIZER_TURTLE (cursor);
+
+       g_seekable_seek (G_SEEKABLE (deserializer->buffered_stream),
+                        0, G_SEEK_SET, NULL, NULL);
+       deserializer->state = STATE_INITIAL;
+       deserializer->line_no = 0;
+       deserializer->column_no = 0;
+}
+
+void
+tracker_deserializer_turtle_close (TrackerSparqlCursor* cursor)
+{
+       TrackerDeserializerTurtle *deserializer = TRACKER_DESERIALIZER_TURTLE (cursor);
+
+       g_input_stream_close (G_INPUT_STREAM (deserializer->buffered_stream), NULL, NULL);
+
+       TRACKER_SPARQL_CURSOR_CLASS (tracker_deserializer_turtle_parent_class)->close (cursor);
+}
+
+gboolean
+tracker_deserializer_turtle_get_parser_location (TrackerDeserializer *deserializer,
+                                                 goffset             *line_no,
+                                                 goffset             *column_no)
+{
+       TrackerDeserializerTurtle *deserializer_ttl = TRACKER_DESERIALIZER_TURTLE (deserializer);
+
+       if (deserializer_ttl->state == STATE_INITIAL) {
+               *line_no = 0;
+               *column_no = 0;
+               return FALSE;
+       }
+
+       *line_no = deserializer_ttl->line_no;
+       *column_no = deserializer_ttl->column_no;
+       return TRUE;
+}
+
+static void
+tracker_deserializer_turtle_class_init (TrackerDeserializerTurtleClass *klass)
+{
+       GObjectClass *object_class = G_OBJECT_CLASS (klass);
+       TrackerSparqlCursorClass *cursor_class = TRACKER_SPARQL_CURSOR_CLASS (klass);
+       TrackerDeserializerClass *deserializer_class = TRACKER_DESERIALIZER_CLASS (klass);
+
+       object_class->finalize = tracker_deserializer_turtle_finalize;
+       object_class->constructed = tracker_deserializer_turtle_constructed;
+
+       cursor_class->get_value_type = tracker_deserializer_turtle_get_value_type;
+       cursor_class->get_string = tracker_deserializer_turtle_get_string;
+       cursor_class->next = tracker_deserializer_turtle_next;
+       cursor_class->rewind = tracker_deserializer_turtle_rewind;
+       cursor_class->close = tracker_deserializer_turtle_close;
+
+       deserializer_class->get_parser_location = tracker_deserializer_turtle_get_parser_location;
+}
+
+static void
+tracker_deserializer_turtle_init (TrackerDeserializerTurtle *deserializer)
+{
+       deserializer->blank_nodes = g_hash_table_new_full (g_str_hash, g_str_equal,
+                                                          g_free, g_free);
+       deserializer->parser_state = g_array_new (FALSE, FALSE, sizeof (StateStack));
+}
+
+TrackerSparqlCursor *
+tracker_deserializer_turtle_new (GInputStream            *istream,
+                                 TrackerNamespaceManager *namespaces)
+{
+       g_return_val_if_fail (G_IS_INPUT_STREAM (istream), NULL);
+
+       return g_object_new (TRACKER_TYPE_DESERIALIZER_TURTLE,
+                            "stream", istream,
+                            "namespace-manager", namespaces,
+                            "has-graph", FALSE,
+                            NULL);
+}
diff --git a/src/libtracker-sparql/tracker-deserializer-turtle.h 
b/src/libtracker-sparql/tracker-deserializer-turtle.h
new file mode 100644
index 000000000..a7b3c5f7e
--- /dev/null
+++ b/src/libtracker-sparql/tracker-deserializer-turtle.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2020, Red Hat Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ *
+ * Author: Carlos Garnacho <carlosg gnome org>
+ */
+
+#include "tracker-deserializer-rdf.h"
+
+#include <gio/gio.h>
+
+#ifndef __TRACKER_DESERIALIZER_TURTLE_H__
+#define __TRACKER_DESERIALIZER_TURTLE_H__
+
+#define TRACKER_TYPE_DESERIALIZER_TURTLE (tracker_deserializer_turtle_get_type ())
+G_DECLARE_FINAL_TYPE (TrackerDeserializerTurtle,
+                      tracker_deserializer_turtle,
+                      TRACKER, DESERIALIZER_TURTLE,
+                      TrackerDeserializerRdf)
+
+TrackerSparqlCursor * tracker_deserializer_turtle_new (GInputStream            *stream,
+                                                       TrackerNamespaceManager *manager);
+
+#endif /* __TRACKER_DESERIALIZER_TURTLE_H__ */
diff --git a/src/libtracker-sparql/tracker-deserializer.c b/src/libtracker-sparql/tracker-deserializer.c
index 8d3f5b2ec..c4eb1af50 100644
--- a/src/libtracker-sparql/tracker-deserializer.c
+++ b/src/libtracker-sparql/tracker-deserializer.c
@@ -22,6 +22,7 @@
 #include "config.h"
 
 #include "tracker-deserializer.h"
+#include "tracker-deserializer-turtle.h"
 
 #include "tracker-private.h"
 
@@ -169,6 +170,8 @@ tracker_deserializer_new (GInputStream            *stream,
        g_return_val_if_fail (G_IS_INPUT_STREAM (stream), NULL);
 
        switch (format) {
+       case TRACKER_SERIALIZER_FORMAT_TTL:
+               return tracker_deserializer_turtle_new (stream, namespaces);
        default:
                g_warn_if_reached ();
                return NULL;
@@ -178,7 +181,7 @@ tracker_deserializer_new (GInputStream            *stream,
 static TrackerSerializerFormat
 pick_format_for_file (GFile *file)
 {
-       return TRACKER_RDF_FORMAT_TURTLE;
+       return TRACKER_SERIALIZER_FORMAT_TTL;
 }
 
 TrackerSparqlCursor *


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]