[tracker-miners: 2/4] tracker-extract: Use libz to process ps.gz files
- From: Sam Thursfield <sthursfield src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker-miners: 2/4] tracker-extract: Use libz to process ps.gz files
- Date: Wed, 24 Apr 2019 13:13:58 +0000 (UTC)
commit 3752c094f108ad7bca0fd429505126c52bc60a4b
Author: Andrea Azzarone <andrea azzarone canonical com>
Date: Tue Apr 23 12:01:54 2019 +0100
tracker-extract: Use libz to process ps.gz files
Process ps.gz files using GZlibDecompressor instead of spawing gunzip. Because
tracker-extract runs the file parsers inside a seccomp sandbox, spawning an
external process during parsing is not a good idea because it leaves us with
little control on which syscalls are used.
Closes: https://gitlab.gnome.org/GNOME/tracker-miners/issues/61
src/tracker-extract/tracker-extract-ps.c | 203 ++++++++-----------------------
1 file changed, 51 insertions(+), 152 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-ps.c b/src/tracker-extract/tracker-extract-ps.c
index 6d59e3d2f..207a32a89 100644
--- a/src/tracker-extract/tracker-extract-ps.c
+++ b/src/tracker-extract/tracker-extract-ps.c
@@ -18,13 +18,7 @@
* Boston, MA 02110-1301, USA.
*/
-#include "config.h"
-
-#include <fcntl.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
+#include <config.h>
#include <glib.h>
#include <glib/gstdio.h>
@@ -101,43 +95,30 @@ date_to_iso8601 (const gchar *date)
}
static TrackerResource *
-extract_ps_from_filestream (FILE *f)
+extract_ps_from_inputstream (GInputStream *stream)
{
TrackerResource *metadata;
+ g_autoptr(GDataInputStream) data_stream = NULL;
gchar *line;
- gsize length;
- gssize read_char;
- gsize accum;
- gsize max_bytes;
-
- line = NULL;
- length = 0;
+ gsize length, accum, max_bytes;
+ g_autoptr(GError) error = NULL;
metadata = tracker_resource_new (NULL);
tracker_resource_add_uri (metadata, "rdf:type", "nfo:PaginatedTextDocument");
+ data_stream = g_data_input_stream_new (stream);
+
/* 20 MiB should be enough! (original safe limit) */
accum = 0;
max_bytes = 20u << 20;
- /* Reuse the same buffer for all lines. Must be dynamically allocated with
- * malloc family methods as getline() may re-size it with realloc() */
- length = 1024;
- line = g_malloc (length);
-
- /* Halt the whole when one of these conditions is met:
- * a) Reached max bytes to read
- * b) No more lines to read
- */
while ((accum < max_bytes) &&
- (read_char = tracker_getline (&line, &length, f)) != -1) {
+ (line = g_data_input_stream_read_line (data_stream, &length, NULL, &error)) != NULL) {
gboolean pageno_atend = FALSE;
gboolean header_finished = FALSE;
/* Update accumulated bytes read */
- accum += read_char;
-
- line[read_char - 1] = '\0'; /* overwrite '\n' char */
+ accum += length;
if (!header_finished && strncmp (line, "%%Copyright:", 12) == 0) {
tracker_resource_set_string (metadata, "nie:copyright", line + 13);
@@ -148,13 +129,11 @@ extract_ps_from_filestream (FILE *f)
tracker_resource_set_relation (metadata, "nco:creator", creator);
g_object_unref (creator);
} else if (!header_finished && strncmp (line, "%%CreationDate:", 15) == 0) {
- gchar *date;
+ g_autofree gchar *date = NULL;
date = date_to_iso8601 (line + 16);
- if (date) {
+ if (date)
tracker_resource_set_string (metadata, "nie:contentCreated", date);
- g_free (date);
- }
} else if (strncmp (line, "%%Pages:", 8) == 0) {
if (strcmp (line + 9, "(atend)") == 0) {
pageno_atend = TRUE;
@@ -168,141 +147,66 @@ extract_ps_from_filestream (FILE *f)
header_finished = TRUE;
if (!pageno_atend) {
+ g_free (line);
break;
}
}
- }
- /* Deallocate the buffer */
- if (line) {
g_free (line);
}
+ if (error != NULL)
+ g_warning ("Unexpected lack of content trying to read a line: %s", error->message);
+
return metadata;
}
-
-
static TrackerResource *
-extract_ps (const gchar *uri)
+extract_ps (const gchar *uri)
{
- TrackerResource *metadata;
- FILE *f;
- gchar *filename;
+ g_autoptr(GFile) file = NULL;
+ g_autoptr(GInputStream) stream = NULL;
+ g_autoptr(GError) error = NULL;
- filename = g_filename_from_uri (uri, NULL, NULL);
- f = tracker_file_open (filename);
- g_free (filename);
+ g_debug ("Extracting PS '%s'...", uri);
- if (!f) {
+ file = g_file_new_for_uri (uri);
+
+ stream = G_INPUT_STREAM (g_file_read (file, NULL, &error));
+ if (stream == NULL) {
+ g_warning ("Could't not read file %s: %s", uri, error->message);
return NULL;
}
- /* Extract from filestream! */
- g_debug ("Extracting PS '%s'...", uri);
- metadata = extract_ps_from_filestream (f);
-
- tracker_file_close (f, FALSE);
-
- return metadata;
+ return extract_ps_from_inputstream (stream);
}
#ifdef USING_UNZIPPSFILES
-#include <errno.h>
-#include <sys/time.h>
-#include <sys/resource.h>
+#include <zlib.h>
-static void
-spawn_child_func (gpointer user_data)
+static TrackerResource *
+extract_ps_gz (const gchar *uri)
{
- struct rlimit cpu_limit;
- gint timeout = GPOINTER_TO_INT (user_data);
+ g_autoptr(GFile) file = NULL;
+ g_autoptr(GInputStream) stream, cstream = NULL;
+ g_autoptr(GConverter) converter = NULL;
+ g_autoptr(GError) error = NULL;
- if (timeout > 0) {
- /* set cpu limit */
- getrlimit (RLIMIT_CPU, &cpu_limit);
- cpu_limit.rlim_cur = timeout;
- cpu_limit.rlim_max = timeout + 1;
-
- if (setrlimit (RLIMIT_CPU, &cpu_limit) != 0) {
- g_critical ("Failed to set resource limit for CPU");
- }
-
- /* Have this as a precaution in cases where cpu limit has not
- * been reached due to spawned app sleeping.
- */
- alarm (timeout + 2);
- }
-
- /* Set child's niceness to 19 */
- errno = 0;
+ g_debug ("Extracting PS '%s'...", uri);
- /* nice() uses attribute "warn_unused_result" and so complains
- * if we do not check its returned value. But it seems that
- * since glibc 2.2.4, nice() can return -1 on a successful call
- * so we have to check value of errno too. Stupid...
- */
- if (nice (19) == -1 && errno) {
- g_warning ("Failed to set nice value");
- }
-}
+ file = g_file_new_for_uri (uri);
-static TrackerResource *
-extract_ps_gz (const gchar *uri)
-{
- TrackerResource *metadata = NULL;
- FILE *fz;
- gint fdz;
- const gchar *argv[4];
- gchar *filename;
- GError *error = NULL;
-
- filename = g_filename_from_uri (uri, NULL, NULL);
-
- /* TODO: we should be using libz for this instead */
-
- argv[0] = "gunzip";
- argv[1] = "-c";
- argv[2] = filename;
- argv[3] = NULL;
-
- /* Fork & spawn to gunzip the file */
- if (!g_spawn_async_with_pipes (g_get_tmp_dir (),
- (gchar **) argv,
- NULL,
- G_SPAWN_SEARCH_PATH | G_SPAWN_STDERR_TO_DEV_NULL,
- spawn_child_func,
- GINT_TO_POINTER (10),
- NULL,
- NULL,
- &fdz,
- NULL,
- &error)) {
- g_warning ("Couldn't fork & spawn to gunzip '%s': %s",
- uri, error ? error->message : NULL);
- g_clear_error (&error);
- }
- /* Get FILE from FD */
- else if ((fz = fdopen (fdz, "r")) == NULL) {
- g_warning ("Couldn't open FILE from FD (%s)...", uri);
- close (fdz);
- }
- /* Extract from filestream! */
- else
- {
- g_debug ("Extracting compressed PS '%s'...", uri);
- metadata = extract_ps_from_filestream (fz);
-#ifdef HAVE_POSIX_FADVISE
- if (posix_fadvise (fdz, 0, 0, POSIX_FADV_DONTNEED) != 0)
- g_warning ("posix_fadvise() call failed: %m");
-#endif /* HAVE_POSIX_FADVISE */
- fclose (fz);
+ stream = G_INPUT_STREAM (g_file_read (file, NULL, &error));
+ if (stream == NULL) {
+ g_warning ("Could't not read file %s: %s", uri, error->message);
+ return NULL;
}
- g_free (filename);
+ converter = G_CONVERTER (g_zlib_decompressor_new (G_ZLIB_COMPRESSOR_FORMAT_GZIP));
+ cstream = g_converter_input_stream_new (stream, converter);
- return metadata;
+ return extract_ps_from_inputstream (cstream);
}
#endif /* USING_UNZIPPSFILES */
@@ -312,28 +216,23 @@ tracker_extract_get_metadata (TrackerExtractInfo *info)
{
TrackerResource *metadata;
GFile *file;
- gchar *uri;
+ g_autofree gchar *uri = NULL;
+ const char *mimetype;
file = tracker_extract_info_get_file (info);
uri = g_file_get_uri (file);
+ mimetype = tracker_extract_info_get_mimetype (info);
- {
+ if (strcmp (mimetype, "application/x-gzpostscript") == 0) {
#ifdef USING_UNZIPPSFILES
- const char *mimetype;
-
- mimetype = tracker_extract_info_get_mimetype (info);
-
- if (strcmp (mimetype, "application/x-gzpostscript") == 0) {
- metadata = extract_ps_gz (uri);
- } else
+ metadata = extract_ps_gz (uri);
+#else
+ metadata = NULL;
#endif /* USING_UNZIPPSFILES */
- {
- metadata = extract_ps (uri);
- }
+ } else {
+ metadata = extract_ps (uri);
}
- g_free (uri);
-
if (metadata) {
tracker_extract_info_set_resource (info, metadata);
g_object_unref (metadata);
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]