[tracker] tracker-extract: Added FTS support for text files

From: Martyn James Russell <mr src gnome org>
To: svn-commits-list gnome org
Cc:
Subject: [tracker] tracker-extract: Added FTS support for text files
Date: Tue, 8 Sep 2009 11:39:00 +0000 (UTC)
commit c53a2e5f7a329d937cbc2e43217c131b82dc4ab5
Author: Martyn Russell <martyn lanedo com>
Date:   Tue Sep 8 12:37:40 2009 +0100

    tracker-extract: Added FTS support for text files

 src/tracker-extract/Makefile.am            |    8 +-
 src/tracker-extract/tracker-extract-text.c |  326 ++++++++++++++++++++++++++++
 2 files changed, 333 insertions(+), 1 deletions(-)
---
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index 5e6f457..792e124 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -41,7 +41,8 @@ modules_LTLIBRARIES = 							\
 	libextract-mp3.la				 		\
 	libextract-oasis.la 						\
 	libextract-png.la 						\
-	libextract-ps.la 						
+	libextract-ps.la 						\
+	libextract-text.la
 
 if HAVE_LIBVORBIS
 modules_LTLIBRARIES += libextract-vorbis.la
@@ -230,6 +231,11 @@ libextract_playlist_la_LDFLAGS = $(module_flags)
 libextract_playlist_la_LIBADD = $(GLIB2_LIBS) $(TOTEM_PL_PARSER_LIBS) $(GCOV_LIBS) \
 	$(top_builddir)/src/libtracker-common/libtracker-common.la
 
+# TIFF
+libextract_text_la_SOURCES = tracker-extract-text.c
+libextract_text_la_LDFLAGS = $(module_flags)
+libextract_text_la_LIBADD = $(GLIB2_LIBS) $(GIO_LIBS) $(GCOV_LIBS) \
+	$(top_builddir)/src/libtracker-common/libtracker-common.la
 
 #
 # Binaries
diff --git a/src/tracker-extract/tracker-extract-text.c b/src/tracker-extract/tracker-extract-text.c
new file mode 100644
index 0000000..95689a9
--- /dev/null
+++ b/src/tracker-extract/tracker-extract-text.c
@@ -0,0 +1,326 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Copyright (C) 2008, Nokia
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include <glib.h>
+
+#include <libtracker-common/tracker-statement-list.h>
+
+#include "tracker-main.h"
+
+#undef  TRY_LOCALE_TO_UTF8_CONVERSION
+
+#define TEXT_MAX_SIZE   1048576  /* bytes */
+#define TEXT_CHECK_SIZE 65535    /* bytes */
+
+#if 0
+
+typedef struct {
+        GMainLoop            *main_loop;
+        GString              *data;
+	gchar                *uri;
+	TrackerSparqlBuilder *metadata;
+} ContentData;
+
+#endif
+
+static void extract_text (const gchar          *uri,
+			  TrackerSparqlBuilder *metadata);
+
+static TrackerExtractData data[] = {
+	{ "text/plain",       extract_text },
+	{ "text/x-authors",   extract_text },
+	{ "text/x-changelog", extract_text },
+	{ "text/x-copying",   extract_text },
+	{ "text/x-credits",   extract_text },
+	{ "text/x-install",   extract_text },
+	{ "text/x-readme",    extract_text },
+	{ NULL, NULL }
+};
+
+static gboolean
+get_file_is_utf8 (GString *s,
+		  gssize  *bytes_valid)
+{
+	const gchar *end;
+
+	/* Check for UTF-8 validity, since we may
+	 * have cut off the end.
+	 */
+	if (g_utf8_validate (s->str, s->len, &end)) {
+		*bytes_valid = (gssize) s->len;
+		return TRUE;
+	}
+
+	*bytes_valid = end - s->str;
+
+	/* 4 is the maximum bytes for a UTF-8 character. */
+	if (*bytes_valid > 4) {
+		return FALSE;
+	}
+
+	if (g_utf8_get_char_validated (end, *bytes_valid) == (gunichar) -1) {
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+#ifdef TRY_LOCALE_TO_UTF8_CONVERSION
+
+static GString *
+get_file_in_locale (GString *s)
+{
+	GError *error = NULL;
+	gchar  *str;
+	gsize	bytes_read;
+	gsize	bytes_written;
+
+	str = g_locale_to_utf8 (s->str,
+				s->len,
+				&bytes_read,
+				&bytes_written,
+				&error);
+	if (error) {
+		g_debug ("  Conversion to UTF-8 read %d bytes, wrote %d bytes",
+			 bytes_read,
+			 bytes_written);
+		g_message ("Could not convert file from locale to UTF-8, %s",
+			   error->message);
+		g_error_free (error);
+		g_free (str);
+	} else {
+		g_string_assign (s, str);
+		g_free (str);
+	}
+
+	return s;
+}
+
+#endif /* TRY_LOCALE_TO_UTF8_CONVERSION */
+
+static gchar *
+get_file_content (const gchar *uri)
+{
+	GFile		 *file;
+	GFileInputStream *stream;
+	GError		 *error = NULL;
+	GString		 *s;
+	gssize		  bytes;
+	gssize		  bytes_valid;
+	gssize		  bytes_read_total;
+	gssize		  buf_size;
+	gchar		  buf[TEXT_CHECK_SIZE];
+	gboolean	  has_more_data;
+	gboolean	  has_reached_max;
+	gboolean	  is_utf8;
+
+	file = g_file_new_for_uri (uri);
+	stream = g_file_read (file, NULL, &error);
+
+	if (error) {
+		g_message ("Could not get read file:'%s', %s",
+			   uri,
+			   error->message);
+		g_error_free (error);
+		g_object_unref (file);
+
+		return NULL;
+	}
+
+	s = g_string_new ("");
+	has_reached_max = FALSE;
+	has_more_data = TRUE;
+	bytes_read_total = 0;
+	buf_size = TEXT_CHECK_SIZE - 1;
+
+	g_debug ("  Starting read...");
+
+	while (has_more_data && !has_reached_max && !error) {
+		gssize bytes_read;
+		gssize bytes_remaining;
+
+		/* Leave space for NULL termination and make sure we
+		 * add it at the end now.
+		 */
+		bytes_remaining = buf_size;
+		bytes_read = 0;
+
+		/* Loop until we hit the maximum */
+		for (bytes = -1; bytes != 0 && !error; ) {
+			bytes = g_input_stream_read (G_INPUT_STREAM (stream),
+						     buf,
+						     bytes_remaining,
+						     NULL,
+						     &error);
+
+			bytes_read += bytes;
+			bytes_remaining -= bytes;
+
+			g_debug ("  Read %" G_GSSIZE_FORMAT " bytes", bytes);
+		}
+
+		/* Set the NULL termination after the last byte read */
+		buf[buf_size - bytes_remaining] = '\0';
+
+		/* First of all, check if this is the first time we
+		 * have tried to read the file up to the TEXT_CHECK_SIZE
+		 * limit. Then make sure that we read the maximum size
+		 * of the buffer. If we don't do this, there is the
+		 * case where we read 10 bytes in and it is just one
+		 * line with no '\n'. Once we have confirmed this we
+		 * check that the buffer has a '\n' to make sure the
+		 * file is worth indexing. Similarly if the file has
+		 * <= 3 bytes then we drop it.
+		 */
+		if (bytes_read_total == 0) {
+			if (bytes_read == buf_size &&
+			    strchr (buf, '\n') == NULL) {
+				g_debug ("  No '\\n' in the first %" G_GSSIZE_FORMAT " bytes, not indexing file",
+					 buf_size);
+				break;
+			} else if (bytes_read <= 2) {
+				g_debug ("  File has less than 3 characters in it, not indexing file");
+				break;
+			}
+		}
+
+		/* Here we increment the bytes read total to evaluate
+		 * the next states. We don't do this before the
+		 * previous condition so we can know when we have
+		 * iterated > 1.
+		 */
+		bytes_read_total += bytes_read;
+
+		if (bytes_read != buf_size || bytes_read == 0) {
+			has_more_data = FALSE;
+		}
+
+		if (bytes_read_total >= TEXT_MAX_SIZE) {
+			has_reached_max = TRUE;
+		}
+
+		g_debug ("  Read "
+			 "%" G_GSSIZE_FORMAT " bytes total, "
+			 "%" G_GSSIZE_FORMAT " bytes this time, "
+			 "more data:%s, reached max:%s",
+			 bytes_read_total,
+			 bytes_read,
+			 has_more_data ? "yes" : "no",
+			 has_reached_max ? "yes" : "no");
+
+		/* The + 1 is for the NULL terminating byte */
+		s = g_string_append_len (s, buf, bytes_read + 1);
+	}
+
+	if (has_reached_max) {
+		g_debug ("  Maximum indexable limit reached");
+	}
+
+	if (error) {
+		g_message ("Could not read input stream for:'%s', %s",
+			   uri,
+			   error->message);
+		g_error_free (error);
+		g_string_free (s, TRUE);
+		g_object_unref (stream);
+		g_object_unref (file);
+
+		return NULL;
+	}
+
+	/* Check for UTF-8 Validity, if not try to convert it to the
+	 * locale we are in.
+	 */
+	is_utf8 = get_file_is_utf8 (s, &bytes_valid);
+
+	/* Make sure the string is NULL terminated and in the case
+	 * where the string is valid UTF-8 up to the last character
+	 * which was cut off, NULL terminate to the last most valid
+	 * character.
+	 */
+#ifdef TRY_LOCALE_TO_UTF8_CONVERSION
+	if (!is_utf8) {
+		s = get_file_in_locale (s);
+	} else {
+		g_debug ("  Truncating to last valid UTF-8 character (%d/%d bytes)",
+			 bytes_valid,
+			 s->len);
+		s = g_string_truncate (s, bytes_valid);
+	}
+#else	/* TRY_LOCALE_TO_UTF8_CONVERSION */
+	g_debug ("  Truncating to last valid UTF-8 character (%" G_GSSIZE_FORMAT "/%" G_GSSIZE_FORMAT " bytes)",
+		 bytes_valid,
+		 s->len);
+	s = g_string_truncate (s, bytes_valid);
+#endif	/* TRY_LOCALE_TO_UTF8_CONVERSION */
+
+	g_object_unref (stream);
+	g_object_unref (file);
+
+	if (s->len < 1) {
+		g_string_free (s, TRUE);
+		s = NULL;
+	}
+
+	return s ? g_string_free (s, FALSE) : NULL;
+}
+
+static void
+extract_text (const gchar          *uri,
+	      TrackerSparqlBuilder *metadata)
+{
+	gchar *content;
+
+	g_type_init ();
+
+#if 0
+	ContentData *cd;
+
+	cd = g_slice_new0 (ContentData);
+
+	cd->main_loop = g_main_loop_new (NULL, FALSE);
+	cd->data = g_string_new (NULL);
+	cd->uri = g_strdup (uri);
+	cd->metadata = g_object_ref (metadata);
+
+	g_main_loop_run (cd->main_loop);
+	g_main_loop_unref (cd->main_loop);
+
+	content = g_string_free (cd->data, FALSE);
+	g_slice_free (ContentData, cd);
+#endif
+
+	tracker_sparql_builder_subject_iri (metadata, uri);
+
+	content = get_file_content (uri);
+	tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
+	tracker_sparql_builder_object_unvalidated (metadata, content);
+	g_free (content);
+}
+
+TrackerExtractData *
+tracker_get_extract_data (void)
+{
+	return data;
+}
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]