tracker r2192 - in branches/indexer-split: . src/tracker-indexer
- From: mr svn gnome org
- To: svn-commits-list gnome org
- Subject: tracker r2192 - in branches/indexer-split: . src/tracker-indexer
- Date: Wed, 3 Sep 2008 16:12:45 +0000 (UTC)
Author: mr
Date: Wed Sep 3 16:12:45 2008
New Revision: 2192
URL: http://svn.gnome.org/viewvc/tracker?rev=2192&view=rev
Log:
* src/tracker-indexer/tracker-metadata-utils.c: (get_file_content),
(tracker_metadata_utils_get_text),
(tracker_metadata_utils_get_thumbnail): Improved the
get_file_content() function to check for '\n' in the first 64Kb
buffer and if it doesn't exist we don't index the file. Also,
check for UTF-8 validity so we don't stop on exactly 'x' bytes but
where the last valid character is.
Modified:
branches/indexer-split/ChangeLog
branches/indexer-split/src/tracker-indexer/tracker-metadata-utils.c
Modified: branches/indexer-split/src/tracker-indexer/tracker-metadata-utils.c
==============================================================================
--- branches/indexer-split/src/tracker-indexer/tracker-metadata-utils.c (original)
+++ branches/indexer-split/src/tracker-indexer/tracker-metadata-utils.c Wed Sep 3 16:12:45 2008
@@ -46,7 +46,8 @@
#define METADATA_FILE_MODIFIED "File:Modified"
#define METADATA_FILE_ACCESSED "File:Accessed"
-#define TEXT_MAX_SIZE 1048576
+#define TEXT_MAX_SIZE 1048576 /* bytes */
+#define TEXT_CHECK_SIZE 65535 /* bytes */
typedef struct {
GPid pid;
@@ -485,11 +486,17 @@
GFile *file;
GFileInputStream *stream;
GError *error = NULL;
+ gssize bytes;
gssize bytes_read;
+ gssize bytes_read_total;
gssize bytes_remaining;
- gchar *buf;
+ gssize buf_size;
+ gchar buf[TEXT_CHECK_SIZE];
+ gboolean has_more_data;
+ gboolean has_reached_max;
+ gboolean has_cr;
+ GString *s;
- buf = g_new (gchar, TEXT_MAX_SIZE);
file = g_file_new_for_path (path);
stream = g_file_read (file, NULL, &error);
@@ -503,20 +510,96 @@
return NULL;
}
- /* bytes_max = tracker_config_get_max_text_to_index (config); */
- bytes_remaining = TEXT_MAX_SIZE;
+ s = g_string_new ("");
+ has_reached_max = FALSE;
+ has_more_data = TRUE;
+ has_cr = FALSE;
+ bytes_read_total = 0;
+ buf_size = TEXT_CHECK_SIZE - 1;
+
+ g_debug (" Starting read...");
+
+ while (has_more_data && !has_reached_max && !error) {
+ /* Leave space for NULL termination and make sure we
+ * add it at the end now.
+ */
+ bytes_remaining = buf_size;
+ bytes_read = 0;
- /* NULL termination */
- bytes_remaining--;
+ /* Loop until we hit the maximum */
+ for (bytes = -1; bytes != 0 && !error; ) {
+ bytes = g_input_stream_read (G_INPUT_STREAM (stream),
+ buf,
+ bytes_remaining,
+ NULL,
+ &error);
- for (bytes_read = -1; bytes_read != 0 && !error; ) {
- bytes_read = g_input_stream_read (G_INPUT_STREAM (stream),
- buf,
- bytes_remaining,
- NULL,
- &error);
- bytes_remaining -= bytes_read;
- }
+ bytes_read += bytes;
+ bytes_remaining -= bytes;
+
+ g_debug (" Read %d bytes",
+ bytes);
+ }
+
+ /* Set the NULL termination after the last byte read */
+ buf[TEXT_CHECK_SIZE - bytes_remaining] = '\0';
+
+ /* First of all, check if this is the first time we
+ * have tried to read the file up to the TEXT_CHECK_SIZE
+ * limit. Then make sure that we read the maximum size
+ * of the buffer. If we don't do this, there is the
+ * case where we read 10 bytes in and it is just one
+ * line with no '\n'. Once we have confirmed this we
+ * check that the buffer has a '\n' to make sure the
+ * file is worth indexing.
+ */
+ if (bytes_read_total == 0 &&
+ bytes_read == buf_size &&
+ strchr (buf, '\n') == NULL) {
+ g_debug (" No '\\n' in the first %d bytes, not indexing file",
+ buf_size);
+ break;
+ }
+
+ /* Here we increment the bytes read total to evaluate
+ * the next states. We don't do this before the
+ * previous condition so we can know when we have
+ * iterated > 1.
+ */
+ bytes_read_total += bytes_read;
+
+ if (bytes_read != buf_size || bytes_read == 0) {
+ has_more_data = FALSE;
+ }
+
+ if (bytes_read_total >= TEXT_MAX_SIZE) {
+ has_reached_max = TRUE;
+ }
+
+ g_debug (" Read %d bytes total, %d bytes this time, more data:%s, reached max:%s",
+ bytes_read_total,
+ bytes_read,
+ has_more_data ? "yes" : "no",
+ has_reached_max ? "yes" : "no");
+
+ s = g_string_append (s, buf);
+
+ if (has_reached_max) {
+ const gchar *p;
+ gssize bytes_valid;
+
+ /* Check for UTF-8 validity, since we may
+ * have cut off the end.
+ */
+ g_utf8_validate (s->str, s->len, &p);
+ bytes_valid = p - s->str;
+ s->str[bytes_valid] = '\0';
+
+ g_debug (" Maximum indexable limit reached, checking UTF-8 validity. Bytes valid %d/%d",
+ bytes_valid,
+ s->len);
+ }
+ }
if (error) {
g_message ("Couldn't get read input stream for:'%s', %s",
@@ -530,16 +613,15 @@
return NULL;
}
- buf [TEXT_MAX_SIZE - bytes_remaining] = '\0';
-
g_object_unref (file);
g_object_unref (stream);
- g_debug ("Read %d bytes from file:'%s'\n",
- TEXT_MAX_SIZE - bytes_remaining,
- path);
-
- return buf;
+ if (s->len < 1) {
+ g_string_free (s, TRUE);
+ s = NULL;
+ }
+
+ return s ? g_string_free (s, FALSE) : NULL;
}
gchar *
@@ -587,7 +669,7 @@
context = create_process_context ((const gchar **) argv);
if (!context) {
- return NULL;
+ return;
}
thumbnail = g_string_new (NULL);
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]