[tracker/extractor-remove-word-counting-review] Moved the istream text reader to a separate file



commit de531f5fdd8838884733189ab1974cfe49354b05
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Tue May 11 12:38:54 2010 +0200

    Moved the istream text reader to a separate file

 src/tracker-extract/Makefile.am            |    2 +
 src/tracker-extract/tracker-extract-text.c |   82 ++------------------
 src/tracker-extract/tracker-istream.c      |  114 ++++++++++++++++++++++++++++
 src/tracker-extract/tracker-istream.h      |   34 ++++++++
 4 files changed, 156 insertions(+), 76 deletions(-)
---
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index d9b6c39..07d0593 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -304,6 +304,8 @@ tracker_extract_SOURCES = 						\
 	tracker-dbus.h							\
 	tracker-extract.c						\
 	tracker-extract.h						\
+	tracker-istream.c						\
+	tracker-istream.h						\
 	tracker-main.c							\
 	tracker-main.h							\
 	tracker-albumart-generic.h
diff --git a/src/tracker-extract/tracker-extract-text.c b/src/tracker-extract/tracker-extract-text.c
index 07f55b3..09a7340 100644
--- a/src/tracker-extract/tracker-extract-text.c
+++ b/src/tracker-extract/tracker-extract-text.c
@@ -27,11 +27,10 @@
 #include <libtracker-extract/tracker-extract.h>
 
 #include "tracker-main.h"
+#include "tracker-istream.h"
 
 #undef  TRY_LOCALE_TO_UTF8_CONVERSION
 
-#define TEXT_BUFFER_SIZE 65535    /* bytes */
-
 static void extract_text (const gchar          *uri,
                           TrackerSparqlBuilder *preupdate,
                           TrackerSparqlBuilder *metadata);
@@ -81,10 +80,8 @@ get_file_content (const gchar *uri,
 	GFile            *file;
 	GFileInputStream *stream;
 	GError           *error = NULL;
-	GString          *s = NULL;
-	gchar             buf[TEXT_BUFFER_SIZE];
-	gsize             n_bytes_remaining;
-	gsize             n_valid_utf8_bytes;
+	GString          *s;
+	gsize             n_valid_utf8_bytes = 0;
 
 	file = g_file_new_for_uri (uri);
 	stream = g_file_read (file, NULL, &error);
@@ -102,76 +99,9 @@ get_file_content (const gchar *uri,
 	g_debug ("  Starting to read '%s' up to %" G_GSIZE_FORMAT " bytes...",
 	         uri, n_bytes);
 
-	/* Reading in chunks of TEXT_BUFFER_SIZE (8192)
-	 *   Loop is halted whenever one of this conditions is met:
-	 *     a) Read bytes reached the maximum allowed (n_bytes)
-	 *     b) No more bytes to read
-	 *     c) Error reading
-	 *     d) File has less than 3 bytes
-	 *     e) File has a single line of TEXT_BUFFER_SIZE bytes with
-	 *          no EOL
-	 */
-	n_bytes_remaining = n_bytes;
-	while (n_bytes_remaining > 0) {
-		gssize bytes_read;
-
-		/* Read n_bytes_remaining or TEXT_BUFFER_SIZE bytes */
-		bytes_read = g_input_stream_read (G_INPUT_STREAM (stream),
-		                                  buf,
-		                                  MIN (TEXT_BUFFER_SIZE, n_bytes_remaining),
-		                                  NULL,
-		                                  &error);
-
-		/* If any error reading, halt the loop */
-		if (error) {
-			g_message ("Error reading from '%s': '%s'",
-			           uri,
-			           error->message);
-			g_error_free (error);
-			break;
-		}
-
-		/* If no more bytes to read, halt loop */
-		if(bytes_read == 0) {
-			break;
-		}
-
-		/* First of all, check if this is the first time we
-		 * have tried to read the file up to the TEXT_BUFFER_SIZE
-		 * limit. Then make sure that we read the maximum size
-		 * of the buffer. If we don't do this, there is the
-		 * case where we read 10 bytes in and it is just one
-		 * line with no '\n'. Once we have confirmed this we
-		 * check that the buffer has a '\n' to make sure the
-		 * file is worth indexing. Similarly if the file has
-		 * <= 3 bytes then we drop it.
-		 */
-		if (s == NULL) {
-			if (bytes_read == TEXT_BUFFER_SIZE &&
-			    g_strstr_len (buf, bytes_read, "\n") == NULL) {
-				g_debug ("  No '\\n' in the first %" G_GSSIZE_FORMAT " bytes, not indexing file",
-				         bytes_read);
-				break;
-			} else if (bytes_read <= 2) {
-				g_debug ("  File has less than 3 characters in it, not indexing file");
-				break;
-			}
-		}
-
-		/* Update remaining bytes */
-		n_bytes_remaining -= bytes_read;
-
-		g_debug ("  Read "
-		         "%" G_GSSIZE_FORMAT " bytes this time, "
-		         "%" G_GSIZE_FORMAT " bytes remaining",
-		         bytes_read,
-		         n_bytes_remaining);
-
-		/* Append non-NIL terminated bytes */
-		s = (s == NULL ?
-		     g_string_new_len (buf, bytes_read) :
-		     g_string_append_len (s, buf, bytes_read));
-	}
+	/* Read up to n_bytes from stream */
+	s = tracker_istream_read_text (G_INPUT_STREAM (stream),
+	                               n_bytes);
 
 	/* If nothing really read, return here */
 	if (!s) {
diff --git a/src/tracker-extract/tracker-istream.c b/src/tracker-extract/tracker-istream.c
new file mode 100644
index 0000000..2d75373
--- /dev/null
+++ b/src/tracker-extract/tracker-istream.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2010, Nokia <ivan frade nokia com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ */
+
+#include <string.h>
+
+#include <glib.h>
+#include <gio/gio.h>
+
+#include "tracker-istream.h"
+
+#define BUFFER_SIZE 65535    /* bytes */
+
+GString *
+tracker_istream_read_text (GInputStream  *stream,
+                           gsize          max_bytes)
+{
+	GString *s = NULL;
+	guchar   buf[BUFFER_SIZE];
+	gsize    n_bytes_remaining;
+	GError  *error = NULL;
+
+	g_return_val_if_fail (stream, NULL);
+	g_return_val_if_fail (max_bytes > 0, NULL);
+
+	/* Reading in chunks of BUFFER_SIZE
+	 *   Loop is halted whenever one of this conditions is met:
+	 *     a) Read bytes reached the maximum allowed (max_bytes)
+	 *     b) No more bytes to read
+	 *     c) Error reading
+	 *     d) File has less than 3 bytes
+	 *     e) File has a single line of BUFFER_SIZE bytes with no EOL
+	 */
+	n_bytes_remaining = max_bytes;
+	while (n_bytes_remaining > 0) {
+		gssize bytes_read;
+
+		/* Read n_bytes_remaining or BUFFER_SIZE bytes */
+		bytes_read = g_input_stream_read (stream,
+		                                  buf,
+		                                  MIN (BUFFER_SIZE, n_bytes_remaining),
+		                                  NULL,
+		                                  &error);
+
+		/* If any error reading, halt the loop */
+		if (error) {
+			g_message ("Error reading from stream: '%s'",
+			           error->message);
+			g_error_free (error);
+			break;
+		}
+
+		/* If no more bytes to read, halt loop */
+		if(bytes_read == 0) {
+			break;
+		}
+
+		/* First of all, check if this is the first time we
+		 * have tried to read the stream up to the BUFFER_SIZE
+		 * limit. Then make sure that we read the maximum size
+		 * of the buffer. If we don't do this, there is the
+		 * case where we read 10 bytes in and it is just one
+		 * line with no '\n'. Once we have confirmed this we
+		 * check that the buffer has a '\n' to make sure the
+		 * file is worth indexing. Similarly if the file has
+		 * <= 3 bytes then we drop it.
+		 */
+		if (s == NULL) {
+			if (bytes_read == BUFFER_SIZE &&
+			    g_strstr_len (buf, bytes_read, "\n") == NULL) {
+				g_debug ("  No '\\n' in the first %" G_GSSIZE_FORMAT " bytes, "
+				         "not indexing file",
+				         bytes_read);
+				break;
+			} else if (bytes_read <= 2) {
+				g_debug ("  File has less than 3 characters in it, "
+				         "not indexing file");
+				break;
+			}
+		}
+
+		/* Update remaining bytes */
+		n_bytes_remaining -= bytes_read;
+
+		g_debug ("  Read "
+		         "%" G_GSSIZE_FORMAT " bytes this time, "
+		         "%" G_GSIZE_FORMAT " bytes remaining",
+		         bytes_read,
+		         n_bytes_remaining);
+
+		/* Append non-NIL terminated bytes */
+		s = (s == NULL ?
+		     g_string_new_len (buf, bytes_read) :
+		     g_string_append_len (s, buf, bytes_read));
+	}
+
+	/* Return whatever we got... */
+	return s;
+}
diff --git a/src/tracker-extract/tracker-istream.h b/src/tracker-extract/tracker-istream.h
new file mode 100644
index 0000000..f155dd2
--- /dev/null
+++ b/src/tracker-extract/tracker-istream.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2010, Nokia <ivan frade nokia com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ */
+
+#ifndef __TRACKER_ISTREAM_H__
+#define __TRACKER_ISTREAM_H__
+
+#include <glib.h>
+#include <gio/gio.h>
+
+G_BEGIN_DECLS
+
+GString *tracker_istream_read_text (GInputStream  *stream,
+                                    gsize          max_bytes);
+
+G_END_DECLS
+
+#endif /* __TRACKER_ISTREAM_H__ */
+



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]