[tracker/extractor-remove-word-counting-review] Fixes GB#616845 - Avoid word counting in the extractors



commit 40cea603096bf12fea95ffd7a4c2c4aa9ac43cf9
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Mon May 10 20:07:09 2010 +0200

    Fixes GB#616845 - Avoid word counting in the extractors
    
     * New max_bytes parameter added to tracker-extract config file. Extractors will
        read up to that configured limit.
     * Removed the need of reading the FTS config file from tracker-extract.
     * Word counting not done now in the extractors.
    
     Note: As a side-effect, last word extracted when reached max_bytes may get cut
      and only first chunk of it extracted.

 src/libtracker-extract/tracker-utils.c         |   60 +++-
 src/libtracker-extract/tracker-utils.h         |    9 +-
 src/tracker-extract/Makefile.am                |    2 -
 src/tracker-extract/tracker-config.c           |   58 +++-
 src/tracker-extract/tracker-config.h           |    4 +
 src/tracker-extract/tracker-extract-html.c     |   42 ++-
 src/tracker-extract/tracker-extract-msoffice.c |  157 ++-------
 src/tracker-extract/tracker-extract-oasis.c    |   74 ++---
 src/tracker-extract/tracker-extract-pdf.cpp    |   36 ++-
 src/tracker-extract/tracker-fts-config.c       |  429 ------------------------
 src/tracker-extract/tracker-fts-config.h       |   64 ----
 src/tracker-extract/tracker-main.c             |   13 +-
 src/tracker-extract/tracker-main.h             |    6 +-
 13 files changed, 233 insertions(+), 721 deletions(-)
---
diff --git a/src/libtracker-extract/tracker-utils.c b/src/libtracker-extract/tracker-utils.c
index a2cc6ab..f9f1084 100644
--- a/src/libtracker-extract/tracker-utils.c
+++ b/src/libtracker-extract/tracker-utils.c
@@ -183,7 +183,7 @@ tracker_coalesce (gint n_values,
  * Since: 0.9
  **/
 gchar *
-tracker_merge_const (const gchar *delimiter, 
+tracker_merge_const (const gchar *delimiter,
                      gint         n_values,
                      ...)
 {
@@ -239,7 +239,7 @@ tracker_merge_const (const gchar *delimiter,
  * Deprecated: 1.0: Use tracker_merge_const() instead.
  **/
 gchar *
-tracker_merge (const gchar *delimiter, 
+tracker_merge (const gchar *delimiter,
                gint         n_values,
                ...)
 {
@@ -304,6 +304,8 @@ tracker_merge (const gchar *delimiter,
  * be freed with g_free() when finished with, otherwise %NULL.
  *
  * Since: 0.8
+ *
+ * Deprecated: 1.0: Use tracker_text_validate_utf8() instead.
  **/
 gchar *
 tracker_text_normalize (const gchar *text,
@@ -345,10 +347,10 @@ tracker_text_normalize (const gchar *text,
 	}
 
 	if (n_words) {
-                if (!in_break) {
-                        /* Count the last word */
-                        words += 1;
-                }
+		if (!in_break) {
+			/* Count the last word */
+			words += 1;
+		}
 		*n_words = words;
 	}
 
@@ -356,6 +358,52 @@ tracker_text_normalize (const gchar *text,
 }
 
 /**
+ * tracker_text_validate_utf8:
+ * @text: the text to validate
+ * @text_len: length of @text, or -1 if NIL-terminated
+ * @str: the string where to place the validated characters
+ *
+ * This function iterates through @text checking for UTF-8 validity
+ * using g_utf8_validate(), and appends the first chunk of valid characters
+ * to @str.
+ *
+ * Returns: %TRUE if valid UTF-8 in @text was appended to @str
+ *
+ * Since: 0.9
+ **/
+gboolean
+tracker_text_validate_utf8 (const gchar  *text,
+                            gsize         text_len,
+                            GString     **str)
+{
+	gsize len_to_validate;
+
+	g_return_val_if_fail (text, FALSE);
+	g_return_val_if_fail (str, FALSE);
+
+	len_to_validate = text_len >= 0 ? text_len : strlen (text);
+
+	if (len_to_validate > 0) {
+		const gchar *end = text;
+
+		/* Validate string, getting the pointer to first non-valid character
+		 *  (if any) or to the end of the string. */
+		g_utf8_validate (text, len_to_validate, &end);
+		if (end > text) {
+			/* Create string to output if not already as input */
+			if (*str == NULL) {
+				*str = g_string_new_len (text, end-text);
+			} else {
+				*str = g_string_append_len (*str, text, end-text);
+			}
+			return TRUE;
+		}
+	}
+
+	return FALSE;
+}
+
+/**
  * tracker_date_format_to_iso8601:
  * @date_string: the date in a string pointer
  * @format: the format of the @date_string
diff --git a/src/libtracker-extract/tracker-utils.h b/src/libtracker-extract/tracker-utils.h
index db2da92..6003d36 100644
--- a/src/libtracker-extract/tracker-utils.h
+++ b/src/libtracker-extract/tracker-utils.h
@@ -34,11 +34,14 @@ gchar*       tracker_coalesce               (gint         n_values,
 gchar*       tracker_merge                  (const gchar *delimiter,
                                              gint         n_values,
                                                           ...) G_GNUC_DEPRECATED;
-#endif /* TRACKER_DISABLE_DEPRECATED */
-
 gchar*       tracker_text_normalize         (const gchar *text,
                                              guint        max_words,
-                                             guint       *n_words);
+                                             guint       *n_words) G_GNUC_DEPRECATED;
+#endif /* TRACKER_DISABLE_DEPRECATED */
+
+gboolean     tracker_text_validate_utf8     (const gchar  *text,
+                                             gsize         text_len,
+                                             GString     **str);
 gchar*       tracker_date_guess             (const gchar *date_string);
 gchar*       tracker_date_format_to_iso8601 (const gchar *date_string,
                                              const gchar *format);
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index 37454ba..d9b6c39 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -304,8 +304,6 @@ tracker_extract_SOURCES = 						\
 	tracker-dbus.h							\
 	tracker-extract.c						\
 	tracker-extract.h						\
-	tracker-fts-config.c						\
-	tracker-fts-config.h						\
 	tracker-main.c							\
 	tracker-main.h							\
 	tracker-albumart-generic.h
diff --git a/src/tracker-extract/tracker-config.c b/src/tracker-extract/tracker-config.c
index 0d09a2b..c5edb1d 100644
--- a/src/tracker-extract/tracker-config.c
+++ b/src/tracker-extract/tracker-config.c
@@ -30,10 +30,12 @@
 
 /* Default values */
 #define DEFAULT_VERBOSITY 0
+#define DEFAULT_MAX_BYTES 1048576 /* 1Mbyte */
 
 typedef struct {
 	/* General */
 	gint verbosity;
+	gint max_bytes;
 } TrackerConfigPrivate;
 
 typedef struct {
@@ -63,11 +65,13 @@ enum {
 	PROP_0,
 
 	/* General */
-	PROP_VERBOSITY
+	PROP_VERBOSITY,
+	PROP_MAX_BYTES
 };
 
 static ObjectToKeyFile conversions[] = {
 	{ G_TYPE_INT,     "verbosity",          GROUP_GENERAL,  "Verbosity"       },
+	{ G_TYPE_INT,     "max_bytes",          GROUP_GENERAL,  "Max Bytes"       },
 };
 
 G_DEFINE_TYPE (TrackerConfig, tracker_config, TRACKER_TYPE_CONFIG_FILE);
@@ -93,6 +97,16 @@ tracker_config_class_init (TrackerConfigClass *klass)
 	                                                   DEFAULT_VERBOSITY,
 	                                                   G_PARAM_READWRITE | G_PARAM_CONSTRUCT));
 
+	g_object_class_install_property (object_class,
+	                                 PROP_VERBOSITY,
+	                                 g_param_spec_int ("max_bytes",
+	                                                   "Max Bytes",
+	                                                   " Maximum number of UTF-8 bytes to extract [0,G_MAXINT]",
+	                                                   0,
+	                                                   G_MAXINT,
+	                                                   DEFAULT_MAX_BYTES,
+	                                                   G_PARAM_READWRITE | G_PARAM_CONSTRUCT));
+
 	g_type_class_add_private (object_class, sizeof (TrackerConfigPrivate));
 }
 
@@ -105,7 +119,7 @@ static void
 config_set_property (GObject      *object,
                      guint         param_id,
                      const GValue *value,
-                     GParamSpec           *pspec)
+                     GParamSpec   *pspec)
 {
 	switch (param_id) {
 		/* General */
@@ -114,6 +128,11 @@ config_set_property (GObject      *object,
 		                              g_value_get_int (value));
 		break;
 
+	case PROP_MAX_BYTES:
+		tracker_config_set_max_bytes (TRACKER_CONFIG (object),
+		                              g_value_get_int (value));
+		break;
+
 	default:
 		G_OBJECT_WARN_INVALID_PROPERTY_ID (object, param_id, pspec);
 		break;
@@ -136,6 +155,10 @@ config_get_property (GObject    *object,
 		g_value_set_int (value, priv->verbosity);
 		break;
 
+	case PROP_MAX_BYTES:
+		g_value_set_int (value, priv->max_bytes);
+		break;
+
 	default:
 		G_OBJECT_WARN_INVALID_PROPERTY_ID (object, param_id, pspec);
 		break;
@@ -317,3 +340,34 @@ tracker_config_set_verbosity (TrackerConfig *config,
 	priv->verbosity = value;
 	g_object_notify (G_OBJECT (config), "verbosity");
 }
+
+
+gint
+tracker_config_get_max_bytes (TrackerConfig *config)
+{
+	TrackerConfigPrivate *priv;
+
+	g_return_val_if_fail (TRACKER_IS_CONFIG (config), DEFAULT_MAX_BYTES);
+
+	priv = TRACKER_CONFIG_GET_PRIVATE (config);
+
+	return priv->max_bytes;
+}
+
+void
+tracker_config_set_max_bytes (TrackerConfig *config,
+                              gint           value)
+{
+	TrackerConfigPrivate *priv;
+
+	g_return_if_fail (TRACKER_IS_CONFIG (config));
+
+	if (!tracker_keyfile_object_validate_int (config, "max_bytes", value)) {
+		return;
+	}
+
+	priv = TRACKER_CONFIG_GET_PRIVATE (config);
+
+	priv->max_bytes = value;
+	g_object_notify (G_OBJECT (config), "max_bytes");
+}
diff --git a/src/tracker-extract/tracker-config.h b/src/tracker-extract/tracker-config.h
index cdede72..491a811 100644
--- a/src/tracker-extract/tracker-config.h
+++ b/src/tracker-extract/tracker-config.h
@@ -53,6 +53,10 @@ gint           tracker_config_get_verbosity (TrackerConfig *config);
 void           tracker_config_set_verbosity (TrackerConfig *config,
                                              gint           value);
 
+gint           tracker_config_get_max_bytes (TrackerConfig *config);
+void           tracker_config_set_max_bytes (TrackerConfig *config,
+                                             gint           value);
+
 G_END_DECLS
 
 #endif /* __TRACKER_EXTRACT_CONFIG_H__ */
diff --git a/src/tracker-extract/tracker-extract-html.c b/src/tracker-extract/tracker-extract-html.c
index 6583cdf..a59b864 100644
--- a/src/tracker-extract/tracker-extract-html.c
+++ b/src/tracker-extract/tracker-extract-html.c
@@ -41,7 +41,7 @@ typedef struct {
 	const gchar *uri;
 	guint in_body : 1;
 	GString *plain_text;
-	guint n_words;
+	guint n_bytes_remaining;
 } parser_data;
 
 static void extract_html (const gchar          *filename,
@@ -212,24 +212,28 @@ parser_characters (void          *data,
 	case READ_IGNORE:
 		break;
 	default:
-		if (pd->in_body && pd->n_words > 0) {
-			gchar *text;
-			guint n_words;
-
-			text = tracker_text_normalize (ch, pd->n_words, &n_words);
-
-			if (text && *text) {
-				g_string_append (pd->plain_text, text);
+		if (pd->in_body && pd->n_bytes_remaining > 0) {
+			gsize text_len;
+
+			text_len = strlen (ch);
+
+			if (tracker_text_validate_utf8 (ch,
+			                                (pd->n_bytes_remaining < text_len ?
+			                                 pd->n_bytes_remaining :
+			                                 text_len),
+			                                &pd->plain_text)) {
+				/* In the case of HTML, each string arriving this
+				 * callback is independent to any other previous
+				 * string, so need to add an explicit whitespace
+				 * separator */
 				g_string_append_c (pd->plain_text, ' ');
-
-				if (n_words > pd->n_words) {
-					pd->n_words = 0;
-				} else {
-					pd->n_words -= n_words;
-				}
 			}
 
-			g_free (text);
+			if (pd->n_bytes_remaining > text_len) {
+				pd->n_bytes_remaining -= text_len;
+			} else {
+				pd->n_bytes_remaining = 0;
+			}
 		}
 		break;
 	}
@@ -240,7 +244,7 @@ extract_html (const gchar          *uri,
               TrackerSparqlBuilder *preupdate,
               TrackerSparqlBuilder *metadata)
 {
-	TrackerFTSConfig *fts_config;
+	TrackerConfig *config;
 	htmlDocPtr doc;
 	parser_data pd;
 	gchar *filename;
@@ -288,8 +292,8 @@ extract_html (const gchar          *uri,
 	pd.uri = uri;
 	pd.plain_text = g_string_new (NULL);
 
-	fts_config = tracker_main_get_fts_config ();
-	pd.n_words = tracker_fts_config_get_max_words_to_index (fts_config);
+	config = tracker_main_get_config ();
+	pd.n_bytes_remaining = tracker_config_get_max_bytes (config);
 
 	filename = g_filename_from_uri (uri, NULL, NULL);
 	doc = htmlSAXParseFile (filename, NULL, &handler, &pd);
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index c9c2de9..d47a1c3 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -394,8 +394,6 @@ read_32bit (const guint8 *buffer)
  * @param chunk_size Number of valid bytes in the input buffer
  * @param is_ansi If %TRUE, input text should be encoded in CP1252, and
  *  in UTF-16 otherwise.
- * @param p_words_remaining Pointer to #gint specifying how many words
- *  should still be considered.
  * @param p_words_remaining Pointer to #gsize specifying how many bytes
  *  should still be considered.
  * @param p_content Pointer to a #GString where the output normalized words
@@ -405,7 +403,6 @@ static void
 msoffice_convert_and_normalize_chunk (guint8    *buffer,
                                       gsize      chunk_size,
                                       gboolean   is_ansi,
-                                      gint      *p_words_remaining,
                                       gsize     *p_bytes_remaining,
                                       GString  **p_content)
 {
@@ -415,7 +412,6 @@ msoffice_convert_and_normalize_chunk (guint8    *buffer,
 
 	g_return_if_fail (buffer != NULL);
 	g_return_if_fail (chunk_size > 0);
-	g_return_if_fail (p_words_remaining != NULL);
 	g_return_if_fail (p_bytes_remaining != NULL);
 	g_return_if_fail (p_content != NULL);
 
@@ -432,42 +428,20 @@ msoffice_convert_and_normalize_chunk (guint8    *buffer,
 	                            &error);
 
 	if (converted_text) {
-		gchar *normalized_chunk;
-		guint n_words_normalized;
-
-		/* Get normalized chunk */
-		normalized_chunk = tracker_text_normalize (converted_text,
-		                                           *p_words_remaining,
-		                                           &n_words_normalized);
+		gsize len_to_validate;
 
-		/* Update number of words remaining.
-		 * Note that n_words_normalized should always be less or
-		 * equal than n_words_remaining */
-		*p_words_remaining = (n_words_normalized <= *p_words_remaining ?
-		                      *p_words_remaining - n_words_normalized : 0);
+		len_to_validate = MIN (*p_bytes_remaining, n_bytes_utf8);
 
-		/* Update accumulated UTF-8 bytes read */
-		*p_bytes_remaining = (n_bytes_utf8 <= *p_bytes_remaining ?
-		                      *p_bytes_remaining - n_bytes_utf8 : 0);
-
-		/* g_debug ("Words normalized: %u (remaining: %u); " */
-		/*          "Bytes read (UTF-8): %" G_GSIZE_FORMAT " bytes " */
-		/*          "(remaining: %" G_GSIZE_FORMAT ")", */
-		/*          n_words_normalized, *p_words_remaining, */
-		/*          n_bytes_utf8, *p_bytes_remaining); */
-
-		/* Append normalized chunk to the string to be returned */
-		if (*p_content) {
-			g_string_append (*p_content, normalized_chunk);
-		} else {
-			*p_content = g_string_new (normalized_chunk);
+		if (tracker_text_validate_utf8 (converted_text,
+		                                len_to_validate,
+		                                p_content)) {
+			/* A whitespace is added to separate next strings appended */
+			g_string_append_c (*p_content, ' ');
 		}
 
-		/* A whitespace is added to separate next strings appended */
-		g_string_append (*p_content, " ");
+		/* Update accumulated UTF-8 bytes read */
+		*p_bytes_remaining -= len_to_validate;
 
-		g_free (converted_text);
-		g_free (normalized_chunk);
 	} else {
 		g_warning ("Couldn't convert %" G_GSIZE_FORMAT " bytes from %s to UTF-8: %s",
 		           chunk_size,
@@ -659,7 +633,6 @@ ppt_seek_header (GsfInput *stream,
 
 static gchar *
 extract_powerpoint_content (GsfInfile *infile,
-                            gint       max_words,
                             gsize      max_bytes,
                             gboolean  *is_encrypted)
 {
@@ -733,18 +706,16 @@ extract_powerpoint_content (GsfInfile *infile,
 	                     SLIDELISTWITHTEXT_RECORD_TYPE,
 	                     SLIDELISTWITHTEXT_RECORD_TYPE,
 	                     FALSE)) {
-		gint words_remaining = max_words;
 		gsize bytes_remaining = max_bytes;
 		guint8 *buffer = NULL;
 		gsize buffer_size = 0;
 
 		/*
 		 * Read while we have either TextBytesAtom or
-		 * TextCharsAtom and we have read less than max_words
-		 * amount of words and less than max_bytes (in UTF-8)
+		 * TextCharsAtom and we have read less than max_bytes
+		 * (in UTF-8)
 		 */
-		while (words_remaining > 0 &&
-		       bytes_remaining > 0 &&
+		while (bytes_remaining > 0 &&
 		       ppt_seek_header (stream,
 		                        TEXTBYTESATOM_RECORD_TYPE,
 		                        TEXTCHARSATOM_RECORD_TYPE,
@@ -763,7 +734,6 @@ extract_powerpoint_content (GsfInfile *infile,
 				msoffice_convert_and_normalize_chunk (buffer,
 				                                      read_size,
 				                                      FALSE, /* Always UTF-16 */
-				                                      &words_remaining,
 				                                      &bytes_remaining,
 				                                      &all_texts);
 			}
@@ -778,45 +748,6 @@ extract_powerpoint_content (GsfInfile *infile,
 }
 
 /**
- * @brief get maximum number of words to index
- * @return maximum number of words to index
- */
-static gint
-fts_max_words (void)
-{
-	TrackerFTSConfig *fts_config;
-
-	fts_config = tracker_main_get_fts_config ();
-	return tracker_fts_config_get_max_words_to_index (fts_config);
-}
-
-/**
- * @brief get min word length
- * @return min_word_length
- */
-static gint
-fts_min_word_length (void)
-{
-	TrackerFTSConfig *fts_config;
-
-	fts_config = tracker_main_get_fts_config ();
-	return tracker_fts_config_get_min_word_length (fts_config);
-}
-
-/**
- * @brief get max word length
- * @return max_word_length
- */
-static gint
-fts_max_word_length (void)
-{
-	TrackerFTSConfig *fts_config;
-
-	fts_config = tracker_main_get_fts_config ();
-	return tracker_fts_config_get_max_word_length (fts_config);
-}
-
-/**
  * @brief Open specified uri for reading and initialize gsf
  * @param uri URI of the file to open
  * @return GsfInFile of the opened file or NULL if failed to open file
@@ -847,7 +778,6 @@ open_uri (const gchar *uri)
  */
 static gchar *
 extract_msword_content (GsfInfile *infile,
-                        gint       n_words,
                         gsize      n_bytes,
                         gboolean  *is_encrypted)
 {
@@ -863,7 +793,6 @@ extract_msword_content (GsfInfile *infile,
 	GString *content = NULL;
 	guint8 *text_buffer = NULL;
 	gint text_buffer_size = 0;
-	guint n_words_remaining;
 	gsize n_bytes_remaining;
 
 	document_stream = gsf_infile_child_by_name (infile, "WordDocument");
@@ -939,14 +868,11 @@ extract_msword_content (GsfInfile *infile,
 	/* Iterate over pieces...
 	 *   Loop is halted whenever one of this conditions is met:
 	 *     a) Max bytes to be read reached
-	 *     b) Already read up to the max number of words configured
-	 *     c) No more pieces to read
+	 *     b) No more pieces to read
 	 */
 	i = 0;
-	n_words_remaining = n_words;
 	n_bytes_remaining = n_bytes;
-	while (n_words_remaining > 0 &&
-	       n_bytes_remaining > 0 &&
+	while (n_bytes_remaining > 0 &&
 	       i < piece_count) {
 		guint8 *piece_descriptor;
 		gint piece_start;
@@ -1009,7 +935,6 @@ extract_msword_content (GsfInfile *infile,
 			msoffice_convert_and_normalize_chunk (text_buffer,
 			                                      piece_size,
 			                                      is_ansi,
-			                                      &n_words_remaining,
 			                                      &n_bytes_remaining,
 			                                      &content);
 		}
@@ -1295,7 +1220,6 @@ read_excel_string (GsfInput *stream,
 static void
 xls_get_extended_record_string (GsfInput  *stream,
                                 GArray    *list,
-                                guint     *p_words_remaining,
                                 gsize     *p_bytes_remaining,
                                 GString  **p_content)
 {
@@ -1337,12 +1261,10 @@ xls_get_extended_record_string (GsfInput  *stream,
 	/* Iterate over chunks...
 	 *   Loop is halted whenever one of this conditions is met:
 	 *     a) Max bytes to be read reached
-	 *     b) Already read up to the max number of words configured
-	 *     c) No more chunks to read
+	 *     b) No more chunks to read
 	 */
 	i = 0;
-	while (*p_words_remaining > 0 &&
-	       *p_bytes_remaining > 0 &&
+	while (*p_bytes_remaining > 0 &&
 	       i < cst_unique) {
 		guint16 cch;
 		guint16 c_run;
@@ -1398,7 +1320,6 @@ xls_get_extended_record_string (GsfInput  *stream,
 		msoffice_convert_and_normalize_chunk (buffer,
 		                                      chunk_size,
 		                                      !is_high_byte,
-		                                      p_words_remaining,
 		                                      p_bytes_remaining,
 		                                      p_content);
 
@@ -1475,7 +1396,6 @@ xls_get_extended_record_string (GsfInput  *stream,
  */
 static gchar*
 extract_excel_content (GsfInfile *infile,
-                       gint       n_words,
                        gsize      n_bytes,
                        gboolean  *is_encrypted)
 {
@@ -1483,7 +1403,6 @@ extract_excel_content (GsfInfile *infile,
 	GString *content = NULL;
 	GsfInput *stream;
 	guint saved_offset;
-	guint n_words_remaining = n_words;
 	gsize n_bytes_remaining = n_bytes;
 
 	stream = gsf_infile_child_by_name (infile, "Workbook");
@@ -1493,8 +1412,7 @@ extract_excel_content (GsfInfile *infile,
 	}
 
 	/* Read until we reach eof or any of our limits reached */
-	while (n_words_remaining > 0 &&
-	       n_bytes_remaining > 0 &&
+	while (n_bytes_remaining > 0 &&
 	       !gsf_input_eof (stream)) {
 		guint8 tmp_buffer[4] = { 0 };
 
@@ -1577,7 +1495,6 @@ extract_excel_content (GsfInfile *infile,
 			/* Read extended string */
 			xls_get_extended_record_string (stream,
 			                                list,
-			                                &n_words_remaining,
 			                                &n_bytes_remaining,
 			                                &content);
 
@@ -1596,8 +1513,7 @@ extract_excel_content (GsfInfile *infile,
 
 	g_object_unref (stream);
 
-	g_debug ("Words normalized: %u, Bytes: %" G_GSIZE_FORMAT,
-	         n_words - n_words_remaining,
+	g_debug ("Bytes extracted: %" G_GSIZE_FORMAT,
 	         n_bytes - n_bytes_remaining);
 
 	return content ? g_string_free (content, FALSE) : NULL;
@@ -1696,13 +1612,13 @@ extract_msoffice (const gchar          *uri,
                   TrackerSparqlBuilder *preupdate,
                   TrackerSparqlBuilder *metadata)
 {
+	TrackerConfig *config;
 	GFile *file = NULL;
 	GFileInfo *file_info = NULL;
 	const gchar *mime_used;
 	GsfInfile *infile = NULL;
 	gchar *content = NULL;
 	gboolean is_encrypted = FALSE;
-	gint max_words;
 	gsize max_bytes;
 
 	file = g_file_new_for_uri (uri);
@@ -1738,23 +1654,19 @@ extract_msoffice (const gchar          *uri,
 
 	mime_used = g_file_info_get_content_type (file_info);
 
-	/* Set max words to read from content */
-	max_words = fts_max_words ();
-
-	/* Set max bytes to read from content.
-	 * Assuming 3 bytes per unicode point in UTF-8, as 4-byte UTF-8 unicode
-	 *  points are really pretty rare */
-	max_bytes = 3 * max_words * fts_max_word_length ();
+	/* Set max bytes to read from content */
+	config = tracker_main_get_config ();
+	max_bytes = tracker_config_get_max_bytes (config);
 
 	if (g_ascii_strcasecmp (mime_used, "application/msword") == 0) {
 		/* Word file */
-		content = extract_msword_content (infile, max_words, max_bytes, &is_encrypted);
+		content = extract_msword_content (infile, max_bytes, &is_encrypted);
 	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.ms-powerpoint") == 0) {
 		/* PowerPoint file */
-		content = extract_powerpoint_content (infile, max_words, max_bytes, &is_encrypted);
+		content = extract_powerpoint_content (infile, max_bytes, &is_encrypted);
 	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.ms-excel") == 0) {
 		/* Excel File */
-		content = extract_excel_content (infile, max_words, max_bytes, &is_encrypted);
+		content = extract_excel_content (infile, max_bytes, &is_encrypted);
 	} else {
 		g_message ("Mime type was not recognised:'%s'", mime_used);
 	}
@@ -1943,20 +1855,21 @@ xml_text_handler_document_data (GMarkupParseContext  *context,
 	MsOfficeXMLParserInfo *info = user_data;
 	static gboolean found = FALSE;
 	static gboolean added = FALSE;
-	guint min_word_length = fts_min_word_length();
 
 	switch (info->tag_type) {
 	case MS_OFFICE_XML_TAG_WORD_TEXT:
 		if (info->style_element_present) {
 			if (atoi (text) == 0) {
-				g_string_append_printf (info->content, "%s ", text);
+				tracker_text_validate_utf8 (text, -1, &info->content);
+				g_string_append_c (info->content, ' ');
 			}
 		}
 
 		if (info->preserve_attribute_present) {
 			gchar *keywords = g_strdup (text);
-			if (found && (strlen (keywords) >= min_word_length)) {
-				g_string_append_printf (info->content, "%s ", text);
+			if (found) {
+				tracker_text_validate_utf8 (text, -1, &info->content);
+				g_string_append_c (info->content, ' ');
 				found = FALSE;
 			} else {
 				gchar *lasts;
@@ -1979,14 +1892,14 @@ xml_text_handler_document_data (GMarkupParseContext  *context,
 		break;
 
 	case MS_OFFICE_XML_TAG_SLIDE_TEXT:
-		if (strlen (text) > min_word_length) {
-			g_string_append_printf (info->content, "%s ", text);
-		}
+		tracker_text_validate_utf8 (text, -1, &info->content);
+		g_string_append_c (info->content, ' ');
 		break;
 
 	case MS_OFFICE_XML_TAG_XLS_SHARED_TEXT:
-		if ((atoi (text) == 0) && (strlen (text) > min_word_length))  {
-			g_string_append_printf (info->content, "%s ", text);
+		if (atoi (text) == 0)  {
+			tracker_text_validate_utf8 (text, -1, &info->content);
+			g_string_append_c (info->content, ' ');
 		}
 		break;
 
diff --git a/src/tracker-extract/tracker-extract-oasis.c b/src/tracker-extract/tracker-extract-oasis.c
index e2f482c..573e0db 100644
--- a/src/tracker-extract/tracker-extract-oasis.c
+++ b/src/tracker-extract/tracker-extract-oasis.c
@@ -74,7 +74,6 @@ static TrackerExtractData extract_data[] = {
 
 static gchar *
 extract_oasis_content (const gchar *uri,
-                       guint        n_words,
                        gsize        n_bytes)
 {
 	const gchar *argv[4];
@@ -93,9 +92,9 @@ extract_oasis_content (const gchar *uri,
 	argv[2] = path;
 	argv[3] = NULL;
 
-	g_debug ("Executing command:'%s %s %s' (max words: %u, "
-	         "max_bytes: %" G_GSIZE_FORMAT ")",
-	         argv[0], argv[1], argv[2], n_words, n_bytes);
+	g_debug ("Executing command:'%s %s %s' "
+	         "(max_bytes: %" G_GSIZE_FORMAT ")",
+	         argv[0], argv[1], argv[2], n_bytes);
 
 	/* Fork & spawn */
 	if (!g_spawn_async_with_pipes (g_get_tmp_dir (),
@@ -122,50 +121,38 @@ extract_oasis_content (const gchar *uri,
 	/* Start buffered reading... */
 	else {
 		unsigned char buf[ODT_BUFFER_SIZE];
-		size_t r, accum;
-		guint n_words_remaining = n_words;
-		GString *normalized;
+		size_t r, bytes_remaining;
+		GString *validated = NULL;
 
-		accum = 0;
-		normalized = g_string_new ("");
+		bytes_remaining = n_bytes;
 
 		/* Reading in chunks of ODT_BUFFER_SIZE -1 (8192)
 		 *   Loop is halted whenever one of this conditions is met:
 		 *     a) Read bytes reached the maximum allowed (n_bytes)
-		 *     b) Already read up to the max number of words configured
-		 *     c) No more bytes to read
+		 *     b) No more bytes to read
 		 */
-		while ((accum <= n_bytes) &&
-		       (n_words_remaining > 0) &&
+		while ((bytes_remaining > 0) &&
 		       (r = fread (buf, 1, ODT_BUFFER_SIZE-1, fz))) {
-			gchar *normalized_chunk;
-			guint n_words_normalized;
-
-			/* Always make sure that the read string will be
-			 * NIL-terminated  */
-			buf[r] = '\0';
-			/* Get normalized chunk */
-			normalized_chunk = tracker_text_normalize (buf,
-			                                           n_words_remaining,
-			                                           &n_words_normalized);
-			/* Update number of words remaining.
-			 * Note that n_words_normalized should always be less or
-			 * equal than n_words_remaining */
-			n_words_remaining = (n_words_normalized <= n_words_remaining ?
-			                     n_words_remaining - n_words_normalized : 0);
-			/* Update accumulated */
-			accum += r;
-
-			/* Add normalized chunk to the whole normalized string */
-			g_string_append (normalized, normalized_chunk);
-			g_free (normalized_chunk);
+			gsize len_to_validate;
+
+			len_to_validate = MIN (bytes_remaining, r);
+
+			tracker_text_validate_utf8 (buf,
+			                            len_to_validate,
+			                            &validated);
+
+			/* Note that in this case we shouldn't add a whitespace
+			 * separator between chunks read */
+
+			/* Update remaining */
+			bytes_remaining -= len_to_validate;
 		}
 
 		/* fclose() the stream, no need to close() the original FD */
 		fclose (fz);
 
 		/* Set final normalized contents to return */
-		text = g_string_free (normalized, FALSE);
+		text = g_string_free (validated, FALSE);
 	}
 
 	g_free (path);
@@ -179,9 +166,7 @@ extract_oasis (const gchar          *uri,
                TrackerSparqlBuilder *metadata)
 {
 	gchar *content;
-	TrackerFTSConfig *fts_config;
-	guint n_words;
-	gsize n_bytes;
+	TrackerConfig *config;
 	ODTParseInfo info;
 	GMarkupParseContext *context;
 	GMarkupParser parser = {
@@ -193,7 +178,7 @@ extract_oasis (const gchar          *uri,
 	};
 
 	/* Setup conf */
-	fts_config = tracker_main_get_fts_config ();
+	config = tracker_main_get_config ();
 
 	g_debug ("Extracting OASIS metadata and contents from '%s'", uri);
 
@@ -217,16 +202,9 @@ extract_oasis (const gchar          *uri,
 
 	/* Next, parse contents */
 
-	/* Set max words to read from content */
-	n_words = tracker_fts_config_get_max_words_to_index (fts_config);
-
-	/* Set max bytes to read from content.
-	 * Assuming 3 bytes per unicode point in UTF-8, as 4-byte UTF-8 unicode
-	 *  points are really pretty rare */
-	n_bytes = 3 * n_words * tracker_fts_config_get_max_word_length(fts_config);
-
 	/* Extract content with the given limitations */
-	content = extract_oasis_content (uri, n_words, n_bytes);
+	content = extract_oasis_content (uri,
+	                                 tracker_config_get_max_bytes (config));
 	if (content) {
 		tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
 		tracker_sparql_builder_object_unvalidated (metadata, content);
diff --git a/src/tracker-extract/tracker-extract-pdf.cpp b/src/tracker-extract/tracker-extract-pdf.cpp
index b7a35be..02d3441 100644
--- a/src/tracker-extract/tracker-extract-pdf.cpp
+++ b/src/tracker-extract/tracker-extract-pdf.cpp
@@ -314,26 +314,27 @@ page_get_size (Page    *page,
 
 static gchar *
 extract_content (PDFDoc *document,
-                 guint   n_words)
+                 gsize   n_bytes)
 {
 	Page *page;
 	Catalog *catalog;
 	GString *string;
-	gint n_pages, i, words;
-	gchar *t;
+	gint n_pages, i;
+	gsize n_bytes_remaining;
 
 	n_pages = document->getNumPages();
 	string = g_string_new ("");
-	words = i = 0;
+	i = 0;
+	n_bytes_remaining = n_bytes;
 	catalog = document->getCatalog();
 
-	while (i < n_pages && words < n_words) {
-		guint normalized_words = 0;
+	while (i < n_pages && n_bytes_remaining > 0) {
 		Gfx *gfx;
 		GooString *sel_text;
 		TextOutputDev *text_dev;
 		PDFRectangle pdf_selection;
 		gdouble height = 0, width = 0;
+		gsize len_to_validate;
 
 		page = catalog->getPage (i + 1);
 		i++;
@@ -360,12 +361,17 @@ extract_content (PDFDoc *document,
 
 		sel_text = text_dev->getSelectionText (&pdf_selection, selectionStyleWord);
 
-		t = tracker_text_normalize (sel_text->getCString (), n_words - words, &normalized_words);
+		len_to_validate = MIN (n_bytes_remaining, strlen (sel_text->getCString ()));
 
-		words += normalized_words;
-		g_string_append (string, t);
+		if (tracker_text_validate_utf8 (sel_text->getCString (),
+		                                len_to_validate,
+		                                &string)) {
+			/* A whitespace is added to separate next strings appended */
+			g_string_append_c (string, ' ');
+		}
 
-		g_free (t);
+		/* Update accumulated UTF-8 bytes read */
+		n_bytes_remaining -= len_to_validate;
 
 		delete gfx;
 		delete text_dev;
@@ -494,13 +500,13 @@ extract_pdf (const gchar          *uri,
              TrackerSparqlBuilder *preupdate,
              TrackerSparqlBuilder *metadata)
 {
-	TrackerFTSConfig *fts_config;
+	TrackerConfig *config;
 	TrackerXmpData *xd = NULL;
 	PDFData pd = { 0 }; /* actual data */
 	PDFData md = { 0 }; /* for merging */
 	PDFDoc *document;
 	gchar *content;
-	guint n_words;
+	gsize n_bytes;
 	Object obj;
 	Catalog *catalog;
 
@@ -783,9 +789,9 @@ extract_pdf (const gchar          *uri,
 	tracker_sparql_builder_predicate (metadata, "nfo:pageCount");
 	tracker_sparql_builder_object_int64 (metadata, document->getNumPages());
 
-	fts_config = tracker_main_get_fts_config ();
-	n_words = tracker_fts_config_get_max_words_to_index (fts_config);
-	content = extract_content (document, n_words);
+	config = tracker_main_get_config ();
+	n_bytes = tracker_config_get_max_bytes (config);
+	content = extract_content (document, n_bytes);
 
 	if (content) {
 		tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
diff --git a/src/tracker-extract/tracker-main.c b/src/tracker-extract/tracker-main.c
index b31031d..3957862 100644
--- a/src/tracker-extract/tracker-main.c
+++ b/src/tracker-extract/tracker-main.c
@@ -76,7 +76,7 @@ static gboolean force_internal_extractors;
 static gchar *force_module;
 static gboolean version;
 
-static TrackerFTSConfig *fts_config;
+static TrackerConfig *config;
 
 static GOptionEntry entries[] = {
 	{ "verbosity", 'v', 0,
@@ -264,14 +264,10 @@ log_handler (const gchar    *domain,
 	}
 }
 
-TrackerFTSConfig *
-tracker_main_get_fts_config (void)
+TrackerConfig *
+tracker_main_get_config (void)
 {
-	if (G_UNLIKELY (!fts_config)) {
-		fts_config = tracker_fts_config_new ();
-	}
-
-	return fts_config;
+	return config;
 }
 
 
@@ -336,7 +332,6 @@ main (int argc, char *argv[])
 {
 	GOptionContext *context;
 	GError         *error = NULL;
-	TrackerConfig  *config;
 	TrackerExtract *object;
 	gchar          *log_filename = NULL;
 
diff --git a/src/tracker-extract/tracker-main.h b/src/tracker-extract/tracker-main.h
index d8b150f..318699e 100644
--- a/src/tracker-extract/tracker-main.h
+++ b/src/tracker-extract/tracker-main.h
@@ -21,7 +21,7 @@
 #ifndef __TRACKER_MAIN_H__
 #define __TRACKER_MAIN_H__
 
-#include "tracker-fts-config.h"
+#include "tracker-config.h"
 
 G_BEGIN_DECLS
 
@@ -29,7 +29,9 @@ G_BEGIN_DECLS
  * get more work to do.
  */
 void              tracker_main_quit_timeout_reset (void);
-TrackerFTSConfig *tracker_main_get_fts_config     (void);
+
+/* Enables getting the config object from extractors */
+TrackerConfig    *tracker_main_get_config         (void);
 
 G_END_DECLS
 



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]