[tracker/extractor-remove-word-counting-review] Fixes GB#616845 - Avoid word counting in the extractors
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/extractor-remove-word-counting-review] Fixes GB#616845 - Avoid word counting in the extractors
- Date: Tue, 11 May 2010 13:11:06 +0000 (UTC)
commit 40cea603096bf12fea95ffd7a4c2c4aa9ac43cf9
Author: Aleksander Morgado <aleksander lanedo com>
Date: Mon May 10 20:07:09 2010 +0200
Fixes GB#616845 - Avoid word counting in the extractors
* New max_bytes parameter added to tracker-extract config file. Extractors will
read up to that configured limit.
* Removed the need of reading the FTS config file from tracker-extract.
* Word counting not done now in the extractors.
Note: As a side-effect, last word extracted when reached max_bytes may get cut
and only first chunk of it extracted.
src/libtracker-extract/tracker-utils.c | 60 +++-
src/libtracker-extract/tracker-utils.h | 9 +-
src/tracker-extract/Makefile.am | 2 -
src/tracker-extract/tracker-config.c | 58 +++-
src/tracker-extract/tracker-config.h | 4 +
src/tracker-extract/tracker-extract-html.c | 42 ++-
src/tracker-extract/tracker-extract-msoffice.c | 157 ++-------
src/tracker-extract/tracker-extract-oasis.c | 74 ++---
src/tracker-extract/tracker-extract-pdf.cpp | 36 ++-
src/tracker-extract/tracker-fts-config.c | 429 ------------------------
src/tracker-extract/tracker-fts-config.h | 64 ----
src/tracker-extract/tracker-main.c | 13 +-
src/tracker-extract/tracker-main.h | 6 +-
13 files changed, 233 insertions(+), 721 deletions(-)
---
diff --git a/src/libtracker-extract/tracker-utils.c b/src/libtracker-extract/tracker-utils.c
index a2cc6ab..f9f1084 100644
--- a/src/libtracker-extract/tracker-utils.c
+++ b/src/libtracker-extract/tracker-utils.c
@@ -183,7 +183,7 @@ tracker_coalesce (gint n_values,
* Since: 0.9
**/
gchar *
-tracker_merge_const (const gchar *delimiter,
+tracker_merge_const (const gchar *delimiter,
gint n_values,
...)
{
@@ -239,7 +239,7 @@ tracker_merge_const (const gchar *delimiter,
* Deprecated: 1.0: Use tracker_merge_const() instead.
**/
gchar *
-tracker_merge (const gchar *delimiter,
+tracker_merge (const gchar *delimiter,
gint n_values,
...)
{
@@ -304,6 +304,8 @@ tracker_merge (const gchar *delimiter,
* be freed with g_free() when finished with, otherwise %NULL.
*
* Since: 0.8
+ *
+ * Deprecated: 1.0: Use tracker_text_validate_utf8() instead.
**/
gchar *
tracker_text_normalize (const gchar *text,
@@ -345,10 +347,10 @@ tracker_text_normalize (const gchar *text,
}
if (n_words) {
- if (!in_break) {
- /* Count the last word */
- words += 1;
- }
+ if (!in_break) {
+ /* Count the last word */
+ words += 1;
+ }
*n_words = words;
}
@@ -356,6 +358,52 @@ tracker_text_normalize (const gchar *text,
}
/**
+ * tracker_text_validate_utf8:
+ * @text: the text to validate
+ * @text_len: length of @text, or -1 if NIL-terminated
+ * @str: the string where to place the validated characters
+ *
+ * This function iterates through @text checking for UTF-8 validity
+ * using g_utf8_validate(), and appends the first chunk of valid characters
+ * to @str.
+ *
+ * Returns: %TRUE if valid UTF-8 in @text was appended to @str
+ *
+ * Since: 0.9
+ **/
+gboolean
+tracker_text_validate_utf8 (const gchar *text,
+ gsize text_len,
+ GString **str)
+{
+ gsize len_to_validate;
+
+ g_return_val_if_fail (text, FALSE);
+ g_return_val_if_fail (str, FALSE);
+
+ len_to_validate = text_len >= 0 ? text_len : strlen (text);
+
+ if (len_to_validate > 0) {
+ const gchar *end = text;
+
+ /* Validate string, getting the pointer to first non-valid character
+ * (if any) or to the end of the string. */
+ g_utf8_validate (text, len_to_validate, &end);
+ if (end > text) {
+ /* Create string to output if not already as input */
+ if (*str == NULL) {
+ *str = g_string_new_len (text, end-text);
+ } else {
+ *str = g_string_append_len (*str, text, end-text);
+ }
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+}
+
+/**
* tracker_date_format_to_iso8601:
* @date_string: the date in a string pointer
* @format: the format of the @date_string
diff --git a/src/libtracker-extract/tracker-utils.h b/src/libtracker-extract/tracker-utils.h
index db2da92..6003d36 100644
--- a/src/libtracker-extract/tracker-utils.h
+++ b/src/libtracker-extract/tracker-utils.h
@@ -34,11 +34,14 @@ gchar* tracker_coalesce (gint n_values,
gchar* tracker_merge (const gchar *delimiter,
gint n_values,
...) G_GNUC_DEPRECATED;
-#endif /* TRACKER_DISABLE_DEPRECATED */
-
gchar* tracker_text_normalize (const gchar *text,
guint max_words,
- guint *n_words);
+ guint *n_words) G_GNUC_DEPRECATED;
+#endif /* TRACKER_DISABLE_DEPRECATED */
+
+gboolean tracker_text_validate_utf8 (const gchar *text,
+ gsize text_len,
+ GString **str);
gchar* tracker_date_guess (const gchar *date_string);
gchar* tracker_date_format_to_iso8601 (const gchar *date_string,
const gchar *format);
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index 37454ba..d9b6c39 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -304,8 +304,6 @@ tracker_extract_SOURCES = \
tracker-dbus.h \
tracker-extract.c \
tracker-extract.h \
- tracker-fts-config.c \
- tracker-fts-config.h \
tracker-main.c \
tracker-main.h \
tracker-albumart-generic.h
diff --git a/src/tracker-extract/tracker-config.c b/src/tracker-extract/tracker-config.c
index 0d09a2b..c5edb1d 100644
--- a/src/tracker-extract/tracker-config.c
+++ b/src/tracker-extract/tracker-config.c
@@ -30,10 +30,12 @@
/* Default values */
#define DEFAULT_VERBOSITY 0
+#define DEFAULT_MAX_BYTES 1048576 /* 1Mbyte */
typedef struct {
/* General */
gint verbosity;
+ gint max_bytes;
} TrackerConfigPrivate;
typedef struct {
@@ -63,11 +65,13 @@ enum {
PROP_0,
/* General */
- PROP_VERBOSITY
+ PROP_VERBOSITY,
+ PROP_MAX_BYTES
};
static ObjectToKeyFile conversions[] = {
{ G_TYPE_INT, "verbosity", GROUP_GENERAL, "Verbosity" },
+ { G_TYPE_INT, "max_bytes", GROUP_GENERAL, "Max Bytes" },
};
G_DEFINE_TYPE (TrackerConfig, tracker_config, TRACKER_TYPE_CONFIG_FILE);
@@ -93,6 +97,16 @@ tracker_config_class_init (TrackerConfigClass *klass)
DEFAULT_VERBOSITY,
G_PARAM_READWRITE | G_PARAM_CONSTRUCT));
+ g_object_class_install_property (object_class,
+ PROP_VERBOSITY,
+ g_param_spec_int ("max_bytes",
+ "Max Bytes",
+ " Maximum number of UTF-8 bytes to extract [0,G_MAXINT]",
+ 0,
+ G_MAXINT,
+ DEFAULT_MAX_BYTES,
+ G_PARAM_READWRITE | G_PARAM_CONSTRUCT));
+
g_type_class_add_private (object_class, sizeof (TrackerConfigPrivate));
}
@@ -105,7 +119,7 @@ static void
config_set_property (GObject *object,
guint param_id,
const GValue *value,
- GParamSpec *pspec)
+ GParamSpec *pspec)
{
switch (param_id) {
/* General */
@@ -114,6 +128,11 @@ config_set_property (GObject *object,
g_value_get_int (value));
break;
+ case PROP_MAX_BYTES:
+ tracker_config_set_max_bytes (TRACKER_CONFIG (object),
+ g_value_get_int (value));
+ break;
+
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID (object, param_id, pspec);
break;
@@ -136,6 +155,10 @@ config_get_property (GObject *object,
g_value_set_int (value, priv->verbosity);
break;
+ case PROP_MAX_BYTES:
+ g_value_set_int (value, priv->max_bytes);
+ break;
+
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID (object, param_id, pspec);
break;
@@ -317,3 +340,34 @@ tracker_config_set_verbosity (TrackerConfig *config,
priv->verbosity = value;
g_object_notify (G_OBJECT (config), "verbosity");
}
+
+
+gint
+tracker_config_get_max_bytes (TrackerConfig *config)
+{
+ TrackerConfigPrivate *priv;
+
+ g_return_val_if_fail (TRACKER_IS_CONFIG (config), DEFAULT_MAX_BYTES);
+
+ priv = TRACKER_CONFIG_GET_PRIVATE (config);
+
+ return priv->max_bytes;
+}
+
+void
+tracker_config_set_max_bytes (TrackerConfig *config,
+ gint value)
+{
+ TrackerConfigPrivate *priv;
+
+ g_return_if_fail (TRACKER_IS_CONFIG (config));
+
+ if (!tracker_keyfile_object_validate_int (config, "max_bytes", value)) {
+ return;
+ }
+
+ priv = TRACKER_CONFIG_GET_PRIVATE (config);
+
+ priv->max_bytes = value;
+ g_object_notify (G_OBJECT (config), "max_bytes");
+}
diff --git a/src/tracker-extract/tracker-config.h b/src/tracker-extract/tracker-config.h
index cdede72..491a811 100644
--- a/src/tracker-extract/tracker-config.h
+++ b/src/tracker-extract/tracker-config.h
@@ -53,6 +53,10 @@ gint tracker_config_get_verbosity (TrackerConfig *config);
void tracker_config_set_verbosity (TrackerConfig *config,
gint value);
+gint tracker_config_get_max_bytes (TrackerConfig *config);
+void tracker_config_set_max_bytes (TrackerConfig *config,
+ gint value);
+
G_END_DECLS
#endif /* __TRACKER_EXTRACT_CONFIG_H__ */
diff --git a/src/tracker-extract/tracker-extract-html.c b/src/tracker-extract/tracker-extract-html.c
index 6583cdf..a59b864 100644
--- a/src/tracker-extract/tracker-extract-html.c
+++ b/src/tracker-extract/tracker-extract-html.c
@@ -41,7 +41,7 @@ typedef struct {
const gchar *uri;
guint in_body : 1;
GString *plain_text;
- guint n_words;
+ guint n_bytes_remaining;
} parser_data;
static void extract_html (const gchar *filename,
@@ -212,24 +212,28 @@ parser_characters (void *data,
case READ_IGNORE:
break;
default:
- if (pd->in_body && pd->n_words > 0) {
- gchar *text;
- guint n_words;
-
- text = tracker_text_normalize (ch, pd->n_words, &n_words);
-
- if (text && *text) {
- g_string_append (pd->plain_text, text);
+ if (pd->in_body && pd->n_bytes_remaining > 0) {
+ gsize text_len;
+
+ text_len = strlen (ch);
+
+ if (tracker_text_validate_utf8 (ch,
+ (pd->n_bytes_remaining < text_len ?
+ pd->n_bytes_remaining :
+ text_len),
+ &pd->plain_text)) {
+ /* In the case of HTML, each string arriving this
+ * callback is independent to any other previous
+ * string, so need to add an explicit whitespace
+ * separator */
g_string_append_c (pd->plain_text, ' ');
-
- if (n_words > pd->n_words) {
- pd->n_words = 0;
- } else {
- pd->n_words -= n_words;
- }
}
- g_free (text);
+ if (pd->n_bytes_remaining > text_len) {
+ pd->n_bytes_remaining -= text_len;
+ } else {
+ pd->n_bytes_remaining = 0;
+ }
}
break;
}
@@ -240,7 +244,7 @@ extract_html (const gchar *uri,
TrackerSparqlBuilder *preupdate,
TrackerSparqlBuilder *metadata)
{
- TrackerFTSConfig *fts_config;
+ TrackerConfig *config;
htmlDocPtr doc;
parser_data pd;
gchar *filename;
@@ -288,8 +292,8 @@ extract_html (const gchar *uri,
pd.uri = uri;
pd.plain_text = g_string_new (NULL);
- fts_config = tracker_main_get_fts_config ();
- pd.n_words = tracker_fts_config_get_max_words_to_index (fts_config);
+ config = tracker_main_get_config ();
+ pd.n_bytes_remaining = tracker_config_get_max_bytes (config);
filename = g_filename_from_uri (uri, NULL, NULL);
doc = htmlSAXParseFile (filename, NULL, &handler, &pd);
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index c9c2de9..d47a1c3 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -394,8 +394,6 @@ read_32bit (const guint8 *buffer)
* @param chunk_size Number of valid bytes in the input buffer
* @param is_ansi If %TRUE, input text should be encoded in CP1252, and
* in UTF-16 otherwise.
- * @param p_words_remaining Pointer to #gint specifying how many words
- * should still be considered.
* @param p_words_remaining Pointer to #gsize specifying how many bytes
* should still be considered.
* @param p_content Pointer to a #GString where the output normalized words
@@ -405,7 +403,6 @@ static void
msoffice_convert_and_normalize_chunk (guint8 *buffer,
gsize chunk_size,
gboolean is_ansi,
- gint *p_words_remaining,
gsize *p_bytes_remaining,
GString **p_content)
{
@@ -415,7 +412,6 @@ msoffice_convert_and_normalize_chunk (guint8 *buffer,
g_return_if_fail (buffer != NULL);
g_return_if_fail (chunk_size > 0);
- g_return_if_fail (p_words_remaining != NULL);
g_return_if_fail (p_bytes_remaining != NULL);
g_return_if_fail (p_content != NULL);
@@ -432,42 +428,20 @@ msoffice_convert_and_normalize_chunk (guint8 *buffer,
&error);
if (converted_text) {
- gchar *normalized_chunk;
- guint n_words_normalized;
-
- /* Get normalized chunk */
- normalized_chunk = tracker_text_normalize (converted_text,
- *p_words_remaining,
- &n_words_normalized);
+ gsize len_to_validate;
- /* Update number of words remaining.
- * Note that n_words_normalized should always be less or
- * equal than n_words_remaining */
- *p_words_remaining = (n_words_normalized <= *p_words_remaining ?
- *p_words_remaining - n_words_normalized : 0);
+ len_to_validate = MIN (*p_bytes_remaining, n_bytes_utf8);
- /* Update accumulated UTF-8 bytes read */
- *p_bytes_remaining = (n_bytes_utf8 <= *p_bytes_remaining ?
- *p_bytes_remaining - n_bytes_utf8 : 0);
-
- /* g_debug ("Words normalized: %u (remaining: %u); " */
- /* "Bytes read (UTF-8): %" G_GSIZE_FORMAT " bytes " */
- /* "(remaining: %" G_GSIZE_FORMAT ")", */
- /* n_words_normalized, *p_words_remaining, */
- /* n_bytes_utf8, *p_bytes_remaining); */
-
- /* Append normalized chunk to the string to be returned */
- if (*p_content) {
- g_string_append (*p_content, normalized_chunk);
- } else {
- *p_content = g_string_new (normalized_chunk);
+ if (tracker_text_validate_utf8 (converted_text,
+ len_to_validate,
+ p_content)) {
+ /* A whitespace is added to separate next strings appended */
+ g_string_append_c (*p_content, ' ');
}
- /* A whitespace is added to separate next strings appended */
- g_string_append (*p_content, " ");
+ /* Update accumulated UTF-8 bytes read */
+ *p_bytes_remaining -= len_to_validate;
- g_free (converted_text);
- g_free (normalized_chunk);
} else {
g_warning ("Couldn't convert %" G_GSIZE_FORMAT " bytes from %s to UTF-8: %s",
chunk_size,
@@ -659,7 +633,6 @@ ppt_seek_header (GsfInput *stream,
static gchar *
extract_powerpoint_content (GsfInfile *infile,
- gint max_words,
gsize max_bytes,
gboolean *is_encrypted)
{
@@ -733,18 +706,16 @@ extract_powerpoint_content (GsfInfile *infile,
SLIDELISTWITHTEXT_RECORD_TYPE,
SLIDELISTWITHTEXT_RECORD_TYPE,
FALSE)) {
- gint words_remaining = max_words;
gsize bytes_remaining = max_bytes;
guint8 *buffer = NULL;
gsize buffer_size = 0;
/*
* Read while we have either TextBytesAtom or
- * TextCharsAtom and we have read less than max_words
- * amount of words and less than max_bytes (in UTF-8)
+ * TextCharsAtom and we have read less than max_bytes
+ * (in UTF-8)
*/
- while (words_remaining > 0 &&
- bytes_remaining > 0 &&
+ while (bytes_remaining > 0 &&
ppt_seek_header (stream,
TEXTBYTESATOM_RECORD_TYPE,
TEXTCHARSATOM_RECORD_TYPE,
@@ -763,7 +734,6 @@ extract_powerpoint_content (GsfInfile *infile,
msoffice_convert_and_normalize_chunk (buffer,
read_size,
FALSE, /* Always UTF-16 */
- &words_remaining,
&bytes_remaining,
&all_texts);
}
@@ -778,45 +748,6 @@ extract_powerpoint_content (GsfInfile *infile,
}
/**
- * @brief get maximum number of words to index
- * @return maximum number of words to index
- */
-static gint
-fts_max_words (void)
-{
- TrackerFTSConfig *fts_config;
-
- fts_config = tracker_main_get_fts_config ();
- return tracker_fts_config_get_max_words_to_index (fts_config);
-}
-
-/**
- * @brief get min word length
- * @return min_word_length
- */
-static gint
-fts_min_word_length (void)
-{
- TrackerFTSConfig *fts_config;
-
- fts_config = tracker_main_get_fts_config ();
- return tracker_fts_config_get_min_word_length (fts_config);
-}
-
-/**
- * @brief get max word length
- * @return max_word_length
- */
-static gint
-fts_max_word_length (void)
-{
- TrackerFTSConfig *fts_config;
-
- fts_config = tracker_main_get_fts_config ();
- return tracker_fts_config_get_max_word_length (fts_config);
-}
-
-/**
* @brief Open specified uri for reading and initialize gsf
* @param uri URI of the file to open
* @return GsfInFile of the opened file or NULL if failed to open file
@@ -847,7 +778,6 @@ open_uri (const gchar *uri)
*/
static gchar *
extract_msword_content (GsfInfile *infile,
- gint n_words,
gsize n_bytes,
gboolean *is_encrypted)
{
@@ -863,7 +793,6 @@ extract_msword_content (GsfInfile *infile,
GString *content = NULL;
guint8 *text_buffer = NULL;
gint text_buffer_size = 0;
- guint n_words_remaining;
gsize n_bytes_remaining;
document_stream = gsf_infile_child_by_name (infile, "WordDocument");
@@ -939,14 +868,11 @@ extract_msword_content (GsfInfile *infile,
/* Iterate over pieces...
* Loop is halted whenever one of this conditions is met:
* a) Max bytes to be read reached
- * b) Already read up to the max number of words configured
- * c) No more pieces to read
+ * b) No more pieces to read
*/
i = 0;
- n_words_remaining = n_words;
n_bytes_remaining = n_bytes;
- while (n_words_remaining > 0 &&
- n_bytes_remaining > 0 &&
+ while (n_bytes_remaining > 0 &&
i < piece_count) {
guint8 *piece_descriptor;
gint piece_start;
@@ -1009,7 +935,6 @@ extract_msword_content (GsfInfile *infile,
msoffice_convert_and_normalize_chunk (text_buffer,
piece_size,
is_ansi,
- &n_words_remaining,
&n_bytes_remaining,
&content);
}
@@ -1295,7 +1220,6 @@ read_excel_string (GsfInput *stream,
static void
xls_get_extended_record_string (GsfInput *stream,
GArray *list,
- guint *p_words_remaining,
gsize *p_bytes_remaining,
GString **p_content)
{
@@ -1337,12 +1261,10 @@ xls_get_extended_record_string (GsfInput *stream,
/* Iterate over chunks...
* Loop is halted whenever one of this conditions is met:
* a) Max bytes to be read reached
- * b) Already read up to the max number of words configured
- * c) No more chunks to read
+ * b) No more chunks to read
*/
i = 0;
- while (*p_words_remaining > 0 &&
- *p_bytes_remaining > 0 &&
+ while (*p_bytes_remaining > 0 &&
i < cst_unique) {
guint16 cch;
guint16 c_run;
@@ -1398,7 +1320,6 @@ xls_get_extended_record_string (GsfInput *stream,
msoffice_convert_and_normalize_chunk (buffer,
chunk_size,
!is_high_byte,
- p_words_remaining,
p_bytes_remaining,
p_content);
@@ -1475,7 +1396,6 @@ xls_get_extended_record_string (GsfInput *stream,
*/
static gchar*
extract_excel_content (GsfInfile *infile,
- gint n_words,
gsize n_bytes,
gboolean *is_encrypted)
{
@@ -1483,7 +1403,6 @@ extract_excel_content (GsfInfile *infile,
GString *content = NULL;
GsfInput *stream;
guint saved_offset;
- guint n_words_remaining = n_words;
gsize n_bytes_remaining = n_bytes;
stream = gsf_infile_child_by_name (infile, "Workbook");
@@ -1493,8 +1412,7 @@ extract_excel_content (GsfInfile *infile,
}
/* Read until we reach eof or any of our limits reached */
- while (n_words_remaining > 0 &&
- n_bytes_remaining > 0 &&
+ while (n_bytes_remaining > 0 &&
!gsf_input_eof (stream)) {
guint8 tmp_buffer[4] = { 0 };
@@ -1577,7 +1495,6 @@ extract_excel_content (GsfInfile *infile,
/* Read extended string */
xls_get_extended_record_string (stream,
list,
- &n_words_remaining,
&n_bytes_remaining,
&content);
@@ -1596,8 +1513,7 @@ extract_excel_content (GsfInfile *infile,
g_object_unref (stream);
- g_debug ("Words normalized: %u, Bytes: %" G_GSIZE_FORMAT,
- n_words - n_words_remaining,
+ g_debug ("Bytes extracted: %" G_GSIZE_FORMAT,
n_bytes - n_bytes_remaining);
return content ? g_string_free (content, FALSE) : NULL;
@@ -1696,13 +1612,13 @@ extract_msoffice (const gchar *uri,
TrackerSparqlBuilder *preupdate,
TrackerSparqlBuilder *metadata)
{
+ TrackerConfig *config;
GFile *file = NULL;
GFileInfo *file_info = NULL;
const gchar *mime_used;
GsfInfile *infile = NULL;
gchar *content = NULL;
gboolean is_encrypted = FALSE;
- gint max_words;
gsize max_bytes;
file = g_file_new_for_uri (uri);
@@ -1738,23 +1654,19 @@ extract_msoffice (const gchar *uri,
mime_used = g_file_info_get_content_type (file_info);
- /* Set max words to read from content */
- max_words = fts_max_words ();
-
- /* Set max bytes to read from content.
- * Assuming 3 bytes per unicode point in UTF-8, as 4-byte UTF-8 unicode
- * points are really pretty rare */
- max_bytes = 3 * max_words * fts_max_word_length ();
+ /* Set max bytes to read from content */
+ config = tracker_main_get_config ();
+ max_bytes = tracker_config_get_max_bytes (config);
if (g_ascii_strcasecmp (mime_used, "application/msword") == 0) {
/* Word file */
- content = extract_msword_content (infile, max_words, max_bytes, &is_encrypted);
+ content = extract_msword_content (infile, max_bytes, &is_encrypted);
} else if (g_ascii_strcasecmp (mime_used, "application/vnd.ms-powerpoint") == 0) {
/* PowerPoint file */
- content = extract_powerpoint_content (infile, max_words, max_bytes, &is_encrypted);
+ content = extract_powerpoint_content (infile, max_bytes, &is_encrypted);
} else if (g_ascii_strcasecmp (mime_used, "application/vnd.ms-excel") == 0) {
/* Excel File */
- content = extract_excel_content (infile, max_words, max_bytes, &is_encrypted);
+ content = extract_excel_content (infile, max_bytes, &is_encrypted);
} else {
g_message ("Mime type was not recognised:'%s'", mime_used);
}
@@ -1943,20 +1855,21 @@ xml_text_handler_document_data (GMarkupParseContext *context,
MsOfficeXMLParserInfo *info = user_data;
static gboolean found = FALSE;
static gboolean added = FALSE;
- guint min_word_length = fts_min_word_length();
switch (info->tag_type) {
case MS_OFFICE_XML_TAG_WORD_TEXT:
if (info->style_element_present) {
if (atoi (text) == 0) {
- g_string_append_printf (info->content, "%s ", text);
+ tracker_text_validate_utf8 (text, -1, &info->content);
+ g_string_append_c (info->content, ' ');
}
}
if (info->preserve_attribute_present) {
gchar *keywords = g_strdup (text);
- if (found && (strlen (keywords) >= min_word_length)) {
- g_string_append_printf (info->content, "%s ", text);
+ if (found) {
+ tracker_text_validate_utf8 (text, -1, &info->content);
+ g_string_append_c (info->content, ' ');
found = FALSE;
} else {
gchar *lasts;
@@ -1979,14 +1892,14 @@ xml_text_handler_document_data (GMarkupParseContext *context,
break;
case MS_OFFICE_XML_TAG_SLIDE_TEXT:
- if (strlen (text) > min_word_length) {
- g_string_append_printf (info->content, "%s ", text);
- }
+ tracker_text_validate_utf8 (text, -1, &info->content);
+ g_string_append_c (info->content, ' ');
break;
case MS_OFFICE_XML_TAG_XLS_SHARED_TEXT:
- if ((atoi (text) == 0) && (strlen (text) > min_word_length)) {
- g_string_append_printf (info->content, "%s ", text);
+ if (atoi (text) == 0) {
+ tracker_text_validate_utf8 (text, -1, &info->content);
+ g_string_append_c (info->content, ' ');
}
break;
diff --git a/src/tracker-extract/tracker-extract-oasis.c b/src/tracker-extract/tracker-extract-oasis.c
index e2f482c..573e0db 100644
--- a/src/tracker-extract/tracker-extract-oasis.c
+++ b/src/tracker-extract/tracker-extract-oasis.c
@@ -74,7 +74,6 @@ static TrackerExtractData extract_data[] = {
static gchar *
extract_oasis_content (const gchar *uri,
- guint n_words,
gsize n_bytes)
{
const gchar *argv[4];
@@ -93,9 +92,9 @@ extract_oasis_content (const gchar *uri,
argv[2] = path;
argv[3] = NULL;
- g_debug ("Executing command:'%s %s %s' (max words: %u, "
- "max_bytes: %" G_GSIZE_FORMAT ")",
- argv[0], argv[1], argv[2], n_words, n_bytes);
+ g_debug ("Executing command:'%s %s %s' "
+ "(max_bytes: %" G_GSIZE_FORMAT ")",
+ argv[0], argv[1], argv[2], n_bytes);
/* Fork & spawn */
if (!g_spawn_async_with_pipes (g_get_tmp_dir (),
@@ -122,50 +121,38 @@ extract_oasis_content (const gchar *uri,
/* Start buffered reading... */
else {
unsigned char buf[ODT_BUFFER_SIZE];
- size_t r, accum;
- guint n_words_remaining = n_words;
- GString *normalized;
+ size_t r, bytes_remaining;
+ GString *validated = NULL;
- accum = 0;
- normalized = g_string_new ("");
+ bytes_remaining = n_bytes;
/* Reading in chunks of ODT_BUFFER_SIZE -1 (8192)
* Loop is halted whenever one of this conditions is met:
* a) Read bytes reached the maximum allowed (n_bytes)
- * b) Already read up to the max number of words configured
- * c) No more bytes to read
+ * b) No more bytes to read
*/
- while ((accum <= n_bytes) &&
- (n_words_remaining > 0) &&
+ while ((bytes_remaining > 0) &&
(r = fread (buf, 1, ODT_BUFFER_SIZE-1, fz))) {
- gchar *normalized_chunk;
- guint n_words_normalized;
-
- /* Always make sure that the read string will be
- * NIL-terminated */
- buf[r] = '\0';
- /* Get normalized chunk */
- normalized_chunk = tracker_text_normalize (buf,
- n_words_remaining,
- &n_words_normalized);
- /* Update number of words remaining.
- * Note that n_words_normalized should always be less or
- * equal than n_words_remaining */
- n_words_remaining = (n_words_normalized <= n_words_remaining ?
- n_words_remaining - n_words_normalized : 0);
- /* Update accumulated */
- accum += r;
-
- /* Add normalized chunk to the whole normalized string */
- g_string_append (normalized, normalized_chunk);
- g_free (normalized_chunk);
+ gsize len_to_validate;
+
+ len_to_validate = MIN (bytes_remaining, r);
+
+ tracker_text_validate_utf8 (buf,
+ len_to_validate,
+ &validated);
+
+ /* Note that in this case we shouldn't add a whitespace
+ * separator between chunks read */
+
+ /* Update remaining */
+ bytes_remaining -= len_to_validate;
}
/* fclose() the stream, no need to close() the original FD */
fclose (fz);
/* Set final normalized contents to return */
- text = g_string_free (normalized, FALSE);
+ text = g_string_free (validated, FALSE);
}
g_free (path);
@@ -179,9 +166,7 @@ extract_oasis (const gchar *uri,
TrackerSparqlBuilder *metadata)
{
gchar *content;
- TrackerFTSConfig *fts_config;
- guint n_words;
- gsize n_bytes;
+ TrackerConfig *config;
ODTParseInfo info;
GMarkupParseContext *context;
GMarkupParser parser = {
@@ -193,7 +178,7 @@ extract_oasis (const gchar *uri,
};
/* Setup conf */
- fts_config = tracker_main_get_fts_config ();
+ config = tracker_main_get_config ();
g_debug ("Extracting OASIS metadata and contents from '%s'", uri);
@@ -217,16 +202,9 @@ extract_oasis (const gchar *uri,
/* Next, parse contents */
- /* Set max words to read from content */
- n_words = tracker_fts_config_get_max_words_to_index (fts_config);
-
- /* Set max bytes to read from content.
- * Assuming 3 bytes per unicode point in UTF-8, as 4-byte UTF-8 unicode
- * points are really pretty rare */
- n_bytes = 3 * n_words * tracker_fts_config_get_max_word_length(fts_config);
-
/* Extract content with the given limitations */
- content = extract_oasis_content (uri, n_words, n_bytes);
+ content = extract_oasis_content (uri,
+ tracker_config_get_max_bytes (config));
if (content) {
tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
tracker_sparql_builder_object_unvalidated (metadata, content);
diff --git a/src/tracker-extract/tracker-extract-pdf.cpp b/src/tracker-extract/tracker-extract-pdf.cpp
index b7a35be..02d3441 100644
--- a/src/tracker-extract/tracker-extract-pdf.cpp
+++ b/src/tracker-extract/tracker-extract-pdf.cpp
@@ -314,26 +314,27 @@ page_get_size (Page *page,
static gchar *
extract_content (PDFDoc *document,
- guint n_words)
+ gsize n_bytes)
{
Page *page;
Catalog *catalog;
GString *string;
- gint n_pages, i, words;
- gchar *t;
+ gint n_pages, i;
+ gsize n_bytes_remaining;
n_pages = document->getNumPages();
string = g_string_new ("");
- words = i = 0;
+ i = 0;
+ n_bytes_remaining = n_bytes;
catalog = document->getCatalog();
- while (i < n_pages && words < n_words) {
- guint normalized_words = 0;
+ while (i < n_pages && n_bytes_remaining > 0) {
Gfx *gfx;
GooString *sel_text;
TextOutputDev *text_dev;
PDFRectangle pdf_selection;
gdouble height = 0, width = 0;
+ gsize len_to_validate;
page = catalog->getPage (i + 1);
i++;
@@ -360,12 +361,17 @@ extract_content (PDFDoc *document,
sel_text = text_dev->getSelectionText (&pdf_selection, selectionStyleWord);
- t = tracker_text_normalize (sel_text->getCString (), n_words - words, &normalized_words);
+ len_to_validate = MIN (n_bytes_remaining, strlen (sel_text->getCString ()));
- words += normalized_words;
- g_string_append (string, t);
+ if (tracker_text_validate_utf8 (sel_text->getCString (),
+ len_to_validate,
+ &string)) {
+ /* A whitespace is added to separate next strings appended */
+ g_string_append_c (string, ' ');
+ }
- g_free (t);
+ /* Update accumulated UTF-8 bytes read */
+ n_bytes_remaining -= len_to_validate;
delete gfx;
delete text_dev;
@@ -494,13 +500,13 @@ extract_pdf (const gchar *uri,
TrackerSparqlBuilder *preupdate,
TrackerSparqlBuilder *metadata)
{
- TrackerFTSConfig *fts_config;
+ TrackerConfig *config;
TrackerXmpData *xd = NULL;
PDFData pd = { 0 }; /* actual data */
PDFData md = { 0 }; /* for merging */
PDFDoc *document;
gchar *content;
- guint n_words;
+ gsize n_bytes;
Object obj;
Catalog *catalog;
@@ -783,9 +789,9 @@ extract_pdf (const gchar *uri,
tracker_sparql_builder_predicate (metadata, "nfo:pageCount");
tracker_sparql_builder_object_int64 (metadata, document->getNumPages());
- fts_config = tracker_main_get_fts_config ();
- n_words = tracker_fts_config_get_max_words_to_index (fts_config);
- content = extract_content (document, n_words);
+ config = tracker_main_get_config ();
+ n_bytes = tracker_config_get_max_bytes (config);
+ content = extract_content (document, n_bytes);
if (content) {
tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
diff --git a/src/tracker-extract/tracker-main.c b/src/tracker-extract/tracker-main.c
index b31031d..3957862 100644
--- a/src/tracker-extract/tracker-main.c
+++ b/src/tracker-extract/tracker-main.c
@@ -76,7 +76,7 @@ static gboolean force_internal_extractors;
static gchar *force_module;
static gboolean version;
-static TrackerFTSConfig *fts_config;
+static TrackerConfig *config;
static GOptionEntry entries[] = {
{ "verbosity", 'v', 0,
@@ -264,14 +264,10 @@ log_handler (const gchar *domain,
}
}
-TrackerFTSConfig *
-tracker_main_get_fts_config (void)
+TrackerConfig *
+tracker_main_get_config (void)
{
- if (G_UNLIKELY (!fts_config)) {
- fts_config = tracker_fts_config_new ();
- }
-
- return fts_config;
+ return config;
}
@@ -336,7 +332,6 @@ main (int argc, char *argv[])
{
GOptionContext *context;
GError *error = NULL;
- TrackerConfig *config;
TrackerExtract *object;
gchar *log_filename = NULL;
diff --git a/src/tracker-extract/tracker-main.h b/src/tracker-extract/tracker-main.h
index d8b150f..318699e 100644
--- a/src/tracker-extract/tracker-main.h
+++ b/src/tracker-extract/tracker-main.h
@@ -21,7 +21,7 @@
#ifndef __TRACKER_MAIN_H__
#define __TRACKER_MAIN_H__
-#include "tracker-fts-config.h"
+#include "tracker-config.h"
G_BEGIN_DECLS
@@ -29,7 +29,9 @@ G_BEGIN_DECLS
* get more work to do.
*/
void tracker_main_quit_timeout_reset (void);
-TrackerFTSConfig *tracker_main_get_fts_config (void);
+
+/* Enables getting the config object from extractors */
+TrackerConfig *tracker_main_get_config (void);
G_END_DECLS
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]