[tracker/rss-enclosures] Fixes GB#616158/doc: Improve reading msoffice doc files
- From: Roberto Guido <rguido src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/rss-enclosures] Fixes GB#616158/doc: Improve reading msoffice doc files
- Date: Mon, 3 May 2010 00:40:48 +0000 (UTC)
commit 43b70a4fc2593c75f3128650aea0827ff5396175
Author: Aleksander Morgado <aleksander lanedo com>
Date: Mon Apr 19 12:44:12 2010 +0200
Fixes GB#616158/doc: Improve reading msoffice doc files
* Limit the max number of bytes to be read from the stream, to some safe
limit like 3*max_words*max_word_size.
* Don't load the whole doc in heap: use a buffer to read the contents, convert
to UTF-8, perform normalization and word count (chunk by chunk).
* Stop reading the contents when max bytes reached.
* Stop reading the contents when max number of words reached.
src/tracker-extract/tracker-extract-msoffice.c | 183 ++++++++++++++++++------
1 files changed, 141 insertions(+), 42 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index b0cdde6..f99d2c5 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -726,7 +726,9 @@ extract_powerpoint_content (GsfInfile *infile,
static gint
fts_max_words (void)
{
- TrackerFTSConfig *fts_config = tracker_main_get_fts_config ();
+ TrackerFTSConfig *fts_config;
+
+ fts_config = tracker_main_get_fts_config ();
return tracker_fts_config_get_max_words_to_index (fts_config);
}
@@ -737,11 +739,26 @@ fts_max_words (void)
static gint
fts_min_word_length (void)
{
- TrackerFTSConfig *fts_config = tracker_main_get_fts_config ();
+ TrackerFTSConfig *fts_config;
+
+ fts_config = tracker_main_get_fts_config ();
return tracker_fts_config_get_min_word_length (fts_config);
}
/**
+ * @brief get max word length
+ * @return max_word_length
+ */
+static gint
+fts_max_word_length (void)
+{
+ TrackerFTSConfig *fts_config;
+
+ fts_config = tracker_main_get_fts_config ();
+ return tracker_fts_config_get_max_word_length (fts_config);
+}
+
+/**
* @brief Open specified uri for reading and initialize gsf
* @param uri URI of the file to open
* @return GsfInFile of the opened file or NULL if failed to open file
@@ -773,6 +790,7 @@ open_uri (const gchar *uri)
static gchar *
extract_msword_content (GsfInfile *infile,
gint n_words,
+ gsize n_bytes,
gboolean *is_encrypted)
{
GsfInput *document_stream, *table_stream;
@@ -785,7 +803,10 @@ extract_msword_content (GsfInfile *infile,
gint piece_count;
gint32 fc;
GString *content = NULL;
- gchar *normalized = NULL;
+ guint8 *text_buffer = NULL;
+ gint text_buffer_size = 0;
+ guint n_words_remaining;
+ gsize n_bytes_remaining;
document_stream = gsf_infile_child_by_name (infile, "WordDocument");
if (document_stream == NULL) {
@@ -857,10 +878,19 @@ extract_msword_content (GsfInfile *infile,
}
}
- /* iterate over pieces and save text to the content -variable */
- for (i = 0; i < piece_count; i++) {
+ /* Iterate over pieces...
+ * Loop is halted whenever one of this conditions is met:
+ * a) Max bytes to be read reached
+ * b) Already read up to the max number of words configured
+ * c) No more pieces to read
+ */
+ i = 0;
+ n_words_remaining = n_words;
+ n_bytes_remaining = n_bytes;
+ while (n_words_remaining > 0 &&
+ n_bytes_remaining > 0 &&
+ i < piece_count) {
gchar *converted_text;
- guint8 *text_buffer;
guint8 *piece_descriptor;
gint piece_start;
gint piece_end;
@@ -887,53 +917,110 @@ extract_msword_content (GsfInfile *infile,
fc = (fc & 0xBFFFFFFF) >> 1;
}
- /* unicode uses twice as many bytes as CP1252 */
+
piece_size = piece_end - piece_start;
+
+ /* NOTE: Very very long pieces may appear. In fact, a single
+ * piece document seems to be quite normal. Thus, we limit
+ * here the number of bytes to read from the stream, based
+ * on the maximum number of bytes in UTF-8. Assuming, then
+ * that a safe limit is 2*n_bytes_remaining if UTF-16 input,
+ * and just n_bytes_remaining in CP1251 input */
+ piece_size = MIN (piece_size, n_bytes_remaining);
+
+ /* UTF-16 uses twice as many bytes as CP1252
+ * NOTE: Not quite sure about this. Some unicode points will be
+ * encoded using 4 bytes in UTF-16 */
if (!is_ansi) {
piece_size *= 2;
}
- if (piece_size < 1) {
- continue;
- }
+ /* Avoid empty pieces */
+ if (piece_size >= 1) {
+ GError *error = NULL;
+ gsize n_bytes_utf8;
+ guint n_words_normalized;
+
+ /* Re-allocate buffer to make it bigger if needed.
+ * This text buffer is re-used over and over in each
+ * iteration. */
+ if (piece_size > text_buffer_size) {
+ text_buffer = g_realloc (text_buffer, piece_size);
+ text_buffer_size = piece_size;
+ }
- /* read single text piece from document_stream */
- text_buffer = g_malloc (piece_size);
- gsf_input_seek (document_stream, fc, G_SEEK_SET);
- gsf_input_read (document_stream, piece_size, text_buffer);
-
- /* pieces can have different encoding */
- converted_text = g_convert (text_buffer,
- piece_size,
- "UTF-8",
- is_ansi ? "CP1252" : "UTF-16",
- NULL,
- NULL,
- NULL);
-
- if (converted_text) {
- if (!content) {
- content = g_string_new (converted_text);
- } else {
- g_string_append (content, converted_text);
+ /* read single text piece from document_stream */
+ gsf_input_seek (document_stream, fc, G_SEEK_SET);
+ gsf_input_read (document_stream, piece_size, text_buffer);
+
+ /* pieces can have different encoding
+ * TODO: Using g_iconv, this extra heap allocation could be
+ * avoided, re-using over and over again the same output buffer
+ * for the UTF-8 encoded string */
+ converted_text = g_convert (text_buffer,
+ piece_size,
+ "UTF-8",
+ is_ansi ? "CP1252" : "UTF-16",
+ NULL,
+ &n_bytes_utf8,
+ &error);
+
+ if (converted_text) {
+ gchar *normalized_chunk;
+
+ /* Get normalized chunk */
+ normalized_chunk = tracker_text_normalize (converted_text,
+ n_words_remaining,
+ &n_words_normalized);
+
+ /* Update number of words remaining.
+ * Note that n_words_normalized should always be less or
+ * equal than n_words_remaining */
+ n_words_remaining = (n_words_normalized <= n_words_remaining ?
+ n_words_remaining - n_words_normalized : 0);
+
+ /* Update accumulated UTF-8 bytes read */
+ n_bytes_remaining = (n_bytes_utf8 <= n_bytes_remaining ?
+ n_bytes_remaining - n_bytes_utf8 : 0);
+
+ g_debug ("(%s) Piece %u; Words normalized: %u (remaining: %u); "
+ "Bytes read (UTF-8): %" G_GSIZE_FORMAT " bytes "
+ "(remaining: %" G_GSIZE_FORMAT ")",
+ __FUNCTION__, i, n_words_normalized, n_words_remaining,
+ n_bytes_utf8, n_bytes_remaining);
+
+ /* Append normalized chunk to the string to be returned */
+ if (!content) {
+ content = g_string_new (normalized_chunk);
+ } else {
+ g_string_append (content, normalized_chunk);
+ }
+
+ g_free (converted_text);
+ g_free (normalized_chunk);
+ }
+ else {
+ g_warning ("Couldn't convert %d bytes from %s to UTF-8: %s",
+ piece_size,
+ is_ansi ? "CP1252" : "UTF-16",
+ error ? error->message : NULL);
}
- g_free (converted_text);
+ /* Note that error may be set even if some converted text is
+ * available, due to G_CONVERT_ERROR_ILLEGAL_SEQUENCE for example */
+ g_clear_error (&error);
}
- g_free (text_buffer);
+ /* Go on to next piece */
+ i++;
}
+ g_free (text_buffer);
g_object_unref (document_stream);
g_object_unref (table_stream);
g_free (clx);
- if (content) {
- normalized = tracker_text_normalize (content->str, n_words, NULL);
- g_string_free (content, TRUE);
- }
-
- return normalized;
+ return content ? g_string_free (content, FALSE) : NULL;
}
@@ -1410,6 +1497,8 @@ extract_msoffice (const gchar *uri,
GsfInfile *infile = NULL;
gchar *content = NULL;
gboolean is_encrypted = FALSE;
+ gint max_words;
+ gsize max_bytes;
file = g_file_new_for_uri (uri);
@@ -1444,15 +1533,25 @@ extract_msoffice (const gchar *uri,
mime_used = g_file_info_get_content_type (file_info);
+ /* Set max words to read from content */
+ max_words = fts_max_words ();
+
+ /* Set max bytes to read from content.
+ * Assuming 3 bytes per unicode point in UTF-8, as 4-byte UTF-8 unicode
+ * points are really pretty rare */
+ max_bytes = 3 * max_words * fts_max_word_length ();
+
if (g_ascii_strcasecmp (mime_used, "application/msword") == 0) {
/* Word file*/
- content = extract_msword_content (infile, fts_max_words (), &is_encrypted);
+ content = extract_msword_content (infile, max_words, max_bytes, &is_encrypted);
} else if (g_ascii_strcasecmp (mime_used, "application/vnd.ms-powerpoint") == 0) {
- /* PowerPoint file */
- content = extract_powerpoint_content (infile, fts_max_words (), &is_encrypted);
+ /* PowerPoint file
+ * TODO: Limit max bytes to read */
+ content = extract_powerpoint_content (infile, max_words, &is_encrypted);
} else if (g_ascii_strcasecmp (mime_used, "application/vnd.ms-excel") == 0) {
- /* Excel File */
- content = extract_excel_content (infile, fts_max_words (), &is_encrypted);
+ /* Excel File
+ * TODO: Limit max bytes to read */
+ content = extract_excel_content (infile, max_words, &is_encrypted);
} else {
g_message ("Mime type was not recognised:'%s'", mime_used);
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]