[tracker/extractor-remove-word-counting-review: 5/14] tracker_text_validate_utf8 can return only the number of valid UTF-8 bytes
- From: Martyn James Russell <mr src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/extractor-remove-word-counting-review: 5/14] tracker_text_validate_utf8 can return only the number of valid UTF-8 bytes
- Date: Tue, 18 May 2010 10:35:39 +0000 (UTC)
commit afbd72a8cc91c1e9f1f45c3de078d7e920087909
Author: Aleksander Morgado <aleksander lanedo com>
Date: Tue May 11 11:42:08 2010 +0200
tracker_text_validate_utf8 can return only the number of valid UTF-8 bytes
src/libtracker-extract/tracker-utils.c | 31 +++++++++++++++--------
src/libtracker-extract/tracker-utils.h | 3 +-
src/tracker-extract/tracker-extract-html.c | 3 +-
src/tracker-extract/tracker-extract-msoffice.c | 11 ++++----
src/tracker-extract/tracker-extract-oasis.c | 3 +-
src/tracker-extract/tracker-extract-pdf.cpp | 3 +-
6 files changed, 34 insertions(+), 20 deletions(-)
---
diff --git a/src/libtracker-extract/tracker-utils.c b/src/libtracker-extract/tracker-utils.c
index f9f1084..fe5eaec 100644
--- a/src/libtracker-extract/tracker-utils.c
+++ b/src/libtracker-extract/tracker-utils.c
@@ -361,25 +361,27 @@ tracker_text_normalize (const gchar *text,
* tracker_text_validate_utf8:
* @text: the text to validate
* @text_len: length of @text, or -1 if NIL-terminated
- * @str: the string where to place the validated characters
+ * @str: the string where to place the validated UTF-8 characters, or %NULL if
+ * not needed.
+ * @p_utf8_len: Output number of valid UTF-8 bytes found, or %NULL if not needed
*
* This function iterates through @text checking for UTF-8 validity
- * using g_utf8_validate(), and appends the first chunk of valid characters
- * to @str.
+ * using g_utf8_validate(), appends the first chunk of valid characters
+ * to @str, and gives the number of valid UTF-8 bytes in @p_utf8_len.
*
- * Returns: %TRUE if valid UTF-8 in @text was appended to @str
+ * Returns: %TRUE if some bytes were found to be valid, %FALSE otherwise.
*
* Since: 0.9
**/
gboolean
tracker_text_validate_utf8 (const gchar *text,
gsize text_len,
- GString **str)
+ GString **str,
+ gsize *p_utf8_len)
{
gsize len_to_validate;
g_return_val_if_fail (text, FALSE);
- g_return_val_if_fail (str, FALSE);
len_to_validate = text_len >= 0 ? text_len : strlen (text);
@@ -390,12 +392,19 @@ tracker_text_validate_utf8 (const gchar *text,
* (if any) or to the end of the string. */
g_utf8_validate (text, len_to_validate, &end);
if (end > text) {
- /* Create string to output if not already as input */
- if (*str == NULL) {
- *str = g_string_new_len (text, end-text);
- } else {
- *str = g_string_append_len (*str, text, end-text);
+ /* If str output required... */
+ if (str) {
+ /* Create string to output if not already as input */
+ *str = (*str == NULL ?
+ g_string_new_len (text, end - text) :
+ g_string_append_len (*str, text, end - text));
+ }
+
+ /* If utf8 len output required... */
+ if (p_utf8_len) {
+ *p_utf8_len = end - text;
}
+
return TRUE;
}
}
diff --git a/src/libtracker-extract/tracker-utils.h b/src/libtracker-extract/tracker-utils.h
index 6003d36..760fc4b 100644
--- a/src/libtracker-extract/tracker-utils.h
+++ b/src/libtracker-extract/tracker-utils.h
@@ -41,7 +41,8 @@ gchar* tracker_text_normalize (const gchar *text,
gboolean tracker_text_validate_utf8 (const gchar *text,
gsize text_len,
- GString **str);
+ GString **str,
+ gsize *p_utf8_len);
gchar* tracker_date_guess (const gchar *date_string);
gchar* tracker_date_format_to_iso8601 (const gchar *date_string,
const gchar *format);
diff --git a/src/tracker-extract/tracker-extract-html.c b/src/tracker-extract/tracker-extract-html.c
index a59b864..acd99b7 100644
--- a/src/tracker-extract/tracker-extract-html.c
+++ b/src/tracker-extract/tracker-extract-html.c
@@ -221,7 +221,8 @@ parser_characters (void *data,
(pd->n_bytes_remaining < text_len ?
pd->n_bytes_remaining :
text_len),
- &pd->plain_text)) {
+ &pd->plain_text,
+ NULL)) {
/* In the case of HTML, each string arriving this
* callback is independent to any other previous
* string, so need to add an explicit whitespace
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index d47a1c3..30c2046 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -434,7 +434,8 @@ msoffice_convert_and_normalize_chunk (guint8 *buffer,
if (tracker_text_validate_utf8 (converted_text,
len_to_validate,
- p_content)) {
+ p_content,
+ NULL)) {
/* A whitespace is added to separate next strings appended */
g_string_append_c (*p_content, ' ');
}
@@ -1860,7 +1861,7 @@ xml_text_handler_document_data (GMarkupParseContext *context,
case MS_OFFICE_XML_TAG_WORD_TEXT:
if (info->style_element_present) {
if (atoi (text) == 0) {
- tracker_text_validate_utf8 (text, -1, &info->content);
+ tracker_text_validate_utf8 (text, -1, &info->content, NULL);
g_string_append_c (info->content, ' ');
}
}
@@ -1868,7 +1869,7 @@ xml_text_handler_document_data (GMarkupParseContext *context,
if (info->preserve_attribute_present) {
gchar *keywords = g_strdup (text);
if (found) {
- tracker_text_validate_utf8 (text, -1, &info->content);
+ tracker_text_validate_utf8 (text, -1, &info->content, NULL);
g_string_append_c (info->content, ' ');
found = FALSE;
} else {
@@ -1892,13 +1893,13 @@ xml_text_handler_document_data (GMarkupParseContext *context,
break;
case MS_OFFICE_XML_TAG_SLIDE_TEXT:
- tracker_text_validate_utf8 (text, -1, &info->content);
+ tracker_text_validate_utf8 (text, -1, &info->content, NULL);
g_string_append_c (info->content, ' ');
break;
case MS_OFFICE_XML_TAG_XLS_SHARED_TEXT:
if (atoi (text) == 0) {
- tracker_text_validate_utf8 (text, -1, &info->content);
+ tracker_text_validate_utf8 (text, -1, &info->content, NULL);
g_string_append_c (info->content, ' ');
}
break;
diff --git a/src/tracker-extract/tracker-extract-oasis.c b/src/tracker-extract/tracker-extract-oasis.c
index 573e0db..da21440 100644
--- a/src/tracker-extract/tracker-extract-oasis.c
+++ b/src/tracker-extract/tracker-extract-oasis.c
@@ -139,7 +139,8 @@ extract_oasis_content (const gchar *uri,
tracker_text_validate_utf8 (buf,
len_to_validate,
- &validated);
+ &validated,
+ NULL);
/* Note that in this case we shouldn't add a whitespace
* separator between chunks read */
diff --git a/src/tracker-extract/tracker-extract-pdf.cpp b/src/tracker-extract/tracker-extract-pdf.cpp
index 78c2df8..22016db 100644
--- a/src/tracker-extract/tracker-extract-pdf.cpp
+++ b/src/tracker-extract/tracker-extract-pdf.cpp
@@ -365,7 +365,8 @@ extract_content (PDFDoc *document,
if (tracker_text_validate_utf8 (sel_text->getCString (),
len_to_validate,
- &string)) {
+ &string,
+ NULL)) {
/* A whitespace is added to separate next strings appended */
g_string_append_c (string, ' ');
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]