[tracker] Add tracker_text_normalize()



commit 0a436f923bd2a4486054c1ce9de0881e11400c88
Author: Carlos Garnacho <carlos lanedo com>
Date:   Thu Oct 8 18:11:04 2009 +0200

    Add tracker_text_normalize()
    
    This function is meant for FTS in extractors, receives UTF8 text and tries to
    strip non-text characters, extra spaces, carriage returns and such, providing
    a suitable string for nie:plainTextContent.

 src/libtracker-common/tracker-utils.c |   46 +++++++++++++++++++++++++++++++++
 src/libtracker-common/tracker-utils.h |    4 +++
 2 files changed, 50 insertions(+), 0 deletions(-)
---
diff --git a/src/libtracker-common/tracker-utils.c b/src/libtracker-common/tracker-utils.c
index 5006beb..f10bbec 100644
--- a/src/libtracker-common/tracker-utils.c
+++ b/src/libtracker-common/tracker-utils.c
@@ -563,3 +563,49 @@ tracker_merge (const gchar *delim, gint n_values,
 
 	return g_string_free (str, FALSE);
 }
+
+gchar *
+tracker_text_normalize (const gchar *text,
+			guint        max_words,
+			guint       *n_words)
+{
+	GString *string;
+	gboolean in_break = TRUE;
+	gunichar ch;
+	gint words = 0;
+
+	string = g_string_new (NULL);
+
+	while ((ch = g_utf8_get_char_validated (text, -1)) > 0) {
+		GUnicodeType type;
+
+		type = g_unichar_type (ch);
+
+		if (type == G_UNICODE_LOWERCASE_LETTER ||
+		    type == G_UNICODE_MODIFIER_LETTER ||
+		    type == G_UNICODE_OTHER_LETTER ||
+		    type == G_UNICODE_TITLECASE_LETTER ||
+		    type == G_UNICODE_UPPERCASE_LETTER) {
+			/* Append regular chars */
+			g_string_append_unichar (string, ch);
+			in_break = FALSE;
+		} else if (!in_break) {
+			/* Non-regular char found, treat as word break */
+			g_string_append_c (string, ' ');
+			in_break = TRUE;
+			words++;
+
+			if (words > max_words) {
+				break;
+			}
+		}
+
+		text = g_utf8_find_next_char (text, NULL);
+	}
+
+	if (n_words) {
+		*n_words = words;
+	}
+
+	return g_string_free (string, FALSE);
+}
diff --git a/src/libtracker-common/tracker-utils.h b/src/libtracker-common/tracker-utils.h
index ce6a837..e0525d0 100644
--- a/src/libtracker-common/tracker-utils.h
+++ b/src/libtracker-common/tracker-utils.h
@@ -50,6 +50,10 @@ gchar *  tracker_coalesce                   (gint n_values,
 gchar *  tracker_merge                      (const gchar *delim, gint n_values,
 					     ...);
 
+gchar *  tracker_text_normalize             (const gchar *text,
+					     guint        max_words,
+					     guint       *n_words);
+
 /* Temporary: Just here until we upgrade to GLib 2.18. */
 G_CONST_RETURN gchar *
          tracker_dngettext                  (const gchar *domain,



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]