[tracker/parser-unicode-libs-review: 66/85] Added normalization and case folding
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/parser-unicode-libs-review: 66/85] Added normalization and case folding
- Date: Tue, 4 May 2010 17:29:33 +0000 (UTC)
commit f31b3ff06608057266149026c8ed415947857940
Author: Aleksander Morgado <aleksander lanedo com>
Date: Wed Apr 28 12:15:09 2010 +0200
Added normalization and case folding
src/libtracker-fts/tracker-parser.c | 230 ++++++++++++++++++++++++++---------
1 files changed, 172 insertions(+), 58 deletions(-)
---
diff --git a/src/libtracker-fts/tracker-parser.c b/src/libtracker-fts/tracker-parser.c
index 3d37874..47b2071 100644
--- a/src/libtracker-fts/tracker-parser.c
+++ b/src/libtracker-fts/tracker-parser.c
@@ -27,15 +27,21 @@
#endif
#ifdef HAVE_LIBUNISTRING
-/* # include <unistr.h> */
-# include <uniwbrk.h>
-# include <unictype.h>
+/* libunistring versions prior to 9.1.2 need this hack */
+#define _UNUSED_PARAMETER_
+#include <unistr.h>
+#include <uniwbrk.h>
+#include <unictype.h>
+#include <unicase.h>
#else
#include <pango/pango.h>
#endif
#include "tracker-parser.h"
+/* Max possible length of a UTF-8 encoded string (just a safety limit) */
+#define WORD_BUFFER_LENGTH 512
+
#ifndef HAVE_LIBUNISTRING
@@ -534,14 +540,16 @@ parser_next (TrackerParser *parser,
/* Use libunistring
* void u8_wordbreaks (const uint8_t *s, size_t n, char *p)
* int u8_strmbtouc (ucs4_t *puc, const uint8_t *s)
+ * uint8_t * u8_casefold (const uint8_t *s, size_t n, const char *iso639_language, uninorm_t nf, uint8_t *resultbuf, size_t *lengthp)
*/
static gboolean
parser_next (TrackerParser *parser,
gint *byte_offset_start,
gint *byte_offset_end)
{
- gchar *word = NULL;
- gsize word_length;
+
+ gsize word_length = 0;
+ gchar *processed_word = NULL;
*byte_offset_start = 0;
*byte_offset_end = 0;
@@ -549,7 +557,7 @@ parser_next (TrackerParser *parser,
g_return_val_if_fail (parser, FALSE);
/* Loop to look for next valid word */
- while (!word &&
+ while (!processed_word &&
parser->cursor < parser->txt_size) {
ucs4_t first_unichar;
gint first_unichar_len;
@@ -571,7 +579,6 @@ parser_next (TrackerParser *parser,
word_length = i - parser->cursor;
if (first_unichar_len > 0) {
-
/* We only want the words where the first character
* in the word is either a letter, a number or a symbol.
* This is needed because the word break algorithm also
@@ -581,10 +588,31 @@ parser_next (TrackerParser *parser,
* should be compatible with all Unicode normalization
* methods.
*/
- if (uc_is_general_category (first_unichar, parser->allowed_start)) {
- word = g_malloc (word_length + 1);
- memcpy (word, &(parser->txt[parser->cursor]), word_length);
- word[word_length] = '\0';
+ if (uc_is_general_category (first_unichar,
+ parser->allowed_start)) {
+ gchar word_buffer [WORD_BUFFER_LENGTH];
+ gsize new_length;
+
+ /* compute truncated word length if needed */
+ new_length = (word_length < WORD_BUFFER_LENGTH ?
+ word_length :
+ WORD_BUFFER_LENGTH - 1);
+
+ /* Word here needs always to be NIL-terminated */
+ memcpy (word_buffer, &(parser->txt[parser->cursor]), new_length);
+ word_buffer[new_length] = '\0';
+
+ /* Process the word here. If it fails, we can still go
+ * to the next one. Returns newly allocated string
+ * always */
+ processed_word = tracker_parser_process_word (parser,
+ word_buffer,
+ new_length,
+ TRUE);
+ if (!processed_word) {
+ /* Skip this word and keep on looping */
+ parser->cursor += word_length;
+ }
} else {
/* Skip this word and keep on looping */
parser->cursor += word_length;
@@ -596,10 +624,8 @@ parser_next (TrackerParser *parser,
}
}
- /* If we got a word here, process it */
- if (word) {
- gchar *processed_word;
-
+ /* If we got a word here, set output */
+ if (processed_word) {
/* Set outputs */
*byte_offset_start = parser->cursor;
*byte_offset_end = parser->cursor + word_length;
@@ -607,20 +633,10 @@ parser_next (TrackerParser *parser,
/* Update cursor */
parser->cursor += word_length;
- /* g_debug ("start: '%d', end: '%d', new cursor at: '%d'", */
- /* *byte_offset_start, *byte_offset_end, (gint)parser->cursor); */
-
- /* TODO: tolower, do_strip */
-
- processed_word = tracker_parser_process_word (parser, word, word_length, TRUE);
- g_free (word);
-
- if (processed_word) {
- parser->word_length = strlen (processed_word);
- parser->word = processed_word;
+ parser->word_length = strlen (processed_word);
+ parser->word = processed_word;
- return TRUE;
- }
+ return TRUE;
}
/* No more words... */
@@ -753,6 +769,8 @@ tracker_parser_reset (TrackerParser *parser,
#endif /* !HAVE_LIBUNISTRING */
}
+
+#ifndef HAVE_LIBUNISTRING
gchar *
tracker_parser_process_word (TrackerParser *parser,
const gchar *word,
@@ -768,51 +786,147 @@ tracker_parser_process_word (TrackerParser *parser,
str = NULL;
- if (word) {
- if (length == -1) {
- bytes = strlen (word);
- } else {
- bytes = length;
- }
+ bytes = length == -1 ? strlen (word) : length;
- g_debug ("ORIGINAL word: '%s'", word);
+ g_debug ("ORIGINAL word: '%s'", word);
- str = g_utf8_normalize (word,
- bytes,
- G_NORMALIZE_NFC);
- if (!str) {
- return NULL;
- }
+ str = g_utf8_normalize (word,
+ bytes,
+ G_NORMALIZE_NFC);
+ if (!str) {
+ return NULL;
+ }
- len = strlen (str);
- g_debug (" After NFC normalization: '%s'", str);
+ len = strlen (str);
- if (do_strip) {
- gchar *stripped_word;
+ g_debug (" After NFC normalization: '%s'", str);
- stripped_word = strip_word (str, len, &len);
- g_debug (" After UNAC stripping: '%s'", stripped_word);
- g_free (str);
- str = stripped_word;
- }
+ if (do_strip) {
+ gchar *stripped_word;
+
+ stripped_word = strip_word (str, len, &len);
+ g_debug (" After UNAC stripping: '%s'", stripped_word);
+ g_free (str);
+ str = stripped_word;
+ }
- if (!parser->enable_stemmer) {
- return str;
+ if (!parser->enable_stemmer) {
+ return str;
+ }
+
+ stem_word = tracker_language_stem_word (parser->language, str, len);
+ g_debug (" After Stemming: '%s'", stem_word);
+
+ if (stem_word) {
+ g_free (str);
+
+ return stem_word;
+ }
+
+ return str;
+}
+
+#else
+
+/* libunistring version of the word processor. */
+gchar *
+tracker_parser_process_word (TrackerParser *parser,
+ const gchar *word,
+ gint length,
+ gboolean do_strip)
+{
+ gchar word_buffer [WORD_BUFFER_LENGTH];
+ gchar *normalized = NULL;
+ gchar *stripped = NULL;
+ gchar *stemmed = NULL;
+ size_t new_word_length;
+
+ g_return_val_if_fail (parser != NULL, NULL);
+ g_return_val_if_fail (word != NULL, NULL);
+
+ if (length < 0) {
+ length = strlen (word);
+ }
+
+ g_debug ("Original word: '%s'", word);
+
+ /* Leave space for last NIL */
+ new_word_length = WORD_BUFFER_LENGTH - 1;
+
+ /* Casefold and NFC normalization in output.
+ * NOTE: if the output buffer is not big enough, u8_casefold will
+ * return a newly-allocated buffer. */
+ normalized = u8_casefold ((const uint8_t *)word,
+ length,
+ uc_locale_language (),
+ UNINORM_NFC,
+ word_buffer,
+ &new_word_length);
+
+ /* Case folding + Normalization failed, skip this word */
+ g_return_val_if_fail (normalized != NULL, NULL);
+
+ /* If output buffer is not the same as the one passed to
+ * u8_casefold, we know it was newly-allocated, so need
+ * to resize it in 1 byte to add last NIL */
+ if (normalized != word_buffer) {
+ normalized = g_realloc (normalized, new_word_length + 1);
+ }
+
+ /* Set output NIL */
+ normalized[new_word_length] = '\0';
+
+ g_debug (" After Casefolding and NFC normalization: '%s'", normalized);
+
+ /* UNAC stripping needed? */
+ if (do_strip) {
+ guint32 stripped_word_length;
+
+ stripped = strip_word (normalized,
+ new_word_length,
+ &stripped_word_length);
+
+ if (stripped) {
+ g_debug (" After UNAC stripping: '%s'", stripped);
+ new_word_length = stripped_word_length;
}
+ }
- stem_word = tracker_language_stem_word (parser->language, str, len);
- g_debug (" After Stemming: '%s'", stem_word);
+ /* Stemming needed? */
+ if (parser->enable_stemmer) {
+ stemmed = tracker_language_stem_word (parser->language,
+ stripped ? stripped : normalized,
+ new_word_length);
- if (stem_word) {
- g_free (str);
+ g_debug (" After Stemming: '%s'", stemmed);
+ }
- return stem_word;
+ /* If stemmed wanted and succeeded, free previous and return it */
+ if (stemmed) {
+ g_free (stripped);
+ if (normalized != word_buffer) {
+ g_free (normalized);
}
+ return stemmed;
}
- return str;
+ /* If stripped wanted and succeeded, free previous and return it */
+ if (stripped) {
+ if (normalized != word_buffer) {
+ g_free (normalized);
+ }
+ return stripped;
+ }
+
+ /* It may be the case that no stripping and no stemming was needed, and
+ * that the output buffer in stack was enough for case-folding and
+ * normalization. In this case, need to strdup() the string to return it */
+ return normalized == word_buffer ? g_strdup (word_buffer) : normalized;
}
+#endif /* !HAVE_LIBUNISTRING */
+
+
const gchar *
tracker_parser_next (TrackerParser *parser,
gint *position,
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]