[tracker] FTS parsers: remove tracker_parser_process_word() from parser API
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker] FTS parsers: remove tracker_parser_process_word() from parser API
- Date: Fri, 28 May 2010 08:02:04 +0000 (UTC)
commit 9b1ffc04f3ba006a710babe46d95181ac8020492
Author: Aleksander Morgado <aleksander lanedo com>
Date: Fri May 28 10:00:10 2010 +0200
FTS parsers: remove tracker_parser_process_word() from parser API
src/libtracker-fts/tracker-parser-glib.c | 156 +++++-----
src/libtracker-fts/tracker-parser-libicu.c | 387 +++++++++-------------
src/libtracker-fts/tracker-parser-libunistring.c | 268 +++++++--------
src/libtracker-fts/tracker-parser.h | 5 -
4 files changed, 364 insertions(+), 452 deletions(-)
---
diff --git a/src/libtracker-fts/tracker-parser-glib.c b/src/libtracker-fts/tracker-parser-glib.c
index 9892829..670a46f 100644
--- a/src/libtracker-fts/tracker-parser-glib.c
+++ b/src/libtracker-fts/tracker-parser-glib.c
@@ -162,6 +162,83 @@ get_encoding (const gchar *txt)
}
+static gchar *
+process_word_utf8 (TrackerParser *parser,
+ const gchar *word,
+ gint length,
+ gboolean do_strip)
+{
+ gchar *stem_word;
+ gchar *str;
+ gchar *stripped_word;
+ gsize bytes, len;
+
+ g_return_val_if_fail (parser != NULL, NULL);
+ g_return_val_if_fail (word != NULL, NULL);
+
+ str = NULL;
+ stripped_word = NULL;
+
+ if (word) {
+ if (length == -1) {
+ bytes = strlen (word);
+ } else {
+ bytes = length;
+ }
+
+ /* Log original word */
+ tracker_parser_message_hex ("ORIGINAL word",
+ word, bytes);
+
+ if (parser->enable_unaccent && do_strip) {
+ stripped_word = tracker_parser_unaccent_utf8_word (word,
+ bytes,
+ &len);
+
+ /* Log after UNAC stripping */
+ tracker_parser_message_hex (" After UNAC stripping",
+ stripped_word, len);
+ } else {
+ stripped_word = NULL;
+ }
+
+ if (!stripped_word) {
+ str = g_utf8_normalize (word,
+ bytes,
+ G_NORMALIZE_NFC);
+ } else {
+ str = g_utf8_normalize (stripped_word,
+ len,
+ G_NORMALIZE_NFC);
+ g_free (stripped_word);
+ }
+
+ /* Log after normalization */
+ tracker_parser_message_hex (" After NFC normalization",
+ str, strlen ((gchar *)str));
+
+ if (!str) {
+ return NULL;
+ }
+
+ if (!parser->enable_stemmer) {
+ return str;
+ }
+
+ len = strlen (str);
+
+ stem_word = tracker_language_stem_word (parser->language, str, len);
+
+ if (stem_word) {
+ g_free (str);
+
+ return stem_word;
+ }
+ }
+
+ return str;
+}
+
static gboolean
pango_next (TrackerParser *parser,
gint *byte_offset_start,
@@ -400,7 +477,7 @@ parser_next (TrackerParser *parser,
parser->cursor = parser->txt + *byte_offset_end;
- processed_word = tracker_parser_process_word (parser, utf8, bytes, do_strip);
+ processed_word = process_word_utf8 (parser, utf8, bytes, do_strip);
g_free (utf8);
if (processed_word) {
@@ -503,83 +580,6 @@ tracker_parser_reset (TrackerParser *parser,
}
}
-gchar *
-tracker_parser_process_word (TrackerParser *parser,
- const gchar *word,
- gint length,
- gboolean do_strip)
-{
- gchar *stem_word;
- gchar *str;
- gchar *stripped_word;
- gsize bytes, len;
-
- g_return_val_if_fail (parser != NULL, NULL);
- g_return_val_if_fail (word != NULL, NULL);
-
- str = NULL;
- stripped_word = NULL;
-
- if (word) {
- if (length == -1) {
- bytes = strlen (word);
- } else {
- bytes = length;
- }
-
- /* Log original word */
- tracker_parser_message_hex ("ORIGINAL word",
- word, bytes);
-
- if (parser->enable_unaccent && do_strip) {
- stripped_word = tracker_parser_unaccent_utf8_word (word,
- bytes,
- &len);
-
- /* Log after UNAC stripping */
- tracker_parser_message_hex (" After UNAC stripping",
- stripped_word, len);
- } else {
- stripped_word = NULL;
- }
-
- if (!stripped_word) {
- str = g_utf8_normalize (word,
- bytes,
- G_NORMALIZE_NFC);
- } else {
- str = g_utf8_normalize (stripped_word,
- len,
- G_NORMALIZE_NFC);
- g_free (stripped_word);
- }
-
- /* Log after normalization */
- tracker_parser_message_hex (" After NFC normalization",
- str, strlen ((gchar *)str));
-
- if (!str) {
- return NULL;
- }
-
- if (!parser->enable_stemmer) {
- return str;
- }
-
- len = strlen (str);
-
- stem_word = tracker_language_stem_word (parser->language, str, len);
-
- if (stem_word) {
- g_free (str);
-
- return stem_word;
- }
- }
-
- return str;
-}
-
const gchar *
tracker_parser_next (TrackerParser *parser,
gint *position,
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
index 0fdde7b..3e1ad98 100644
--- a/src/libtracker-fts/tracker-parser-libicu.c
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -44,11 +44,6 @@ typedef enum {
/* Max possible length of a UChar encoded string (just a safety limit) */
#define WORD_BUFFER_LENGTH 512
-static gchar *process_word_uchar (TrackerParser *parser,
- const UChar *word,
- gint length,
- TrackerParserWordType type);
-
struct TrackerParser {
const gchar *txt;
gint txt_size;
@@ -143,6 +138,168 @@ get_word_info (const UChar *word,
return TRUE;
}
+static gchar *
+process_word_uchar (TrackerParser *parser,
+ const UChar *word,
+ gint length,
+ TrackerParserWordType type)
+{
+ UErrorCode error = U_ZERO_ERROR;
+ UChar normalized_buffer [WORD_BUFFER_LENGTH];
+ gchar *utf8_str = NULL;
+ gchar *stemmed = NULL;
+ size_t new_word_length;
+
+ /* Log original word */
+ tracker_parser_message_hex ("ORIGINAL word",
+ (guint8 *)word,
+ length * sizeof (UChar));
+
+
+ if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
+ UChar casefolded_buffer [WORD_BUFFER_LENGTH];
+
+ /* Casefold... */
+ new_word_length = u_strFoldCase (casefolded_buffer,
+ WORD_BUFFER_LENGTH,
+ word,
+ length,
+ U_FOLD_CASE_DEFAULT,
+ &error);
+ if (U_FAILURE (error)) {
+ g_warning ("Error casefolding: '%s'",
+ u_errorName (error));
+ return NULL;
+ }
+ if (new_word_length > WORD_BUFFER_LENGTH)
+ new_word_length = WORD_BUFFER_LENGTH;
+
+ /* Log after casefolding */
+ tracker_parser_message_hex (" After Casefolding",
+ (guint8 *)casefolded_buffer,
+ new_word_length * sizeof (UChar));
+
+ /* NFC normalization... */
+ new_word_length = unorm_normalize (casefolded_buffer,
+ new_word_length,
+ UNORM_NFC,
+ 0,
+ normalized_buffer,
+ WORD_BUFFER_LENGTH,
+ &error);
+ if (U_FAILURE (error)) {
+ g_warning ("Error normalizing: '%s'",
+ u_errorName (error));
+ return NULL;
+ }
+
+ if (new_word_length > WORD_BUFFER_LENGTH)
+ new_word_length = WORD_BUFFER_LENGTH;
+
+ /* Log after casefolding */
+ tracker_parser_message_hex (" After Normalization",
+ (guint8 *)normalized_buffer,
+ new_word_length * sizeof (UChar));
+ } else {
+ /* For ASCII-only, just tolower() each character */
+ new_word_length = u_strToLower (normalized_buffer,
+ WORD_BUFFER_LENGTH,
+ word,
+ length,
+ NULL,
+ &error);
+ if (U_FAILURE (error)) {
+ g_warning ("Error lowercasing: '%s'",
+ u_errorName (error));
+ return NULL;
+ }
+
+ /* Log after casefolding */
+ tracker_parser_message_hex (" After lowercase",
+ (guint8 *)normalized_buffer,
+ new_word_length * sizeof (UChar));
+ }
+
+ /* UNAC stripping needed? (for non-CJK and non-ASCII) */
+ if (parser->enable_unaccent && type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
+ gsize stripped_word_length;
+
+ /* Get unaccented string in UTF-8 */
+ utf8_str = tracker_parser_unaccent_UChar_word (normalized_buffer,
+ new_word_length,
+ &stripped_word_length);
+ if (utf8_str) {
+ new_word_length = stripped_word_length;
+
+ /* Log after unaccenting */
+ tracker_parser_message_hex (" After UNAC",
+ utf8_str,
+ new_word_length);
+ }
+ }
+
+ /* If stripping failed or not needed, convert to UTF-8 */
+ if (!utf8_str) {
+ UErrorCode icu_error = U_ZERO_ERROR;
+ UConverter *converter;
+ gsize utf8_len;
+
+ /* Open converter UChar to UTF-16BE */
+ converter = ucnv_open ("UTF-8", &icu_error);
+ if (!converter) {
+ g_warning ("Cannot open UTF-8 converter: '%s'",
+ U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
+ return NULL;
+ }
+ /* A character encoded in 2 bytes in UTF-16 may get expanded to 3 or 4 bytes
+ * in UTF-8. */
+ utf8_str = g_malloc (2 * new_word_length * sizeof (UChar) + 1);
+
+ /* Convert from UChar to UTF-8 (NIL-terminated) */
+ utf8_len = ucnv_fromUChars (converter,
+ utf8_str,
+ 2 * new_word_length * sizeof (UChar) + 1,
+ normalized_buffer,
+ new_word_length,
+ &icu_error);
+ if (U_FAILURE (icu_error)) {
+ g_warning ("Cannot convert from UChar to UTF-8: '%s'",
+ u_errorName (icu_error));
+ g_free (utf8_str);
+ ucnv_close (converter);
+ return NULL;
+ }
+
+ new_word_length = utf8_len;
+ ucnv_close (converter);
+
+ /* Log after unaccenting */
+ tracker_parser_message_hex (" After UTF8 conversion",
+ utf8_str,
+ new_word_length);
+ }
+
+ /* Stemming needed? */
+ if (parser->enable_stemmer) {
+ /* Input for stemmer ALWAYS in UTF-8, as well as output */
+ stemmed = tracker_language_stem_word (parser->language,
+ utf8_str,
+ new_word_length);
+
+ /* Log after stemming */
+ tracker_parser_message_hex (" After stemming",
+ stemmed, strlen (stemmed));
+ }
+
+ /* If stemmed wanted and succeeded, free previous and return it */
+ if (stemmed) {
+ g_free (utf8_str);
+ return stemmed;
+ }
+
+ return utf8_str;
+}
+
static gboolean
parser_next (TrackerParser *parser,
gint *byte_offset_start,
@@ -397,226 +554,6 @@ tracker_parser_reset (TrackerParser *parser,
ucnv_close (converter);
}
-static gchar *
-process_word_uchar (TrackerParser *parser,
- const UChar *word,
- gint length,
- TrackerParserWordType type)
-{
- UErrorCode error = U_ZERO_ERROR;
- UChar normalized_buffer [WORD_BUFFER_LENGTH];
- gchar *utf8_str = NULL;
- gchar *stemmed = NULL;
- size_t new_word_length;
-
- /* Log original word */
- tracker_parser_message_hex ("ORIGINAL word",
- (guint8 *)word,
- length * sizeof (UChar));
-
-
- if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
- UChar casefolded_buffer [WORD_BUFFER_LENGTH];
-
- /* Casefold... */
- new_word_length = u_strFoldCase (casefolded_buffer,
- WORD_BUFFER_LENGTH,
- word,
- length,
- U_FOLD_CASE_DEFAULT,
- &error);
- if (U_FAILURE (error)) {
- g_warning ("Error casefolding: '%s'",
- u_errorName (error));
- return NULL;
- }
- if (new_word_length > WORD_BUFFER_LENGTH)
- new_word_length = WORD_BUFFER_LENGTH;
-
- /* Log after casefolding */
- tracker_parser_message_hex (" After Casefolding",
- (guint8 *)casefolded_buffer,
- new_word_length * sizeof (UChar));
-
- /* NFC normalization... */
- new_word_length = unorm_normalize (casefolded_buffer,
- new_word_length,
- UNORM_NFC,
- 0,
- normalized_buffer,
- WORD_BUFFER_LENGTH,
- &error);
- if (U_FAILURE (error)) {
- g_warning ("Error normalizing: '%s'",
- u_errorName (error));
- return NULL;
- }
-
- if (new_word_length > WORD_BUFFER_LENGTH)
- new_word_length = WORD_BUFFER_LENGTH;
-
- /* Log after casefolding */
- tracker_parser_message_hex (" After Normalization",
- (guint8 *)normalized_buffer,
- new_word_length * sizeof (UChar));
- } else {
- /* For ASCII-only, just tolower() each character */
- new_word_length = u_strToLower (normalized_buffer,
- WORD_BUFFER_LENGTH,
- word,
- length,
- NULL,
- &error);
- if (U_FAILURE (error)) {
- g_warning ("Error lowercasing: '%s'",
- u_errorName (error));
- return NULL;
- }
-
- /* Log after casefolding */
- tracker_parser_message_hex (" After lowercase",
- (guint8 *)normalized_buffer,
- new_word_length * sizeof (UChar));
- }
-
- /* UNAC stripping needed? (for non-CJK and non-ASCII) */
- if (parser->enable_unaccent && type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
- gsize stripped_word_length;
-
- /* Get unaccented string in UTF-8 */
- utf8_str = tracker_parser_unaccent_UChar_word (normalized_buffer,
- new_word_length,
- &stripped_word_length);
- if (utf8_str) {
- new_word_length = stripped_word_length;
-
- /* Log after unaccenting */
- tracker_parser_message_hex (" After UNAC",
- utf8_str,
- new_word_length);
- }
- }
-
- /* If stripping failed or not needed, convert to UTF-8 */
- if (!utf8_str) {
- UErrorCode icu_error = U_ZERO_ERROR;
- UConverter *converter;
- gsize utf8_len;
-
- /* Open converter UChar to UTF-16BE */
- converter = ucnv_open ("UTF-8", &icu_error);
- if (!converter) {
- g_warning ("Cannot open UTF-8 converter: '%s'",
- U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
- return NULL;
- }
- /* A character encoded in 2 bytes in UTF-16 may get expanded to 3 or 4 bytes
- * in UTF-8. */
- utf8_str = g_malloc (2 * new_word_length * sizeof (UChar) + 1);
-
- /* Convert from UChar to UTF-8 (NIL-terminated) */
- utf8_len = ucnv_fromUChars (converter,
- utf8_str,
- 2 * new_word_length * sizeof (UChar) + 1,
- normalized_buffer,
- new_word_length,
- &icu_error);
- if (U_FAILURE (icu_error)) {
- g_warning ("Cannot convert from UChar to UTF-8: '%s'",
- u_errorName (icu_error));
- g_free (utf8_str);
- ucnv_close (converter);
- return NULL;
- }
-
- new_word_length = utf8_len;
- ucnv_close (converter);
-
- /* Log after unaccenting */
- tracker_parser_message_hex (" After UTF8 conversion",
- utf8_str,
- new_word_length);
- }
-
- /* Stemming needed? */
- if (parser->enable_stemmer) {
- /* Input for stemmer ALWAYS in UTF-8, as well as output */
- stemmed = tracker_language_stem_word (parser->language,
- utf8_str,
- new_word_length);
-
- /* Log after stemming */
- tracker_parser_message_hex (" After stemming",
- stemmed, strlen (stemmed));
- }
-
- /* If stemmed wanted and succeeded, free previous and return it */
- if (stemmed) {
- g_free (utf8_str);
- return stemmed;
- }
-
- return utf8_str;
-}
-
-
-/* Both Input and Output are always UTF-8 */
-gchar *
-tracker_parser_process_word (TrackerParser *parser,
- const gchar *word,
- gint length,
- gboolean do_strip)
-{
- UErrorCode icu_error = U_ZERO_ERROR;
- UConverter *converter;
- UChar *uchar_word;
- gsize uchar_len;
- gchar *processed;
-
- /* Open converter UTF-8 to UChar */
- converter = ucnv_open ("UTF-8", &icu_error);
- if (!converter) {
- g_warning ("Cannot open UTF-8 converter: '%s'",
- U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
- return NULL;
- }
-
- /* Compute length if not already as input */
- if (length < 0) {
- length = strlen (word);
- }
-
- /* Twice the size of the UTF-8 string for UChars */
- uchar_word = g_malloc (2 * length);
-
- /* Convert from UTF-8 to UChars*/
- uchar_len = ucnv_toUChars (converter,
- uchar_word,
- 2 * length,
- word,
- length,
- &icu_error);
- if (U_FAILURE (icu_error)) {
- g_warning ("Cannot convert from UTF-8 to UChar: '%s'",
- u_errorName (icu_error));
- g_free (uchar_word);
- ucnv_close (converter);
- return NULL;
- }
-
- ucnv_close (converter);
-
- /* Process UChar based word */
- processed = process_word_uchar (parser,
- uchar_word,
- uchar_len,
- (do_strip ?
- TRACKER_PARSER_WORD_TYPE_OTHER_UNAC :
- TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC));
- g_free (uchar_word);
- return processed;
-}
-
const gchar *
tracker_parser_next (TrackerParser *parser,
gint *position,
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c
index 7b21947..67dda5f 100644
--- a/src/libtracker-fts/tracker-parser-libunistring.c
+++ b/src/libtracker-fts/tracker-parser-libunistring.c
@@ -43,11 +43,6 @@ typedef enum {
/* Max possible length of a UTF-8 encoded string (just a safety limit) */
#define WORD_BUFFER_LENGTH 512
-static gchar *process_word_utf8 (TrackerParser *parser,
- const gchar *word,
- gint length,
- TrackerParserWordType type);
-
struct TrackerParser {
const gchar *txt;
gint txt_size;
@@ -144,6 +139,130 @@ get_word_info (TrackerParser *parser,
return TRUE;
}
+static gchar *
+process_word_utf8 (TrackerParser *parser,
+ const gchar *word,
+ gint length,
+ TrackerParserWordType type)
+{
+ gchar word_buffer [WORD_BUFFER_LENGTH];
+ gchar *normalized = NULL;
+ gchar *stripped = NULL;
+ gchar *stemmed = NULL;
+ size_t new_word_length;
+
+ g_return_val_if_fail (parser != NULL, NULL);
+ g_return_val_if_fail (word != NULL, NULL);
+
+ /* If length is set as -1, the input word MUST be NIL-terminated.
+ * Otherwise, this restriction is not needed as the length to process
+ * is given as input argument */
+ if (length < 0) {
+ length = strlen (word);
+ }
+
+ /* Log original word */
+ tracker_parser_message_hex ("ORIGINAL word",
+ word, length);
+
+ /* Normalization and case-folding ONLY for non-ASCII */
+ if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
+ /* Leave space for last NIL */
+ new_word_length = WORD_BUFFER_LENGTH - 1;
+
+ /* Casefold and NFC normalization in output.
+ * NOTE: if the output buffer is not big enough, u8_casefold will
+ * return a newly-allocated buffer. */
+ normalized = u8_casefold ((const uint8_t *)word,
+ length,
+ uc_locale_language (),
+ UNINORM_NFC,
+ word_buffer,
+ &new_word_length);
+
+ /* Case folding + Normalization failed, ignore this word */
+ g_return_val_if_fail (normalized != NULL, NULL);
+
+ /* If output buffer is not the same as the one passed to
+ * u8_casefold, we know it was newly-allocated, so need
+ * to resize it in 1 byte to add last NIL */
+ if (normalized != word_buffer) {
+ normalized = g_realloc (normalized, new_word_length + 1);
+ }
+
+ /* Log after Normalization */
+ tracker_parser_message_hex (" After Casefolding and NFC normalization",
+ normalized, new_word_length);
+ } else {
+ /* For ASCII-only, just tolower() each character */
+ gsize i;
+
+ normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length + 1) : word_buffer;
+
+ for (i = 0; i < length; i++) {
+ normalized[i] = g_ascii_tolower (word[i]);
+ }
+
+ new_word_length = length;
+
+ /* Log after tolower */
+ tracker_parser_message_hex (" After Lowercasing",
+ normalized, new_word_length);
+ }
+
+ /* Set output NIL */
+ normalized[new_word_length] = '\0';
+
+ /* UNAC stripping needed? (for non-CJK and non-ASCII) */
+ if (parser->enable_unaccent && type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
+ gsize stripped_word_length;
+
+ stripped = tracker_parser_unaccent_utf8_word (normalized,
+ new_word_length,
+ &stripped_word_length);
+
+ if (stripped) {
+ /* Log after UNAC stripping */
+ tracker_parser_message_hex (" After UNAC stripping",
+ stripped, stripped_word_length);
+ new_word_length = stripped_word_length;
+ }
+ }
+
+ /* Stemming needed? */
+ if (parser->enable_stemmer) {
+ stemmed = tracker_language_stem_word (parser->language,
+ stripped ? stripped : normalized,
+ new_word_length);
+
+ /* Log after stemming */
+ tracker_parser_message_hex (" After stemming",
+ stemmed, strlen (stemmed));
+ }
+
+ /* If stemmed wanted and succeeded, free previous and return it */
+ if (stemmed) {
+ g_free (stripped);
+ if (normalized != word_buffer) {
+ g_free (normalized);
+ }
+ return stemmed;
+ }
+
+ /* If stripped wanted and succeeded, free previous and return it */
+ if (stripped) {
+ if (normalized != word_buffer) {
+ g_free (normalized);
+ }
+ return stripped;
+ }
+
+ /* It may be the case that no stripping and no stemming was needed, and
+ * that the output buffer in stack was enough for case-folding and
+ * normalization. In this case, need to strdup() the string to return it */
+ return normalized == word_buffer ? g_strdup (word_buffer) : normalized;
+}
+
static gboolean
parser_next (TrackerParser *parser,
gint *byte_offset_start,
@@ -315,145 +434,6 @@ tracker_parser_reset (TrackerParser *parser,
}
}
-gchar *
-tracker_parser_process_word (TrackerParser *parser,
- const gchar *word,
- gint length,
- gboolean do_strip)
-{
-
- return process_word_utf8 (parser,
- word,
- length,
- (do_strip ?
- TRACKER_PARSER_WORD_TYPE_OTHER_UNAC :
- TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC));
-}
-
-static gchar *
-process_word_utf8 (TrackerParser *parser,
- const gchar *word,
- gint length,
- TrackerParserWordType type)
-{
- gchar word_buffer [WORD_BUFFER_LENGTH];
- gchar *normalized = NULL;
- gchar *stripped = NULL;
- gchar *stemmed = NULL;
- size_t new_word_length;
-
- g_return_val_if_fail (parser != NULL, NULL);
- g_return_val_if_fail (word != NULL, NULL);
-
- /* If length is set as -1, the input word MUST be NIL-terminated.
- * Otherwise, this restriction is not needed as the length to process
- * is given as input argument */
- if (length < 0) {
- length = strlen (word);
- }
-
- /* Log original word */
- tracker_parser_message_hex ("ORIGINAL word",
- word, length);
-
- /* Normalization and case-folding ONLY for non-ASCII */
- if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
- /* Leave space for last NIL */
- new_word_length = WORD_BUFFER_LENGTH - 1;
-
- /* Casefold and NFC normalization in output.
- * NOTE: if the output buffer is not big enough, u8_casefold will
- * return a newly-allocated buffer. */
- normalized = u8_casefold ((const uint8_t *)word,
- length,
- uc_locale_language (),
- UNINORM_NFC,
- word_buffer,
- &new_word_length);
-
- /* Case folding + Normalization failed, ignore this word */
- g_return_val_if_fail (normalized != NULL, NULL);
-
- /* If output buffer is not the same as the one passed to
- * u8_casefold, we know it was newly-allocated, so need
- * to resize it in 1 byte to add last NIL */
- if (normalized != word_buffer) {
- normalized = g_realloc (normalized, new_word_length + 1);
- }
-
- /* Log after Normalization */
- tracker_parser_message_hex (" After Casefolding and NFC normalization",
- normalized, new_word_length);
- } else {
- /* For ASCII-only, just tolower() each character */
- gsize i;
-
- normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length + 1) : word_buffer;
-
- for (i = 0; i < length; i++) {
- normalized[i] = g_ascii_tolower (word[i]);
- }
-
- new_word_length = length;
-
- /* Log after tolower */
- tracker_parser_message_hex (" After Lowercasing",
- normalized, new_word_length);
- }
-
- /* Set output NIL */
- normalized[new_word_length] = '\0';
-
- /* UNAC stripping needed? (for non-CJK and non-ASCII) */
- if (parser->enable_unaccent && type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
- gsize stripped_word_length;
-
- stripped = tracker_parser_unaccent_utf8_word (normalized,
- new_word_length,
- &stripped_word_length);
-
- if (stripped) {
- /* Log after UNAC stripping */
- tracker_parser_message_hex (" After UNAC stripping",
- stripped, stripped_word_length);
- new_word_length = stripped_word_length;
- }
- }
-
- /* Stemming needed? */
- if (parser->enable_stemmer) {
- stemmed = tracker_language_stem_word (parser->language,
- stripped ? stripped : normalized,
- new_word_length);
-
- /* Log after stemming */
- tracker_parser_message_hex (" After stemming",
- stemmed, strlen (stemmed));
- }
-
- /* If stemmed wanted and succeeded, free previous and return it */
- if (stemmed) {
- g_free (stripped);
- if (normalized != word_buffer) {
- g_free (normalized);
- }
- return stemmed;
- }
-
- /* If stripped wanted and succeeded, free previous and return it */
- if (stripped) {
- if (normalized != word_buffer) {
- g_free (normalized);
- }
- return stripped;
- }
-
- /* It may be the case that no stripping and no stemming was needed, and
- * that the output buffer in stack was enough for case-folding and
- * normalization. In this case, need to strdup() the string to return it */
- return normalized == word_buffer ? g_strdup (word_buffer) : normalized;
-}
-
const gchar *
tracker_parser_next (TrackerParser *parser,
gint *position,
diff --git a/src/libtracker-fts/tracker-parser.h b/src/libtracker-fts/tracker-parser.h
index 57426c3..cc12398 100644
--- a/src/libtracker-fts/tracker-parser.h
+++ b/src/libtracker-fts/tracker-parser.h
@@ -48,11 +48,6 @@ const gchar * tracker_parser_next (TrackerParser *parser,
gboolean *stop_word,
gint *word_length);
-gchar * tracker_parser_process_word (TrackerParser *parser,
- const gchar *word,
- gint length,
- gboolean do_strip);
-
void tracker_parser_free (TrackerParser *parser);
G_END_DECLS
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]