[tracker/parser-unicode-libs-review] Improve ASCII-only parsing
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/parser-unicode-libs-review] Improve ASCII-only parsing
- Date: Wed, 5 May 2010 10:42:41 +0000 (UTC)
commit b16612e50feb5cb944d83b374e2ab3d0898c3be6
Author: Aleksander Morgado <aleksander lanedo com>
Date: Wed May 5 12:41:21 2010 +0200
Improve ASCII-only parsing
* Don't perform normalization if ASCII-only
* Don't perform full case-folding if ASCII-only (only lowercase)
src/libtracker-fts/tracker-parser-libicu.c | 125 +++++++-----
src/libtracker-fts/tracker-parser-libunistring.c | 227 ++++++++++++++--------
2 files changed, 220 insertions(+), 132 deletions(-)
---
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
index 40e740b..5a4f1e3 100644
--- a/src/libtracker-fts/tracker-parser-libicu.c
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -42,6 +42,13 @@
((c) >= 0x4E00 && (c) <= 0x9FA5) || \
((c) >= 0x20000 && (c) <= 0x2A6D6))
+/* Type of words detected */
+typedef enum {
+ TRACKER_PARSER_WORD_TYPE_ASCII,
+ TRACKER_PARSER_WORD_TYPE_OTHER_UNAC,
+ TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC,
+} TrackerParserWordType;
+
/* Max possible length of a UChar encoded string (just a safety limit) */
#define WORD_BUFFER_LENGTH 512
@@ -49,7 +56,7 @@
static gchar *process_word_uchar (TrackerParser *parser,
const UChar *word,
gint length,
- gboolean do_strip);
+ TrackerParserWordType type);
struct TrackerParser {
@@ -84,19 +91,15 @@ struct TrackerParser {
static gboolean
-get_word_info (const UChar *word,
- gsize word_length,
- gboolean *p_is_allowed_word_start,
- gboolean *p_is_ascii_or_cjk)
+get_word_info (const UChar *word,
+ gsize word_length,
+ gboolean *p_is_allowed_word_start,
+ TrackerParserWordType *p_word_type)
{
UCharIterator iter;
UChar32 unichar;
guint8 unichar_gc;
- /* Defaults... */
- *p_is_allowed_word_start = TRUE;
- *p_is_ascii_or_cjk = TRUE;
-
/* Get first character of the word as UCS4 */
uiter_setString (&iter, word, word_length);
unichar = uiter_current32 (&iter);
@@ -128,10 +131,13 @@ get_word_info (const UChar *word,
unichar_gc != U_OTHER_SYMBOL) {
*p_is_allowed_word_start = FALSE;
return TRUE;
+ } else {
+ *p_is_allowed_word_start = TRUE;
}
/* Word starts with a CJK character? */
if (IS_CJK_UCS4 ((guint32)unichar)) {
+ *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC;
return TRUE;
}
@@ -139,17 +145,16 @@ get_word_info (const UChar *word,
while (unichar != U_SENTINEL)
{
if (!IS_ASCII_UCS4 ((guint32)unichar)) {
- *p_is_ascii_or_cjk = FALSE;
+ *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC;
return TRUE;
}
unichar = uiter_next32 (&iter);
}
+ *p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII;
return TRUE;
}
-
-/* libunistring-based parser */
static gboolean
parser_next (TrackerParser *parser,
gint *byte_offset_start,
@@ -168,7 +173,7 @@ parser_next (TrackerParser *parser,
/* Loop to look for next valid word */
while (!processed_word &&
parser->cursor < parser->utxt_size) {
- gboolean is_ascii_or_cjk;
+ TrackerParserWordType type;
gboolean is_allowed;
gsize next_word_offset_uchar;
gsize next_word_offset_utf8;
@@ -204,7 +209,7 @@ parser_next (TrackerParser *parser,
if (!get_word_info (&parser->utxt[parser->cursor],
word_length_uchar,
&is_allowed,
- &is_ascii_or_cjk)) {
+ &type)) {
/* Quit loop just in case */
parser->cursor = parser->utxt_size;
break;
@@ -243,7 +248,7 @@ parser_next (TrackerParser *parser,
processed_word = process_word_uchar (parser,
&(parser->utxt[parser->cursor]),
truncated_length,
- !is_ascii_or_cjk);
+ type);
if (!processed_word) {
/* Skip this word and keep on looping */
parser->cursor = next_word_offset_uchar;
@@ -407,53 +412,69 @@ tracker_parser_reset (TrackerParser *parser,
}
static gchar *
-process_word_uchar (TrackerParser *parser,
- const UChar *word,
- gint length,
- gboolean do_strip)
+process_word_uchar (TrackerParser *parser,
+ const UChar *word,
+ gint length,
+ TrackerParserWordType type)
{
UErrorCode error = U_ZERO_ERROR;
- UChar casefolded_buffer [WORD_BUFFER_LENGTH];
UChar normalized_buffer [WORD_BUFFER_LENGTH];
gchar *utf8_str = NULL;
gchar *stemmed = NULL;
size_t new_word_length;
- /* Casefold... */
- new_word_length = u_strFoldCase (casefolded_buffer,
- WORD_BUFFER_LENGTH,
- word,
- length,
- U_FOLD_CASE_DEFAULT,
- &error);
- if (U_FAILURE (error)) {
- g_warning ("Error casefolding: '%s'",
- u_errorName (error));
- return NULL;
- }
- if (new_word_length > WORD_BUFFER_LENGTH)
- new_word_length = WORD_BUFFER_LENGTH;
-
- /* NFC normalization... */
- new_word_length = unorm_normalize (casefolded_buffer,
- new_word_length,
- UNORM_NFC,
- 0,
- normalized_buffer,
- WORD_BUFFER_LENGTH,
- &error);
- if (U_FAILURE (error)) {
- g_warning ("Error normalizing: '%s'",
- u_errorName (error));
- return NULL;
- }
+ if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
+ UChar casefolded_buffer [WORD_BUFFER_LENGTH];
+
+ /* Casefold... */
+ new_word_length = u_strFoldCase (casefolded_buffer,
+ WORD_BUFFER_LENGTH,
+ word,
+ length,
+ U_FOLD_CASE_DEFAULT,
+ &error);
+ if (U_FAILURE (error)) {
+ g_warning ("Error casefolding: '%s'",
+ u_errorName (error));
+ return NULL;
+ }
+ if (new_word_length > WORD_BUFFER_LENGTH)
+ new_word_length = WORD_BUFFER_LENGTH;
+
+ /* NFC normalization... */
+ new_word_length = unorm_normalize (casefolded_buffer,
+ new_word_length,
+ UNORM_NFC,
+ 0,
+ normalized_buffer,
+ WORD_BUFFER_LENGTH,
+ &error);
+ if (U_FAILURE (error)) {
+ g_warning ("Error normalizing: '%s'",
+ u_errorName (error));
+ return NULL;
+ }
- if (new_word_length > WORD_BUFFER_LENGTH)
- new_word_length = WORD_BUFFER_LENGTH;
+ if (new_word_length > WORD_BUFFER_LENGTH)
+ new_word_length = WORD_BUFFER_LENGTH;
+ } else {
+ /* For ASCII-only, just tolower() each character */
+ new_word_length = u_strToLower (normalized_buffer,
+ WORD_BUFFER_LENGTH,
+ word,
+ length,
+ NULL,
+ &error);
+ if (U_FAILURE (error)) {
+ g_warning ("Error lowercasing: '%s'",
+ u_errorName (error));
+ return NULL;
+ }
+ }
- /* UNAC stripping needed? */
- if (do_strip) {
+ /* UNAC stripping needed? (for non-CJK and non-ASCII) */
+ if (type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
gsize stripped_word_length;
/* Get unaccented string in UTF-8 */
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c
index 6fec131..f022cbb 100644
--- a/src/libtracker-fts/tracker-parser-libunistring.c
+++ b/src/libtracker-fts/tracker-parser-libunistring.c
@@ -33,6 +33,7 @@
#include "tracker-parser.h"
#include "tracker-parser-utils.h"
+
/* ASCII-7 is in range [0x00,0x7F] */
#define IS_ASCII_BYTE(c) ((c) <= 0x7F)
@@ -41,9 +42,20 @@
((c) >= 0x4E00 && (c) <= 0x9FA5) || \
((c) >= 0x20000 && (c) <= 0x2A6D6))
+/* Type of words detected */
+typedef enum {
+ TRACKER_PARSER_WORD_TYPE_ASCII,
+ TRACKER_PARSER_WORD_TYPE_OTHER_UNAC,
+ TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC,
+} TrackerParserWordType;
+
/* Max possible length of a UTF-8 encoded string (just a safety limit) */
#define WORD_BUFFER_LENGTH 512
+static gchar *process_word_utf8 (TrackerParser *parser,
+ const gchar *word,
+ gint length,
+ TrackerParserWordType type);
struct TrackerParser {
const gchar *txt;
@@ -70,25 +82,76 @@ struct TrackerParser {
uc_general_category_t allowed_start;
};
-/* Detect if a UTF-8 word is pure ASCII-7, so that there is no need to apply
- * UNAC stripping.
- * Just check byte per byte, and if any of the bytes is >127, then it's not
- * ASCII-7 */
static gboolean
-is_ascii_word (const gchar *word,
- gsize length)
+get_word_info (TrackerParser *parser,
+ gsize *p_word_length,
+ gboolean *p_is_allowed_word_start,
+ TrackerParserWordType *p_word_type)
{
+ ucs4_t first_unichar;
+ gint first_unichar_len;
gsize i;
+ gboolean ascii_only;
+
+ /* Defaults */
+ *p_is_allowed_word_start = TRUE;
+
+ /* Get first character of the word as UCS4 */
+ first_unichar_len = u8_strmbtouc (&first_unichar,
+ &(parser->txt[parser->cursor]));
+ if (first_unichar_len <= 0) {
+ /* This should only happen if NIL was passed to u8_strmbtouc,
+ * so better just force stop here */
+ return FALSE;
+ } else {
+ /* If first character has length 1, it's ASCII-7 */
+ ascii_only = first_unichar_len == 1 ? TRUE : FALSE;
+ }
+
+ /* Find next word break, and in the same loop checking if only ASCII
+ * characters */
+ i = parser->cursor + first_unichar_len;
+ while (i < parser->txt_size &&
+ !parser->word_break_flags [i]) {
- for (i = 0; i < length; i++) {
- if (!IS_ASCII_BYTE ((guchar)word[i])) {
- return FALSE;
+ if (ascii_only &&
+ !IS_ASCII_BYTE ((guchar)parser->txt[i])) {
+ ascii_only = FALSE;
}
+
+ i++;
+ }
+
+ /* Word end is the first byte after the word, which is either the
+ * start of next word or the end of the string */
+ *p_word_length = i - parser->cursor;
+
+ /* We only want the words where the first character
+ * in the word is either a letter, a number or a symbol.
+ * This is needed because the word break algorithm also
+ * considers word breaks after for example commas or other
+ * punctuation marks.
+ * Note that looking at the first character in the string
+ * should be compatible with all Unicode normalization
+ * methods.
+ */
+ if (!uc_is_general_category (first_unichar,
+ parser->allowed_start)) {
+ *p_is_allowed_word_start = FALSE;
+ return TRUE;
+ }
+
+ /* Decide word type */
+ if (ascii_only) {
+ *p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII;
+ } else if (IS_CJK_UCS4 (first_unichar)) {
+ *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC;
+ } else {
+ *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC;
}
return TRUE;
}
-/* libunistring-based parser */
static gboolean
parser_next (TrackerParser *parser,
gint *byte_offset_start,
@@ -105,44 +168,22 @@ parser_next (TrackerParser *parser,
/* Loop to look for next valid word */
while (!processed_word &&
parser->cursor < parser->txt_size) {
- ucs4_t first_unichar;
- gint first_unichar_len;
- gsize i;
+ TrackerParserWordType type;
gsize truncated_length;
- gboolean do_strip;
-
- /* Get first character of the word as UCS4 */
- first_unichar_len = u8_strmbtouc (&first_unichar,
- &(parser->txt[parser->cursor]));
- if (first_unichar_len <= 0) {
- /* This should only happen if NIL was passed to u8_strmbtouc,
- * so better just force stop here */
+ gboolean is_allowed;
+
+ /* Get word info */
+ if (!get_word_info (parser,
+ &word_length,
+ &is_allowed,
+ &type)) {
+ /* Quit loop just in case */
parser->cursor = parser->txt_size;
break;
}
- /* Find next word break */
- i = parser->cursor + first_unichar_len;
- while (i < parser->txt_size &&
- !parser->word_break_flags [i]) {
- i++;
- }
-
- /* Word end is the first byte after the word, which is either the
- * start of next word or the end of the string */
- word_length = i - parser->cursor;
-
- /* We only want the words where the first character
- * in the word is either a letter, a number or a symbol.
- * This is needed because the word break algorithm also
- * considers word breaks after for example commas or other
- * punctuation marks.
- * Note that looking at the first character in the string
- * should be compatible with all Unicode normalization
- * methods.
- */
- if (!uc_is_general_category (first_unichar,
- parser->allowed_start)) {
+ /* Skip the word if not an allowed word start */
+ if (!is_allowed) {
/* Skip this word and keep on looping */
parser->cursor += word_length;
continue;
@@ -164,18 +205,13 @@ parser_next (TrackerParser *parser,
word_length :
WORD_BUFFER_LENGTH - 1);
- /* Enable UNAC stripping only if no ASCII and no CJK */
- do_strip = (!is_ascii_word (&(parser->txt[parser->cursor]),
- truncated_length) &&
- !IS_CJK_UCS4 (first_unichar));
-
/* Process the word here. If it fails, we can still go
* to the next one. Returns newly allocated string
* always */
- processed_word = tracker_parser_process_word (parser,
- &(parser->txt[parser->cursor]),
- truncated_length,
- do_strip);
+ processed_word = process_word_utf8 (parser,
+ &(parser->txt[parser->cursor]),
+ truncated_length,
+ type);
if (!processed_word) {
/* Skip this word and keep on looping */
parser->cursor += word_length;
@@ -283,13 +319,26 @@ tracker_parser_reset (TrackerParser *parser,
parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_SYMBOL);
}
-/* libunistring version of the word processor. */
gchar *
tracker_parser_process_word (TrackerParser *parser,
const gchar *word,
gint length,
gboolean do_strip)
{
+ return process_word_utf8 (parser,
+ word,
+ length,
+ (do_strip ?
+ TRACKER_PARSER_WORD_TYPE_OTHER_UNAC :
+ TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC));
+}
+
+static gchar *
+process_word_utf8 (TrackerParser *parser,
+ const gchar *word,
+ gint length,
+ TrackerParserWordType type)
+{
gchar word_buffer [WORD_BUFFER_LENGTH];
gchar *normalized = NULL;
gchar *stripped = NULL;
@@ -310,38 +359,57 @@ tracker_parser_process_word (TrackerParser *parser,
tracker_parser_message_hex ("ORIGINAL word",
word, length);
- /* Leave space for last NIL */
- new_word_length = WORD_BUFFER_LENGTH - 1;
+ /* Normalization and case-folding ONLY for non-ASCII */
+ if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
+ /* Leave space for last NIL */
+ new_word_length = WORD_BUFFER_LENGTH - 1;
+
+ /* Casefold and NFC normalization in output.
+ * NOTE: if the output buffer is not big enough, u8_casefold will
+ * return a newly-allocated buffer. */
+ normalized = u8_casefold ((const uint8_t *)word,
+ length,
+ uc_locale_language (),
+ UNINORM_NFC,
+ word_buffer,
+ &new_word_length);
+
+ /* Case folding + Normalization failed, skip this word */
+ g_return_val_if_fail (normalized != NULL, NULL);
+
+ /* If output buffer is not the same as the one passed to
+ * u8_casefold, we know it was newly-allocated, so need
+ * to resize it in 1 byte to add last NIL */
+ if (normalized != word_buffer) {
+ normalized = g_realloc (normalized, new_word_length + 1);
+ }
- /* Casefold and NFC normalization in output.
- * NOTE: if the output buffer is not big enough, u8_casefold will
- * return a newly-allocated buffer. */
- normalized = u8_casefold ((const uint8_t *)word,
- length,
- uc_locale_language (),
- UNINORM_NFC,
- word_buffer,
- &new_word_length);
-
- /* Case folding + Normalization failed, skip this word */
- g_return_val_if_fail (normalized != NULL, NULL);
-
- /* If output buffer is not the same as the one passed to
- * u8_casefold, we know it was newly-allocated, so need
- * to resize it in 1 byte to add last NIL */
- if (normalized != word_buffer) {
- normalized = g_realloc (normalized, new_word_length + 1);
+ /* Log after Normalization */
+ tracker_parser_message_hex (" After Casefolding and NFC normalization",
+ normalized, new_word_length);
+ }
+ else {
+ /* For ASCII-only, just tolower() each character */
+ gsize i;
+
+ normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length) : word_buffer;
+
+ for (i = 0; i < length; i++) {
+ normalized[i] = g_ascii_tolower (word[i]);
+ }
+
+ new_word_length = length;
+
+ /* Log after tolower */
+ tracker_parser_message_hex (" After Lowercasing",
+ normalized, new_word_length);
}
/* Set output NIL */
normalized[new_word_length] = '\0';
- /* Log after Normalization */
- tracker_parser_message_hex (" After Casefolding and NFC normalization",
- normalized, new_word_length);
-
- /* UNAC stripping needed? */
- if (do_strip) {
+ /* UNAC stripping needed? (for non-CJK and non-ASCII) */
+ if (type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
gsize stripped_word_length;
stripped = tracker_parser_unaccent_utf8_word (normalized,
@@ -356,7 +424,6 @@ tracker_parser_process_word (TrackerParser *parser,
}
}
-
/* Stemming needed? */
if (parser->enable_stemmer) {
stemmed = tracker_language_stem_word (parser->language,
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]