[tracker/parser-unicode-libs-review] Filtering rules updated
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/parser-unicode-libs-review] Filtering rules updated
- Date: Fri, 7 May 2010 12:01:33 +0000 (UTC)
commit a88554a8a5d66030ab4819efe73103e170eaadef
Author: Aleksander Morgado <aleksander lanedo com>
Date: Fri May 7 13:46:43 2010 +0200
Filtering rules updated
* By default, skip numbers
* Enable underscore as word starter
* Remove symbols from valid word starters
src/libtracker-fts/tracker-fts.c | 6 +-
src/libtracker-fts/tracker-parser-glib.c | 19 +++++----
src/libtracker-fts/tracker-parser-libicu.c | 47 ++++++++++------------
src/libtracker-fts/tracker-parser-libunistring.c | 30 ++++++--------
src/libtracker-fts/tracker-parser-utils.h | 11 +++++
src/libtracker-fts/tracker-parser.h | 3 +-
tests/libtracker-fts/tracker-parser-test.c | 1 +
7 files changed, 62 insertions(+), 55 deletions(-)
---
diff --git a/src/libtracker-fts/tracker-fts.c b/src/libtracker-fts/tracker-fts.c
index 857b3f8..f6c5b8b 100644
--- a/src/libtracker-fts/tracker-fts.c
+++ b/src/libtracker-fts/tracker-fts.c
@@ -3658,7 +3658,7 @@ static void snippetOffsetsOfColumn(
pVtab = pQuery->pFts;
nColumn = pVtab->nColumn;
- tracker_parser_reset (pVtab->parser, zDoc, nDoc, FALSE, TRUE, pVtab->stop_words, FALSE);
+ tracker_parser_reset (pVtab->parser, zDoc, nDoc, FALSE, TRUE, pVtab->stop_words, TRUE, TRUE);
aTerm = pQuery->pTerms;
nTerm = pQuery->nTerms;
@@ -4355,7 +4355,7 @@ static int tokenizeSegment(
int firstIndex = pQuery->nTerms;
int nTerm = 1;
- tracker_parser_reset (parser, pSegment, nSegment, FALSE, TRUE, v->stop_words, TRUE);
+ tracker_parser_reset (parser, pSegment, nSegment, FALSE, TRUE, v->stop_words, FALSE, TRUE);
while( 1 ){
const char *pToken;
@@ -4808,7 +4808,7 @@ int Catid,
if (!zText) return SQLITE_OK;
- tracker_parser_reset (parser, zText, strlen (zText), FALSE, TRUE, v->stop_words, FALSE);
+ tracker_parser_reset (parser, zText, strlen (zText), FALSE, TRUE, v->stop_words, TRUE, TRUE);
while( 1 ){
diff --git a/src/libtracker-fts/tracker-parser-glib.c b/src/libtracker-fts/tracker-parser-glib.c
index 83a969b..555331f 100644
--- a/src/libtracker-fts/tracker-parser-glib.c
+++ b/src/libtracker-fts/tracker-parser-glib.c
@@ -77,7 +77,8 @@ struct TrackerParser {
guint max_words_to_index;
guint max_word_length;
gboolean delimit_words;
- gboolean parse_reserved_words;
+ gboolean skip_reserved_words;
+ gboolean skip_numbers;
/* Private members */
gchar *word;
@@ -278,14 +279,14 @@ parser_next (TrackerParser *parser,
/* word break */
/* check if word is reserved */
- if (is_valid && parser->parse_reserved_words) {
+ if (is_valid && parser->skip_reserved_words) {
if (length == 2 && word[0] == 'o' && word[1] == 'r') {
- break;
+ is_valid = FALSE;
}
}
if (!is_valid ||
- word_type == TRACKER_PARSER_WORD_NUM) {
+ (parser->skip_numbers && word_type == TRACKER_PARSER_WORD_NUM)) {
word_type = TRACKER_PARSER_WORD_IGNORE;
is_valid = TRUE;
length = 0;
@@ -312,12 +313,12 @@ parser_next (TrackerParser *parser,
* underscore if we are filtering.
*/
- if (type == TRACKER_PARSER_WORD_NUM) {
+ if (parser->skip_numbers && type == TRACKER_PARSER_WORD_NUM) {
is_valid = FALSE;
continue;
} else {
if (type == TRACKER_PARSER_WORD_HYPHEN) {
- is_valid = parser->parse_reserved_words;
+ is_valid = !parser->skip_reserved_words;
continue;
}
}
@@ -461,7 +462,8 @@ tracker_parser_reset (TrackerParser *parser,
gboolean delimit_words,
gboolean enable_stemmer,
gboolean enable_stop_words,
- gboolean parse_reserved_words)
+ gboolean skip_reserved_words,
+ gboolean skip_numbers)
{
g_return_if_fail (parser != NULL);
g_return_if_fail (txt != NULL);
@@ -479,7 +481,8 @@ tracker_parser_reset (TrackerParser *parser,
parser->txt_size = txt_size;
parser->txt = txt;
- parser->parse_reserved_words = parse_reserved_words;
+ parser->skip_reserved_words = skip_reserved_words;
+ parser->skip_numbers = skip_numbers;
g_free (parser->word);
parser->word = NULL;
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
index 33c062c..190931c 100644
--- a/src/libtracker-fts/tracker-parser-libicu.c
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -34,14 +34,6 @@
#include "tracker-parser.h"
#include "tracker-parser-utils.h"
-/* ASCII-7 is in range [0x00,0x7F] */
-#define IS_ASCII_UCS4(c) ((c) <= 0x7F)
-
-/* CJK ranges are : [0x3400,0x4DB5], [0x4E00,0x9FA5], [0x20000,0x2A6D6] */
-#define IS_CJK_UCS4(c) (((c) >= 0x3400 && (c) <= 0x4DB5) || \
- ((c) >= 0x4E00 && (c) <= 0x9FA5) || \
- ((c) >= 0x20000 && (c) <= 0x2A6D6))
-
/* Type of words detected */
typedef enum {
TRACKER_PARSER_WORD_TYPE_ASCII,
@@ -69,7 +61,8 @@ struct TrackerParser {
guint max_words_to_index;
guint max_word_length;
gboolean delimit_words;
- gboolean parse_reserved_words;
+ gboolean skip_reserved_words;
+ gboolean skip_numbers;
/* Private members */
gchar *word;
@@ -93,6 +86,7 @@ struct TrackerParser {
static gboolean
get_word_info (const UChar *word,
gsize word_length,
+ gboolean skip_numbers,
gboolean *p_is_allowed_word_start,
TrackerParserWordType *p_word_type)
{
@@ -117,22 +111,20 @@ get_word_info (const UChar *word,
* methods.
*/
unichar_gc = u_charType (unichar);
- if (unichar_gc != U_UPPERCASE_LETTER &&
- unichar_gc != U_LOWERCASE_LETTER &&
- unichar_gc != U_TITLECASE_LETTER &&
- unichar_gc != U_MODIFIER_LETTER &&
- unichar_gc != U_OTHER_LETTER &&
- unichar_gc != U_DECIMAL_DIGIT_NUMBER &&
- unichar_gc != U_LETTER_NUMBER &&
- unichar_gc != U_OTHER_NUMBER &&
- unichar_gc != U_MATH_SYMBOL &&
- unichar_gc != U_CURRENCY_SYMBOL &&
- unichar_gc != U_MODIFIER_SYMBOL &&
- unichar_gc != U_OTHER_SYMBOL) {
+ if (unichar_gc == U_UPPERCASE_LETTER ||
+ unichar_gc == U_LOWERCASE_LETTER ||
+ unichar_gc == U_TITLECASE_LETTER ||
+ unichar_gc == U_MODIFIER_LETTER ||
+ unichar_gc == U_OTHER_LETTER ||
+ IS_UNDERSCORE_UCS4 ((guint32)unichar) ||
+ (!skip_numbers &&
+ (unichar_gc == U_DECIMAL_DIGIT_NUMBER ||
+ unichar_gc == U_LETTER_NUMBER ||
+ unichar_gc == U_OTHER_NUMBER))) {
+ *p_is_allowed_word_start = TRUE;
+ } else {
*p_is_allowed_word_start = FALSE;
return TRUE;
- } else {
- *p_is_allowed_word_start = TRUE;
}
/* Word starts with a CJK character? */
@@ -215,6 +207,7 @@ parser_next (TrackerParser *parser,
/* Get word info... */
if (!get_word_info (&parser->utxt[parser->cursor],
word_length_uchar,
+ parser->skip_numbers,
&is_allowed,
&type)) {
/* Quit loop just in case */
@@ -230,7 +223,7 @@ parser_next (TrackerParser *parser,
}
/* check if word is reserved (looking at ORIGINAL UTF-8 buffer here! */
- if (parser->parse_reserved_words &&
+ if (parser->skip_reserved_words &&
tracker_parser_is_reserved_word_utf8 (&parser->txt[current_word_offset_utf8],
word_length_utf8)) {
/* Skip this word and keep on looping */
@@ -333,7 +326,8 @@ tracker_parser_reset (TrackerParser *parser,
gboolean delimit_words,
gboolean enable_stemmer,
gboolean enable_stop_words,
- gboolean parse_reserved_words)
+ gboolean skip_reserved_words,
+ gboolean skip_numbers)
{
UErrorCode error = U_ZERO_ERROR;
UConverter *converter;
@@ -349,7 +343,8 @@ tracker_parser_reset (TrackerParser *parser,
parser->txt_size = txt_size;
parser->txt = txt;
- parser->parse_reserved_words = parse_reserved_words;
+ parser->skip_reserved_words = skip_reserved_words;
+ parser->skip_numbers = skip_numbers;
g_free (parser->word);
parser->word = NULL;
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c
index bad3cea..4a6ff35 100644
--- a/src/libtracker-fts/tracker-parser-libunistring.c
+++ b/src/libtracker-fts/tracker-parser-libunistring.c
@@ -33,15 +33,6 @@
#include "tracker-parser.h"
#include "tracker-parser-utils.h"
-
-/* ASCII-7 is in range [0x00,0x7F] */
-#define IS_ASCII_BYTE(c) ((c) <= 0x7F)
-
-/* CJK ranges are : [0x3400,0x4DB5], [0x4E00,0x9FA5], [0x20000,0x2A6D6] */
-#define IS_CJK_UCS4(c) (((c) >= 0x3400 && (c) <= 0x4DB5) || \
- ((c) >= 0x4E00 && (c) <= 0x9FA5) || \
- ((c) >= 0x20000 && (c) <= 0x2A6D6))
-
/* Type of words detected */
typedef enum {
TRACKER_PARSER_WORD_TYPE_ASCII,
@@ -67,7 +58,8 @@ struct TrackerParser {
guint max_words_to_index;
guint max_word_length;
gboolean delimit_words;
- gboolean parse_reserved_words;
+ gboolean skip_reserved_words;
+ gboolean skip_numbers;
/* Private members */
gchar *word;
@@ -115,7 +107,7 @@ get_word_info (TrackerParser *parser,
!parser->word_break_flags [i]) {
if (ascii_only &&
- !IS_ASCII_BYTE ((guchar)parser->txt[i])) {
+ !IS_ASCII_UCS4 ((guint32)parser->txt[i])) {
ascii_only = FALSE;
}
@@ -135,7 +127,8 @@ get_word_info (TrackerParser *parser,
* should be compatible with all Unicode normalization
* methods.
*/
- if (!uc_is_general_category (first_unichar,
+ if (!IS_UNDERSCORE_UCS4 ((guint32)first_unichar) &&
+ !uc_is_general_category (first_unichar,
parser->allowed_start)) {
*p_is_allowed_word_start = FALSE;
return TRUE;
@@ -197,7 +190,7 @@ parser_next (TrackerParser *parser,
}
/* check if word is reserved and skip it if so */
- if (parser->parse_reserved_words &&
+ if (parser->skip_reserved_words &&
tracker_parser_is_reserved_word_utf8 (&parser->txt[parser->cursor],
word_length)) {
/* Skip this word and keep on looping */
@@ -288,7 +281,8 @@ tracker_parser_reset (TrackerParser *parser,
gboolean delimit_words,
gboolean enable_stemmer,
gboolean enable_stop_words,
- gboolean parse_reserved_words)
+ gboolean skip_reserved_words,
+ gboolean skip_numbers)
{
g_return_if_fail (parser != NULL);
g_return_if_fail (txt != NULL);
@@ -299,7 +293,8 @@ tracker_parser_reset (TrackerParser *parser,
parser->txt_size = txt_size;
parser->txt = txt;
- parser->parse_reserved_words = parse_reserved_words;
+ parser->skip_reserved_words = skip_reserved_words;
+ parser->skip_numbers = skip_numbers;
g_free (parser->word);
parser->word = NULL;
@@ -321,8 +316,9 @@ tracker_parser_reset (TrackerParser *parser,
/* Prepare a custom category which is a combination of the
* desired ones */
parser->allowed_start = UC_LETTER;
- parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER);
- parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_SYMBOL);
+ if (!parser->skip_numbers) {
+ parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER);
+ }
}
gchar *
diff --git a/src/libtracker-fts/tracker-parser-utils.h b/src/libtracker-fts/tracker-parser-utils.h
index 9c007bd..50805c1 100644
--- a/src/libtracker-fts/tracker-parser-utils.h
+++ b/src/libtracker-fts/tracker-parser-utils.h
@@ -30,6 +30,17 @@
G_BEGIN_DECLS
+/* ASCII-7 is in range [0x00,0x7F] */
+#define IS_ASCII_UCS4(c) ((c) <= 0x7F)
+
+/* CJK ranges are : [0x3400,0x4DB5], [0x4E00,0x9FA5], [0x20000,0x2A6D6] */
+#define IS_CJK_UCS4(c) (((c) >= 0x3400 && (c) <= 0x4DB5) || \
+ ((c) >= 0x4E00 && (c) <= 0x9FA5) || \
+ ((c) >= 0x20000 && (c) <= 0x2A6D6))
+
+#define IS_UNDERSCORE_UCS4(c) ((c) == 0x005F)
+
+
gchar *tracker_parser_unaccent_utf16be_word (const gchar *string,
gsize ilength,
gsize *p_olength);
diff --git a/src/libtracker-fts/tracker-parser.h b/src/libtracker-fts/tracker-parser.h
index 3175b22..cad4442 100644
--- a/src/libtracker-fts/tracker-parser.h
+++ b/src/libtracker-fts/tracker-parser.h
@@ -38,7 +38,8 @@ void tracker_parser_reset (TrackerParser *parser,
gboolean delimit_words,
gboolean enable_stemmer,
gboolean enable_stop_words,
- gboolean parse_reserved_words);
+ gboolean skip_reserved_words,
+ gboolean skip_numbers);
const gchar * tracker_parser_next (TrackerParser *parser,
gint *position,
diff --git a/tests/libtracker-fts/tracker-parser-test.c b/tests/libtracker-fts/tracker-parser-test.c
index 1c38215..970c8cc 100644
--- a/tests/libtracker-fts/tracker-parser-test.c
+++ b/tests/libtracker-fts/tracker-parser-test.c
@@ -149,6 +149,7 @@ run_parsing (void)
TRUE,
TRUE,
TRUE,
+ TRUE,
TRUE);
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]