[tracker/miner-web: 38/77] libtracker-fts: Do not limit word length in prefix queries
- From: Adrien Bustany <abustany src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/miner-web: 38/77] libtracker-fts: Do not limit word length in prefix queries
- Date: Wed, 3 Mar 2010 12:51:53 +0000 (UTC)
commit e38a2a2ca74c99084347e864d1c90544cac14292
Author: Mikael Ottela <mikael ottela ixonos com>
Date: Thu Feb 25 14:43:56 2010 +0100
libtracker-fts: Do not limit word length in prefix queries
Index short words for properties that specify tracker:fulltextNoLimit.
Limit the word length in exact match queries but not in prefix ones.
src/libtracker-data/tracker-data-update.c | 7 ++-
src/libtracker-fts/tracker-fts.c | 63 ++++++++++++++++------------
src/libtracker-fts/tracker-fts.h | 2 +-
src/libtracker-fts/tracker-parser.c | 9 +---
src/libtracker-fts/tracker-parser.h | 6 ++-
5 files changed, 48 insertions(+), 39 deletions(-)
---
diff --git a/src/libtracker-data/tracker-data-update.c b/src/libtracker-data/tracker-data-update.c
index a574ad6..f2713b7 100644
--- a/src/libtracker-data/tracker-data-update.c
+++ b/src/libtracker-data/tracker-data-update.c
@@ -666,7 +666,9 @@ tracker_data_resource_buffer_flush (GError **error)
g_string_append (fts, g_value_get_string (g_value_array_get_nth (values, i)));
g_string_append_c (fts, ' ');
}
- tracker_fts_update_text (resource_buffer->id, tracker_data_query_resource_id (tracker_property_get_uri (prop)), fts->str);
+ tracker_fts_update_text (resource_buffer->id,
+ tracker_data_query_resource_id (tracker_property_get_uri (prop)),
+ fts->str, !tracker_property_get_fulltext_no_limit (prop));
g_string_free (fts, TRUE);
}
}
@@ -1036,7 +1038,8 @@ get_old_property_values (TrackerProperty *property,
/* delete old fts entries */
for (i = 0; i < old_values->n_values; i++) {
tracker_fts_update_text (resource_buffer->id, -1,
- g_value_get_string (g_value_array_get_nth (old_values, i)));
+ g_value_get_string (g_value_array_get_nth (old_values, i)),
+ !tracker_property_get_fulltext_no_limit (prop));
}
}
}
diff --git a/src/libtracker-fts/tracker-fts.c b/src/libtracker-fts/tracker-fts.c
index 2b89649..b80f102 100644
--- a/src/libtracker-fts/tracker-fts.c
+++ b/src/libtracker-fts/tracker-fts.c
@@ -2323,6 +2323,7 @@ struct fulltext_vtab {
TrackerParser *parser; /* tokenizer for inserts and queries */
gboolean stop_words;
int max_words;
+ int min_word_length;
/* Precompiled statements which we keep as long as the table is
** open.
@@ -3358,8 +3359,8 @@ static int constructVtab(
max_len = tracker_fts_config_get_max_word_length (config);
v->max_words = tracker_fts_config_get_max_words_to_index (config);
-
- v->parser = tracker_parser_new (language, max_len, min_len);
+ v->min_word_length = min_len;
+ v->parser = tracker_parser_new (language, max_len);
/* disable stop words if TRACKER_FTS_STOP_WORDS is set to 0 - used by tests */
v->stop_words = g_strcmp0 (g_getenv ("TRACKER_FTS_STOP_WORDS"), "0") != 0;
@@ -4332,14 +4333,19 @@ static int tokenizeSegment(
pToken = tracker_parser_next (parser, &iPos,
- &iBegin,
- &iEnd,
- &stop_word,
- &nToken);
+ &iBegin,
+ &iEnd,
+ &stop_word,
+ &nToken);
if (!pToken) {
break;
}
+ /* If prefix search ignore the word lenght limit */
+ if( nToken < v->min_word_length && !(iEnd<nSegment && pSegment[iEnd]=='*') ){
+ continue;
+ }
+
// printf("token being indexed is %s, pos is %d, begin is %d, end is %d and length is %d\n", pToken, iPos, iBegin, iEnd, nToken);
if( !inPhrase &&
@@ -4363,10 +4369,10 @@ static int tokenizeSegment(
continue;
}
if( !inPhrase && pQuery->nTerms>0 && !pQuery->nextIsOr && nToken==4
- && pToken[0]=='n'
- && pToken[1]=='e'
- && pToken[2]=='a'
- && pToken[3]=='r'
+ && pToken[0]=='n'
+ && pToken[1]=='e'
+ && pToken[2]=='a'
+ && pToken[3]=='r'
){
QueryTerm *pTerm = &pQuery->pTerms[pQuery->nTerms-1];
if( (iBegin+6)<nSegment
@@ -4380,10 +4386,10 @@ static int tokenizeSegment(
iEnd++;
}
pToken = tracker_parser_next (parser, &iPos,
- &iBegin,
- &iEnd,
- &stop_word,
- &nToken);
+ &iBegin,
+ &iEnd,
+ &stop_word,
+ &nToken);
if (!pToken) {
break;
}
@@ -4756,7 +4762,8 @@ static int buildTerms(fulltext_vtab *v, sqlite_int64 iDocid,
#ifdef STORE_CATEGORY
int Catid,
#endif
- const char *zText, int iColumn){
+ const char *zText, int iColumn,
+ gboolean limit_word_length){
const char *pToken;
int nTokenBytes;
@@ -4773,24 +4780,24 @@ int Catid,
while( 1 ){
pToken = tracker_parser_next (parser, &iPosition,
- &iStartOffset,
- &iEndOffset,
- &stop_word,
- &nTokenBytes);
+ &iStartOffset,
+ &iEndOffset,
+ &stop_word,
+ &nTokenBytes);
if (!pToken) {
break;
}
+ if (limit_word_length && nTokenBytes < v->min_word_length) {
+ continue;
+ }
+
// printf("token being indexed is %s, begin is %d, end is %d and length is %d\n", pToken, iStartOffset, iEndOffset, nTokenBytes);
if (stop_word) {
continue;
}
-
-
-
-
/* Positions can't be negative; we use -1 as a terminator
* internally. Token can't be NULL or empty. */
if( iPosition<0 || pToken == NULL || nTokenBytes == 0 ){
@@ -4947,7 +4954,7 @@ static int index_update(fulltext_vtab *v, sqlite_int64 iRow,
/* tracker - as for col id we want col 0 to be the default metadata field (file:contents or email:body) ,
col 1 to be meatdata id 1, col 2 to be metadat id 2 etc so need to decrement i here */
- int rc = buildTerms(v, iRow, sqlite3_value_int (pValues[0]), zText, delete ? -1 : (i-1));
+ int rc = buildTerms(v, iRow, sqlite3_value_int (pValues[0]), zText, delete ? -1 : (i-1), TRUE);
if( rc!=SQLITE_OK ) return rc;
}
@@ -4955,7 +4962,7 @@ static int index_update(fulltext_vtab *v, sqlite_int64 iRow,
for(i = 0; i < v->nColumn ; ++i){
char *zText = (char*)sqlite3_value_text(pValues[i]);
- rc = buildTerms(v, iRow, zText, delete ? -1 : i);
+ rc = buildTerms(v, iRow, zText, delete ? -1 : i, TRUE);
if( rc!=SQLITE_OK ) return rc;
}
@@ -7775,8 +7782,10 @@ int tracker_fts_update_init(int id){
return initPendingTerms(tracker_fts_vtab, id);
}
-int tracker_fts_update_text(int id, int column_id, const char *text){
- return buildTerms(tracker_fts_vtab, id, text, column_id);
+int tracker_fts_update_text(int id, int column_id,
+ const char *text, gboolean limit_word_length){
+ return buildTerms(tracker_fts_vtab, id, text,
+ column_id, limit_word_length);
}
void tracker_fts_update_commit(void){
diff --git a/src/libtracker-fts/tracker-fts.h b/src/libtracker-fts/tracker-fts.h
index b39c4c5..491404d 100644
--- a/src/libtracker-fts/tracker-fts.h
+++ b/src/libtracker-fts/tracker-fts.h
@@ -24,7 +24,7 @@ G_BEGIN_DECLS
int tracker_fts_init (sqlite3 *db);
int tracker_fts_update_init (int id);
-int tracker_fts_update_text (int id, int column_id, const char *text);
+int tracker_fts_update_text (int id, int column_id, const char *text, gboolean limit_word_length);
void tracker_fts_update_commit (void);
void tracker_fts_update_rollback (void);
gchar * tracker_fts_get_create_fts_table_query (void);
diff --git a/src/libtracker-fts/tracker-parser.c b/src/libtracker-fts/tracker-parser.c
index cf1d1e3..84eda5a 100644
--- a/src/libtracker-fts/tracker-parser.c
+++ b/src/libtracker-fts/tracker-parser.c
@@ -79,7 +79,6 @@ struct TrackerParser {
gboolean enable_stop_words;
guint max_words_to_index;
guint max_word_length;
- guint min_word_length;
gboolean delimit_words;
gboolean parse_reserved_words;
@@ -323,7 +322,6 @@ parser_next (TrackerParser *parser,
}
if (!is_valid ||
- length < parser->min_word_length ||
word_type == TRACKER_PARSER_WORD_NUM) {
word_type = TRACKER_PARSER_WORD_IGNORE;
is_valid = TRUE;
@@ -460,21 +458,18 @@ parser_next (TrackerParser *parser,
TrackerParser *
tracker_parser_new (TrackerLanguage *language,
- gint max_word_length,
- gint min_word_length)
+ gint max_word_length)
{
TrackerParser *parser;
g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
- g_return_val_if_fail (min_word_length > 0, NULL);
- g_return_val_if_fail (min_word_length < max_word_length, NULL);
+ g_return_val_if_fail (max_word_length > 0, NULL);
parser = g_new0 (TrackerParser, 1);
parser->language = g_object_ref (language);
parser->max_word_length = max_word_length;
- parser->min_word_length = min_word_length;
parser->word_length = 0;
parser->attrs = NULL;
diff --git a/src/libtracker-fts/tracker-parser.h b/src/libtracker-fts/tracker-parser.h
index 6e6b7fa..f6503ac 100644
--- a/src/libtracker-fts/tracker-parser.h
+++ b/src/libtracker-fts/tracker-parser.h
@@ -30,8 +30,8 @@ G_BEGIN_DECLS
typedef struct TrackerParser TrackerParser;
TrackerParser *tracker_parser_new (TrackerLanguage *language,
- gint max_word_length,
- gint min_word_length);
+ gint max_word_length);
+
void tracker_parser_reset (TrackerParser *parser,
const gchar *txt,
gint txt_size,
@@ -39,12 +39,14 @@ void tracker_parser_reset (TrackerParser *parser,
gboolean enable_stemmer,
gboolean enable_stop_words,
gboolean parse_reserved_words);
+
const gchar * tracker_parser_next (TrackerParser *parser,
gint *position,
gint *byte_offset_start,
gint *byte_offset_end,
gboolean *stop_word,
gint *word_length);
+
gchar * tracker_parser_process_word (TrackerParser *parser,
const char *word,
gint length,
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]