[tracker/miner-web-review: 38/74] libtracker-fts: Do not limit word length in prefix queries



commit e38a2a2ca74c99084347e864d1c90544cac14292
Author: Mikael Ottela <mikael ottela ixonos com>
Date:   Thu Feb 25 14:43:56 2010 +0100

    libtracker-fts: Do not limit word length in prefix queries
    
    Index short words for properties that specify tracker:fulltextNoLimit.
    Limit the word length in exact match queries but not in prefix ones.

 src/libtracker-data/tracker-data-update.c |    7 ++-
 src/libtracker-fts/tracker-fts.c          |   63 ++++++++++++++++------------
 src/libtracker-fts/tracker-fts.h          |    2 +-
 src/libtracker-fts/tracker-parser.c       |    9 +---
 src/libtracker-fts/tracker-parser.h       |    6 ++-
 5 files changed, 48 insertions(+), 39 deletions(-)
---
diff --git a/src/libtracker-data/tracker-data-update.c b/src/libtracker-data/tracker-data-update.c
index a574ad6..f2713b7 100644
--- a/src/libtracker-data/tracker-data-update.c
+++ b/src/libtracker-data/tracker-data-update.c
@@ -666,7 +666,9 @@ tracker_data_resource_buffer_flush (GError **error)
 					g_string_append (fts, g_value_get_string (g_value_array_get_nth (values, i)));
 					g_string_append_c (fts, ' ');
 				}
-				tracker_fts_update_text (resource_buffer->id, tracker_data_query_resource_id (tracker_property_get_uri (prop)), fts->str);
+				tracker_fts_update_text (resource_buffer->id,
+							 tracker_data_query_resource_id (tracker_property_get_uri (prop)),
+							 fts->str, !tracker_property_get_fulltext_no_limit (prop));
 				g_string_free (fts, TRUE);
 			}
 		}
@@ -1036,7 +1038,8 @@ get_old_property_values (TrackerProperty  *property,
 					/* delete old fts entries */
 					for (i = 0; i < old_values->n_values; i++) {
 						tracker_fts_update_text (resource_buffer->id, -1,
-						                         g_value_get_string (g_value_array_get_nth (old_values, i)));
+						                         g_value_get_string (g_value_array_get_nth (old_values, i)),
+									 !tracker_property_get_fulltext_no_limit (prop));
 					}
 				}
 			}
diff --git a/src/libtracker-fts/tracker-fts.c b/src/libtracker-fts/tracker-fts.c
index 2b89649..b80f102 100644
--- a/src/libtracker-fts/tracker-fts.c
+++ b/src/libtracker-fts/tracker-fts.c
@@ -2323,6 +2323,7 @@ struct fulltext_vtab {
   TrackerParser *parser;	   /* tokenizer for inserts and queries */
   gboolean stop_words;
   int max_words;
+  int min_word_length;
 
   /* Precompiled statements which we keep as long as the table is
   ** open.
@@ -3358,8 +3359,8 @@ static int constructVtab(
   max_len = tracker_fts_config_get_max_word_length (config);
 
   v->max_words = tracker_fts_config_get_max_words_to_index (config);
-
-  v->parser = tracker_parser_new (language, max_len, min_len);
+  v->min_word_length = min_len;
+  v->parser = tracker_parser_new (language, max_len);
 
   /* disable stop words if TRACKER_FTS_STOP_WORDS is set to 0 - used by tests */
   v->stop_words = g_strcmp0 (g_getenv ("TRACKER_FTS_STOP_WORDS"), "0") != 0;
@@ -4332,14 +4333,19 @@ static int tokenizeSegment(
 
 
     pToken = tracker_parser_next (parser, &iPos,
-				     &iBegin,
-				     &iEnd,
-				     &stop_word,
-				     &nToken);
+				  &iBegin,
+				  &iEnd,
+				  &stop_word,
+				  &nToken);
     if (!pToken) {
       break;
      }
 
+    /* If prefix search ignore the word lenght limit */
+    if( nToken < v->min_word_length && !(iEnd<nSegment && pSegment[iEnd]=='*') ){
+      continue;
+    }
+
 //   printf("token being indexed  is %s, pos is %d, begin is %d, end is %d and length is %d\n", pToken, iPos, iBegin, iEnd, nToken);
 
     if( !inPhrase &&
@@ -4363,10 +4369,10 @@ static int tokenizeSegment(
       continue;
     }
     if( !inPhrase && pQuery->nTerms>0 && !pQuery->nextIsOr && nToken==4
-      && pToken[0]=='n'
-      && pToken[1]=='e'
-      && pToken[2]=='a'
-      && pToken[3]=='r'
+	&& pToken[0]=='n'
+	&& pToken[1]=='e'
+	&& pToken[2]=='a'
+	&& pToken[3]=='r'
     ){
       QueryTerm *pTerm = &pQuery->pTerms[pQuery->nTerms-1];
       if( (iBegin+6)<nSegment
@@ -4380,10 +4386,10 @@ static int tokenizeSegment(
 	  iEnd++;
 	}
 	pToken = tracker_parser_next (parser, &iPos,
-				     &iBegin,
-				     &iEnd,
-				     &stop_word,
-				     &nToken);
+				      &iBegin,
+				      &iEnd,
+				      &stop_word,
+				      &nToken);
 	if (!pToken) {
 	  break;
 	}
@@ -4756,7 +4762,8 @@ static int buildTerms(fulltext_vtab *v, sqlite_int64 iDocid,
 #ifdef STORE_CATEGORY
 int Catid,
 #endif
-		      const char *zText, int iColumn){
+		      const char *zText, int iColumn,
+		      gboolean limit_word_length){
 
   const char *pToken;
   int nTokenBytes;
@@ -4773,24 +4780,24 @@ int Catid,
   while( 1 ){
 
     pToken = tracker_parser_next (parser, &iPosition,
-				     &iStartOffset,
-				     &iEndOffset,
-				     &stop_word,
-				     &nTokenBytes);
+				  &iStartOffset,
+				  &iEndOffset,
+				  &stop_word,
+				  &nTokenBytes);
    if (!pToken) {
 	break;
    }
 
+   if (limit_word_length && nTokenBytes < v->min_word_length) {
+	continue;
+   }
+
   // printf("token being indexed  is %s, begin is %d, end is %d and length is %d\n", pToken, iStartOffset, iEndOffset, nTokenBytes);
 
    if (stop_word) {
 	continue;
    }
 
-
-
-
-
     /* Positions can't be negative; we use -1 as a terminator
      * internally.  Token can't be NULL or empty. */
     if( iPosition<0 || pToken == NULL || nTokenBytes == 0 ){
@@ -4947,7 +4954,7 @@ static int index_update(fulltext_vtab *v, sqlite_int64 iRow,
 
     /* tracker - as for col id we want col 0 to be the default metadata field (file:contents or email:body) ,
     col 1 to be meatdata id 1, col 2 to be metadat id 2 etc so need to decrement i here */
-    int rc = buildTerms(v, iRow, sqlite3_value_int (pValues[0]), zText, delete ? -1 : (i-1));
+    int rc = buildTerms(v, iRow, sqlite3_value_int (pValues[0]), zText, delete ? -1 : (i-1), TRUE);
     if( rc!=SQLITE_OK ) return rc;
   }
 
@@ -4955,7 +4962,7 @@ static int index_update(fulltext_vtab *v, sqlite_int64 iRow,
 
   for(i = 0; i < v->nColumn ; ++i){
     char *zText = (char*)sqlite3_value_text(pValues[i]);
-    rc = buildTerms(v, iRow, zText, delete ? -1 : i);
+    rc = buildTerms(v, iRow, zText, delete ? -1 : i, TRUE);
     if( rc!=SQLITE_OK ) return rc;
   }
 
@@ -7775,8 +7782,10 @@ int tracker_fts_update_init(int id){
   return initPendingTerms(tracker_fts_vtab, id);
 }
 
-int tracker_fts_update_text(int id, int column_id, const char *text){
-  return buildTerms(tracker_fts_vtab, id, text, column_id);
+int tracker_fts_update_text(int id, int column_id,
+			    const char *text, gboolean limit_word_length){
+	return buildTerms(tracker_fts_vtab, id, text,
+			  column_id, limit_word_length);
 }
 
 void tracker_fts_update_commit(void){
diff --git a/src/libtracker-fts/tracker-fts.h b/src/libtracker-fts/tracker-fts.h
index b39c4c5..491404d 100644
--- a/src/libtracker-fts/tracker-fts.h
+++ b/src/libtracker-fts/tracker-fts.h
@@ -24,7 +24,7 @@ G_BEGIN_DECLS
 
 int tracker_fts_init (sqlite3 *db);
 int tracker_fts_update_init (int id);
-int tracker_fts_update_text (int id, int column_id, const char *text);
+int tracker_fts_update_text (int id, int column_id, const char *text, gboolean limit_word_length);
 void tracker_fts_update_commit (void);
 void tracker_fts_update_rollback (void);
 gchar * tracker_fts_get_create_fts_table_query (void);
diff --git a/src/libtracker-fts/tracker-parser.c b/src/libtracker-fts/tracker-parser.c
index cf1d1e3..84eda5a 100644
--- a/src/libtracker-fts/tracker-parser.c
+++ b/src/libtracker-fts/tracker-parser.c
@@ -79,7 +79,6 @@ struct TrackerParser {
 	gboolean               enable_stop_words;
 	guint                  max_words_to_index;
 	guint                  max_word_length;
-	guint                  min_word_length;
 	gboolean               delimit_words;
 	gboolean               parse_reserved_words;
 
@@ -323,7 +322,6 @@ parser_next (TrackerParser *parser,
 				}
 
 				if (!is_valid ||
-				    length < parser->min_word_length ||
 				    word_type == TRACKER_PARSER_WORD_NUM) {
 					word_type = TRACKER_PARSER_WORD_IGNORE;
 					is_valid = TRUE;
@@ -460,21 +458,18 @@ parser_next (TrackerParser *parser,
 
 TrackerParser *
 tracker_parser_new (TrackerLanguage *language,
-                    gint             max_word_length,
-                    gint             min_word_length)
+                    gint             max_word_length)
 {
 	TrackerParser *parser;
 
 	g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
-	g_return_val_if_fail (min_word_length > 0, NULL);
-	g_return_val_if_fail (min_word_length < max_word_length, NULL);
+	g_return_val_if_fail (max_word_length > 0, NULL);
 
 	parser = g_new0 (TrackerParser, 1);
 
 	parser->language = g_object_ref (language);
 
 	parser->max_word_length = max_word_length;
-	parser->min_word_length = min_word_length;
 	parser->word_length = 0;
 	parser->attrs = NULL;
 
diff --git a/src/libtracker-fts/tracker-parser.h b/src/libtracker-fts/tracker-parser.h
index 6e6b7fa..f6503ac 100644
--- a/src/libtracker-fts/tracker-parser.h
+++ b/src/libtracker-fts/tracker-parser.h
@@ -30,8 +30,8 @@ G_BEGIN_DECLS
 typedef struct TrackerParser TrackerParser;
 
 TrackerParser *tracker_parser_new             (TrackerLanguage *language,
-                                               gint             max_word_length,
-                                               gint             min_word_length);
+                                               gint             max_word_length);
+
 void           tracker_parser_reset           (TrackerParser   *parser,
                                                const gchar     *txt,
                                                gint             txt_size,
@@ -39,12 +39,14 @@ void           tracker_parser_reset           (TrackerParser   *parser,
                                                gboolean                 enable_stemmer,
                                                gboolean                 enable_stop_words,
                                                gboolean                 parse_reserved_words);
+
 const gchar *  tracker_parser_next            (TrackerParser   *parser,
                                                gint            *position,
                                                gint            *byte_offset_start,
                                                gint            *byte_offset_end,
                                                gboolean        *stop_word,
                                                gint            *word_length);
+
 gchar *        tracker_parser_process_word    (TrackerParser   *parser,
                                                const char      *word,
                                                gint             length,



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]