[tracker/wip/fts4: 2/21] libtracker-fts: Implement tokenizer FTS module using TrackerParser



commit 5485a2dd7a3059d4c8020750617a5c330be9ebbe
Author: Carlos Garnacho <carlos lanedo com>
Date:   Thu Oct 27 17:54:49 2011 +0200

    libtracker-fts: Implement tokenizer FTS module using TrackerParser

 src/libtracker-fts/Makefile.am             |    2 +
 src/libtracker-fts/tracker-fts-tokenizer.c |  216 ++++++++++++++++++++++++++++
 src/libtracker-fts/tracker-fts-tokenizer.h |   31 ++++
 src/libtracker-fts/tracker-fts.c           |    9 +-
 4 files changed, 257 insertions(+), 1 deletions(-)
---
diff --git a/src/libtracker-fts/Makefile.am b/src/libtracker-fts/Makefile.am
index a0306e0..1780f06 100644
--- a/src/libtracker-fts/Makefile.am
+++ b/src/libtracker-fts/Makefile.am
@@ -31,6 +31,8 @@ libtracker_fts_la_SOURCES =                            \
 	tracker-fts.h                                  \
 	tracker-fts-config.c                           \
 	tracker-fts-config.h                           \
+	tracker-fts-tokenizer.c                        \
+	tracker-fts-tokenizer.h                        \
 	tracker-parser-utils.c                         \
 	tracker-parser-utils.h                         \
 	tracker-parser.h
diff --git a/src/libtracker-fts/tracker-fts-tokenizer.c b/src/libtracker-fts/tracker-fts-tokenizer.c
new file mode 100644
index 0000000..8e8f6fb
--- /dev/null
+++ b/src/libtracker-fts/tracker-fts-tokenizer.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (C) 2011 Nokia <ivan frade nokia com>
+ *
+ * Author: Carlos Garnacho <carlos lanedo com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301  USA
+ */
+
+/* FTS3/4 Tokenizer using TrackerParser */
+
+#include <assert.h>
+#include <string.h>
+#include "tracker-fts-config.h"
+#include "tracker-parser.h"
+#include "fts3_tokenizer.h"
+
+typedef struct TrackerTokenizer TrackerTokenizer;
+typedef struct TrackerCursor TrackerCursor;
+
+struct TrackerTokenizer {
+  sqlite3_tokenizer base;
+  TrackerParser *parser;
+  int max_word_length;
+  int max_words;
+  gboolean enable_stemmer;
+  gboolean enable_unaccent;
+  gboolean ignore_numbers;
+  gboolean ignore_stop_words;
+};
+
+struct TrackerCursor {
+  sqlite3_tokenizer_cursor base;
+
+  TrackerTokenizer *tokenizer;
+};
+
+/*
+** Create a new tokenizer instance.
+*/
+static int trackerCreate(
+  int argc,                            /* Number of entries in argv[] */
+  const char * const *argv,            /* Tokenizer creation arguments */
+  sqlite3_tokenizer **ppTokenizer      /* OUT: Created tokenizer */
+){
+  TrackerTokenizer *p;
+  TrackerFTSConfig *config;
+
+  p = (TrackerTokenizer *)sqlite3_malloc(sizeof(TrackerTokenizer));
+  if( !p ){
+    return SQLITE_NOMEM;
+  }
+  memset(p, 0, sizeof(TrackerTokenizer));
+  p->parser = tracker_parser_new (tracker_language_new (NULL));
+
+  config = tracker_fts_config_new ();
+
+  p->max_word_length = tracker_fts_config_get_max_word_length (config);
+  p->enable_stemmer = tracker_fts_config_get_enable_stemmer (config);
+  p->enable_unaccent = tracker_fts_config_get_enable_unaccent (config);
+  p->ignore_numbers = tracker_fts_config_get_ignore_numbers (config);
+
+  /* disable stop words if TRACKER_FTS_STOP_WORDS is set to 0 - used by tests
+   *  otherwise, get value from the conf file */
+  p->ignore_stop_words = (g_strcmp0 (g_getenv ("TRACKER_FTS_STOP_WORDS"), "0") == 0 ?
+                          FALSE : tracker_fts_config_get_ignore_stop_words (config));
+
+  p->max_words = tracker_fts_config_get_max_words_to_index (config);
+
+  g_object_unref (config);
+
+  *ppTokenizer = (sqlite3_tokenizer *)p;
+
+  return SQLITE_OK;
+}
+
+/*
+** Destroy a tokenizer
+*/
+static int trackerDestroy(sqlite3_tokenizer *pTokenizer){
+  TrackerTokenizer *p = (TrackerTokenizer *)pTokenizer;
+  tracker_parser_free (p->parser);
+  sqlite3_free(p);
+  return SQLITE_OK;
+}
+
+/*
+** Prepare to begin tokenizing a particular string.  The input
+** string to be tokenized is pInput[0..nBytes-1].  A cursor
+** used to incrementally tokenize this string is returned in 
+** *ppCursor.
+*/
+static int trackerOpen(
+  sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
+  const char *zInput,                    /* Input string */
+  int nInput,                            /* Length of zInput in bytes */
+  sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
+){
+  TrackerTokenizer *p = (TrackerTokenizer *)pTokenizer;
+  TrackerCursor *pCsr;
+
+  if ( nInput<0 ){
+    nInput = strlen(zInput);
+  }
+
+  tracker_parser_reset (p->parser, zInput, nInput,
+			p->max_word_length,
+			p->enable_stemmer,
+			p->enable_unaccent,
+			p->ignore_stop_words,
+			TRUE,
+			p->ignore_numbers);
+
+  pCsr = (TrackerCursor *)sqlite3_malloc(sizeof(TrackerCursor));
+  memset(pCsr, 0, sizeof(TrackerCursor));
+  pCsr->tokenizer = p;
+
+  *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
+  return SQLITE_OK;
+}
+
+/*
+** Close a tokenization cursor.
+*/
+static int trackerClose(sqlite3_tokenizer_cursor *pCursor){
+  TrackerCursor *pCsr = (TrackerCursor *)pCursor;
+
+  sqlite3_free(pCsr);
+  return SQLITE_OK;
+}
+
+/*
+** Extract the next token from a tokenization cursor.
+*/
+static int trackerNext(
+  sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
+  const char **ppToken,               /* OUT: *ppToken is the token text */
+  int *pnBytes,                       /* OUT: Number of bytes in token */
+  int *piStartOffset,                 /* OUT: Starting offset of token */
+  int *piEndOffset,                   /* OUT: Ending offset of token */
+  int *piPosition                     /* OUT: Position integer of token */
+){
+  TrackerCursor *cursor = (TrackerCursor *) pCursor;
+  TrackerTokenizer *p;
+  const gchar *pToken;
+  gboolean stop_word;
+
+  p  = cursor->tokenizer;
+
+  do {
+    pToken = tracker_parser_next (p->parser,
+				  piPosition,
+				  piStartOffset,
+				  piEndOffset,
+				  &stop_word,
+				  pnBytes);
+
+    if (!pToken){
+      return SQLITE_DONE;
+    }
+  } while (stop_word && p->ignore_stop_words);
+
+  if (ppToken){
+    *ppToken = pToken;
+  }
+
+  return SQLITE_OK;
+}
+
+/*
+** The set of routines that implement the simple tokenizer
+*/
+static const sqlite3_tokenizer_module trackerTokenizerModule = {
+  0,                           /* iVersion */
+  trackerCreate,               /* xCreate  */
+  trackerDestroy,              /* xDestroy */
+  trackerOpen,                 /* xOpen    */
+  trackerClose,                /* xClose   */
+  trackerNext,                 /* xNext    */
+};
+
+/*
+** Set *ppModule to point at the implementation of the tracker tokenizer.
+*/
+gboolean tracker_tokenizer_initialize (sqlite3 *db) {
+  sqlite3_tokenizer_module *pTokenizer;
+  int rc = SQLITE_OK;
+  sqlite3_stmt *stmt;
+
+  pTokenizer = &trackerTokenizerModule;
+  rc = sqlite3_prepare_v2(db, "SELECT fts3_tokenizer(?, ?)",
+                          -1, &stmt, 0);
+
+  if (rc != SQLITE_OK) {
+	  return FALSE;
+  }
+
+  sqlite3_bind_text(stmt, 1, "TrackerTokenizer", -1, SQLITE_STATIC);
+  sqlite3_bind_blob(stmt, 2, &pTokenizer, sizeof(pTokenizer), SQLITE_STATIC);
+  sqlite3_step(stmt);
+  rc = sqlite3_finalize(stmt);
+
+  return (rc == SQLITE_OK);
+}
diff --git a/src/libtracker-fts/tracker-fts-tokenizer.h b/src/libtracker-fts/tracker-fts-tokenizer.h
new file mode 100644
index 0000000..3e86295
--- /dev/null
+++ b/src/libtracker-fts/tracker-fts-tokenizer.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2011 Nokia <ivan frade nokia com>
+ *
+ * Author: Carlos Garnacho <carlos lanedo com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301  USA
+ */
+
+#include <sqlite3.h>
+#include <glib.h>
+#include "fts3_tokenizer.h"
+
+#ifndef __TRACKER_FTS_TOKENIZER_H__
+#define __TRACKER_FTS_TOKENIZER_H__
+
+gboolean tracker_tokenizer_initialize (sqlite3 *db);
+
+#endif /* __TRACKER_FTS_TOKENIZER_H__ */
diff --git a/src/libtracker-fts/tracker-fts.c b/src/libtracker-fts/tracker-fts.c
index 9b23b65..57b8ef0 100644
--- a/src/libtracker-fts/tracker-fts.c
+++ b/src/libtracker-fts/tracker-fts.c
@@ -20,6 +20,7 @@
  */
 
 #include <sqlite3.h>
+#include "tracker-fts-tokenizer.h"
 #include "tracker-fts.h"
 #include "fts3.h"
 
@@ -38,8 +39,14 @@ gboolean tracker_fts_init (void) {
 gboolean tracker_fts_init_db (sqlite3 *db, int create){
 	int rc = SQLITE_OK;
 
+	if (!tracker_tokenizer_initialize (db)) {
+		return FALSE;
+	}
+
 	if (create){
-		rc = sqlite3_exec(db, "CREATE VIRTUAL TABLE fts USING fts4",
+		rc = sqlite3_exec(db,
+		                  "CREATE VIRTUAL TABLE fts "
+		                  "USING fts4(tokenize=TrackerTokenizer)",
 		                  NULL, 0, NULL);
 	}
 



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]