[tracker/wip/fts4: 2/21] libtracker-fts: Implement tokenizer FTS module using TrackerParser
- From: Carlos Garnacho <carlosg src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/wip/fts4: 2/21] libtracker-fts: Implement tokenizer FTS module using TrackerParser
- Date: Mon, 28 Jan 2013 10:02:20 +0000 (UTC)
commit 5485a2dd7a3059d4c8020750617a5c330be9ebbe
Author: Carlos Garnacho <carlos lanedo com>
Date: Thu Oct 27 17:54:49 2011 +0200
libtracker-fts: Implement tokenizer FTS module using TrackerParser
src/libtracker-fts/Makefile.am | 2 +
src/libtracker-fts/tracker-fts-tokenizer.c | 216 ++++++++++++++++++++++++++++
src/libtracker-fts/tracker-fts-tokenizer.h | 31 ++++
src/libtracker-fts/tracker-fts.c | 9 +-
4 files changed, 257 insertions(+), 1 deletions(-)
---
diff --git a/src/libtracker-fts/Makefile.am b/src/libtracker-fts/Makefile.am
index a0306e0..1780f06 100644
--- a/src/libtracker-fts/Makefile.am
+++ b/src/libtracker-fts/Makefile.am
@@ -31,6 +31,8 @@ libtracker_fts_la_SOURCES = \
tracker-fts.h \
tracker-fts-config.c \
tracker-fts-config.h \
+ tracker-fts-tokenizer.c \
+ tracker-fts-tokenizer.h \
tracker-parser-utils.c \
tracker-parser-utils.h \
tracker-parser.h
diff --git a/src/libtracker-fts/tracker-fts-tokenizer.c b/src/libtracker-fts/tracker-fts-tokenizer.c
new file mode 100644
index 0000000..8e8f6fb
--- /dev/null
+++ b/src/libtracker-fts/tracker-fts-tokenizer.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (C) 2011 Nokia <ivan frade nokia com>
+ *
+ * Author: Carlos Garnacho <carlos lanedo com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+/* FTS3/4 Tokenizer using TrackerParser */
+
+#include <assert.h>
+#include <string.h>
+#include "tracker-fts-config.h"
+#include "tracker-parser.h"
+#include "fts3_tokenizer.h"
+
+typedef struct TrackerTokenizer TrackerTokenizer;
+typedef struct TrackerCursor TrackerCursor;
+
+struct TrackerTokenizer {
+ sqlite3_tokenizer base;
+ TrackerParser *parser;
+ int max_word_length;
+ int max_words;
+ gboolean enable_stemmer;
+ gboolean enable_unaccent;
+ gboolean ignore_numbers;
+ gboolean ignore_stop_words;
+};
+
+struct TrackerCursor {
+ sqlite3_tokenizer_cursor base;
+
+ TrackerTokenizer *tokenizer;
+};
+
+/*
+** Create a new tokenizer instance.
+*/
+static int trackerCreate(
+ int argc, /* Number of entries in argv[] */
+ const char * const *argv, /* Tokenizer creation arguments */
+ sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
+){
+ TrackerTokenizer *p;
+ TrackerFTSConfig *config;
+
+ p = (TrackerTokenizer *)sqlite3_malloc(sizeof(TrackerTokenizer));
+ if( !p ){
+ return SQLITE_NOMEM;
+ }
+ memset(p, 0, sizeof(TrackerTokenizer));
+ p->parser = tracker_parser_new (tracker_language_new (NULL));
+
+ config = tracker_fts_config_new ();
+
+ p->max_word_length = tracker_fts_config_get_max_word_length (config);
+ p->enable_stemmer = tracker_fts_config_get_enable_stemmer (config);
+ p->enable_unaccent = tracker_fts_config_get_enable_unaccent (config);
+ p->ignore_numbers = tracker_fts_config_get_ignore_numbers (config);
+
+ /* disable stop words if TRACKER_FTS_STOP_WORDS is set to 0 - used by tests
+ * otherwise, get value from the conf file */
+ p->ignore_stop_words = (g_strcmp0 (g_getenv ("TRACKER_FTS_STOP_WORDS"), "0") == 0 ?
+ FALSE : tracker_fts_config_get_ignore_stop_words (config));
+
+ p->max_words = tracker_fts_config_get_max_words_to_index (config);
+
+ g_object_unref (config);
+
+ *ppTokenizer = (sqlite3_tokenizer *)p;
+
+ return SQLITE_OK;
+}
+
+/*
+** Destroy a tokenizer
+*/
+static int trackerDestroy(sqlite3_tokenizer *pTokenizer){
+ TrackerTokenizer *p = (TrackerTokenizer *)pTokenizer;
+ tracker_parser_free (p->parser);
+ sqlite3_free(p);
+ return SQLITE_OK;
+}
+
+/*
+** Prepare to begin tokenizing a particular string. The input
+** string to be tokenized is pInput[0..nBytes-1]. A cursor
+** used to incrementally tokenize this string is returned in
+** *ppCursor.
+*/
+static int trackerOpen(
+ sqlite3_tokenizer *pTokenizer, /* The tokenizer */
+ const char *zInput, /* Input string */
+ int nInput, /* Length of zInput in bytes */
+ sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
+){
+ TrackerTokenizer *p = (TrackerTokenizer *)pTokenizer;
+ TrackerCursor *pCsr;
+
+ if ( nInput<0 ){
+ nInput = strlen(zInput);
+ }
+
+ tracker_parser_reset (p->parser, zInput, nInput,
+ p->max_word_length,
+ p->enable_stemmer,
+ p->enable_unaccent,
+ p->ignore_stop_words,
+ TRUE,
+ p->ignore_numbers);
+
+ pCsr = (TrackerCursor *)sqlite3_malloc(sizeof(TrackerCursor));
+ memset(pCsr, 0, sizeof(TrackerCursor));
+ pCsr->tokenizer = p;
+
+ *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
+ return SQLITE_OK;
+}
+
+/*
+** Close a tokenization cursor.
+*/
+static int trackerClose(sqlite3_tokenizer_cursor *pCursor){
+ TrackerCursor *pCsr = (TrackerCursor *)pCursor;
+
+ sqlite3_free(pCsr);
+ return SQLITE_OK;
+}
+
+/*
+** Extract the next token from a tokenization cursor.
+*/
+static int trackerNext(
+ sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
+ const char **ppToken, /* OUT: *ppToken is the token text */
+ int *pnBytes, /* OUT: Number of bytes in token */
+ int *piStartOffset, /* OUT: Starting offset of token */
+ int *piEndOffset, /* OUT: Ending offset of token */
+ int *piPosition /* OUT: Position integer of token */
+){
+ TrackerCursor *cursor = (TrackerCursor *) pCursor;
+ TrackerTokenizer *p;
+ const gchar *pToken;
+ gboolean stop_word;
+
+ p = cursor->tokenizer;
+
+ do {
+ pToken = tracker_parser_next (p->parser,
+ piPosition,
+ piStartOffset,
+ piEndOffset,
+ &stop_word,
+ pnBytes);
+
+ if (!pToken){
+ return SQLITE_DONE;
+ }
+ } while (stop_word && p->ignore_stop_words);
+
+ if (ppToken){
+ *ppToken = pToken;
+ }
+
+ return SQLITE_OK;
+}
+
+/*
+** The set of routines that implement the simple tokenizer
+*/
+static const sqlite3_tokenizer_module trackerTokenizerModule = {
+ 0, /* iVersion */
+ trackerCreate, /* xCreate */
+ trackerDestroy, /* xDestroy */
+ trackerOpen, /* xOpen */
+ trackerClose, /* xClose */
+ trackerNext, /* xNext */
+};
+
+/*
+** Set *ppModule to point at the implementation of the tracker tokenizer.
+*/
+gboolean tracker_tokenizer_initialize (sqlite3 *db) {
+ sqlite3_tokenizer_module *pTokenizer;
+ int rc = SQLITE_OK;
+ sqlite3_stmt *stmt;
+
+ pTokenizer = &trackerTokenizerModule;
+ rc = sqlite3_prepare_v2(db, "SELECT fts3_tokenizer(?, ?)",
+ -1, &stmt, 0);
+
+ if (rc != SQLITE_OK) {
+ return FALSE;
+ }
+
+ sqlite3_bind_text(stmt, 1, "TrackerTokenizer", -1, SQLITE_STATIC);
+ sqlite3_bind_blob(stmt, 2, &pTokenizer, sizeof(pTokenizer), SQLITE_STATIC);
+ sqlite3_step(stmt);
+ rc = sqlite3_finalize(stmt);
+
+ return (rc == SQLITE_OK);
+}
diff --git a/src/libtracker-fts/tracker-fts-tokenizer.h b/src/libtracker-fts/tracker-fts-tokenizer.h
new file mode 100644
index 0000000..3e86295
--- /dev/null
+++ b/src/libtracker-fts/tracker-fts-tokenizer.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2011 Nokia <ivan frade nokia com>
+ *
+ * Author: Carlos Garnacho <carlos lanedo com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <sqlite3.h>
+#include <glib.h>
+#include "fts3_tokenizer.h"
+
+#ifndef __TRACKER_FTS_TOKENIZER_H__
+#define __TRACKER_FTS_TOKENIZER_H__
+
+gboolean tracker_tokenizer_initialize (sqlite3 *db);
+
+#endif /* __TRACKER_FTS_TOKENIZER_H__ */
diff --git a/src/libtracker-fts/tracker-fts.c b/src/libtracker-fts/tracker-fts.c
index 9b23b65..57b8ef0 100644
--- a/src/libtracker-fts/tracker-fts.c
+++ b/src/libtracker-fts/tracker-fts.c
@@ -20,6 +20,7 @@
*/
#include <sqlite3.h>
+#include "tracker-fts-tokenizer.h"
#include "tracker-fts.h"
#include "fts3.h"
@@ -38,8 +39,14 @@ gboolean tracker_fts_init (void) {
gboolean tracker_fts_init_db (sqlite3 *db, int create){
int rc = SQLITE_OK;
+ if (!tracker_tokenizer_initialize (db)) {
+ return FALSE;
+ }
+
if (create){
- rc = sqlite3_exec(db, "CREATE VIRTUAL TABLE fts USING fts4",
+ rc = sqlite3_exec(db,
+ "CREATE VIRTUAL TABLE fts "
+ "USING fts4(tokenize=TrackerTokenizer)",
NULL, 0, NULL);
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]