tracker r2125 - in branches/indexer-split: . src/libtracker-common src/tracker-indexer src/tracker-indexer/modules src/trackerd tests/libtracker-common tests/libtracker-db
- From: mr svn gnome org
- To: svn-commits-list gnome org
- Subject: tracker r2125 - in branches/indexer-split: . src/libtracker-common src/tracker-indexer src/tracker-indexer/modules src/trackerd tests/libtracker-common tests/libtracker-db
- Date: Thu, 21 Aug 2008 08:49:34 +0000 (UTC)
Author: mr
Date: Thu Aug 21 08:49:34 2008
New Revision: 2125
URL: http://svn.gnome.org/viewvc/tracker?rev=2125&view=rev
Log:
* src/libtracker-common/tracker-file-utils.c: Removed
is_text_file() which is no longer used.
* src/tracker-indexer/Makefile.am:
* src/tracker-indexer/modules/Makefile.am:
* src/trackerd/Makefile.am:
* tests/libtracker-common/Makefile.am:
* tests/libtracker-db/Makefile.am: Removed references to
libxdgmime.la which was breaking the build.
Modified:
branches/indexer-split/ChangeLog
branches/indexer-split/src/libtracker-common/tracker-file-utils.c
branches/indexer-split/src/libtracker-common/tracker-parser.c
branches/indexer-split/src/libtracker-common/tracker-parser.h
branches/indexer-split/src/tracker-indexer/Makefile.am
branches/indexer-split/src/tracker-indexer/modules/Makefile.am
branches/indexer-split/src/trackerd/Makefile.am
branches/indexer-split/tests/libtracker-common/Makefile.am
branches/indexer-split/tests/libtracker-db/Makefile.am
Modified: branches/indexer-split/src/libtracker-common/tracker-file-utils.c
==============================================================================
--- branches/indexer-split/src/libtracker-common/tracker-file-utils.c (original)
+++ branches/indexer-split/src/libtracker-common/tracker-file-utils.c Thu Aug 21 08:49:34 2008
@@ -144,52 +144,6 @@
return FALSE;
}
-static gboolean
-is_text_file (const gchar *uri)
-{
- gchar buffer[TEXT_SNIFF_SIZE];
- gint buffer_length = 0;
- gint fd;
- gboolean result = FALSE;
-
- fd = tracker_file_open (uri, FALSE);
- buffer_length = read (fd, buffer, TEXT_SNIFF_SIZE);
-
- /* Don't allow embedded zeros in textfiles. */
- if (buffer_length > 2 &&
- memchr (buffer, 0, buffer_length) == NULL) {
- if (is_utf8 (buffer, buffer_length)) {
- result = TRUE;
- } else {
- GError *error = NULL;
- gchar *tmp;
-
- tmp = g_locale_to_utf8 (buffer,
- buffer_length,
- NULL,
- NULL,
- &error);
- g_free (tmp);
-
- if (error) {
- gboolean result = FALSE;
-
- if (error->code != G_CONVERT_ERROR_ILLEGAL_SEQUENCE &&
- error->code != G_CONVERT_ERROR_FAILED &&
- error->code != G_CONVERT_ERROR_NO_CONVERSION) {
- result = TRUE;
- }
-
- g_error_free (error);
- }
- }
- }
-
- tracker_file_close (fd, !result);
-
- return result;
-}
-
gboolean
tracker_file_is_valid (const gchar *uri)
{
Modified: branches/indexer-split/src/libtracker-common/tracker-parser.c
==============================================================================
--- branches/indexer-split/src/libtracker-common/tracker-parser.c (original)
+++ branches/indexer-split/src/libtracker-common/tracker-parser.c Thu Aug 21 08:49:34 2008
@@ -1,5 +1,7 @@
-/* Tracker - indexer and metadata database engine
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
* Copyright (C) 2006, Mr Jamie McCracken (jamiemcc gnome org)
+ * Copyright (C) 2008, Nokia
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
@@ -21,7 +23,6 @@
#include <string.h>
-
#include "tracker-parser.h"
#include "tracker-log.h"
#include "tracker-utils.h"
@@ -36,11 +37,11 @@
((c) >= 0x20000 && (c) <= 0x2A6D6))
#define IS_LATIN(c) (((c) <= 0x02AF) || \
((c) >= 0x1E00 && (c) <= 0x1EFF))
-#define IS_ASCII(c) ((c) <= 0x007F)
+#define IS_ASCII(c) ((c) <= 0x007F)
#define IS_ASCII_ALPHA_LOWER(c) ((c) >= 0x0061 && (c) <= 0x007A)
#define IS_ASCII_ALPHA_HIGHER(c) ((c) >= 0x0041 && (c) <= 0x005A)
#define IS_ASCII_NUMERIC(c) ((c) >= 0x0030 && (c) <= 0x0039)
-#define IS_ASCII_IGNORE(c) ((c) <= 0x002C)
+#define IS_ASCII_IGNORE(c) ((c) <= 0x002C)
#define IS_HYPHEN(c) ((c) == 0x002D)
#define IS_UNDERSCORE(c) ((c) == 0x005F)
#define IS_NEWLINE(c) ((c) == 0x000D)
@@ -56,7 +57,7 @@
TRACKER_PARSER_WORD_ALPHA,
TRACKER_PARSER_WORD_ALPHA_NUM,
TRACKER_PARSER_WORD_IGNORE,
- TRACKER_PARSER_WORD_NEWLINE
+ TRACKER_PARSER_WORD_NEWLINE
} TrackerParserWordType;
static inline TrackerParserWordType
@@ -74,7 +75,7 @@
}
if (IS_ASCII_IGNORE (c)) {
- return TRACKER_PARSER_WORD_IGNORE;
+ return TRACKER_PARSER_WORD_IGNORE;
}
if (IS_ASCII_NUMERIC (c)) {
@@ -88,7 +89,7 @@
if (IS_UNDERSCORE (c)) {
return TRACKER_PARSER_WORD_UNDERSCORE;
}
-
+
if (IS_NEWLINE (c)) {
return TRACKER_PARSER_WORD_NEWLINE;
}
@@ -101,15 +102,15 @@
}
} else if (g_unichar_isdigit (c)) {
return TRACKER_PARSER_WORD_NUM;
- }
+ }
}
return TRACKER_PARSER_WORD_IGNORE;
}
static inline gchar *
-strip_word (const gchar *str,
- gint length,
+strip_word (const gchar *str,
+ gint length,
guint32 *len)
{
#ifdef HAVE_UNAC
@@ -122,12 +123,10 @@
return s;
#else
*len = length;
- return NULL;
+ return NULL;
#endif
}
-
-
static gboolean
text_needs_pango (const gchar *text)
{
@@ -148,18 +147,16 @@
}
}
- return FALSE;
+ return FALSE;
}
-
-
static TrackerParserEncoding
-get_encoding (const char *txt)
+get_encoding (const gchar *txt)
{
const gchar *p;
gunichar c;
gint i = 0;
-
+
/* Grab first 255 non-whitespace chars and test */
for (p = txt; *p && i < 255; p = g_utf8_next_char (p)) {
c = g_utf8_get_char (p);
@@ -167,16 +164,16 @@
if (!g_unichar_isspace (c)) {
i++;
}
-
+
if (IS_ASCII(c)) continue;
-
+
if (IS_LATIN(c)) return TRACKER_PARSER_ENCODING_LATIN;
-
+
if (NEED_PANGO(c)) return TRACKER_PARSER_ENCODING_CJK;
-
+
return TRACKER_PARSER_ENCODING_OTHER;
}
-
+
return TRACKER_PARSER_ENCODING_ASCII;
}
@@ -186,21 +183,23 @@
const gchar *word)
{
GHashTable *stop_words;
-
- if (!word) return FALSE;
-
+
+ if (!word) {
+ return FALSE;
+ }
+
stop_words = tracker_language_get_stop_words (language);
return g_hash_table_lookup (stop_words, word) != NULL;
}
static const gchar *
-analyze_text (const gchar *text,
+analyze_text (const gchar *text,
TrackerLanguage *language,
gint max_word_length,
gint min_word_length,
- gboolean filter_words,
- gboolean filter_numbers,
+ gboolean filter_words,
+ gboolean filter_numbers,
gboolean delimit_hyphen,
gchar **index_word)
{
@@ -229,28 +228,28 @@
for (p = text; *p; p = g_utf8_next_char (p)) {
TrackerParserWordType type;
gunichar c;
-
+
c = g_utf8_get_char (p);
type = get_word_type (c);
-
+
if (type == TRACKER_PARSER_WORD_IGNORE || type == TRACKER_PARSER_WORD_NEWLINE ||
- (delimit_hyphen &&
- (type == TRACKER_PARSER_WORD_HYPHEN ||
+ (delimit_hyphen &&
+ (type == TRACKER_PARSER_WORD_HYPHEN ||
type == TRACKER_PARSER_WORD_UNDERSCORE))) {
if (!start) {
continue;
} else {
break;
}
- }
-
+ }
+
if (!is_valid) {
continue;
}
-
+
if (!start) {
start = p;
-
+
/* Valid words must start with an alpha or
* underscore if we are filtering.
*/
@@ -263,65 +262,65 @@
is_valid = FALSE;
continue;
}
- }
- }
+ }
+ }
}
-
+
if (length >= max_word_length) {
continue;
}
-
+
length++;
-
+
switch (type) {
- case TRACKER_PARSER_WORD_ASCII_HIGHER:
+ case TRACKER_PARSER_WORD_ASCII_HIGHER:
c += 32;
-
- case TRACKER_PARSER_WORD_ASCII_LOWER:
+
+ case TRACKER_PARSER_WORD_ASCII_LOWER:
case TRACKER_PARSER_WORD_HYPHEN:
case TRACKER_PARSER_WORD_UNDERSCORE:
- if (word_type == TRACKER_PARSER_WORD_NUM ||
+ if (word_type == TRACKER_PARSER_WORD_NUM ||
word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
} else {
word_type = TRACKER_PARSER_WORD_ALPHA;
}
-
+
break;
-
- case TRACKER_PARSER_WORD_NUM:
- if (word_type == TRACKER_PARSER_WORD_ALPHA ||
+
+ case TRACKER_PARSER_WORD_NUM:
+ if (word_type == TRACKER_PARSER_WORD_ALPHA ||
word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
} else {
word_type = TRACKER_PARSER_WORD_NUM;
}
break;
-
- case TRACKER_PARSER_WORD_ALPHA_HIGHER:
+
+ case TRACKER_PARSER_WORD_ALPHA_HIGHER:
c = g_unichar_tolower (c);
-
- case TRACKER_PARSER_WORD_ALPHA_LOWER:
+
+ case TRACKER_PARSER_WORD_ALPHA_LOWER:
if (!do_strip) {
do_strip = TRUE;
}
-
- if (word_type == TRACKER_PARSER_WORD_NUM ||
+
+ if (word_type == TRACKER_PARSER_WORD_NUM ||
word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
} else {
word_type = TRACKER_PARSER_WORD_ALPHA;
}
-
+
break;
-
- default:
+
+ default:
break;
}
-
+
word[length -1] = c;
}
-
+
if (!is_valid) {
return p;
}
@@ -329,166 +328,94 @@
if (word_type == TRACKER_PARSER_WORD_NUM) {
if (!filter_numbers || length >= INDEX_NUMBER_MIN_LENGTH) {
*index_word = g_ucs4_to_utf8 (word, length, NULL, NULL, NULL);
- }
+ }
} else if (length >= min_word_length) {
const gchar *stem_word;
gchar *stripped_word;
gchar *str;
gchar *utf8;
guint32 len;
-
+
utf8 = g_ucs4_to_utf8 (word, length, NULL, &bytes, NULL);
-
+
if (!utf8) {
return p;
}
-
- if (do_strip && get_encoding (utf8) == TRACKER_PARSER_ENCODING_LATIN) {
+ if (do_strip && get_encoding (utf8) == TRACKER_PARSER_ENCODING_LATIN) {
stripped_word = strip_word (utf8, bytes, &len);
} else {
stripped_word = NULL;
}
-
+
if (!stripped_word) {
- str = g_utf8_normalize (utf8,
- bytes,
+ str = g_utf8_normalize (utf8,
+ bytes,
G_NORMALIZE_NFC);
} else {
- str = g_utf8_normalize (stripped_word,
- len,
+ str = g_utf8_normalize (stripped_word,
+ len,
G_NORMALIZE_NFC);
g_free (stripped_word);
}
-
+
g_free (utf8);
-
- stem_word = tracker_language_stem_word (language,
- str,
+
+ stem_word = tracker_language_stem_word (language,
+ str,
strlen (str));
g_free (str);
-
+
if (!filter_words || !is_stop_word (language, stem_word)) {
*index_word = g_strdup (stem_word);
}
}
-
- return p;
-}
-
-
-
-TrackerParser *
-tracker_parser_new (TrackerLanguage *language,
- gint max_word_length,
- gint min_word_length)
-{
-
-
- TrackerParser *parser = g_new (TrackerParser, 1);
-
- parser->language = language;
- parser->max_word_length = max_word_length;
- parser->min_word_length = min_word_length;
-
- parser->attrs = NULL;
-
- return parser;
-}
-
-void
-tracker_parser_reset (TrackerParser *parser,
- const gchar *txt,
- gint txt_size,
- gboolean delimit_words,
- gboolean enable_stemmer,
- gboolean enable_stop_words)
-{
- g_return_if_fail (txt && parser);
-
- if (parser->attrs) g_free (parser->attrs);
-
- parser->enable_stemmer = enable_stemmer;
- parser->enable_stop_words = enable_stop_words;
- parser->delimit_words = delimit_words;
- parser->encoding = get_encoding (txt);
- parser->txt_size = txt_size;
- parser->txt = txt;
-
- parser->word_position = 0;
-
- parser->cursor = txt;
-
-
- if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
-
- PangoLogAttr *attrs;
-
- parser->attr_length = g_utf8_strlen (parser->txt, parser->txt_size) + 1;
-
- attrs = g_new0 (PangoLogAttr, parser->attr_length);
-
- pango_get_log_attrs (parser->txt,
- txt_size,
- 0,
- pango_language_from_string ("C"),
- attrs,
- parser->attr_length);
-
- parser->attrs = attrs;
- parser->attr_pos = 0;
-
- }
+ return p;
+}
-}
-
-
-
static gchar *
pango_next (TrackerParser *parser,
- guint skip_words,
- guint *byte_offset_start,
- guint *byte_offset_end,
- gboolean *is_new_paragraph)
+ guint skip_words,
+ guint *byte_offset_start,
+ guint *byte_offset_end,
+ gboolean *is_new_paragraph)
{
-
/* CJK text does not need stemming or other treatment */
- int word_start = -1;
- int old_word_start = -1;
- guint words_parsed = 0;
+ gint word_start = -1;
+ gint old_word_start = -1;
+ guint words_parsed = 0;
guint32 i;
-
+
*is_new_paragraph = FALSE;
-
+
for (i = parser->attr_pos; i < parser->attr_length; i++) {
-
if (parser->attrs[i].is_word_start) {
- word_start = i;
+ word_start = i;
continue;
- }
-
+ }
+
if (parser->attrs[i].is_word_end && word_start != old_word_start) {
-
+ gchar *start_word, *end_word;
+
old_word_start = word_start;
-
words_parsed++;
-
- if (words_parsed <= skip_words) continue;
- gchar *start_word, *end_word;
+ if (words_parsed <= skip_words) {
+ continue;
+ }
start_word = g_utf8_offset_to_pointer (parser->txt, word_start);
end_word = g_utf8_offset_to_pointer (parser->txt, i);
if (start_word != end_word) {
- gchar *str;
- gchar *index_word;
+ gchar *str;
+ gchar *index_word;
/* Normalize word */
str = g_utf8_casefold (start_word, end_word - start_word);
if (!str) {
- continue;
+ continue;
}
index_word = g_utf8_normalize (str, -1, G_NORMALIZE_NFC);
@@ -501,31 +428,97 @@
if (word_start > 1 && parser->attrs[word_start -1].is_sentence_boundary) {
*is_new_paragraph = TRUE;
}
-
+
*byte_offset_start = (start_word - parser->txt);
*byte_offset_end = *byte_offset_start + (end_word - start_word);
parser->attr_pos = i;
+
return index_word;
-
+
}
+
word_start = i;
}
}
+
parser->attr_pos = i;
-
+
return NULL;
-
-}
-
-static gchar *
-tracker_parser_process_word (TrackerParser *parser, const char *word, gint length, gboolean do_strip)
+}
+
+TrackerParser *
+tracker_parser_new (TrackerLanguage *language,
+ gint max_word_length,
+ gint min_word_length)
{
- guint bytes, len;
- char *str = NULL, *stripped_word = NULL;
- const char *stem_word;
- if (word) {
+ TrackerParser *parser = g_new0 (TrackerParser, 1);
+
+ parser->language = language;
+ parser->max_word_length = max_word_length;
+ parser->min_word_length = min_word_length;
+
+ parser->attrs = NULL;
+
+ return parser;
+}
+
+void
+tracker_parser_reset (TrackerParser *parser,
+ const gchar *txt,
+ gint txt_size,
+ gboolean delimit_words,
+ gboolean enable_stemmer,
+ gboolean enable_stop_words)
+{
+ g_return_if_fail (parser != NULL);
+ g_return_if_fail (txt != NULL);
+
+ g_free (parser->attrs);
+
+ parser->enable_stemmer = enable_stemmer;
+ parser->enable_stop_words = enable_stop_words;
+ parser->delimit_words = delimit_words;
+ parser->encoding = get_encoding (txt);
+ parser->txt_size = txt_size;
+ parser->txt = txt;
+
+ parser->word_position = 0;
+
+ parser->cursor = txt;
+
+ if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
+ PangoLogAttr *attrs;
+
+ parser->attr_length = g_utf8_strlen (parser->txt, parser->txt_size) + 1;
+
+ attrs = g_new0 (PangoLogAttr, parser->attr_length);
+
+ pango_get_log_attrs (parser->txt,
+ txt_size,
+ 0,
+ pango_language_from_string ("C"),
+ attrs,
+ parser->attr_length);
+
+ parser->attrs = attrs;
+ parser->attr_pos = 0;
+ }
+}
+
+gchar *
+tracker_parser_process_word (TrackerParser *parser,
+ const char *word,
+ gint length,
+ gboolean do_strip)
+{
+ const gchar *stem_word;
+ gchar *str = NULL;
+ gchar *stripped_word = NULL;
+ guint bytes, len;
+
+ if (word) {
if (length == -1) {
bytes = strlen (word);
} else {
@@ -537,46 +530,45 @@
} else {
stripped_word = NULL;
}
-
+
if (!stripped_word) {
- str = g_utf8_normalize (word,
- bytes,
+ str = g_utf8_normalize (word,
+ bytes,
G_NORMALIZE_NFC);
} else {
- str = g_utf8_normalize (stripped_word,
- len,
+ str = g_utf8_normalize (stripped_word,
+ len,
G_NORMALIZE_NFC);
g_free (stripped_word);
}
-
-
+
if (!parser->enable_stemmer) {
return str;
}
-
+
len = strlen (str);
-
+
stem_word = tracker_language_stem_word (parser->language, str, len);
-
-
- if (stem_word) {
- char *result = g_strdup (stem_word);
-
- g_free (str);
-
- return result;
+
+ if (stem_word) {
+ gchar *result;
+
+ result = g_strdup (stem_word);
+ g_free (str);
+
+ return result;
}
- }
+ }
+
return str;
+}
-}
-
static gchar *
parser_next (TrackerParser *parser,
- guint skip_words,
- guint *byte_offset_start,
- guint *byte_offset_end,
- gboolean *is_new_paragraph)
+ guint skip_words,
+ guint *byte_offset_start,
+ guint *byte_offset_end,
+ gboolean *is_new_paragraph)
{
TrackerParserWordType word_type;
gunichar word[64];
@@ -585,9 +577,9 @@
guint length;
gint char_count = 0;
glong bytes;
- const char *p;
- const char *start;
- const char *end;
+ const gchar *p;
+ const gchar *start;
+ const gchar *end;
guint words_skipped = 0;
gboolean do_strip = FALSE;
@@ -596,7 +588,7 @@
*is_new_paragraph = FALSE;
g_return_val_if_fail (parser, NULL);
-
+
if (!parser->cursor) {
return NULL;
}
@@ -605,11 +597,10 @@
is_valid = TRUE;
length = 0;
bytes = 0;
-
+
start = NULL;
end = NULL;
-
for (p = parser->cursor; *p; p = g_utf8_next_char (p)) {
TrackerParserWordType type;
gunichar c;
@@ -617,25 +608,32 @@
char_count++;
c = g_utf8_get_char (p);
type = get_word_type (c);
-
+
if (type == TRACKER_PARSER_WORD_NEWLINE) {
*is_new_paragraph = TRUE;
}
-
- if (type == TRACKER_PARSER_WORD_IGNORE || type == TRACKER_PARSER_WORD_NEWLINE ||
- (parser->delimit_words &&
- (type == TRACKER_PARSER_WORD_HYPHEN ||
+
+ if (type == TRACKER_PARSER_WORD_IGNORE || type == TRACKER_PARSER_WORD_NEWLINE ||
+ (parser->delimit_words &&
+ (type == TRACKER_PARSER_WORD_HYPHEN ||
type == TRACKER_PARSER_WORD_UNDERSCORE))) {
if (!start) {
continue;
} else {
/* word break */
-
- if (!is_valid || length < parser->min_word_length || word_type == TRACKER_PARSER_WORD_NUM || words_skipped < skip_words) {
+
+ if (!is_valid ||
+ length < parser->min_word_length ||
+ word_type == TRACKER_PARSER_WORD_NUM ||
+ words_skipped < skip_words) {
*is_new_paragraph = FALSE;
-
- if (is_valid && length >= parser->min_word_length && word_type != TRACKER_PARSER_WORD_NUM && words_skipped < skip_words) words_skipped++;
-
+
+ if (is_valid && length >= parser->min_word_length &&
+ word_type != TRACKER_PARSER_WORD_NUM &&
+ words_skipped < skip_words) {
+ words_skipped++;
+ }
+
word_type = TRACKER_PARSER_WORD_IGNORE;
is_valid = TRUE;
length = 0;
@@ -643,22 +641,21 @@
start = NULL;
end = NULL;
do_strip = FALSE;
+
continue;
-
}
-
-
+
break;
}
- }
-
+ }
+
if (!is_valid) {
continue;
}
-
+
if (!start) {
start = g_utf8_offset_to_pointer (parser->cursor, char_count);
-
+
/* Valid words must start with an alpha or
* underscore if we are filtering.
*/
@@ -671,96 +668,96 @@
is_valid = FALSE;
continue;
}
- }
- }
+ }
+ }
}
-
+
if (length >= parser->max_word_length) {
continue;
}
-
+
length++;
-
+
switch (type) {
- case TRACKER_PARSER_WORD_ASCII_HIGHER:
+ case TRACKER_PARSER_WORD_ASCII_HIGHER:
c += 32;
-
- case TRACKER_PARSER_WORD_ASCII_LOWER:
+
+ case TRACKER_PARSER_WORD_ASCII_LOWER:
case TRACKER_PARSER_WORD_HYPHEN:
case TRACKER_PARSER_WORD_UNDERSCORE:
- if (word_type == TRACKER_PARSER_WORD_NUM ||
+ if (word_type == TRACKER_PARSER_WORD_NUM ||
word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
} else {
word_type = TRACKER_PARSER_WORD_ALPHA;
}
-
+
break;
-
- case TRACKER_PARSER_WORD_NUM:
- if (word_type == TRACKER_PARSER_WORD_ALPHA ||
+
+ case TRACKER_PARSER_WORD_NUM:
+ if (word_type == TRACKER_PARSER_WORD_ALPHA ||
word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
} else {
word_type = TRACKER_PARSER_WORD_NUM;
}
break;
-
- case TRACKER_PARSER_WORD_ALPHA_HIGHER:
+
+ case TRACKER_PARSER_WORD_ALPHA_HIGHER:
c = g_unichar_tolower (c);
-
- case TRACKER_PARSER_WORD_ALPHA_LOWER:
+
+ case TRACKER_PARSER_WORD_ALPHA_LOWER:
if (!do_strip) {
do_strip = TRUE;
}
-
- if (word_type == TRACKER_PARSER_WORD_NUM ||
+
+ if (word_type == TRACKER_PARSER_WORD_NUM ||
word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
} else {
word_type = TRACKER_PARSER_WORD_ALPHA;
}
-
+
break;
-
- default:
+
+ default:
break;
}
-
+
word[length -1] = c;
}
-
+
if (!is_valid) {
parser->cursor = NULL;
return NULL;
}
end = g_utf8_offset_to_pointer (parser->cursor, char_count);
-
+
parser->cursor = end;
-
+
if (word_type == TRACKER_PARSER_WORD_ALPHA_NUM || word_type == TRACKER_PARSER_WORD_ALPHA) {
gchar *utf8;
gchar *processed_word;
-
-
-
+
+
+
utf8 = g_ucs4_to_utf8 (word, length, NULL, &bytes, NULL);
-
+
if (!utf8) {
return NULL;
}
- *byte_offset_start = start - parser->txt;
+ *byte_offset_start = start - parser->txt;
*byte_offset_end = end - parser->txt;
-
+
processed_word = tracker_parser_process_word (parser, utf8, bytes, do_strip);
-
+
g_free (utf8);
-
+
return processed_word;
-
+
}
-
- return NULL;
+
+ return NULL;
}
@@ -776,8 +773,7 @@
return result;
}
-
-gchar *
+gchar *
tracker_parser_next (TrackerParser *parser,
guint *position,
guint *byte_offset_start,
@@ -789,16 +785,16 @@
guint byte_start, byte_end;
gboolean new_para;
char *str;
-
-
-
+
+
+
if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
str = pango_next (parser, 0, &byte_start, &byte_end, &new_para);
-
+
parser->word_position++;
*stop_word = FALSE;
-
+
} else {
str = parser_next (parser, 0, &byte_start, &byte_end, &new_para);
parser->word_position++;
@@ -812,24 +808,22 @@
*position = parser->word_position;
*byte_offset_start = byte_start;
*byte_offset_end = byte_end;
- *new_paragraph = new_para;
-
+ *new_paragraph = new_para;
+
return str;
-}
-
-
-
+}
+
void
tracker_parser_set_posititon (TrackerParser *parser,
guint position)
{
guint byte_start, byte_end;
gboolean para;
-
-
- parser->word_position = 0;
- parser->cursor = parser->txt;
+
+
+ parser->word_position = 0;
+ parser->cursor = parser->txt;
parser->attr_pos = 0;
if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
@@ -838,34 +832,26 @@
} else {
char *s = parser_next (parser, position, &byte_start, &byte_end, ¶);
g_free (s);
- }
-
-}
-
-void
-tracker_parser_free (TrackerParser *parser)
-{
- if (parser->attrs) g_free (parser->attrs);
-
- g_free (parser);
+ }
}
+void
+tracker_parser_free (TrackerParser *parser)
+{
+ if (parser->attrs) g_free (parser->attrs);
+ g_free (parser);
-
-
-
-/* old stuff */
-
+}
gchar *
-tracker_parser_text_to_string (const gchar *txt,
+tracker_parser_text_to_string (const gchar *txt,
TrackerLanguage *language,
gint max_word_length,
gint min_word_length,
- gboolean filter_words,
- gboolean filter_numbers,
+ gboolean filter_words,
+ gboolean filter_numbers,
gboolean delimit)
{
const gchar *p = txt;
@@ -893,54 +879,54 @@
PangoLogAttr *attrs;
guint nb_bytes, str_len, word_start;
GString *strs;
-
+
nb_bytes = strlen (txt);
str_len = g_utf8_strlen (txt, -1);
-
+
strs = g_string_new (" ");
-
+
attrs = g_new0 (PangoLogAttr, str_len + 1);
-
- pango_get_log_attrs (txt,
- nb_bytes,
- 0,
- pango_language_from_string ("C"),
- attrs,
+
+ pango_get_log_attrs (txt,
+ nb_bytes,
+ 0,
+ pango_language_from_string ("C"),
+ attrs,
str_len + 1);
-
+
word_start = 0;
-
+
for (i = 0; i < str_len + 1; i++) {
if (attrs[i].is_word_end) {
gchar *start_word, *end_word;
-
+
start_word = g_utf8_offset_to_pointer (txt, word_start);
end_word = g_utf8_offset_to_pointer (txt, i);
-
+
if (start_word != end_word) {
/* Normalize word */
gchar *s;
gchar *index_word;
-
+
s = g_utf8_casefold (start_word, end_word - start_word);
index_word = g_utf8_normalize (s, -1, G_NORMALIZE_NFC);
g_free (s);
-
+
strs = g_string_append (strs, index_word);
strs = g_string_append_c (strs, ' ');
g_free (index_word);
}
-
+
word_start = i;
}
-
+
if (attrs[i].is_word_start) {
word_start = i;
}
}
-
+
g_free (attrs);
-
+
parsed_text = g_string_free (strs, FALSE);
return g_strstrip (parsed_text);
} else {
@@ -948,24 +934,24 @@
gchar *word;
str = g_string_new (" ");
-
+
while (TRUE) {
i++;
p = analyze_text (p,
- language,
+ language,
max_word_length,
min_word_length,
- filter_words,
- filter_numbers,
+ filter_words,
+ filter_numbers,
delimit,
&word);
-
+
if (word) {
g_string_append (str, word);
g_string_append_c (str, ' ');
g_free (word);
}
-
+
if (!p || !*p) {
parsed_text = g_string_free (str, FALSE);
return g_strstrip (parsed_text);
@@ -989,12 +975,12 @@
g_return_val_if_fail (language != NULL, NULL);
- s = tracker_parser_text_to_string (text,
- language,
+ s = tracker_parser_text_to_string (text,
+ language,
max_word_length,
min_word_length,
- TRUE,
- FALSE,
+ TRUE,
+ FALSE,
FALSE);
strv = g_strsplit (g_strstrip (s), " ", -1);
g_free (s);
@@ -1004,21 +990,21 @@
GHashTable *
tracker_parser_text_fast (GHashTable *word_table,
- const gchar *txt,
+ const gchar *txt,
gint weight)
{
- gchar **tmp;
+ gchar **tmp;
gchar **array;
gpointer k = 0;
gpointer v = 0;
/* Use this for already processed text only */
if (!word_table) {
- word_table = g_hash_table_new_full (g_str_hash,
+ word_table = g_hash_table_new_full (g_str_hash,
g_str_equal,
g_free,
NULL);
- }
+ }
if (!txt || weight == 0) {
return word_table;
@@ -1032,13 +1018,13 @@
}
if (!g_hash_table_lookup_extended (word_table, *tmp, &k, &v)) {
- g_hash_table_insert (word_table,
- g_strdup (*tmp),
- GINT_TO_POINTER (GPOINTER_TO_INT (v) + weight));
+ g_hash_table_insert (word_table,
+ g_strdup (*tmp),
+ GINT_TO_POINTER (GPOINTER_TO_INT (v) + weight));
} else {
- g_hash_table_insert (word_table,
- *tmp,
- GINT_TO_POINTER (GPOINTER_TO_INT (v) + weight));
+ g_hash_table_insert (word_table,
+ *tmp,
+ GINT_TO_POINTER (GPOINTER_TO_INT (v) + weight));
}
}
@@ -1049,23 +1035,23 @@
static gboolean
word_table_increment (GHashTable *word_table,
- gchar *index_word,
+ gchar *index_word,
gint weight,
gint total_words,
- gint max_words_to_index)
+ gint max_words_to_index)
{
gboolean update_count;
update_count = total_words <= max_words_to_index;
- if (update_count) {
+ if (update_count) {
gpointer p;
gint count;
-
+
p = g_hash_table_lookup (word_table, index_word);
count = GPOINTER_TO_INT (p);
- g_hash_table_replace (word_table,
+ g_hash_table_replace (word_table,
index_word,
GINT_TO_POINTER (count + weight));
} else {
@@ -1077,14 +1063,14 @@
GHashTable *
-tracker_parser_text (GHashTable *word_table,
- const gchar *txt,
- gint weight,
+tracker_parser_text (GHashTable *word_table,
+ const gchar *txt,
+ gint weight,
TrackerLanguage *language,
gint max_words_to_index,
gint max_word_length,
gint min_word_length,
- gboolean filter_words,
+ gboolean filter_words,
gboolean delimit_words)
{
const gchar *p;
@@ -1096,7 +1082,7 @@
g_return_val_if_fail (language != NULL, NULL);
if (!word_table) {
- word_table = g_hash_table_new_full (g_str_hash,
+ word_table = g_hash_table_new_full (g_str_hash,
g_str_equal,
g_free,
NULL);
@@ -1109,7 +1095,7 @@
return word_table;
}
- p = txt;
+ p = txt;
i = 0;
if (text_needs_pango (txt)) {
@@ -1123,13 +1109,13 @@
attrs = g_new0 (PangoLogAttr, str_len + 1);
- pango_get_log_attrs (txt,
- nb_bytes,
- 0,
- pango_language_from_string ("C"),
- attrs,
+ pango_get_log_attrs (txt,
+ nb_bytes,
+ 0,
+ pango_language_from_string ("C"),
+ attrs,
str_len + 1);
-
+
word_start = 0;
for (i = 0; i < str_len + 1; i++) {
@@ -1158,9 +1144,9 @@
}
total_words++;
- was_updated = word_table_increment (word_table,
- index_word,
- weight,
+ was_updated = word_table_increment (word_table,
+ index_word,
+ weight,
total_words,
max_words_to_index);
@@ -1171,38 +1157,38 @@
word_start = i;
}
-
+
if (attrs[i].is_word_start) {
word_start = i;
}
}
- g_free (attrs);
+ g_free (attrs);
} else {
gchar *word;
while (TRUE) {
i++;
- p = analyze_text (p,
+ p = analyze_text (p,
language,
max_word_length,
min_word_length,
- filter_words,
- filter_words,
+ filter_words,
+ filter_words,
delimit_words,
&word);
if (word) {
total_words++;
- if (!word_table_increment (word_table,
- word,
- weight,
+ if (!word_table_increment (word_table,
+ word,
+ weight,
total_words,
max_words_to_index)) {
break;
}
- }
+ }
if (!p || !*p) {
break;
Modified: branches/indexer-split/src/libtracker-common/tracker-parser.h
==============================================================================
--- branches/indexer-split/src/libtracker-common/tracker-parser.h (original)
+++ branches/indexer-split/src/libtracker-common/tracker-parser.h Thu Aug 21 08:49:34 2008
@@ -1,6 +1,7 @@
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/*
* Copyright (C) 2006, Mr Jamie McCracken (jamiemcc gnome org)
+ * Copyright (C) 2008, Nokia
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
@@ -28,7 +29,6 @@
G_BEGIN_DECLS
-
typedef enum {
TRACKER_PARSER_ENCODING_ASCII,
TRACKER_PARSER_ENCODING_LATIN,
@@ -37,62 +37,52 @@
} TrackerParserEncoding;
typedef struct {
- const gchar *txt;
- gint txt_size;
- TrackerLanguage *language;
- gboolean enable_stemmer;
- gboolean enable_stop_words;
- guint max_words_to_index;
- guint max_word_length;
- guint min_word_length;
- gboolean delimit_words;
-
- /* private members */
- guint word_position;
- TrackerParserEncoding encoding;
- const gchar *cursor;
+ const gchar *txt;
+ gint txt_size;
+
+ TrackerLanguage *language;
+ gboolean enable_stemmer;
+ gboolean enable_stop_words;
+ guint max_words_to_index;
+ guint max_word_length;
+ guint min_word_length;
+ gboolean delimit_words;
- /* pango members for CJK text parsing */
- PangoLogAttr * attrs;
- guint attr_length;
- guint attr_pos;
+ /* Private members */
+ guint word_position;
+ TrackerParserEncoding encoding;
+ const gchar *cursor;
+ /* Pango members for CJK text parsing */
+ PangoLogAttr *attrs;
+ guint attr_length;
+ guint attr_pos;
} TrackerParser;
-
-
-TrackerParser * tracker_parser_new (TrackerLanguage *language,
- gint max_word_length,
- gint min_word_length);
-
-
-void tracker_parser_reset (TrackerParser *parser,
- const gchar *txt,
- gint txt_size,
- gboolean delimit_words,
- gboolean enable_stemmer,
- gboolean enable_stop_words);
-
-
-
-gchar * tracker_parser_next (TrackerParser *parser,
- guint *position,
- guint *byte_offset_start,
- guint *byte_offset_end,
- gboolean *new_paragraph,
- gboolean *stop_word);
-
-
-void tracker_parser_set_posititon (TrackerParser *parser,
- guint position);
-
-gboolean tracker_parser_is_stop_word (TrackerParser *parser, const gchar *word);
-
-static gchar * tracker_parser_process_word (TrackerParser *parser, const char *word, gint length, gboolean do_strip);
-
-void tracker_parser_free (TrackerParser *parser);
-
-
+TrackerParser *tracker_parser_new (TrackerLanguage *language,
+ gint max_word_length,
+ gint min_word_length);
+void tracker_parser_reset (TrackerParser *parser,
+ const gchar *txt,
+ gint txt_size,
+ gboolean delimit_words,
+ gboolean enable_stemmer,
+ gboolean enable_stop_words);
+gchar * tracker_parser_next (TrackerParser *parser,
+ guint *position,
+ guint *byte_offset_start,
+ guint *byte_offset_end,
+ gboolean *new_paragraph,
+ gboolean *stop_word);
+void tracker_parser_set_posititon (TrackerParser *parser,
+ guint position);
+gboolean tracker_parser_is_stop_word (TrackerParser *parser,
+ const gchar *word);
+gchar * tracker_parser_process_word (TrackerParser *parser,
+ const char *word,
+ gint length,
+ gboolean do_strip);
+void tracker_parser_free (TrackerParser *parser);
/*
@@ -110,30 +100,30 @@
*
* Returns the word_table.
*/
+GHashTable * tracker_parser_text (GHashTable *word_table,
+ const gchar *txt,
+ gint weight,
+ TrackerLanguage *language,
+ gint max_words_to_index,
+ gint max_word_length,
+ gint min_word_length,
+ gboolean filter_words,
+ gboolean delimit_words);
+GHashTable * tracker_parser_text_fast (GHashTable *word_table,
+ const char *txt,
+ gint weight);
+gchar * tracker_parser_text_to_string (const gchar *txt,
+ TrackerLanguage *language,
+ gint max_word_length,
+ gint min_word_length,
+ gboolean filter_words,
+ gboolean filter_numbers,
+ gboolean delimit);
+gchar ** tracker_parser_text_into_array (const gchar *text,
+ TrackerLanguage *language,
+ gint max_word_length,
+ gint min_word_length);
-GHashTable *tracker_parser_text (GHashTable *word_table,
- const gchar *txt,
- gint weight,
- TrackerLanguage *language,
- gint max_words_to_index,
- gint max_word_length,
- gint min_word_length,
- gboolean filter_words,
- gboolean delimit_words);
-GHashTable *tracker_parser_text_fast (GHashTable *word_table,
- const char *txt,
- gint weight);
-gchar * tracker_parser_text_to_string (const gchar *txt,
- TrackerLanguage *language,
- gint max_word_length,
- gint min_word_length,
- gboolean filter_words,
- gboolean filter_numbers,
- gboolean delimit);
-gchar ** tracker_parser_text_into_array (const gchar *text,
- TrackerLanguage *language,
- gint max_word_length,
- gint min_word_length);
G_END_DECLS
Modified: branches/indexer-split/src/tracker-indexer/Makefile.am
==============================================================================
--- branches/indexer-split/src/tracker-indexer/Makefile.am (original)
+++ branches/indexer-split/src/tracker-indexer/Makefile.am Thu Aug 21 08:49:34 2008
@@ -35,7 +35,6 @@
$(top_builddir)/src/libtracker-db/libtracker-db.la \
$(top_builddir)/src/libtracker-common/libtracker-common.la \
$(top_builddir)/src/libstemmer/libstemmer.la \
- $(top_builddir)/src/xdgmime/libxdgmime.la \
$(trackerd_win_libs) \
$(DBUS_LIBS) \
$(GMODULE_LIBS) \
Modified: branches/indexer-split/src/tracker-indexer/modules/Makefile.am
==============================================================================
--- branches/indexer-split/src/tracker-indexer/modules/Makefile.am (original)
+++ branches/indexer-split/src/tracker-indexer/modules/Makefile.am Thu Aug 21 08:49:34 2008
@@ -33,7 +33,6 @@
libtracker_indexer_files_la_LDFLAGS = $(module_flags)
libtracker_indexer_files_la_LIBADD = \
$(top_builddir)/src/libtracker-db/libtracker-db.la \
- $(top_builddir)/src/xdgmime/libxdgmime.la \
$(GMODULE_LIBS) \
$(GIO_LIBS) \
$(GLIB2_LIBS)
Modified: branches/indexer-split/src/trackerd/Makefile.am
==============================================================================
--- branches/indexer-split/src/trackerd/Makefile.am (original)
+++ branches/indexer-split/src/trackerd/Makefile.am Thu Aug 21 08:49:34 2008
@@ -76,7 +76,6 @@
$(top_builddir)/src/libtracker-db/libtracker-db.la \
$(top_builddir)/src/libtracker-common/libtracker-common.la \
$(top_builddir)/src/libstemmer/libstemmer.la \
- $(top_builddir)/src/xdgmime/libxdgmime.la \
$(inotify_libs) \
$(GMIME_LIBS) \
$(FAM_LIBS) \
Modified: branches/indexer-split/tests/libtracker-common/Makefile.am
==============================================================================
--- branches/indexer-split/tests/libtracker-common/Makefile.am (original)
+++ branches/indexer-split/tests/libtracker-common/Makefile.am Thu Aug 21 08:49:34 2008
@@ -58,7 +58,6 @@
tracker_file_utils_LDADD = \
$(top_builddir)/src/libtracker-common/libtracker-common.la \
$(top_builddir)/tests/common/libtracker-testcommon.la \
- $(top_builddir)/src/xdgmime/libxdgmime.la \
$(GMODULE_LIBS) \
$(GTHREAD_LIBS) \
$(GLIB2_LIBS)
@@ -70,7 +69,6 @@
tracker_parser_LDADD = \
$(top_builddir)/src/libtracker-common/libtracker-common.la \
$(top_builddir)/tests/common/libtracker-testcommon.la \
- $(top_builddir)/src/xdgmime/libxdgmime.la \
$(top_builddir)/src/libstemmer/libstemmer.la \
$(GMODULE_LIBS) \
$(GTHREAD_LIBS) \
Modified: branches/indexer-split/tests/libtracker-db/Makefile.am
==============================================================================
--- branches/indexer-split/tests/libtracker-db/Makefile.am (original)
+++ branches/indexer-split/tests/libtracker-db/Makefile.am Thu Aug 21 08:49:34 2008
@@ -33,7 +33,6 @@
$(top_builddir)/src/libtracker-db/libtracker-db.la \
$(top_builddir)/src/libtracker-common/libtracker-common.la \
$(top_builddir)/tests/common/libtracker-testcommon.la \
- $(top_builddir)/src/xdgmime/libxdgmime.la \
$(SQLITE3_LIBS) \
$(QDBM_LIBS) \
$(GMODULE_LIBS) \
@@ -41,56 +40,53 @@
$(GLIB2_LIBS) \
-lz
-tracker_db_manager_attach_SOURCES = \
- tracker-db-manager-test-attach.c \
- tracker-db-manager-common.c \
- tracker-db-manager-common.h
-
-tracker_db_manager_unattach_LDADD = \
- $(top_builddir)/src/libtracker-db/libtracker-db.la \
- $(top_builddir)/src/libtracker-common/libtracker-common.la \
- $(top_builddir)/tests/common/libtracker-testcommon.la \
- $(top_builddir)/src/xdgmime/libxdgmime.la \
- $(SQLITE3_LIBS) \
- $(QDBM_LIBS) \
- $(GMODULE_LIBS) \
- $(GTHREAD_LIBS) \
- $(GLIB2_LIBS) \
- -lz
-
-tracker_db_manager_unattach_SOURCES = \
- tracker-db-manager-test-unattach.c \
- tracker-db-manager-common.c \
- tracker-db-manager-common.h
-
-tracker_db_manager_attach_LDADD = \
- $(top_builddir)/src/libtracker-db/libtracker-db.la \
- $(top_builddir)/src/libtracker-common/libtracker-common.la \
- $(top_builddir)/tests/common/libtracker-testcommon.la \
- $(top_builddir)/src/xdgmime/libxdgmime.la \
- $(SQLITE3_LIBS) \
- $(QDBM_LIBS) \
- $(GMODULE_LIBS) \
- $(GTHREAD_LIBS) \
- $(GLIB2_LIBS) \
- -lz
-
-tracker_db_manager_custom_SOURCES = \
- tracker-db-manager-test-custom.c \
- tracker-db-manager-common.c \
- tracker-db-manager-common.h
-
-tracker_db_manager_custom_LDADD = \
- $(top_builddir)/src/libtracker-db/libtracker-db.la \
- $(top_builddir)/src/libtracker-common/libtracker-common.la \
- $(top_builddir)/tests/common/libtracker-testcommon.la \
- $(top_builddir)/src/xdgmime/libxdgmime.la \
- $(SQLITE3_LIBS) \
- $(QDBM_LIBS) \
- $(GMODULE_LIBS) \
- $(GTHREAD_LIBS) \
- $(GLIB2_LIBS) \
- -lz
+# tracker_db_manager_attach_SOURCES = \
+# tracker-db-manager-test-attach.c \
+# tracker-db-manager-common.c \
+# tracker-db-manager-common.h
+
+# tracker_db_manager_attach_LDADD = \
+# $(top_builddir)/src/libtracker-db/libtracker-db.la \
+# $(top_builddir)/src/libtracker-common/libtracker-common.la \
+# $(top_builddir)/tests/common/libtracker-testcommon.la \
+# $(SQLITE3_LIBS) \
+# $(QDBM_LIBS) \
+# $(GMODULE_LIBS) \
+# $(GTHREAD_LIBS) \
+# $(GLIB2_LIBS) \
+# -lz
+
+# tracker_db_manager_unattach_SOURCES = \
+# tracker-db-manager-test-unattach.c \
+# tracker-db-manager-common.c \
+# tracker-db-manager-common.h
+
+# tracker_db_manager_unattach_LDADD = \
+# $(top_builddir)/src/libtracker-db/libtracker-db.la \
+# $(top_builddir)/src/libtracker-common/libtracker-common.la \
+# $(top_builddir)/tests/common/libtracker-testcommon.la \
+# $(SQLITE3_LIBS) \
+# $(QDBM_LIBS) \
+# $(GMODULE_LIBS) \
+# $(GTHREAD_LIBS) \
+# $(GLIB2_LIBS) \
+# -lz
+
+# tracker_db_manager_custom_SOURCES = \
+# tracker-db-manager-test-custom.c \
+# tracker-db-manager-common.c \
+# tracker-db-manager-common.h
+
+# tracker_db_manager_custom_LDADD = \
+# $(top_builddir)/src/libtracker-db/libtracker-db.la \
+# $(top_builddir)/src/libtracker-common/libtracker-common.la \
+# $(top_builddir)/tests/common/libtracker-testcommon.la \
+# $(SQLITE3_LIBS) \
+# $(QDBM_LIBS) \
+# $(GMODULE_LIBS) \
+# $(GTHREAD_LIBS) \
+# $(GLIB2_LIBS) \
+# -lz
tracker_db_dbus_SOURCES = \
tracker-db-dbus-test.c
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]