[tracker/parser-unicode-libs-review: 72/85] Some refactoring cleaning up the code of the parser
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/parser-unicode-libs-review: 72/85] Some refactoring cleaning up the code of the parser
- Date: Tue, 4 May 2010 17:30:03 +0000 (UTC)
commit cdc54c5073bd5c2f94b29d29d727db2584635cfa
Author: Aleksander Morgado <aleksander lanedo com>
Date: Thu Apr 29 12:54:24 2010 +0200
Some refactoring cleaning up the code of the parser
src/libtracker-common/tracker-language.c | 26 +-
src/libtracker-common/tracker-language.h | 2 +
src/libtracker-common/tracker-utils.c | 55 ++
src/libtracker-common/tracker-utils.h | 20 +-
src/libtracker-fts/Makefile.am | 2 +
src/libtracker-fts/tracker-parser-glib.c | 502 +---------------
src/libtracker-fts/tracker-parser-libunistring.c | 707 +---------------------
src/libtracker-fts/tracker-parser-utils.c | 80 +++
src/libtracker-fts/tracker-parser-utils.h | 33 +
src/libtracker-fts/tracker-parser.h | 10 +-
10 files changed, 235 insertions(+), 1202 deletions(-)
---
diff --git a/src/libtracker-common/tracker-language.c b/src/libtracker-common/tracker-language.c
index 07df05a..d67d13d 100644
--- a/src/libtracker-common/tracker-language.c
+++ b/src/libtracker-common/tracker-language.c
@@ -330,7 +330,7 @@ tracker_language_new (const gchar *language_code)
{
TrackerLanguage *language;
- language = g_object_new (TRACKER_TYPE_LANGUAGE,
+ language = g_object_new (TRACKER_TYPE_LANGUAGE,
"language-code", language_code,
NULL);
@@ -380,6 +380,30 @@ tracker_language_get_stop_words (TrackerLanguage *language)
}
/**
+ * tracker_language_is_stop_word:
+ * @language: a #TrackerLanguage
+ * @word: a string containing a word
+ *
+ * Returns %TRUE if the given @word is in the list of stop words of the
+ * given @language.
+ *
+ * Returns: %TRUE if @word is a stop word. %FALSE otherwise.
+ */
+gboolean
+tracker_language_is_stop_word (TrackerLanguage *language,
+ const gchar *word)
+{
+ TrackerLanguagePriv *priv;
+
+ g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), FALSE);
+ g_return_val_if_fail (word, FALSE);
+
+ priv = GET_PRIV (language);
+
+ return g_hash_table_lookup (priv->stop_words, word) != NULL;
+}
+
+/**
* tracker_language_get_language_code:
* @language: a #TrackerLanguage
*
diff --git a/src/libtracker-common/tracker-language.h b/src/libtracker-common/tracker-language.h
index f0ff3cd..71e00f1 100644
--- a/src/libtracker-common/tracker-language.h
+++ b/src/libtracker-common/tracker-language.h
@@ -52,6 +52,8 @@ TrackerLanguage *tracker_language_new (const gchar *language_
gboolean tracker_language_get_enable_stemmer (TrackerLanguage *language);
GHashTable * tracker_language_get_stop_words (TrackerLanguage *language);
+gboolean tracker_language_is_stop_word (TrackerLanguage *language,
+ const gchar *word);
const gchar * tracker_language_get_language_code (TrackerLanguage *language);
void tracker_language_set_enable_stemmer (TrackerLanguage *language,
diff --git a/src/libtracker-common/tracker-utils.c b/src/libtracker-common/tracker-utils.c
index c2154f1..80e574b 100644
--- a/src/libtracker-common/tracker-utils.c
+++ b/src/libtracker-common/tracker-utils.c
@@ -20,6 +20,7 @@
#include "config.h"
+#include <stdio.h>
#include <string.h>
#include <locale.h>
@@ -151,3 +152,57 @@ tracker_seconds_to_string (gdouble seconds_elapsed,
+/**
+ * tracker_strhex:
+ * @data: The input array of bytes
+ * @size: Number of bytes in the input array
+ * @delimiter: Character to use as separator between each printed byte
+ *
+ * Returns the contents of @data as a printable string in hexadecimal
+ * representation.
+ *
+ * Based on GNU PDF's pdf_text_test_get_hex()
+ *
+ * Returns: A newly allocated string which should be disposed with g_free()
+ **/
+gchar *
+tracker_strhex (const guint8 *data,
+ gsize size,
+ gchar delimiter)
+{
+ /* */
+ gsize i;
+ gsize j;
+ gsize new_str_length;
+ gchar *new_str;
+ gchar new_hex_char [3];
+
+ /* Get new string length. If input string has N bytes, we need:
+ * - 1 byte for last NUL char
+ * - 2N bytes for hexadecimal char representation of each byte...
+ * - N-1 bytes for the separator ':'
+ * So... a total of (1+2N+N-1) = 3N bytes are needed... */
+ new_str_length = 3 * size;
+
+ /* Allocate memory for new array and initialize contents to NUL */
+ new_str = g_malloc0 (new_str_length);
+
+ /* Print hexadecimal representation of each byte... */
+ for(i=0, j=0; i<size; i++, j+=3) {
+ memset (new_hex_char, 0, 3);
+ /* Print character in helper array... */
+ sprintf (new_hex_char, "%02X", data[i]);
+ /* Copy to output string... */
+ memcpy (&new_str[j],&new_hex_char[0],2);
+ /* And if needed, add separator */
+ if(i != (size-1) ) {
+ new_str[j+2] = delimiter;
+ }
+ }
+
+ /* Set output string */
+ return new_str;
+}
+
+
+
diff --git a/src/libtracker-common/tracker-utils.h b/src/libtracker-common/tracker-utils.h
index 7af38eb..6364eaf 100644
--- a/src/libtracker-common/tracker-utils.h
+++ b/src/libtracker-common/tracker-utils.h
@@ -29,15 +29,17 @@ G_BEGIN_DECLS
#error "only <libtracker-common/tracker-common.h> must be included directly."
#endif
-gboolean tracker_is_empty_string (const char *str);
-gboolean tracker_is_blank_string (const char *str);
-gchar * tracker_seconds_estimate_to_string (gdouble seconds_elapsed,
- gboolean short_string,
- guint items_done,
- guint items_remaining);
-gchar * tracker_seconds_to_string (gdouble seconds_elapsed,
- gboolean short_string);
-
+gboolean tracker_is_empty_string (const char *str);
+gboolean tracker_is_blank_string (const char *str);
+gchar * tracker_seconds_estimate_to_string (gdouble seconds_elapsed,
+ gboolean short_string,
+ guint items_done,
+ guint items_remaining);
+gchar * tracker_seconds_to_string (gdouble seconds_elapsed,
+ gboolean short_string);
+gchar * tracker_strhex (const guint8 *data,
+ gsize size,
+ gchar delimiter);
G_END_DECLS
#endif /* __LIBTRACKER_COMMON_UTILS_H__ */
diff --git a/src/libtracker-fts/Makefile.am b/src/libtracker-fts/Makefile.am
index 9b469c6..667cece 100644
--- a/src/libtracker-fts/Makefile.am
+++ b/src/libtracker-fts/Makefile.am
@@ -26,6 +26,8 @@ libtracker_fts_la_SOURCES = \
tracker-fts-config.h \
tracker-fts-hash.c \
tracker-fts-hash.h \
+ tracker-parser-utils.c \
+ tracker-parser-utils.h \
tracker-parser.h
if HAVE_LIBUNISTRING
diff --git a/src/libtracker-fts/tracker-parser-glib.c b/src/libtracker-fts/tracker-parser-glib.c
index c1f3a29..891d9f6 100644
--- a/src/libtracker-fts/tracker-parser-glib.c
+++ b/src/libtracker-fts/tracker-parser-glib.c
@@ -20,33 +20,22 @@
#include "config.h"
-#include <stdio.h>
#include <string.h>
#ifdef HAVE_UNAC
#include <unac.h>
#endif
-#ifdef HAVE_LIBUNISTRING
-/* libunistring versions prior to 9.1.2 need this hack */
-#define _UNUSED_PARAMETER_
-#include <unistr.h>
-#include <uniwbrk.h>
-#include <unictype.h>
-#include <unicase.h>
-#else
#include <pango/pango.h>
-#endif
+#include <libtracker-common/tracker-common.h>
#include "tracker-parser.h"
+#include "tracker-parser-utils.h"
/* Define to 1 if you want to enable debugging logs showing HEX contents
* of the words being parsed */
#define TRACKER_PARSER_DEBUG_HEX 0
-
-#ifndef HAVE_LIBUNISTRING
-
/* Need pango for CJK ranges which are : 0x3400 - 0x4DB5, 0x4E00 -
* 0x9FA5, 0x20000 - <= 0x2A6D6
*/
@@ -86,21 +75,6 @@ typedef enum {
TRACKER_PARSER_ENCODING_OTHER
} TrackerParserEncoding;
-#else
-
-/* ASCII-7 is in range [0x00,0x7F] */
-#define IS_ASCII_BYTE(c) ((c) <= 0x7F)
-
-/* CJK ranges are : [0x3400,0x4DB5], [0x4E00,0x9FA5], [0x20000,0x2A6D6] */
-#define IS_CJK_UCS4(c) (((c) >= 0x3400 && (c) <= 0x4DB5) || \
- ((c) >= 0x4E00 && (c) <= 0x9FA5) || \
- ((c) >= 0x20000 && (c) <= 0x2A6D6))
-
-/* Max possible length of a UTF-8 encoded string (just a safety limit) */
-#define WORD_BUFFER_LENGTH 512
-
-#endif /* !HAVE_LIBUNISTRING */
-
struct TrackerParser {
const gchar *txt;
@@ -118,8 +92,6 @@ struct TrackerParser {
gchar *word;
gint word_length;
guint word_position;
-
-#ifndef HAVE_LIBUNISTRING
TrackerParserEncoding encoding;
const gchar *cursor;
@@ -127,61 +99,8 @@ struct TrackerParser {
PangoLogAttr *attrs;
guint attr_length;
guint attr_pos;
-#else
- /* Cursor, as index of the input array of bytes */
- gsize cursor;
- /* libunistring flags array */
- gchar *word_break_flags;
- /* general category of the start character in words */
- uc_general_category_t allowed_start;
-#endif /* !HAVE_LIBUNISTRING */
};
-
-#if TRACKER_PARSER_DEBUG_HEX
-/* Based on GNU PDF's pdf_text_test_get_hex() */
-static gchar *
-tracker_strhex (const gchar *data,
- const gsize size,
- gchar delimiter)
-{
- gint i;
- gint j;
- guint new_str_length;
- gchar *new_str;
- gchar new_hex_char [3];
-
- /* Get new string length. If input string has N bytes, we need:
- * - 1 byte for last NUL char
- * - 2N bytes for hexadecimal char representation of each byte...
- * - N-1 bytes for the separator ':'
- * So... a total of (1+2N+N-1) = 3N bytes are needed... */
- new_str_length = 3 * size;
-
- /* Allocate memory for new array and initialize contents to NUL */
- new_str = g_malloc0 (new_str_length);
-
- /* Print hexadecimal representation of each byte... */
- for(i=0, j=0; i<size; i++, j+=3) {
- memset (new_hex_char, 0, 3);
- /* Print character in helper array... */
- sprintf (new_hex_char, "%02X", (guint8)(data[i]));
- /* Copy to output string... */
- memcpy (&new_str[j],&new_hex_char[0],2);
- /* And if needed, add separator */
- if(i != (size-1) ) {
- new_str[j+2] = delimiter;
- }
- }
-
- /* Set output string */
- return new_str;
-}
-
-#endif /* TRACKER_PARSER_DEBUG_HEX */
-
-
-#ifndef HAVE_LIBUNISTRING
static inline TrackerParserWordType
get_word_type (gunichar c)
{
@@ -224,64 +143,7 @@ get_word_type (gunichar c)
return TRACKER_PARSER_WORD_IGNORE;
}
-#endif /* !HAVE_LIBUNISTRING */
-
-
-
-static inline gchar *
-strip_word (const gchar *str,
- gint length,
- guint32 *len)
-{
-#ifdef HAVE_UNAC
- GError *error = NULL;
- gchar *str_utf16;
- gsize utf16_len, unaccented_len, final_len;
- gchar *unaccented_str = NULL;
- gchar *s = NULL;
-
- *len = 0;
-
- /* unac_string() does roughly the same than below, plus it
- * corrupts memory in 64bit systems, so avoid it for now.
- */
- str_utf16 = g_convert (str, length, "UTF-16BE", "UTF-8", NULL, &utf16_len, &error);
-
- if (error) {
- g_warning ("Could not convert to UTF-16: %s", error->message);
- g_error_free (error);
- return NULL;
- }
-
- if (unac_string_utf16 (str_utf16, utf16_len,
- &unaccented_str, &unaccented_len) != 0) {
- g_warning ("UNAC failed to strip accents");
- g_free (str_utf16);
- return NULL;
- }
-
- g_free (str_utf16);
-
- s = g_convert (unaccented_str, unaccented_len, "UTF-8", "UTF-16BE", NULL, &final_len, &error);
- g_free (unaccented_str);
-
- if (error) {
- g_warning ("Could not convert back to UTF-8: %s", error->message);
- g_error_free (error);
- return NULL;
- }
-
- *len = (guint32) final_len;
-
- return s;
-#else
- *len = length;
- return NULL;
-#endif
-}
-
-#ifndef HAVE_LIBUNISTRING
static TrackerParserEncoding
get_encoding (const gchar *txt)
{
@@ -310,27 +172,6 @@ get_encoding (const gchar *txt)
}
-#endif /* !HAVE_LIBUNISTRING */
-
-
-static gboolean
-is_stop_word (TrackerLanguage *language,
- const gchar *word)
-{
- GHashTable *stop_words;
-
- if (!word) {
- return FALSE;
- }
-
- stop_words = tracker_language_get_stop_words (language);
-
- return g_hash_table_lookup (stop_words, word) != NULL;
-}
-
-
-#ifndef HAVE_LIBUNISTRING
-
static gboolean
pango_next (TrackerParser *parser,
gint *byte_offset_start,
@@ -559,8 +400,6 @@ parser_next (TrackerParser *parser,
gchar *utf8;
gchar *processed_word;
-
-
utf8 = g_ucs4_to_utf8 (word, length, NULL, &bytes, NULL);
if (!utf8) {
@@ -588,146 +427,6 @@ parser_next (TrackerParser *parser,
}
-#else
-
-
-/* Detect if a UTF-8 word is pure ASCII-7, so that there is no need to apply
- * UNAC stripping.
- * Just check byte per byte, and if any of the bytes is >127, then it's not
- * ASCII-7 */
-static gboolean
-is_ascii_word (const gchar *word,
- gsize length)
-{
- gsize i;
-
- for (i = 0; i < length; i++) {
- if (!IS_ASCII_BYTE ((guchar)word[i])) {
- return FALSE;
- }
- }
- return TRUE;
-}
-
-
-/* libunistring-based parser */
-static gboolean
-parser_next (TrackerParser *parser,
- gint *byte_offset_start,
- gint *byte_offset_end)
-{
-
- gsize word_length = 0;
- gchar *processed_word = NULL;
-
- *byte_offset_start = 0;
- *byte_offset_end = 0;
-
- g_return_val_if_fail (parser, FALSE);
-
- /* Loop to look for next valid word */
- while (!processed_word &&
- parser->cursor < parser->txt_size) {
- ucs4_t first_unichar;
- gint first_unichar_len;
- gsize i;
- gsize truncated_length;
- gboolean do_strip;
-
- /* Get first character of the word as UCS4 */
- first_unichar_len = u8_strmbtouc (&first_unichar,
- &(parser->txt[parser->cursor]));
- if (first_unichar_len <= 0) {
- /* This should only happen if NIL was passed to u8_strmbtouc,
- * so better just force stop here */
- parser->cursor = parser->txt_size;
- break;
- }
-
- /* Find next word break */
- i = parser->cursor + first_unichar_len;
- while (i < parser->txt_size &&
- !parser->word_break_flags [i]) {
- i++;
- }
-
- /* Word end is the first byte after the word, which is either the
- * start of next word or the end of the string */
- word_length = i - parser->cursor;
-
- /* We only want the words where the first character
- * in the word is either a letter, a number or a symbol.
- * This is needed because the word break algorithm also
- * considers word breaks after for example commas or other
- * punctuation marks.
- * Note that looking at the first character in the string
- * should be compatible with all Unicode normalization
- * methods.
- */
- if (!uc_is_general_category (first_unichar,
- parser->allowed_start)) {
- /* Skip this word and keep on looping */
- parser->cursor += word_length;
- continue;
- }
-
- /* check if word is reserved */
- if (parser->parse_reserved_words &&
- word_length == 2 &&
- parser->txt[parser->cursor] == 'o' &&
- parser->txt[parser->cursor + 1] == 'r') {
- /* Skip this word and keep on looping */
- parser->cursor += word_length;
- continue;
- }
-
- /* compute truncated word length if needed (to avoid extremely
- * long words)*/
- truncated_length = (word_length < WORD_BUFFER_LENGTH ?
- word_length :
- WORD_BUFFER_LENGTH - 1);
-
- /* Enable UNAC stripping only if no ASCII and no CJK */
- do_strip = (!is_ascii_word (&(parser->txt[parser->cursor]),
- truncated_length) &&
- !IS_CJK_UCS4 (first_unichar));
-
- /* Process the word here. If it fails, we can still go
- * to the next one. Returns newly allocated string
- * always */
- processed_word = tracker_parser_process_word (parser,
- &(parser->txt[parser->cursor]),
- truncated_length,
- do_strip);
- if (!processed_word) {
- /* Skip this word and keep on looping */
- parser->cursor += word_length;
- continue;
- }
- }
-
- /* If we got a word here, set output */
- if (processed_word) {
- /* Set outputs */
- *byte_offset_start = parser->cursor;
- *byte_offset_end = parser->cursor + word_length;
-
- /* Update cursor */
- parser->cursor += word_length;
-
- parser->word_length = strlen (processed_word);
- parser->word = processed_word;
-
- return TRUE;
- }
-
- /* No more words... */
- return FALSE;
-}
-
-#endif /* !HAVE_LIBUNISTRING */
-
-
TrackerParser *
tracker_parser_new (TrackerLanguage *language,
gint max_word_length)
@@ -743,12 +442,7 @@ tracker_parser_new (TrackerLanguage *language,
parser->max_word_length = max_word_length;
parser->word_length = 0;
-
-#ifndef HAVE_LIBUNISTRING
parser->attrs = NULL;
-#else
- parser->word_break_flags = NULL;
-#endif /* !HAVE_LIBUNISTRING */
return parser;
}
@@ -762,11 +456,7 @@ tracker_parser_free (TrackerParser *parser)
g_object_unref (parser->language);
}
-#ifndef HAVE_LIBUNISTRING
g_free (parser->attrs);
-#else
- g_free (parser->word_break_flags);
-#endif /* !HAVE_LIBUNISTRING */
g_free (parser->word);
@@ -785,13 +475,12 @@ tracker_parser_reset (TrackerParser *parser,
g_return_if_fail (parser != NULL);
g_return_if_fail (txt != NULL);
-#ifndef HAVE_LIBUNISTRING
+
g_free (parser->attrs);
parser->attrs = NULL;
parser->cursor = txt;
parser->encoding = get_encoding (txt);
-#endif
parser->enable_stemmer = enable_stemmer;
parser->enable_stop_words = enable_stop_words;
@@ -806,27 +495,6 @@ tracker_parser_reset (TrackerParser *parser,
parser->word_position = 0;
-#ifdef HAVE_LIBUNISTRING
-
- parser->cursor = 0;
-
- g_free (parser->word_break_flags);
-
- /* Create array of flags, same size as original text. */
- parser->word_break_flags = g_malloc (txt_size);
-
- /* Get wordbreak flags in the whole string */
- u8_wordbreaks ((const uint8_t *)txt,
- (size_t) txt_size,
- (char *)parser->word_break_flags);
-
- /* Prepare a custom category which is a combination of the
- * desired ones */
- parser->allowed_start = UC_LETTER;
- parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER);
- parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_SYMBOL);
-
-#else
if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
PangoLogAttr *attrs;
@@ -848,11 +516,8 @@ tracker_parser_reset (TrackerParser *parser,
parser->attrs = attrs;
parser->attr_pos = 0;
}
-#endif /* !HAVE_LIBUNISTRING */
}
-
-#ifndef HAVE_LIBUNISTRING
gchar *
tracker_parser_process_word (TrackerParser *parser,
const gchar *word,
@@ -862,7 +527,7 @@ tracker_parser_process_word (TrackerParser *parser,
gchar *stem_word;
gchar *str;
gchar *stripped_word;
- guint bytes, len;
+ gsize bytes, len;
g_return_val_if_fail (parser != NULL, NULL);
g_return_val_if_fail (word != NULL, NULL);
@@ -889,7 +554,7 @@ tracker_parser_process_word (TrackerParser *parser,
#endif
if (do_strip) {
- stripped_word = strip_word (word, bytes, &len);
+ stripped_word = tracker_parser_unaccent_string (word, bytes, &len);
/* Log after UNAC stripping */
#if TRACKER_PARSER_DEBUG_HEX
@@ -951,152 +616,6 @@ tracker_parser_process_word (TrackerParser *parser,
return str;
}
-#else
-
-/* libunistring version of the word processor. */
-gchar *
-tracker_parser_process_word (TrackerParser *parser,
- const gchar *word,
- gint length,
- gboolean do_strip)
-{
- gchar word_buffer [WORD_BUFFER_LENGTH];
- gchar *normalized = NULL;
- gchar *stripped = NULL;
- gchar *stemmed = NULL;
- size_t new_word_length;
-
- g_return_val_if_fail (parser != NULL, NULL);
- g_return_val_if_fail (word != NULL, NULL);
-
- /* If length is set as -1, the input word MUST be NIL-terminated.
- * Otherwise, this restriction is not needed as the length to process
- * is given as input argument */
- if (length < 0) {
- length = strlen (word);
- }
-
- /* Log original word */
-#if TRACKER_PARSER_DEBUG_HEX
- {
- gchar *aux;
- gchar *word_aux;
-
- /* Word may not come NIL-terminated */
- word_aux = g_malloc (length + 1);
- memcpy (word_aux, word, length);
- word_aux[length] = '\0';
-
- aux = tracker_strhex (word, length, ':');
- g_message ("ORIGINAL word: '%s' (%s)",
- word_aux, aux);
- g_free (aux);
- g_free (word_aux);
- }
-#endif
-
- /* Leave space for last NIL */
- new_word_length = WORD_BUFFER_LENGTH - 1;
-
- /* Casefold and NFC normalization in output.
- * NOTE: if the output buffer is not big enough, u8_casefold will
- * return a newly-allocated buffer. */
- normalized = u8_casefold ((const uint8_t *)word,
- length,
- uc_locale_language (),
- UNINORM_NFC,
- word_buffer,
- &new_word_length);
-
- /* Case folding + Normalization failed, skip this word */
- g_return_val_if_fail (normalized != NULL, NULL);
-
- /* If output buffer is not the same as the one passed to
- * u8_casefold, we know it was newly-allocated, so need
- * to resize it in 1 byte to add last NIL */
- if (normalized != word_buffer) {
- normalized = g_realloc (normalized, new_word_length + 1);
- }
-
- /* Set output NIL */
- normalized[new_word_length] = '\0';
-
-#if TRACKER_PARSER_DEBUG_HEX
- {
- gchar *aux;
- aux = tracker_strhex (normalized, new_word_length, ':');
- g_message (" After Casefolding and NFC normalization: '%s' (%s)",
- normalized, aux);
- g_free (aux);
- }
-#endif
-
- /* UNAC stripping needed? */
- if (do_strip) {
- guint32 stripped_word_length;
-
- stripped = strip_word (normalized,
- new_word_length,
- &stripped_word_length);
-
- if (stripped) {
- /* Log after UNAC stripping */
-#if TRACKER_PARSER_DEBUG_HEX
- {
- gchar *aux;
- aux = tracker_strhex (stripped, stripped_word_length, ':');
- g_message (" After UNAC stripping: '%s' (%s)",
- stripped, aux);
- g_free (aux);
- }
-#endif
- new_word_length = stripped_word_length;
- }
- }
-
-
- /* Stemming needed? */
- if (parser->enable_stemmer) {
- stemmed = tracker_language_stem_word (parser->language,
- stripped ? stripped : normalized,
- new_word_length);
-#if TRACKER_PARSER_DEBUG_HEX
- if (stemmed) {
- gchar *aux;
- aux = tracker_strhex (stemmed, strlen (stemmed), ':');
- g_message (" After stemming: '%s' (%s)",
- stemmed, aux);
- g_free (aux);
- }
-#endif
- }
-
- /* If stemmed wanted and succeeded, free previous and return it */
- if (stemmed) {
- g_free (stripped);
- if (normalized != word_buffer) {
- g_free (normalized);
- }
- return stemmed;
- }
-
- /* If stripped wanted and succeeded, free previous and return it */
- if (stripped) {
- if (normalized != word_buffer) {
- g_free (normalized);
- }
- return stripped;
- }
-
- /* It may be the case that no stripping and no stemming was needed, and
- * that the output buffer in stack was enough for case-folding and
- * normalization. In this case, need to strdup() the string to return it */
- return normalized == word_buffer ? g_strdup (word_buffer) : normalized;
-}
-
-#endif /* !HAVE_LIBUNISTRING */
-
-
const gchar *
tracker_parser_next (TrackerParser *parser,
gint *position,
@@ -1113,8 +632,6 @@ tracker_parser_next (TrackerParser *parser,
g_free (parser->word);
parser->word = NULL;
-
-#ifndef HAVE_LIBUNISTRING
if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
if (pango_next (parser, &byte_start, &byte_end)) {
str = parser->word;
@@ -1122,14 +639,13 @@ tracker_parser_next (TrackerParser *parser,
parser->word_position++;
*stop_word = FALSE;
- } else
-#endif /* !HAVE_LIBUNISTRING */
- {
+ } else {
if (parser_next (parser, &byte_start, &byte_end)) {
str = parser->word;
}
- if (parser->enable_stop_words && is_stop_word (parser->language, str)) {
+ if (parser->enable_stop_words &&
+ tracker_language_is_stop_word (parser->language, str)) {
*stop_word = TRUE;
} else {
parser->word_position++;
@@ -1145,5 +661,3 @@ tracker_parser_next (TrackerParser *parser,
return str;
}
-
-
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c
index 5ffa73e..0052cb0 100644
--- a/src/libtracker-fts/tracker-parser-libunistring.c
+++ b/src/libtracker-fts/tracker-parser-libunistring.c
@@ -1,3 +1,4 @@
+
/*
* Copyright (C) 2006, Jamie McCracken <jamiemcc gnome org>
* Copyright (C) 2008, Nokia <ivan frade nokia com>
@@ -27,67 +28,21 @@
#include <unac.h>
#endif
-#ifdef HAVE_LIBUNISTRING
/* libunistring versions prior to 9.1.2 need this hack */
#define _UNUSED_PARAMETER_
#include <unistr.h>
#include <uniwbrk.h>
#include <unictype.h>
#include <unicase.h>
-#else
-#include <pango/pango.h>
-#endif
+#include <libtracker-common/tracker-common.h>
#include "tracker-parser.h"
+#include "tracker-parser-utils.h"
/* Define to 1 if you want to enable debugging logs showing HEX contents
* of the words being parsed */
#define TRACKER_PARSER_DEBUG_HEX 0
-
-#ifndef HAVE_LIBUNISTRING
-
-/* Need pango for CJK ranges which are : 0x3400 - 0x4DB5, 0x4E00 -
- * 0x9FA5, 0x20000 - <= 0x2A6D6
- */
-#define NEED_PANGO(c) (((c) >= 0x3400 && (c) <= 0x4DB5) || \
- ((c) >= 0x4E00 && (c) <= 0x9FA5) || \
- ((c) >= 0x20000 && (c) <= 0x2A6D6))
-#define IS_LATIN(c) (((c) <= 0x02AF) || \
- ((c) >= 0x1E00 && (c) <= 0x1EFF))
-#define IS_ASCII(c) ((c) <= 0x007F)
-#define IS_ASCII_ALPHA_LOWER(c) ((c) >= 0x0061 && (c) <= 0x007A)
-#define IS_ASCII_ALPHA_HIGHER(c) ((c) >= 0x0041 && (c) <= 0x005A)
-#define IS_ASCII_NUMERIC(c) ((c) >= 0x0030 && (c) <= 0x0039)
-#define IS_ASCII_IGNORE(c) ((c) <= 0x002C)
-#define IS_HYPHEN(c) ((c) == 0x002D)
-#define IS_UNDERSCORE(c) ((c) == 0x005F)
-#define IS_NEWLINE(c) ((c) == 0x000D)
-#define IS_O(c) ((c) == 0x006F)
-#define IS_R(c) ((c) == 0x0072)
-
-typedef enum {
- TRACKER_PARSER_WORD_ASCII_HIGHER,
- TRACKER_PARSER_WORD_ASCII_LOWER,
- TRACKER_PARSER_WORD_HYPHEN,
- TRACKER_PARSER_WORD_UNDERSCORE,
- TRACKER_PARSER_WORD_NUM,
- TRACKER_PARSER_WORD_ALPHA_HIGHER,
- TRACKER_PARSER_WORD_ALPHA_LOWER,
- TRACKER_PARSER_WORD_ALPHA,
- TRACKER_PARSER_WORD_ALPHA_NUM,
- TRACKER_PARSER_WORD_IGNORE
-} TrackerParserWordType;
-
-typedef enum {
- TRACKER_PARSER_ENCODING_ASCII,
- TRACKER_PARSER_ENCODING_LATIN,
- TRACKER_PARSER_ENCODING_CJK,
- TRACKER_PARSER_ENCODING_OTHER
-} TrackerParserEncoding;
-
-#else
-
/* ASCII-7 is in range [0x00,0x7F] */
#define IS_ASCII_BYTE(c) ((c) <= 0x7F)
@@ -99,8 +54,6 @@ typedef enum {
/* Max possible length of a UTF-8 encoded string (just a safety limit) */
#define WORD_BUFFER_LENGTH 512
-#endif /* !HAVE_LIBUNISTRING */
-
struct TrackerParser {
const gchar *txt;
@@ -119,480 +72,14 @@ struct TrackerParser {
gint word_length;
guint word_position;
-#ifndef HAVE_LIBUNISTRING
- TrackerParserEncoding encoding;
- const gchar *cursor;
-
- /* Pango members for CJK text parsing */
- PangoLogAttr *attrs;
- guint attr_length;
- guint attr_pos;
-#else
/* Cursor, as index of the input array of bytes */
gsize cursor;
/* libunistring flags array */
gchar *word_break_flags;
/* general category of the start character in words */
uc_general_category_t allowed_start;
-#endif /* !HAVE_LIBUNISTRING */
};
-
-#if TRACKER_PARSER_DEBUG_HEX
-/* Based on GNU PDF's pdf_text_test_get_hex() */
-static gchar *
-tracker_strhex (const gchar *data,
- const gsize size,
- gchar delimiter)
-{
- gint i;
- gint j;
- guint new_str_length;
- gchar *new_str;
- gchar new_hex_char [3];
-
- /* Get new string length. If input string has N bytes, we need:
- * - 1 byte for last NUL char
- * - 2N bytes for hexadecimal char representation of each byte...
- * - N-1 bytes for the separator ':'
- * So... a total of (1+2N+N-1) = 3N bytes are needed... */
- new_str_length = 3 * size;
-
- /* Allocate memory for new array and initialize contents to NUL */
- new_str = g_malloc0 (new_str_length);
-
- /* Print hexadecimal representation of each byte... */
- for(i=0, j=0; i<size; i++, j+=3) {
- memset (new_hex_char, 0, 3);
- /* Print character in helper array... */
- sprintf (new_hex_char, "%02X", (guint8)(data[i]));
- /* Copy to output string... */
- memcpy (&new_str[j],&new_hex_char[0],2);
- /* And if needed, add separator */
- if(i != (size-1) ) {
- new_str[j+2] = delimiter;
- }
- }
-
- /* Set output string */
- return new_str;
-}
-
-#endif /* TRACKER_PARSER_DEBUG_HEX */
-
-
-#ifndef HAVE_LIBUNISTRING
-static inline TrackerParserWordType
-get_word_type (gunichar c)
-{
- /* Fast ascii handling */
- if (IS_ASCII (c)) {
- if (IS_ASCII_ALPHA_LOWER (c)) {
- return TRACKER_PARSER_WORD_ASCII_LOWER;
- }
-
- if (IS_ASCII_ALPHA_HIGHER (c)) {
- return TRACKER_PARSER_WORD_ASCII_HIGHER;
- }
-
- if (IS_ASCII_IGNORE (c)) {
- return TRACKER_PARSER_WORD_IGNORE;
- }
-
- if (IS_ASCII_NUMERIC (c)) {
- return TRACKER_PARSER_WORD_NUM;
- }
-
- if (IS_HYPHEN (c)) {
- return TRACKER_PARSER_WORD_HYPHEN;
- }
-
- if (IS_UNDERSCORE (c)) {
- return TRACKER_PARSER_WORD_UNDERSCORE;
- }
- } else {
- if (g_unichar_isalpha (c)) {
- if (!g_unichar_isupper (c)) {
- return TRACKER_PARSER_WORD_ALPHA_LOWER;
- } else {
- return TRACKER_PARSER_WORD_ALPHA_HIGHER;
- }
- } else if (g_unichar_isdigit (c)) {
- return TRACKER_PARSER_WORD_NUM;
- }
- }
-
- return TRACKER_PARSER_WORD_IGNORE;
-}
-#endif /* !HAVE_LIBUNISTRING */
-
-
-
-static inline gchar *
-strip_word (const gchar *str,
- gint length,
- guint32 *len)
-{
-#ifdef HAVE_UNAC
- GError *error = NULL;
- gchar *str_utf16;
- gsize utf16_len, unaccented_len, final_len;
- gchar *unaccented_str = NULL;
- gchar *s = NULL;
-
- *len = 0;
-
- /* unac_string() does roughly the same than below, plus it
- * corrupts memory in 64bit systems, so avoid it for now.
- */
- str_utf16 = g_convert (str, length, "UTF-16BE", "UTF-8", NULL, &utf16_len, &error);
-
- if (error) {
- g_warning ("Could not convert to UTF-16: %s", error->message);
- g_error_free (error);
- return NULL;
- }
-
- if (unac_string_utf16 (str_utf16, utf16_len,
- &unaccented_str, &unaccented_len) != 0) {
- g_warning ("UNAC failed to strip accents");
- g_free (str_utf16);
- return NULL;
- }
-
- g_free (str_utf16);
-
- s = g_convert (unaccented_str, unaccented_len, "UTF-8", "UTF-16BE", NULL, &final_len, &error);
- g_free (unaccented_str);
-
- if (error) {
- g_warning ("Could not convert back to UTF-8: %s", error->message);
- g_error_free (error);
- return NULL;
- }
-
- *len = (guint32) final_len;
-
- return s;
-#else
- *len = length;
- return NULL;
-#endif
-}
-
-
-#ifndef HAVE_LIBUNISTRING
-static TrackerParserEncoding
-get_encoding (const gchar *txt)
-{
- const gchar *p;
- gunichar c;
- gint i = 0;
-
- /* Grab first 255 non-whitespace chars and test */
- for (p = txt; *p && i < 255; p = g_utf8_next_char (p)) {
- c = g_utf8_get_char (p);
-
- if (!g_unichar_isspace (c)) {
- i++;
- }
-
- if (IS_ASCII(c)) continue;
-
- if (IS_LATIN(c)) return TRACKER_PARSER_ENCODING_LATIN;
-
- if (NEED_PANGO(c)) return TRACKER_PARSER_ENCODING_CJK;
-
- return TRACKER_PARSER_ENCODING_OTHER;
- }
-
- return TRACKER_PARSER_ENCODING_ASCII;
-
-}
-
-#endif /* !HAVE_LIBUNISTRING */
-
-
-static gboolean
-is_stop_word (TrackerLanguage *language,
- const gchar *word)
-{
- GHashTable *stop_words;
-
- if (!word) {
- return FALSE;
- }
-
- stop_words = tracker_language_get_stop_words (language);
-
- return g_hash_table_lookup (stop_words, word) != NULL;
-}
-
-
-#ifndef HAVE_LIBUNISTRING
-
-static gboolean
-pango_next (TrackerParser *parser,
- gint *byte_offset_start,
- gint *byte_offset_end)
-
-{
- /* CJK text does not need stemming or other treatment */
- gint word_start = -1;
- gint old_word_start = -1;
- guint i;
-
- for (i = parser->attr_pos; i < parser->attr_length; i++) {
- if (parser->attrs[i].is_word_start) {
- word_start = i;
- continue;
- }
-
- if (parser->attrs[i].is_word_end && word_start != old_word_start) {
- gchar *start_word, *end_word;
-
- old_word_start = word_start;
-
- start_word = g_utf8_offset_to_pointer (parser->txt, word_start);
- end_word = g_utf8_offset_to_pointer (parser->txt, i);
-
- if (start_word != end_word) {
- gchar *str;
- gchar *index_word;
-
- /* Normalize word */
- str = g_utf8_casefold (start_word, end_word - start_word);
- if (!str) {
- continue;
- }
-
- index_word = g_utf8_normalize (str, -1, G_NORMALIZE_NFC);
- g_free (str);
-
- if (!index_word) {
- continue;
- }
-
- parser->word_length = strlen (index_word);
- parser->word = index_word;
-
- *byte_offset_start = (start_word - parser->txt);
- *byte_offset_end = *byte_offset_start + (end_word - start_word);
- parser->attr_pos = i;
-
-
- return TRUE;
-
- }
-
- word_start = i;
- }
- }
-
- parser->attr_pos = i;
-
- return FALSE;
-}
-
-
-static gboolean
-parser_next (TrackerParser *parser,
- gint *byte_offset_start,
- gint *byte_offset_end)
-{
- TrackerParserWordType word_type;
- gunichar word[64];
- gboolean is_valid;
- guint length;
- gint char_count = 0;
- glong bytes;
- const gchar *p;
- const gchar *start;
- const gchar *end;
- gboolean do_strip = FALSE;
-
- *byte_offset_start = 0;
- *byte_offset_end = 0;
-
- g_return_val_if_fail (parser, FALSE);
-
- if (!parser->cursor) {
- return FALSE;
- }
-
- word_type = TRACKER_PARSER_WORD_IGNORE;
- is_valid = TRUE;
- length = 0;
- bytes = 0;
-
- start = NULL;
- end = NULL;
-
- for (p = parser->cursor; *p && *p != '\0'; p = g_utf8_next_char (p)) {
- TrackerParserWordType type;
- gunichar c;
-
- char_count++;
- c = g_utf8_get_char (p);
- type = get_word_type (c);
-
- if (type == TRACKER_PARSER_WORD_IGNORE ||
- (parser->delimit_words &&
- (type == TRACKER_PARSER_WORD_HYPHEN ||
- type == TRACKER_PARSER_WORD_UNDERSCORE))) {
- if (!start) {
- continue;
- } else {
- /* word break */
-
- /* check if word is reserved */
- if (is_valid && parser->parse_reserved_words) {
- if (length == 2 && word[0] == 'o' && word[1] == 'r') {
- break;
- }
- }
-
- if (!is_valid ||
- word_type == TRACKER_PARSER_WORD_NUM) {
- word_type = TRACKER_PARSER_WORD_IGNORE;
- is_valid = TRUE;
- length = 0;
- bytes = 0;
- start = NULL;
- end = NULL;
- do_strip = FALSE;
-
- continue;
- }
-
- break;
- }
- }
-
- if (!is_valid) {
- continue;
- }
-
- if (!start) {
- start = g_utf8_offset_to_pointer (parser->cursor, char_count-1);
-
- /* Valid words must start with an alpha or
- * underscore if we are filtering.
- */
-
- if (type == TRACKER_PARSER_WORD_NUM) {
- is_valid = FALSE;
- continue;
- } else {
- if (type == TRACKER_PARSER_WORD_HYPHEN) {
- is_valid = parser->parse_reserved_words;
- continue;
- }
- }
- }
-
- if (length >= parser->max_word_length) {
- continue;
- }
-
- length++;
-
- switch (type) {
- case TRACKER_PARSER_WORD_ASCII_HIGHER:
- c += 32;
-
- /* Fall through */
- case TRACKER_PARSER_WORD_ASCII_LOWER:
- case TRACKER_PARSER_WORD_HYPHEN:
- case TRACKER_PARSER_WORD_UNDERSCORE:
- if (word_type == TRACKER_PARSER_WORD_NUM ||
- word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
- word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
- } else {
- word_type = TRACKER_PARSER_WORD_ALPHA;
- }
-
- break;
-
- case TRACKER_PARSER_WORD_NUM:
- if (word_type == TRACKER_PARSER_WORD_ALPHA ||
- word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
- word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
- } else {
- word_type = TRACKER_PARSER_WORD_NUM;
- }
- break;
-
- case TRACKER_PARSER_WORD_ALPHA_HIGHER:
- c = g_unichar_tolower (c);
-
- /* Fall through */
- case TRACKER_PARSER_WORD_ALPHA_LOWER:
- if (!do_strip) {
- do_strip = TRUE;
- }
-
- if (word_type == TRACKER_PARSER_WORD_NUM ||
- word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
- word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
- } else {
- word_type = TRACKER_PARSER_WORD_ALPHA;
- }
-
- break;
-
- case TRACKER_PARSER_WORD_ALPHA:
- case TRACKER_PARSER_WORD_ALPHA_NUM:
- case TRACKER_PARSER_WORD_IGNORE:
- default:
- break;
- }
-
- word[length -1] = c;
- }
-
- parser->cursor = NULL;
-
- if (!is_valid) {
- return FALSE;
- }
-
- if (word_type == TRACKER_PARSER_WORD_ALPHA_NUM || word_type == TRACKER_PARSER_WORD_ALPHA) {
- gchar *utf8;
- gchar *processed_word;
-
-
-
- utf8 = g_ucs4_to_utf8 (word, length, NULL, &bytes, NULL);
-
- if (!utf8) {
- return FALSE;
- }
-
- *byte_offset_start = start-parser->txt;
- *byte_offset_end = *byte_offset_start + bytes;
-
- parser->cursor = parser->txt + *byte_offset_end;
-
- processed_word = tracker_parser_process_word (parser, utf8, bytes, do_strip);
- g_free (utf8);
-
- if (processed_word) {
- parser->word_length = strlen (processed_word);
- parser->word = processed_word;
-
- return TRUE;
- }
-
- }
-
- return FALSE;
-
-}
-
-#else
-
-
/* Detect if a UTF-8 word is pure ASCII-7, so that there is no need to apply
* UNAC stripping.
* Just check byte per byte, and if any of the bytes is >127, then it's not
@@ -611,14 +98,12 @@ is_ascii_word (const gchar *word,
return TRUE;
}
-
/* libunistring-based parser */
static gboolean
parser_next (TrackerParser *parser,
gint *byte_offset_start,
gint *byte_offset_end)
{
-
gsize word_length = 0;
gchar *processed_word = NULL;
@@ -727,9 +212,6 @@ parser_next (TrackerParser *parser,
return FALSE;
}
-#endif /* !HAVE_LIBUNISTRING */
-
-
TrackerParser *
tracker_parser_new (TrackerLanguage *language,
gint max_word_length)
@@ -746,11 +228,7 @@ tracker_parser_new (TrackerLanguage *language,
parser->max_word_length = max_word_length;
parser->word_length = 0;
-#ifndef HAVE_LIBUNISTRING
- parser->attrs = NULL;
-#else
parser->word_break_flags = NULL;
-#endif /* !HAVE_LIBUNISTRING */
return parser;
}
@@ -764,11 +242,7 @@ tracker_parser_free (TrackerParser *parser)
g_object_unref (parser->language);
}
-#ifndef HAVE_LIBUNISTRING
- g_free (parser->attrs);
-#else
g_free (parser->word_break_flags);
-#endif /* !HAVE_LIBUNISTRING */
g_free (parser->word);
@@ -787,14 +261,6 @@ tracker_parser_reset (TrackerParser *parser,
g_return_if_fail (parser != NULL);
g_return_if_fail (txt != NULL);
-#ifndef HAVE_LIBUNISTRING
- g_free (parser->attrs);
- parser->attrs = NULL;
-
- parser->cursor = txt;
- parser->encoding = get_encoding (txt);
-#endif
-
parser->enable_stemmer = enable_stemmer;
parser->enable_stop_words = enable_stop_words;
parser->delimit_words = delimit_words;
@@ -808,8 +274,6 @@ tracker_parser_reset (TrackerParser *parser,
parser->word_position = 0;
-#ifdef HAVE_LIBUNISTRING
-
parser->cursor = 0;
g_free (parser->word_break_flags);
@@ -827,134 +291,8 @@ tracker_parser_reset (TrackerParser *parser,
parser->allowed_start = UC_LETTER;
parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER);
parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_SYMBOL);
-
-#else
- if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
- PangoLogAttr *attrs;
-
- if (parser->txt_size == -1) {
- parser->txt_size = strlen (parser->txt);
- }
-
- parser->attr_length = g_utf8_strlen (parser->txt, parser->txt_size) + 1;
-
- attrs = g_new0 (PangoLogAttr, parser->attr_length);
-
- pango_get_log_attrs (parser->txt,
- txt_size,
- 0,
- pango_language_from_string ("C"),
- attrs,
- parser->attr_length);
-
- parser->attrs = attrs;
- parser->attr_pos = 0;
- }
-#endif /* !HAVE_LIBUNISTRING */
}
-
-#ifndef HAVE_LIBUNISTRING
-gchar *
-tracker_parser_process_word (TrackerParser *parser,
- const gchar *word,
- gint length,
- gboolean do_strip)
-{
- gchar *stem_word;
- gchar *str;
- gchar *stripped_word;
- guint bytes, len;
-
- g_return_val_if_fail (parser != NULL, NULL);
- g_return_val_if_fail (word != NULL, NULL);
-
- str = NULL;
- stripped_word = NULL;
-
- if (word) {
- if (length == -1) {
- bytes = strlen (word);
- } else {
- bytes = length;
- }
-
- /* Log original word */
-#if TRACKER_PARSER_DEBUG_HEX
- {
- gchar *aux;
- aux = tracker_strhex (word, bytes, ':');
- g_message ("ORIGINAL word: '%s' (%s)",
- word, aux);
- g_free (aux);
- }
-#endif
-
- if (do_strip) {
- stripped_word = strip_word (word, bytes, &len);
-
- /* Log after UNAC stripping */
-#if TRACKER_PARSER_DEBUG_HEX
- {
- gchar *aux;
- aux = tracker_strhex (stripped_word, len, ':');
- g_message (" After UNAC stripping: '%s' (%s)",
- stripped_word, aux);
- g_free (aux);
- }
-#endif
- } else {
- stripped_word = NULL;
- }
-
-
- if (!stripped_word) {
- str = g_utf8_normalize (word,
- bytes,
- G_NORMALIZE_NFC);
- } else {
- str = g_utf8_normalize (stripped_word,
- len,
- G_NORMALIZE_NFC);
- g_free (stripped_word);
- }
-
- /* Log after normalization */
-#if TRACKER_PARSER_DEBUG_HEX
- {
- gchar *aux;
- aux = tracker_strhex (str, strlen ((gchar *)str), ':');
- g_message (" After NFC normalization: '%s' (%s)",
- str, aux);
- g_free (aux);
- }
-#endif
-
-
- if (!str) {
- return NULL;
- }
-
- if (!parser->enable_stemmer) {
- return str;
- }
-
- len = strlen (str);
-
- stem_word = tracker_language_stem_word (parser->language, str, len);
-
- if (stem_word) {
- g_free (str);
-
- return stem_word;
- }
- }
-
- return str;
-}
-
-#else
-
/* libunistring version of the word processor. */
gchar *
tracker_parser_process_word (TrackerParser *parser,
@@ -1035,11 +373,11 @@ tracker_parser_process_word (TrackerParser *parser,
/* UNAC stripping needed? */
if (do_strip) {
- guint32 stripped_word_length;
+ gsize stripped_word_length;
- stripped = strip_word (normalized,
- new_word_length,
- &stripped_word_length);
+ stripped = tracker_parser_unaccent_string (normalized,
+ new_word_length,
+ &stripped_word_length);
if (stripped) {
/* Log after UNAC stripping */
@@ -1096,9 +434,6 @@ tracker_parser_process_word (TrackerParser *parser,
return normalized == word_buffer ? g_strdup (word_buffer) : normalized;
}
-#endif /* !HAVE_LIBUNISTRING */
-
-
const gchar *
tracker_parser_next (TrackerParser *parser,
gint *position,
@@ -1115,28 +450,16 @@ tracker_parser_next (TrackerParser *parser,
g_free (parser->word);
parser->word = NULL;
+ if (parser_next (parser, &byte_start, &byte_end)) {
+ str = parser->word;
+ }
-#ifndef HAVE_LIBUNISTRING
- if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
- if (pango_next (parser, &byte_start, &byte_end)) {
- str = parser->word;
- }
+ if (parser->enable_stop_words &&
+ tracker_language_is_stop_word (parser->language, str)) {
+ *stop_word = TRUE;
+ } else {
parser->word_position++;
-
*stop_word = FALSE;
- } else
-#endif /* !HAVE_LIBUNISTRING */
- {
- if (parser_next (parser, &byte_start, &byte_end)) {
- str = parser->word;
- }
-
- if (parser->enable_stop_words && is_stop_word (parser->language, str)) {
- *stop_word = TRUE;
- } else {
- parser->word_position++;
- *stop_word = FALSE;
- }
}
*word_length = parser->word_length;
@@ -1147,5 +470,3 @@ tracker_parser_next (TrackerParser *parser,
return str;
}
-
-
diff --git a/src/libtracker-fts/tracker-parser-utils.c b/src/libtracker-fts/tracker-parser-utils.c
new file mode 100644
index 0000000..503de1b
--- /dev/null
+++ b/src/libtracker-fts/tracker-parser-utils.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2010, Nokia <ivan frade nokia com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#ifdef HAVE_UNAC
+#include <unac.h>
+#endif
+
+#include "tracker-parser-utils.h"
+
+gchar *
+tracker_parser_unaccent_string (const gchar *str,
+ gsize ilength,
+ gsize *p_olength)
+{
+#ifdef HAVE_UNAC
+ GError *error = NULL;
+ gchar *str_utf16;
+ gsize utf16_len, unaccented_len, final_len;
+ gchar *unaccented_str = NULL;
+ gchar *s = NULL;
+
+ *p_olength = 0;
+
+ /* unac_string() does roughly the same than below, plus it
+ * corrupts memory in 64bit systems, so avoid it for now.
+ */
+ str_utf16 = g_convert (str, ilength, "UTF-16BE", "UTF-8", NULL, &utf16_len, &error);
+
+ if (error) {
+ g_warning ("Could not convert to UTF-16: %s", error->message);
+ g_error_free (error);
+ return NULL;
+ }
+
+ if (unac_string_utf16 (str_utf16, utf16_len,
+ &unaccented_str, &unaccented_len) != 0) {
+ g_warning ("UNAC failed to strip accents");
+ g_free (str_utf16);
+ return NULL;
+ }
+
+ g_free (str_utf16);
+
+ s = g_convert (unaccented_str, unaccented_len, "UTF-8", "UTF-16BE", NULL, &final_len, &error);
+ g_free (unaccented_str);
+
+ if (error) {
+ g_warning ("Could not convert back to UTF-8: %s", error->message);
+ g_error_free (error);
+ return NULL;
+ }
+
+ *p_olength = final_len;
+
+ return s;
+#else
+ return NULL;
+#endif
+}
+
diff --git a/src/libtracker-fts/tracker-parser-utils.h b/src/libtracker-fts/tracker-parser-utils.h
new file mode 100644
index 0000000..4e869b1
--- /dev/null
+++ b/src/libtracker-fts/tracker-parser-utils.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2010, Nokia <ivan frade nokia com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#ifndef __TRACKER_PARSER_UTILS_H__
+#define __TRACKER_PARSER_UTILS_H__
+
+#include <glib.h>
+
+G_BEGIN_DECLS
+
+gchar *tracker_parser_unaccent_string (const gchar *str,
+ gsize ilength,
+ gsize *p_olength);
+
+G_END_DECLS
+
+#endif /* __TRACKER_PARSER_UTILS_H__ */
diff --git a/src/libtracker-fts/tracker-parser.h b/src/libtracker-fts/tracker-parser.h
index 66535c9..e118ae7 100644
--- a/src/libtracker-fts/tracker-parser.h
+++ b/src/libtracker-fts/tracker-parser.h
@@ -35,10 +35,10 @@ TrackerParser *tracker_parser_new (TrackerLanguage *language,
void tracker_parser_reset (TrackerParser *parser,
const gchar *txt,
gint txt_size,
- gboolean delimit_words,
- gboolean enable_stemmer,
- gboolean enable_stop_words,
- gboolean parse_reserved_words);
+ gboolean delimit_words,
+ gboolean enable_stemmer,
+ gboolean enable_stop_words,
+ gboolean parse_reserved_words);
const gchar * tracker_parser_next (TrackerParser *parser,
gint *position,
@@ -50,7 +50,7 @@ const gchar * tracker_parser_next (TrackerParser *parser,
gchar * tracker_parser_process_word (TrackerParser *parser,
const char *word,
gint length,
- gboolean do_strip);
+ gboolean do_strip);
void tracker_parser_free (TrackerParser *parser);
G_END_DECLS
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]