[tracker/drop-unac] Fixes GB#619244: Use a custom unaccenting method instead of libunac
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/drop-unac] Fixes GB#619244: Use a custom unaccenting method instead of libunac
- Date: Mon, 7 Jun 2010 09:10:18 +0000 (UTC)
commit b85f3bd11cf7c4da72748abc8c80aaaa725303c7
Author: Aleksander Morgado <aleksander lanedo com>
Date: Tue Jun 1 17:17:27 2010 +0300
Fixes GB#619244: Use a custom unaccenting method instead of libunac
* Notes: Output strings are now always normalized using
compatibility decomposition (NFKD). This actually is the best
normalization type for text search.
* If unaccenting requested, all combining diacritical marks
are removed from the string.
* This new method avoids extra conversion to UTF-16, and does
mark removal in-place without any extra allocation.
* libunac dependency is completely removed.
configure.ac | 34 ----
src/libtracker-fts/tracker-parser-glib.c | 121 +++++++++----
src/libtracker-fts/tracker-parser-libicu.c | 207 ++++++++++++++--------
src/libtracker-fts/tracker-parser-libunistring.c | 95 +++++++---
src/libtracker-fts/tracker-parser-utils.c | 149 ----------------
src/libtracker-fts/tracker-parser-utils.h | 25 ++--
tests/libtracker-fts/tracker-parser-test.c | 55 +++---
7 files changed, 324 insertions(+), 362 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index 191f476..bf5709f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -142,7 +142,6 @@ HAL_REQUIRED=0.5
UPOWER_REQUIRED=0.9.0
GDKPIXBUF_REQUIRED=2.12.0
QUILL_REQUIRED=1.0.0
-UNAC_REQUIRED=1.0.0
POPPLER_REQUIRED=0.12.2
CAIRO_REQUIRED=1.0
GDK_REQUIRED=1.0
@@ -735,38 +734,6 @@ AC_SUBST(SQLITE3_CFLAGS)
AC_SUBST(SQLITE3_LIBS)
##################################################################
-# Enable UNAC support?
-##################################################################
-
-AC_ARG_ENABLE(unac,
- AS_HELP_STRING([--enable-unac],
- [enable UNAC support, required for stripping accents [[default=auto]]]),,
- [enable_unac=auto])
-
-if test "x$enable_unac" != "xno"; then
- PKG_CHECK_MODULES(UNAC,
- [unac >= $UNAC_REQUIRED],
- [have_unac=yes],
- [have_unac=no])
- AC_SUBST(UNAC_LIBS)
- AC_SUBST(UNAC_CFLAGS)
-
- if test "x$have_unac" = "xyes"; then
- AC_DEFINE(HAVE_UNAC, [], [Define if we have UNAC for accent stripping])
- fi
-else
- have_unac="no (disabled)"
-fi
-
-if test "x$enable_unac" = "xyes"; then
- if test "x$have_unac" != "xyes"; then
- AC_MSG_ERROR([Couldn't find UNAC >= $UNAC_REQUIRED.])
- fi
-fi
-
-AM_CONDITIONAL(HAVE_UNAC, test "x$have_unac" = "xyes")
-
-##################################################################
# Enable Gnome Keyring support to store credentials (for web miners)
##################################################################
@@ -1963,7 +1930,6 @@ Build Configuration:
Support for HAL: $have_hal
Support for UPower: $have_upower
Support for file monitoring: $have_file_monitor
- Support for accent stripping (unac): $have_unac
Support for Cyrillic languages (enca): $have_enca
Support for network status detection: $have_network_manager
Unicode support library: $with_unicode_support
diff --git a/src/libtracker-fts/tracker-parser-glib.c b/src/libtracker-fts/tracker-parser-glib.c
index d521c9c..06858f8 100644
--- a/src/libtracker-fts/tracker-parser-glib.c
+++ b/src/libtracker-fts/tracker-parser-glib.c
@@ -42,9 +42,6 @@
#define IS_ASCII_IGNORE(c) ((c) <= 0x002C)
#define IS_HYPHEN(c) ((c) == 0x002D)
#define IS_UNDERSCORE(c) ((c) == 0x005F)
-#define IS_NEWLINE(c) ((c) == 0x000D)
-#define IS_O(c) ((c) == 0x006F)
-#define IS_R(c) ((c) == 0x0072)
typedef enum {
TRACKER_PARSER_WORD_ASCII_HIGHER,
@@ -162,64 +159,108 @@ get_encoding (const gchar *txt)
}
+static gboolean
+tracker_parser_unaccent_nfkd_word (gchar *word,
+ gsize *word_length)
+{
+ /* The input word in this method MUST be normalized in NFKD form */
+ gsize i;
+ gsize j;
+
+ g_return_val_if_fail (word, FALSE);
+ g_return_val_if_fail (word_length, FALSE);
+ g_return_val_if_fail (*word_length > 0, FALSE);
+
+ i = 0;
+ j = 0;
+ while (i < *word_length) {
+ gunichar unichar;
+ gchar *next_utf8;
+ gint utf8_len;
+
+ /* Get next character of the word as UCS4 */
+ unichar = g_utf8_get_char_validated (&word[i], -1);
+
+ /* Invalid UTF-8 character or end of original string. */
+ if (unichar == (gunichar)-1 ||
+ unichar == (gunichar)-2) {
+ break;
+ }
+
+ /* Find next UTF-8 character */
+ next_utf8 = g_utf8_next_char (&word[i]);
+ utf8_len = next_utf8 - &word[i];
+
+ /* If the given unichar is a combining diacritical mark,
+ * just update the original index, not the output one */
+ if (IS_CDM_UCS4 ((guint32)unichar)) {
+ i += utf8_len;
+ continue;
+ }
+
+ /* If already found a previous combining
+ * diacritical mark, indexes are different so
+ * need to copy characters. As output and input
+ * buffers may overlap, need to use memmove
+ * instead of memcpy */
+ if (i != j) {
+ memmove (&word[j], &word[i], utf8_len);
+ }
+
+ /* Update both indexes */
+ i += utf8_len;
+ j += utf8_len;
+ }
+
+ /* Force proper string end */
+ word[j] = '\0';
+ /* Set new output length */
+ *word_length = j;
+
+ return TRUE;
+}
+
static gchar *
process_word_utf8 (TrackerParser *parser,
- const gchar *word,
- gint length,
+ const gchar *word,
+ gint length,
gboolean do_strip,
gboolean *stop_word)
{
gchar *stem_word;
gchar *str;
- gchar *stripped_word;
gsize bytes, len;
g_return_val_if_fail (parser != NULL, NULL);
g_return_val_if_fail (word != NULL, NULL);
str = NULL;
- stripped_word = NULL;
if (word) {
- if (length == -1) {
- bytes = strlen (word);
- } else {
- bytes = length;
- }
+ bytes = length == -1 ? strlen (word) : length;
/* Log original word */
tracker_parser_message_hex ("ORIGINAL word",
word, bytes);
- if (parser->enable_unaccent && do_strip) {
- stripped_word = tracker_parser_unaccent_utf8_word (word,
- bytes,
- &len);
-
- /* Log after UNAC stripping */
- tracker_parser_message_hex (" After UNAC stripping",
- stripped_word, len);
- } else {
- stripped_word = NULL;
+ str = g_utf8_normalize (word, bytes, G_NORMALIZE_NFKD);
+ if (!str) {
+ return NULL;
}
- if (!stripped_word) {
- str = g_utf8_normalize (word,
- bytes,
- G_NORMALIZE_NFC);
- } else {
- str = g_utf8_normalize (stripped_word,
- len,
- G_NORMALIZE_NFC);
- g_free (stripped_word);
- }
+ /* Update string length */
+ bytes = strlen (str);
/* Log after normalization */
- tracker_parser_message_hex (" After NFC normalization",
- str, strlen ((gchar *)str));
+ tracker_parser_message_hex (" After NFKD normalization",
+ str, bytes);
- if (!str) {
- return NULL;
+ if (parser->enable_unaccent &&
+ do_strip &&
+ tracker_parser_unaccent_nfkd_word (str, &bytes)) {
+ /* Log after UNAC stripping */
+ tracker_parser_message_hex (" After UNAC stripping",
+ str, bytes);
}
/* Check if stop word */
@@ -232,9 +273,9 @@ process_word_utf8 (TrackerParser *parser,
return str;
}
- len = strlen (str);
-
- stem_word = tracker_language_stem_word (parser->language, str, len);
+ stem_word = tracker_language_stem_word (parser->language,
+ str,
+ bytes);
if (stem_word) {
g_free (str);
@@ -414,7 +455,7 @@ parser_next (TrackerParser *parser,
case TRACKER_PARSER_WORD_ASCII_HIGHER:
c += 32;
- /* Fall through */
+ /* Fall through */
case TRACKER_PARSER_WORD_ASCII_LOWER:
case TRACKER_PARSER_WORD_HYPHEN:
case TRACKER_PARSER_WORD_UNDERSCORE:
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
index 55151ec..f69fb2a 100644
--- a/src/libtracker-fts/tracker-parser-libicu.c
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -138,6 +138,109 @@ get_word_info (const UChar *word,
return TRUE;
}
+static gboolean
+tracker_parser_unaccent_nfkd_word (UChar *word,
+ gsize *word_length)
+{
+ /* The input word in this method MUST be normalized in NFKD form */
+ gsize i;
+ gsize j;
+
+ g_return_val_if_fail (word, FALSE);
+ g_return_val_if_fail (word_length, FALSE);
+ g_return_val_if_fail (*word_length > 0, FALSE);
+
+ i = 0;
+ j = 0;
+ while (i < *word_length) {
+ UChar32 unichar;
+ gint utf16_len; /* given in UChars */
+ gsize aux_i;
+
+ /* Get next character of the word as UCS4 */
+ aux_i = i;
+ U16_NEXT (word, aux_i, *word_length, unichar);
+ utf16_len = aux_i - i;
+
+ /* Invalid UTF-16 character or end of original string. */
+ if (utf16_len <= 0) {
+ break;
+ }
+
+ /* If the given unichar is a combining diacritical mark,
+ * just update the original index, not the output one */
+ if (IS_CDM_UCS4 ((guint32)unichar)) {
+ i += utf16_len;
+ continue;
+ }
+
+ /* If already found a previous combining
+ * diacritical mark, indexes are different so
+ * need to copy characters. As output and input
+ * buffers may overlap, need to use memmove
+ * instead of memcpy */
+ if (i != j) {
+ memmove (&word[j], &word[i], sizeof (UChar) * utf16_len);
+ }
+
+ /* Update both indexes */
+ i += utf16_len;
+ j += utf16_len;
+ }
+
+ /* Force proper string end */
+ word[j] = (UChar)0;
+ /* Set new output length */
+ *word_length = j;
+
+ return TRUE;
+}
+
+static gchar *
+convert_UChar_to_utf8 (const UChar *word,
+ gsize uchar_len,
+ gsize *utf8_len)
+{
+ gchar *utf8_str;
+ UErrorCode icu_error = U_ZERO_ERROR;
+ UConverter *converter;
+ gsize new_utf8_len;
+
+ g_return_val_if_fail (word, NULL);
+ g_return_val_if_fail (utf8_len, NULL);
+
+ /* Open converter UChar to UTF-16BE */
+ converter = ucnv_open ("UTF-8", &icu_error);
+ if (!converter) {
+ g_warning ("Cannot open UTF-8 converter: '%s'",
+ U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
+ return NULL;
+ }
+
+ /* A character encoded in 2 bytes in UTF-16 may get expanded to 3 or 4 bytes
+ * in UTF-8. */
+ utf8_str = g_malloc (2 * uchar_len * sizeof (UChar) + 1);
+
+ /* Convert from UChar to UTF-8 (NIL-terminated) */
+ new_utf8_len = ucnv_fromUChars (converter,
+ utf8_str,
+ 2 * uchar_len * sizeof (UChar) + 1,
+ word,
+ uchar_len,
+ &icu_error);
+ if (U_FAILURE (icu_error)) {
+ g_warning ("Cannot convert from UChar to UTF-8: '%s'",
+ u_errorName (icu_error));
+ g_free (utf8_str);
+ ucnv_close (converter);
+ return NULL;
+ }
+
+ *utf8_len = new_utf8_len;
+ ucnv_close (converter);
+ return utf8_str;
+}
+
static gchar *
process_word_uchar (TrackerParser *parser,
const UChar *word,
@@ -148,13 +251,12 @@ process_word_uchar (TrackerParser *parser,
UErrorCode error = U_ZERO_ERROR;
UChar normalized_buffer [WORD_BUFFER_LENGTH];
gchar *utf8_str = NULL;
- gchar *stemmed = NULL;
- size_t new_word_length;
+ gsize new_word_length;
/* Log original word */
tracker_parser_message_hex ("ORIGINAL word",
(guint8 *)word,
- length * sizeof (UChar));
+ length * sizeof (UChar));
if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
@@ -178,12 +280,12 @@ process_word_uchar (TrackerParser *parser,
/* Log after casefolding */
tracker_parser_message_hex (" After Casefolding",
(guint8 *)casefolded_buffer,
- new_word_length * sizeof (UChar));
+ new_word_length * sizeof (UChar));
- /* NFC normalization... */
+ /* NFKD normalization... */
new_word_length = unorm_normalize (casefolded_buffer,
new_word_length,
- UNORM_NFC,
+ UNORM_NFKD,
0,
normalized_buffer,
WORD_BUFFER_LENGTH,
@@ -200,7 +302,7 @@ process_word_uchar (TrackerParser *parser,
/* Log after casefolding */
tracker_parser_message_hex (" After Normalization",
(guint8 *)normalized_buffer,
- new_word_length * sizeof (UChar));
+ new_word_length * sizeof (UChar));
} else {
/* For ASCII-only, just tolower() each character */
new_word_length = u_strToLower (normalized_buffer,
@@ -218,67 +320,29 @@ process_word_uchar (TrackerParser *parser,
/* Log after casefolding */
tracker_parser_message_hex (" After lowercase",
(guint8 *)normalized_buffer,
- new_word_length * sizeof (UChar));
+ new_word_length * sizeof (UChar));
}
/* UNAC stripping needed? (for non-CJK and non-ASCII) */
- if (parser->enable_unaccent && type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
- gsize stripped_word_length;
-
- /* Get unaccented string in UTF-8 */
- utf8_str = tracker_parser_unaccent_UChar_word (normalized_buffer,
- new_word_length,
- &stripped_word_length);
- if (utf8_str) {
- new_word_length = stripped_word_length;
-
- /* Log after unaccenting */
- tracker_parser_message_hex (" After UNAC",
- utf8_str,
- new_word_length);
- }
+ if (parser->enable_unaccent &&
+ type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
+ tracker_parser_unaccent_nfkd_word (normalized_buffer,
+ &new_word_length)) {
+ /* Log after unaccenting */
+ tracker_parser_message_hex (" After UNAC",
+ (guint8 *)normalized_buffer,
+ new_word_length * sizeof (UChar));
}
- /* If stripping failed or not needed, convert to UTF-8 */
- if (!utf8_str) {
- UErrorCode icu_error = U_ZERO_ERROR;
- UConverter *converter;
- gsize utf8_len;
-
- /* Open converter UChar to UTF-16BE */
- converter = ucnv_open ("UTF-8", &icu_error);
- if (!converter) {
- g_warning ("Cannot open UTF-8 converter: '%s'",
- U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
- return NULL;
- }
- /* A character encoded in 2 bytes in UTF-16 may get expanded to 3 or 4 bytes
- * in UTF-8. */
- utf8_str = g_malloc (2 * new_word_length * sizeof (UChar) + 1);
-
- /* Convert from UChar to UTF-8 (NIL-terminated) */
- utf8_len = ucnv_fromUChars (converter,
- utf8_str,
- 2 * new_word_length * sizeof (UChar) + 1,
- normalized_buffer,
- new_word_length,
- &icu_error);
- if (U_FAILURE (icu_error)) {
- g_warning ("Cannot convert from UChar to UTF-8: '%s'",
- u_errorName (icu_error));
- g_free (utf8_str);
- ucnv_close (converter);
- return NULL;
- }
+ /* Finally, convert to UTF-8 */
+ utf8_str = convert_UChar_to_utf8 (normalized_buffer,
+ new_word_length,
+ &new_word_length);
- new_word_length = utf8_len;
- ucnv_close (converter);
-
- /* Log after unaccenting */
- tracker_parser_message_hex (" After UTF8 conversion",
- utf8_str,
- new_word_length);
- }
+ /* Log after unaccenting */
+ tracker_parser_message_hex (" After UTF8 conversion",
+ utf8_str,
+ new_word_length);
/* Check if stop word */
if (parser->ignore_stop_words) {
@@ -287,21 +351,24 @@ process_word_uchar (TrackerParser *parser,
}
/* Stemming needed? */
- if (parser->enable_stemmer) {
+ if (utf8_str &&
+ parser->enable_stemmer) {
+ gchar *stemmed;
+
/* Input for stemmer ALWAYS in UTF-8, as well as output */
stemmed = tracker_language_stem_word (parser->language,
utf8_str,
new_word_length);
/* Log after stemming */
- tracker_parser_message_hex (" After stemming",
+ tracker_parser_message_hex (" After stemming",
stemmed, strlen (stemmed));
- }
- /* If stemmed wanted and succeeded, free previous and return it */
- if (stemmed) {
- g_free (utf8_str);
- return stemmed;
+ /* If stemmed wanted and succeeded, free previous and return it */
+ if (stemmed) {
+ g_free (utf8_str);
+ return stemmed;
+ }
}
return utf8_str;
@@ -510,7 +577,7 @@ tracker_parser_reset (TrackerParser *parser,
if (!converter) {
g_warning ("Cannot open UTF-8 converter: '%s'",
U_FAILURE (error) ? u_errorName (error) : "none");
- return;
+ return;
}
/* Allocate UChars and offsets buffers */
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c
index 07f638d..240ea44 100644
--- a/src/libtracker-fts/tracker-parser-libunistring.c
+++ b/src/libtracker-fts/tracker-parser-libunistring.c
@@ -139,6 +139,61 @@ get_word_info (TrackerParser *parser,
return TRUE;
}
+static gboolean
+tracker_parser_unaccent_nfkd_word (gchar *word,
+ gsize *word_length)
+{
+ /* The input word in this method MUST be normalized in NFKD form */
+ gsize i;
+ gsize j;
+
+ g_return_val_if_fail (word, FALSE);
+ g_return_val_if_fail (word_length, FALSE);
+ g_return_val_if_fail (*word_length > 0, FALSE);
+
+ i = 0;
+ j = 0;
+ while (i < *word_length) {
+ ucs4_t unichar;
+ gint utf8_len;
+
+ /* Get next character of the word as UCS4 */
+ utf8_len = u8_strmbtouc (&unichar, &word[i]);
+
+ /* Invalid UTF-8 character or end of original string. */
+ if (utf8_len <= 0) {
+ break;
+ }
+
+ /* If the given unichar is a combining diacritical mark,
+ * just update the original index, not the output one */
+ if (IS_CDM_UCS4 ((guint32)unichar)) {
+ i += utf8_len;
+ continue;
+ }
+
+ /* If already found a previous combining
+ * diacritical mark, indexes are different so
+ * need to copy characters. As output and input
+ * buffers may overlap, need to use memmove
+ * instead of memcpy */
+ if (i != j) {
+ memmove (&word[j], &word[i], utf8_len);
+ }
+
+ /* Update both indexes */
+ i += utf8_len;
+ j += utf8_len;
+ }
+
+ /* Force proper string end */
+ word[j] = '\0';
+ /* Set new output length */
+ *word_length = j;
+
+ return TRUE;
+}
+
static gchar *
process_word_utf8 (TrackerParser *parser,
const gchar *word,
@@ -148,7 +203,6 @@ process_word_utf8 (TrackerParser *parser,
{
gchar word_buffer [WORD_BUFFER_LENGTH];
gchar *normalized = NULL;
- gchar *stripped = NULL;
gchar *stemmed = NULL;
size_t new_word_length;
@@ -171,13 +225,13 @@ process_word_utf8 (TrackerParser *parser,
/* Leave space for last NIL */
new_word_length = WORD_BUFFER_LENGTH - 1;
- /* Casefold and NFC normalization in output.
+ /* Casefold and NFKD normalization in output.
* NOTE: if the output buffer is not big enough, u8_casefold will
* return a newly-allocated buffer. */
normalized = u8_casefold ((const uint8_t *)word,
length,
uc_locale_language (),
- UNINORM_NFC,
+ UNINORM_NFKD,
word_buffer,
&new_word_length);
@@ -192,7 +246,7 @@ process_word_utf8 (TrackerParser *parser,
}
/* Log after Normalization */
- tracker_parser_message_hex (" After Casefolding and NFC normalization",
+ tracker_parser_message_hex (" After Casefolding and NFKD normalization",
normalized, new_word_length);
} else {
/* For ASCII-only, just tolower() each character */
@@ -215,31 +269,25 @@ process_word_utf8 (TrackerParser *parser,
normalized[new_word_length] = '\0';
/* UNAC stripping needed? (for non-CJK and non-ASCII) */
- if (parser->enable_unaccent && type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
- gsize stripped_word_length;
-
- stripped = tracker_parser_unaccent_utf8_word (normalized,
- new_word_length,
- &stripped_word_length);
-
- if (stripped) {
- /* Log after UNAC stripping */
- tracker_parser_message_hex (" After UNAC stripping",
- stripped, stripped_word_length);
- new_word_length = stripped_word_length;
- }
+ if (parser->enable_unaccent &&
+ type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
+ tracker_parser_unaccent_nfkd_word (normalized,
+ &new_word_length)) {
+ /* Log after UNAC stripping */
+ tracker_parser_message_hex (" After UNAC stripping",
+ normalized, new_word_length);
}
/* Check if stop word */
if (parser->ignore_stop_words) {
*stop_word = tracker_language_is_stop_word (parser->language,
- stripped ? stripped : normalized);
+ normalized);
}
/* Stemming needed? */
if (parser->enable_stemmer) {
stemmed = tracker_language_stem_word (parser->language,
- stripped ? stripped : normalized,
+ normalized,
new_word_length);
/* Log after stemming */
@@ -249,21 +297,12 @@ process_word_utf8 (TrackerParser *parser,
/* If stemmed wanted and succeeded, free previous and return it */
if (stemmed) {
- g_free (stripped);
if (normalized != word_buffer) {
g_free (normalized);
}
return stemmed;
}
- /* If stripped wanted and succeeded, free previous and return it */
- if (stripped) {
- if (normalized != word_buffer) {
- g_free (normalized);
- }
- return stripped;
- }
-
/* It may be the case that no stripping and no stemming was needed, and
* that the output buffer in stack was enough for case-folding and
* normalization. In this case, need to strdup() the string to return it */
diff --git a/src/libtracker-fts/tracker-parser-utils.c b/src/libtracker-fts/tracker-parser-utils.c
index 9c24bd0..76a8ecb 100644
--- a/src/libtracker-fts/tracker-parser-utils.c
+++ b/src/libtracker-fts/tracker-parser-utils.c
@@ -21,158 +21,9 @@
#include <string.h>
-#ifdef HAVE_UNAC
-#include <unac.h>
-#endif
-
-#ifdef HAVE_LIBICU
-#include <unicode/utypes.h>
-#include <unicode/ucnv.h>
-#endif
-
#include <libtracker-common/tracker-common.h>
#include "tracker-parser-utils.h"
-
-/* Output is always UTF-8. */
-gchar *
-tracker_parser_unaccent_utf16be_word (const gchar *string,
- gsize ilength,
- gsize *p_olength)
-{
-#ifdef HAVE_UNAC
- GError *error = NULL;
- gchar *unaccented_str = NULL;
- gchar *str_utf8 = NULL;
- gsize unaccented_len;
- gsize utf8_len;
-
- *p_olength = 0;
-
- if (unac_string_utf16 (string, ilength,
- &unaccented_str, &unaccented_len) != 0) {
- g_warning ("UNAC failed to strip accents");
- return NULL;
- }
-
- /* Convert from UTF-16BE to UTF-8 */
- str_utf8 = g_convert (unaccented_str,
- unaccented_len,
- "UTF-8",
- "UTF-16BE",
- NULL,
- &utf8_len,
- &error);
- g_free (unaccented_str);
-
- if (error) {
- g_warning ("Could not convert back to UTF-8: %s",
- error->message);
- g_error_free (error);
- return NULL;
- }
-
- *p_olength = utf8_len;
- return str_utf8;
-#else
- return NULL;
-#endif
-}
-
-
-#ifdef HAVE_LIBICU
-/* NOTE: Internally, UChars are UTF-16, but conversion needed just in case,
- * as libunac needs UTF-16BE. Output is always UTF-8.*/
-gchar *
-tracker_parser_unaccent_UChar_word (const UChar *string,
- gsize ilength,
- gsize *p_olength)
-{
-#ifdef HAVE_UNAC
- UErrorCode icu_error = U_ZERO_ERROR;
- UConverter *converter;
- gchar *str_utf16;
- gchar *str_utf8 = NULL;
- gsize utf16_len;
-
- *p_olength = 0;
-
- /* Open converter UChar to UTF-16BE */
- converter = ucnv_open ("UTF-16BE", &icu_error);
- if (!converter) {
- g_warning ("Cannot open UTF-16BE converter: '%s'",
- U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
- return NULL;
- }
-
- /* Allocate buffer, same size as input string.
- * Note that ilength specifies number of UChars not
- * number of bytes */
- str_utf16 = g_malloc ((ilength + 1) * 2);
-
- /* Convert from UChar to UTF-16BE */
- utf16_len = ucnv_fromUChars (converter,
- str_utf16,
- (ilength + 1) * 2,
- string,
- ilength,
- &icu_error);
- if (U_FAILURE (icu_error)) {
- g_warning ("Cannot convert from UChar to UTF-16BE: '%s' "
- "(ilength: %" G_GSIZE_FORMAT ")",
- u_errorName (icu_error),
- ilength);
- } else {
- str_utf8 = tracker_parser_unaccent_utf16be_word (str_utf16,
- utf16_len,
- p_olength);
- }
- ucnv_close (converter);
- g_free (str_utf16);
- return str_utf8;
-#else
- return NULL;
-#endif
-}
-#endif
-
-gchar *
-tracker_parser_unaccent_utf8_word (const gchar *str,
- gsize ilength,
- gsize *p_olength)
-{
-#ifdef HAVE_UNAC
- GError *error = NULL;
- gchar *str_utf16 = NULL;
- gchar *str_utf8 = NULL;
- gsize utf16_len;
-
- *p_olength = 0;
-
- /* unac_string() does roughly the same than below, plus it
- * corrupts memory in 64bit systems, so avoid it for now.
- */
- str_utf16 = g_convert (str, ilength, "UTF-16BE", "UTF-8", NULL, &utf16_len, &error);
-
- if (error) {
- g_warning ("Could not convert to UTF-16: %s", error->message);
- g_error_free (error);
- return NULL;
- } else {
-
- str_utf8 = tracker_parser_unaccent_utf16be_word (str_utf16,
- utf16_len,
- p_olength);
- }
-
- g_free (str_utf16);
- return str_utf8;
-#else
- return NULL;
-#endif
-}
-
-
/*
* Definition of the possible reserved words.
* Length of word is explicitly given to avoid strlen() calls
diff --git a/src/libtracker-fts/tracker-parser-utils.h b/src/libtracker-fts/tracker-parser-utils.h
index 50805c1..f3f884e 100644
--- a/src/libtracker-fts/tracker-parser-utils.h
+++ b/src/libtracker-fts/tracker-parser-utils.h
@@ -38,22 +38,19 @@ G_BEGIN_DECLS
((c) >= 0x4E00 && (c) <= 0x9FA5) || \
((c) >= 0x20000 && (c) <= 0x2A6D6))
+/* ASCII undescore? */
#define IS_UNDERSCORE_UCS4(c) ((c) == 0x005F)
-
-gchar *tracker_parser_unaccent_utf16be_word (const gchar *string,
- gsize ilength,
- gsize *p_olength);
-
-gchar *tracker_parser_unaccent_utf8_word (const gchar *string,
- gsize ilength,
- gsize *p_olength);
-
-#ifdef HAVE_LIBICU
-gchar *tracker_parser_unaccent_UChar_word (const UChar *string,
- gsize ilength,
- gsize *p_olength);
-#endif
+/* Combining diacritical mark?
+ * Basic range: [0x0300,0x036F]
+ * Supplement: [0x1DC0,0x1DFF]
+ * For Symbols: [0x20D0,0x20FF]
+ * Half marks: [0xFE20,0xFE2F]
+ */
+#define IS_CDM_UCS4(c) (((c) >= 0x0300 && (c) <= 0x036F) || \
+ ((c) >= 0x1DC0 && (c) <= 0x1DFF) || \
+ ((c) >= 0x20D0 && (c) <= 0x20FF) || \
+ ((c) >= 0xFE20 && (c) <= 0xFE2F))
gboolean tracker_parser_is_reserved_word_utf8 (const gchar *word,
diff --git a/tests/libtracker-fts/tracker-parser-test.c b/tests/libtracker-fts/tracker-parser-test.c
index 813ce38..5390989 100644
--- a/tests/libtracker-fts/tracker-parser-test.c
+++ b/tests/libtracker-fts/tracker-parser-test.c
@@ -170,6 +170,7 @@ expected_word_check (TrackerParserTestFixture *fixture,
{
const TestDataExpectedWord *testdata = data;
const gchar *word;
+ gchar *expected_nfkd;
gint position;
gint byte_offset_start;
gint byte_offset_end;
@@ -195,8 +196,15 @@ expected_word_check (TrackerParserTestFixture *fixture,
&stop_word,
&word_length);
+ /* Expected word MUST always be in NFKD normalization */
+ expected_nfkd = g_utf8_normalize (testdata->expected,
+ -1,
+ G_NORMALIZE_NFKD);
+
/* Check if input is same as expected */
- g_assert_cmpstr (word, == , testdata->expected);
+ g_assert_cmpstr (word, == , expected_nfkd);
+
+ g_free (expected_nfkd);
}
/* -------------- STOP WORD TESTS ----------------- */
@@ -247,7 +255,6 @@ stop_word_check (TrackerParserTestFixture *fixture,
/* -------------- LIST OF TESTS ----------------- */
-#ifdef HAVE_UNAC
/* Normalization-related tests (unaccenting) */
static const TestDataExpectedWord test_data_normalization[] = {
{ "école", "ecole", FALSE, TRUE },
@@ -263,36 +270,30 @@ static const TestDataExpectedWord test_data_normalization[] = {
/* Unaccenting-related tests */
static const TestDataExpectedWord test_data_unaccent[] = {
- { "Murciélago", "murcielago", FALSE, TRUE },
- { "camión", "camion", FALSE, TRUE },
- { "desagüe", "desague", FALSE, TRUE },
+ { "Murciélago", "murcielago", FALSE, TRUE },
+ { "camión", "camion", FALSE, TRUE },
+ { "desagüe", "desague", FALSE, TRUE },
+ { "Ὰ", "α", FALSE, TRUE }, /* greek capital alpha with U+0300, composed */
+ { "ὰ", "α", FALSE, TRUE }, /* greek small alpha with U+0300, composed */
+ { "�", "ι", FALSE, TRUE }, /* greek capital iotta with U+0300, composed */
+ { "ὶ", "ι", FALSE, TRUE }, /* greek small iotta with U+0300, composed */
+ { "Ὼ", "Ï?", FALSE, TRUE }, /* greek capital omega with U+0300, composed */
+ { "á½¼", "Ï?", FALSE, TRUE }, /* greek small omega with U+0300, composed */
+#ifdef FULL_UNICODE_TESTS /* glib/pango does not like NFD strings */
+ { "Î?Ì?", "α", FALSE, TRUE }, /* capital alpha with U+0300, decomposed */
+ { "αÌ?", "α", FALSE, TRUE }, /* small alpha with U+0300, decomposed */
+ { "Î?Ì?", "ι", FALSE, TRUE }, /* capital iotta with U+0300, decomposed */
+ { "ιÌ?", "ι", FALSE, TRUE }, /* small iotta with U+0300, decomposed */
+ { "ΩÌ?", "Ï?", FALSE, TRUE }, /* capital omega with U+0300, decomposed */
+ { "Ï?Ì?", "Ï?", FALSE, TRUE }, /* small omega with U+0300, decomposed */
+ { "aNÍ¡Ga", "anga", FALSE, TRUE }, /* 0x0361 affects to two characters */
+ { "aNGÍ¡a", "anga", FALSE, TRUE }, /* 0x0361 affects to two characters */
+#endif
{ "Murciélago", "murciélago", FALSE, FALSE },
{ "camión", "camión", FALSE, FALSE },
{ "desagüe", "desagüe", FALSE, FALSE },
{ NULL, NULL, FALSE, FALSE }
};
-#else
-/* Normalization-related tests (not unaccenting) */
-static const TestDataExpectedWord test_data_normalization[] = {
- { "école", "école", FALSE, FALSE },
- { "�COLE", "école", FALSE, FALSE },
- { "�cole", "école", FALSE, FALSE },
-#ifdef FULL_UNICODE_TESTS /* glib/pango doesn't like NFD strings */
- { "e" "\xCC\x81" "cole", "école", FALSE, FALSE },
- { "E" "\xCC\x81" "COLE", "école", FALSE, FALSE },
- { "E" "\xCC\x81" "cole", "école", FALSE, FALSE },
-#endif
- { "école", "école", FALSE, TRUE },
- { "�COLE", "école", FALSE, TRUE },
- { "�cole", "école", FALSE, TRUE },
-#ifdef FULL_UNICODE_TESTS /* glib/pango doesn't like NFD strings */
- { "e" "\xCC\x81" "cole", "école", FALSE, TRUE },
- { "E" "\xCC\x81" "COLE", "école", FALSE, TRUE },
- { "E" "\xCC\x81" "cole", "école", FALSE, TRUE },
-#endif
- { NULL, NULL, FALSE, FALSE }
-};
-#endif /* !HAVE_UNAC */
/* Stemming-related tests */
static const TestDataExpectedWord test_data_stemming[] = {
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]