[tracker] libtracker-data: new 'tracker:unaccent' method
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker] libtracker-data: new 'tracker:unaccent' method
- Date: Tue, 21 Jan 2014 11:19:10 +0000 (UTC)
commit b3fb86ea9cfc7062240aa8527dbc0a9ac897396e
Author: Aleksander Morgado <aleksander aleksander es>
Date: Wed Jan 15 15:52:07 2014 +0100
libtracker-data: new 'tracker:unaccent' method
https://bugzilla.gnome.org/show_bug.cgi?id=722254
This method allows removing combining diacritical marks (accents) from strings
used in SPARQL queries. It expects a single argument, the string to be
unaccented.
Note that the output string will also be NFKD-normalized.
Example:
1) First, insert a new element which has accents in the nie:title. In the
example we insert the word 'école' which in UTF-8 NFC looks like
"0xC3 0xA9 0x63 0x6F 0x6C 0x65":
$ tracker-sparql -u -q "
INSERT { <abc> a nie:InformationElement .
<abc> nie:title 'école' }"
2) Second, get hexdump of querying nie:title, we should get the original string
in UTF-8 and NFC normalization:
$ tracker-sparql -q "
SELECT ?title
WHERE { <abc> nie:title ?title }" | hexdump
0000000 6552 7573 746c 3a73 200a c320 63a9 6c6f
0000010 0a65 000a
0000013
Or, without the hexdump...
$ tracker-sparql -q "
SELECT ?title
WHERE { <abc> nie:title ?title }"
Results:
école
3) Last, apply the unaccenting method. The expected string should look like
"0×65 0×63 0x6F 0x6C 0×65" (i.e. without the combining diacritical mark):
$ tracker-sparql -q "
SELECT tracker:unaccent(?title)
WHERE { <abc> nie:title ?title }" | hexdump
0000000 6552 7573 746c 3a73 200a 6520 6f63 656c
0000010 0a0a
0000012
Or, without the hexdump...
$ tracker-sparql -q "
SELECT tracker:unaccent(?title)
WHERE { <abc> nie:title ?title }"
Results:
ecole
src/libtracker-data/tracker-db-interface-sqlite.c | 78 ++++++++++++++++++++
src/libtracker-data/tracker-sparql-expression.vala | 6 ++
src/libtracker-fts/tracker-parser-libicu.c | 29 +++++---
src/libtracker-fts/tracker-parser-libunistring.c | 26 ++++---
src/libtracker-fts/tracker-parser.h | 5 +
5 files changed, 123 insertions(+), 21 deletions(-)
---
diff --git a/src/libtracker-data/tracker-db-interface-sqlite.c
b/src/libtracker-data/tracker-db-interface-sqlite.c
index 2581d45..1fabf1b 100644
--- a/src/libtracker-data/tracker-db-interface-sqlite.c
+++ b/src/libtracker-data/tracker-db-interface-sqlite.c
@@ -33,6 +33,7 @@
#if HAVE_TRACKER_FTS
#include <libtracker-fts/tracker-fts.h>
+#include <libtracker-fts/tracker-parser.h>
#endif
#ifdef HAVE_LIBUNISTRING
@@ -659,6 +660,34 @@ function_sparql_normalize (sqlite3_context *context,
sqlite3_result_text16 (context, zOutput, written * 2, free);
}
+static void
+function_sparql_unaccent (sqlite3_context *context,
+ int argc,
+ sqlite3_value *argv[])
+{
+ const gchar *zInput;
+ gchar *zOutput;
+ gsize written = 0;
+ int nInput;
+
+ g_assert (argc == 1);
+
+ zInput = sqlite3_value_text (argv[0]);
+
+ if (!zInput) {
+ return;
+ }
+
+ nInput = sqlite3_value_bytes (argv[0]);
+
+ zOutput = u8_normalize (UNINORM_NFKD, zInput, nInput, NULL, &written);
+
+ /* Unaccenting is done in place */
+ tracker_parser_unaccent_nfkd_string (zOutput, &written);
+
+ sqlite3_result_text (context, zOutput, written, free);
+}
+
#elif HAVE_LIBICU
static void
@@ -805,6 +834,51 @@ function_sparql_normalize (sqlite3_context *context,
sqlite3_result_text16 (context, zOutput, -1, sqlite3_free);
}
+static void
+function_sparql_unaccent (sqlite3_context *context,
+ int argc,
+ sqlite3_value *argv[])
+{
+ const gchar *nfstr;
+ const uint16_t *zInput;
+ uint16_t *zOutput;
+ int nInput;
+ gsize nOutput;
+ UErrorCode status = U_ZERO_ERROR;
+
+ g_assert (argc == 1);
+
+ zInput = sqlite3_value_text16 (argv[0]);
+
+ if (!zInput) {
+ return;
+ }
+
+ nInput = sqlite3_value_bytes16 (argv[0]);
+
+ nOutput = nInput * 2 + 2;
+ zOutput = sqlite3_malloc (nOutput);
+
+ if (!zOutput) {
+ return;
+ }
+
+ nOutput = unorm_normalize (zInput, nInput/2, UNORM_NFKD, 0, zOutput, nOutput/2, &status);
+ if (!U_SUCCESS (status)) {
+ char zBuf[128];
+ sqlite3_snprintf (128, zBuf, "ICU error: unorm_normalize: %s", u_errorName (status));
+ zBuf[127] = '\0';
+ sqlite3_free (zOutput);
+ sqlite3_result_error (context, zBuf, -1);
+ return;
+ }
+
+ /* Unaccenting is done in place */
+ tracker_parser_unaccent_nfkd_string (zOutput, &nOutput);
+
+ sqlite3_result_text16 (context, zOutput, -1, sqlite3_free);
+}
+
#endif
static inline int
@@ -933,6 +1007,10 @@ open_database (TrackerDBInterface *db_interface,
db_interface, &function_sparql_normalize,
NULL, NULL);
+ sqlite3_create_function (db_interface->db, "SparqlUnaccent", 1, SQLITE_ANY,
+ db_interface, &function_sparql_unaccent,
+ NULL, NULL);
+
sqlite3_create_function (db_interface->db, "SparqlFormatTime", 1, SQLITE_ANY,
db_interface, &function_sparql_format_time,
NULL, NULL);
diff --git a/src/libtracker-data/tracker-sparql-expression.vala
b/src/libtracker-data/tracker-sparql-expression.vala
index a2981af..e11930f 100644
--- a/src/libtracker-data/tracker-sparql-expression.vala
+++ b/src/libtracker-data/tracker-sparql-expression.vala
@@ -496,6 +496,12 @@ class Tracker.Sparql.Expression : Object {
translate_expression_as_string (sql);
sql.append (")");
return PropertyType.STRING;
+ } else if (uri == TRACKER_NS + "unaccent") {
+ // conversion to string
+ sql.append ("SparqlUnaccent (");
+ translate_expression_as_string (sql);
+ sql.append (")");
+ return PropertyType.STRING;
} else if (uri == FN_NS + "contains") {
// fn:contains('A','B') => 'A' GLOB '*B*'
sql.append ("(");
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
index 69f75ed..b26722c 100644
--- a/src/libtracker-fts/tracker-parser-libicu.c
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -141,28 +141,35 @@ get_word_info (const UChar *word,
return TRUE;
}
-static gboolean
-parser_unaccent_nfkd_word (UChar *word,
- gsize *word_length)
+/* The input word in this method MUST be normalized in NFKD form,
+ * and given in UChars, where str_length is the number of UChars
+ * (not the number of bytes) */
+gboolean
+tracker_parser_unaccent_nfkd_string (gpointer str,
+ gsize *str_length)
{
- /* The input word in this method MUST be normalized in NFKD form */
+ UChar *word;
+ gsize word_length;
gsize i;
gsize j;
- g_return_val_if_fail (word, FALSE);
- g_return_val_if_fail (word_length, FALSE);
- g_return_val_if_fail (*word_length > 0, FALSE);
+ g_return_val_if_fail (str != NULL, FALSE);
+ g_return_val_if_fail (str_length != NULL, FALSE);
+ g_return_val_if_fail (*str_length > 0, FALSE);
+
+ word = (UChar *)str;
+ word_length = *str_length;
i = 0;
j = 0;
- while (i < *word_length) {
+ while (i < word_length) {
UChar32 unichar;
gint utf16_len; /* given in UChars */
gsize aux_i;
/* Get next character of the word as UCS4 */
aux_i = i;
- U16_NEXT (word, aux_i, *word_length, unichar);
+ U16_NEXT (word, aux_i, word_length, unichar);
utf16_len = aux_i - i;
/* Invalid UTF-16 character or end of original string. */
@@ -195,7 +202,7 @@ parser_unaccent_nfkd_word (UChar *word,
word[j] = (UChar) 0;
/* Set new output length */
- *word_length = j;
+ *str_length = j;
return TRUE;
}
@@ -331,7 +338,7 @@ process_word_uchar (TrackerParser *parser,
/* UNAC stripping needed? (for non-CJK and non-ASCII) */
if (parser->enable_unaccent &&
type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
- parser_unaccent_nfkd_word (normalized_buffer, &new_word_length)) {
+ tracker_parser_unaccent_nfkd_string (normalized_buffer, &new_word_length)) {
/* Log after unaccenting */
tracker_parser_message_hex (" After UNAC",
(guint8 *) normalized_buffer,
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c
b/src/libtracker-fts/tracker-parser-libunistring.c
index 1824528..9de6e46 100644
--- a/src/libtracker-fts/tracker-parser-libunistring.c
+++ b/src/libtracker-fts/tracker-parser-libunistring.c
@@ -156,21 +156,27 @@ get_word_info (TrackerParser *parser,
return TRUE;
}
-static gboolean
-parser_unaccent_nfkd_word (gchar *word,
- gsize *word_length)
+/* The input word in this method MUST be normalized in NFKD form,
+ * and given in UTF-8, where str_length is the byte-length */
+gboolean
+tracker_parser_unaccent_nfkd_string (gpointer str,
+ gsize *str_length)
{
- /* The input word in this method MUST be normalized in NFKD form */
+ gchar *word;
+ gsize word_length;
gsize i;
gsize j;
- g_return_val_if_fail (word, FALSE);
- g_return_val_if_fail (word_length, FALSE);
- g_return_val_if_fail (*word_length > 0, FALSE);
+ g_return_val_if_fail (str != NULL, FALSE);
+ g_return_val_if_fail (str_length != NULL, FALSE);
+ g_return_val_if_fail (*str_length > 0, FALSE);
+
+ word = (gchar *)str;
+ word_length = *str_length;
i = 0;
j = 0;
- while (i < *word_length) {
+ while (i < word_length) {
ucs4_t unichar;
gint utf8_len;
@@ -207,7 +213,7 @@ parser_unaccent_nfkd_word (gchar *word,
word[j] = '\0';
/* Set new output length */
- *word_length = j;
+ *str_length = j;
return TRUE;
}
@@ -289,7 +295,7 @@ process_word_utf8 (TrackerParser *parser,
/* UNAC stripping needed? (for non-CJK and non-ASCII) */
if (parser->enable_unaccent &&
type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
- parser_unaccent_nfkd_word (normalized, &new_word_length)) {
+ tracker_parser_unaccent_nfkd_string (normalized, &new_word_length)) {
/* Log after UNAC stripping */
tracker_parser_message_hex (" After UNAC stripping",
normalized, new_word_length);
diff --git a/src/libtracker-fts/tracker-parser.h b/src/libtracker-fts/tracker-parser.h
index cc12398..e6cb10e 100644
--- a/src/libtracker-fts/tracker-parser.h
+++ b/src/libtracker-fts/tracker-parser.h
@@ -50,6 +50,11 @@ const gchar * tracker_parser_next (TrackerParser *parser,
void tracker_parser_free (TrackerParser *parser);
+/* Other helper methods */
+
+gboolean tracker_parser_unaccent_nfkd_string (gpointer str,
+ gsize *str_length);
+
G_END_DECLS
#endif /* __LIBTRACKER_FTS_PARSER_H__ */
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]