[tracker] libtracker-data: new 'tracker:unaccent' method



commit b3fb86ea9cfc7062240aa8527dbc0a9ac897396e
Author: Aleksander Morgado <aleksander aleksander es>
Date:   Wed Jan 15 15:52:07 2014 +0100

    libtracker-data: new 'tracker:unaccent' method
    
    https://bugzilla.gnome.org/show_bug.cgi?id=722254
    
    This method allows removing combining diacritical marks (accents) from strings
    used in SPARQL queries. It expects a single argument, the string to be
    unaccented.
    
    Note that the output string will also be NFKD-normalized.
    
    Example:
    
    1) First, insert a new element which has accents in the nie:title. In the
    example we insert the word 'école' which in UTF-8 NFC looks like
    "0xC3 0xA9 0x63 0x6F 0x6C 0x65":
    
        $ tracker-sparql -u -q "
            INSERT { <abc> a         nie:InformationElement .
                     <abc> nie:title 'école' }"
    
    2) Second, get hexdump of querying nie:title, we should get the original string
    in UTF-8 and NFC normalization:
    
        $ tracker-sparql -q "
            SELECT ?title
            WHERE { <abc> nie:title ?title }" | hexdump
        0000000 6552 7573 746c 3a73 200a c320 63a9 6c6f
        0000010 0a65 000a
        0000013
    
    Or, without the hexdump...
    
        $ tracker-sparql -q "
            SELECT ?title
            WHERE { <abc> nie:title ?title }"
        Results:
          école
    
    3) Last, apply the unaccenting method. The expected string should look like
    "0×65 0×63 0x6F 0x6C 0×65" (i.e. without the combining diacritical mark):
    
        $ tracker-sparql -q "
            SELECT tracker:unaccent(?title)
            WHERE { <abc> nie:title ?title }" | hexdump
        0000000 6552 7573 746c 3a73 200a 6520 6f63 656c
        0000010 0a0a
        0000012
    
    Or, without the hexdump...
    
        $ tracker-sparql -q "
            SELECT tracker:unaccent(?title)
            WHERE { <abc> nie:title ?title }"
        Results:
          ecole

 src/libtracker-data/tracker-db-interface-sqlite.c  |   78 ++++++++++++++++++++
 src/libtracker-data/tracker-sparql-expression.vala |    6 ++
 src/libtracker-fts/tracker-parser-libicu.c         |   29 +++++---
 src/libtracker-fts/tracker-parser-libunistring.c   |   26 ++++---
 src/libtracker-fts/tracker-parser.h                |    5 +
 5 files changed, 123 insertions(+), 21 deletions(-)
---
diff --git a/src/libtracker-data/tracker-db-interface-sqlite.c 
b/src/libtracker-data/tracker-db-interface-sqlite.c
index 2581d45..1fabf1b 100644
--- a/src/libtracker-data/tracker-db-interface-sqlite.c
+++ b/src/libtracker-data/tracker-db-interface-sqlite.c
@@ -33,6 +33,7 @@
 
 #if HAVE_TRACKER_FTS
 #include <libtracker-fts/tracker-fts.h>
+#include <libtracker-fts/tracker-parser.h>
 #endif
 
 #ifdef HAVE_LIBUNISTRING
@@ -659,6 +660,34 @@ function_sparql_normalize (sqlite3_context *context,
        sqlite3_result_text16 (context, zOutput, written * 2, free);
 }
 
+static void
+function_sparql_unaccent (sqlite3_context *context,
+                          int              argc,
+                          sqlite3_value   *argv[])
+{
+       const gchar *zInput;
+       gchar *zOutput;
+       gsize written = 0;
+       int nInput;
+
+       g_assert (argc == 1);
+
+       zInput = sqlite3_value_text (argv[0]);
+
+       if (!zInput) {
+               return;
+       }
+
+       nInput = sqlite3_value_bytes (argv[0]);
+
+       zOutput = u8_normalize (UNINORM_NFKD, zInput, nInput, NULL, &written);
+
+       /* Unaccenting is done in place */
+       tracker_parser_unaccent_nfkd_string (zOutput, &written);
+
+       sqlite3_result_text (context, zOutput, written, free);
+}
+
 #elif HAVE_LIBICU
 
 static void
@@ -805,6 +834,51 @@ function_sparql_normalize (sqlite3_context *context,
        sqlite3_result_text16 (context, zOutput, -1, sqlite3_free);
 }
 
+static void
+function_sparql_unaccent (sqlite3_context *context,
+                          int              argc,
+                          sqlite3_value   *argv[])
+{
+       const gchar *nfstr;
+       const uint16_t *zInput;
+       uint16_t *zOutput;
+       int nInput;
+       gsize nOutput;
+       UErrorCode status = U_ZERO_ERROR;
+
+       g_assert (argc == 1);
+
+       zInput = sqlite3_value_text16 (argv[0]);
+
+       if (!zInput) {
+               return;
+       }
+
+       nInput = sqlite3_value_bytes16 (argv[0]);
+
+       nOutput = nInput * 2 + 2;
+       zOutput = sqlite3_malloc (nOutput);
+
+       if (!zOutput) {
+               return;
+       }
+
+       nOutput = unorm_normalize (zInput, nInput/2, UNORM_NFKD, 0, zOutput, nOutput/2, &status);
+       if (!U_SUCCESS (status)) {
+               char zBuf[128];
+               sqlite3_snprintf (128, zBuf, "ICU error: unorm_normalize: %s", u_errorName (status));
+               zBuf[127] = '\0';
+               sqlite3_free (zOutput);
+               sqlite3_result_error (context, zBuf, -1);
+               return;
+       }
+
+       /* Unaccenting is done in place */
+       tracker_parser_unaccent_nfkd_string (zOutput, &nOutput);
+
+       sqlite3_result_text16 (context, zOutput, -1, sqlite3_free);
+}
+
 #endif
 
 static inline int
@@ -933,6 +1007,10 @@ open_database (TrackerDBInterface  *db_interface,
                                 db_interface, &function_sparql_normalize,
                                 NULL, NULL);
 
+       sqlite3_create_function (db_interface->db, "SparqlUnaccent", 1, SQLITE_ANY,
+                                db_interface, &function_sparql_unaccent,
+                                NULL, NULL);
+
        sqlite3_create_function (db_interface->db, "SparqlFormatTime", 1, SQLITE_ANY,
                                 db_interface, &function_sparql_format_time,
                                 NULL, NULL);
diff --git a/src/libtracker-data/tracker-sparql-expression.vala 
b/src/libtracker-data/tracker-sparql-expression.vala
index a2981af..e11930f 100644
--- a/src/libtracker-data/tracker-sparql-expression.vala
+++ b/src/libtracker-data/tracker-sparql-expression.vala
@@ -496,6 +496,12 @@ class Tracker.Sparql.Expression : Object {
                        translate_expression_as_string (sql);
                        sql.append (")");
                        return PropertyType.STRING;
+               } else if (uri == TRACKER_NS + "unaccent") {
+                       // conversion to string
+                       sql.append ("SparqlUnaccent (");
+                       translate_expression_as_string (sql);
+                       sql.append (")");
+                       return PropertyType.STRING;
                } else if (uri == FN_NS + "contains") {
                        // fn:contains('A','B') => 'A' GLOB '*B*'
                        sql.append ("(");
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
index 69f75ed..b26722c 100644
--- a/src/libtracker-fts/tracker-parser-libicu.c
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -141,28 +141,35 @@ get_word_info (const UChar           *word,
        return TRUE;
 }
 
-static gboolean
-parser_unaccent_nfkd_word (UChar *word,
-                          gsize *word_length)
+/* The input word in this method MUST be normalized in NFKD form,
+ * and given in UChars, where str_length is the number of UChars
+ * (not the number of bytes) */
+gboolean
+tracker_parser_unaccent_nfkd_string (gpointer  str,
+                                     gsize    *str_length)
 {
-       /* The input word in this method MUST be normalized in NFKD form */
+       UChar *word;
+       gsize word_length;
        gsize i;
        gsize j;
 
-       g_return_val_if_fail (word, FALSE);
-       g_return_val_if_fail (word_length, FALSE);
-       g_return_val_if_fail (*word_length > 0, FALSE);
+       g_return_val_if_fail (str != NULL, FALSE);
+       g_return_val_if_fail (str_length != NULL, FALSE);
+       g_return_val_if_fail (*str_length > 0, FALSE);
+
+       word = (UChar *)str;
+       word_length = *str_length;
 
        i = 0;
        j = 0;
-       while (i < *word_length) {
+       while (i < word_length) {
                UChar32 unichar;
                gint utf16_len; /* given in UChars */
                gsize aux_i;
 
                /* Get next character of the word as UCS4 */
                aux_i = i;
-               U16_NEXT (word, aux_i, *word_length, unichar);
+               U16_NEXT (word, aux_i, word_length, unichar);
                utf16_len = aux_i - i;
 
                /* Invalid UTF-16 character or end of original string. */
@@ -195,7 +202,7 @@ parser_unaccent_nfkd_word (UChar *word,
        word[j] = (UChar) 0;
 
        /* Set new output length */
-       *word_length = j;
+       *str_length = j;
 
        return TRUE;
 }
@@ -331,7 +338,7 @@ process_word_uchar (TrackerParser         *parser,
        /* UNAC stripping needed? (for non-CJK and non-ASCII) */
        if (parser->enable_unaccent &&
            type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
-           parser_unaccent_nfkd_word (normalized_buffer, &new_word_length)) {
+           tracker_parser_unaccent_nfkd_string (normalized_buffer, &new_word_length)) {
                /* Log after unaccenting */
                tracker_parser_message_hex ("  After UNAC",
                                            (guint8 *) normalized_buffer,
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c 
b/src/libtracker-fts/tracker-parser-libunistring.c
index 1824528..9de6e46 100644
--- a/src/libtracker-fts/tracker-parser-libunistring.c
+++ b/src/libtracker-fts/tracker-parser-libunistring.c
@@ -156,21 +156,27 @@ get_word_info (TrackerParser         *parser,
        return TRUE;
 }
 
-static gboolean
-parser_unaccent_nfkd_word (gchar *word,
-                          gsize *word_length)
+/* The input word in this method MUST be normalized in NFKD form,
+ * and given in UTF-8, where str_length is the byte-length */
+gboolean
+tracker_parser_unaccent_nfkd_string (gpointer  str,
+                                     gsize    *str_length)
 {
-       /* The input word in this method MUST be normalized in NFKD form */
+       gchar *word;
+       gsize word_length;
        gsize i;
        gsize j;
 
-       g_return_val_if_fail (word, FALSE);
-       g_return_val_if_fail (word_length, FALSE);
-       g_return_val_if_fail (*word_length > 0, FALSE);
+       g_return_val_if_fail (str != NULL, FALSE);
+       g_return_val_if_fail (str_length != NULL, FALSE);
+       g_return_val_if_fail (*str_length > 0, FALSE);
+
+       word = (gchar *)str;
+       word_length = *str_length;
 
        i = 0;
        j = 0;
-       while (i < *word_length) {
+       while (i < word_length) {
                ucs4_t unichar;
                gint utf8_len;
 
@@ -207,7 +213,7 @@ parser_unaccent_nfkd_word (gchar *word,
        word[j] = '\0';
 
        /* Set new output length */
-       *word_length = j;
+       *str_length = j;
 
        return TRUE;
 }
@@ -289,7 +295,7 @@ process_word_utf8 (TrackerParser         *parser,
        /* UNAC stripping needed? (for non-CJK and non-ASCII) */
        if (parser->enable_unaccent &&
            type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
-           parser_unaccent_nfkd_word (normalized, &new_word_length)) {
+           tracker_parser_unaccent_nfkd_string (normalized, &new_word_length)) {
                /* Log after UNAC stripping */
                tracker_parser_message_hex ("  After UNAC stripping",
                                            normalized, new_word_length);
diff --git a/src/libtracker-fts/tracker-parser.h b/src/libtracker-fts/tracker-parser.h
index cc12398..e6cb10e 100644
--- a/src/libtracker-fts/tracker-parser.h
+++ b/src/libtracker-fts/tracker-parser.h
@@ -50,6 +50,11 @@ const gchar *  tracker_parser_next            (TrackerParser   *parser,
 
 void           tracker_parser_free            (TrackerParser   *parser);
 
+/* Other helper methods */
+
+gboolean       tracker_parser_unaccent_nfkd_string (gpointer  str,
+                                                    gsize    *str_length);
+
 G_END_DECLS
 
 #endif /* __LIBTRACKER_FTS_PARSER_H__ */


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]