[tracker] libtracker-data: new 'tracker:normalize' method



commit 8e00e18156328725c03210edb1a7585922c32984
Author: Aleksander Morgado <aleksander aleksander es>
Date:   Wed Jan 15 14:16:05 2014 +0100

    libtracker-data: new 'tracker:normalize' method
    
    https://bugzilla.gnome.org/show_bug.cgi?id=722254
    
    This method allows normalizing the strings used in SPARQL queries. It expects
    two arguments: First, the string to be normalized, and second, one of "nfc",
    "nfd", "nfkc" or "nfkd" specifying the type of normalization to apply to the
    string.
    
    Example:
    
    1) First, insert a new element which has accents in the nie:title. In the
    example we insert the word 'école' which in UTF-8 NFC looks like
    "0xC3 0xA9 0x63 0x6F 0x6C 0x65":
    
        $ tracker-sparql -u -q "
            INSERT { <abc> a         nie:InformationElement .
                     <abc> nie:title 'école' }"
    
    2) Second, get hexdump of querying nie:title, we should get the original string
    in UTF-8 and NFC normalization:
    
        $ tracker-sparql -q "
            SELECT ?title
            WHERE { <abc> nie:title ?title }" | hexdump
        0000000 6552 7573 746c 3a73 200a c320 63a9 6c6f
        0000010 0a65 000a
        0000013
    
    3) Third, now apply explicitly NFC normalization, we should get the same output:
    
        $ tracker-sparql -q "
            SELECT tracker:normalize(?title,'nfc')
            WHERE { <abc> nie:title ?title }" | hexdump
        0000000 6552 7573 746c 3a73 200a c320 63a9 6c6f
        0000010 0a65 000a
        0000013
    
    4) Last, apply a NFD decomposition, the expected decomposed string should look
    like "0×65 0xCC 0x81 0×63 0x6F 0x6C 0×65":
    
        $ tracker-sparql -q "
            SELECT tracker:normalize(?title,'nfkd')
            WHERE { <abc> nie:title ?title }" | hexdump
        0000000 6552 7573 746c 3a73 200a 6520 81cc 6f63
        0000010 656c 0a0a
        0000014

 src/libtracker-data/tracker-db-interface-sqlite.c  |  108 ++++++++++++++++++++
 src/libtracker-data/tracker-sparql-expression.vala |    9 ++
 2 files changed, 117 insertions(+), 0 deletions(-)
---
diff --git a/src/libtracker-data/tracker-db-interface-sqlite.c 
b/src/libtracker-data/tracker-db-interface-sqlite.c
index 5ac09d1..2581d45 100644
--- a/src/libtracker-data/tracker-db-interface-sqlite.c
+++ b/src/libtracker-data/tracker-db-interface-sqlite.c
@@ -615,6 +615,50 @@ function_sparql_case_fold (sqlite3_context *context,
        sqlite3_result_text16 (context, zOutput, written * 2, free);
 }
 
+static void
+function_sparql_normalize (sqlite3_context *context,
+                           int              argc,
+                           sqlite3_value   *argv[])
+{
+       const gchar *nfstr;
+       const uint16_t *zInput;
+       uint16_t *zOutput;
+       size_t written = 0;
+       int nInput;
+       uninorm_t nf;
+
+       if (argc != 2) {
+               sqlite3_result_error (context, "Invalid argument count", -1);
+               return;
+       }
+
+       zInput = sqlite3_value_text16 (argv[0]);
+
+       if (!zInput) {
+               return;
+       }
+
+       nfstr = sqlite3_value_text (argv[1]);
+       if (g_ascii_strcasecmp (nfstr, "nfc") == 0)
+               nf = UNINORM_NFC;
+       else if (g_ascii_strcasecmp (nfstr, "nfd") == 0)
+               nf = UNINORM_NFD;
+       else if (g_ascii_strcasecmp (nfstr, "nfkc") == 0)
+               nf = UNINORM_NFKC;
+       else if (g_ascii_strcasecmp (nfstr, "nfkd") == 0)
+               nf = UNINORM_NFKD;
+       else {
+               sqlite3_result_error (context, "Invalid normalization specified, options are 'nfc', 'nfd', 
'nfkc' or 'nfkd'", -1);
+               return;
+       }
+
+       nInput = sqlite3_value_bytes16 (argv[0]);
+
+       zOutput = u16_normalize (nf, zInput, nInput/2, NULL, &written);
+
+       sqlite3_result_text16 (context, zOutput, written * 2, free);
+}
+
 #elif HAVE_LIBICU
 
 static void
@@ -701,6 +745,66 @@ function_sparql_case_fold (sqlite3_context *context,
        sqlite3_result_text16 (context, zOutput, -1, sqlite3_free);
 }
 
+static void
+function_sparql_normalize (sqlite3_context *context,
+                           int              argc,
+                           sqlite3_value   *argv[])
+{
+       const gchar *nfstr;
+       const uint16_t *zInput;
+       uint16_t *zOutput;
+       int nInput;
+       int nOutput;
+       UNormalizationMode nf;
+       UErrorCode status = U_ZERO_ERROR;
+
+       if (argc != 2) {
+               sqlite3_result_error (context, "Invalid argument count", -1);
+               return;
+       }
+
+       zInput = sqlite3_value_text16 (argv[0]);
+
+       if (!zInput) {
+               return;
+       }
+
+       nfstr = sqlite3_value_text (argv[1]);
+       if (g_ascii_strcasecmp (nfstr, "nfc") == 0)
+               nf = UNORM_NFC;
+       else if (g_ascii_strcasecmp (nfstr, "nfd") == 0)
+               nf = UNORM_NFD;
+       else if (g_ascii_strcasecmp (nfstr, "nfkc") == 0)
+               nf = UNORM_NFKC;
+       else if (g_ascii_strcasecmp (nfstr, "nfkd") == 0)
+               nf = UNORM_NFKD;
+       else {
+               sqlite3_result_error (context, "Invalid normalization specified", -1);
+               return;
+       }
+
+       nInput = sqlite3_value_bytes16 (argv[0]);
+
+       nOutput = nInput * 2 + 2;
+       zOutput = sqlite3_malloc (nOutput);
+
+       if (!zOutput) {
+               return;
+       }
+
+       unorm_normalize (zInput, nInput/2, nf, 0, zOutput, nOutput/2, &status);
+       if (!U_SUCCESS (status)) {
+               char zBuf[128];
+               sqlite3_snprintf (128, zBuf, "ICU error: unorm_normalize: %s", u_errorName (status));
+               zBuf[127] = '\0';
+               sqlite3_free (zOutput);
+               sqlite3_result_error (context, zBuf, -1);
+               return;
+       }
+
+       sqlite3_result_text16 (context, zOutput, -1, sqlite3_free);
+}
+
 #endif
 
 static inline int
@@ -825,6 +929,10 @@ open_database (TrackerDBInterface  *db_interface,
                                 db_interface, &function_sparql_case_fold,
                                 NULL, NULL);
 
+       sqlite3_create_function (db_interface->db, "SparqlNormalize", 2, SQLITE_ANY,
+                                db_interface, &function_sparql_normalize,
+                                NULL, NULL);
+
        sqlite3_create_function (db_interface->db, "SparqlFormatTime", 1, SQLITE_ANY,
                                 db_interface, &function_sparql_format_time,
                                 NULL, NULL);
diff --git a/src/libtracker-data/tracker-sparql-expression.vala 
b/src/libtracker-data/tracker-sparql-expression.vala
index aa59b02..a2981af 100644
--- a/src/libtracker-data/tracker-sparql-expression.vala
+++ b/src/libtracker-data/tracker-sparql-expression.vala
@@ -487,6 +487,15 @@ class Tracker.Sparql.Expression : Object {
                        translate_expression_as_string (sql);
                        sql.append (")");
                        return PropertyType.STRING;
+               } else if (uri == TRACKER_NS + "normalize") {
+                       // conversion to string
+                       sql.append ("SparqlNormalize (");
+                       translate_expression_as_string (sql);
+                       sql.append (", ");
+                       expect (SparqlTokenType.COMMA);
+                       translate_expression_as_string (sql);
+                       sql.append (")");
+                       return PropertyType.STRING;
                } else if (uri == FN_NS + "contains") {
                        // fn:contains('A','B') => 'A' GLOB '*B*'
                        sql.append ("(");


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]