[tracker] libtracker-data: Don't rely on spaces as separators on title collation



commit 0dab83673b48e20747f00cabc2bfac7cfc3fab56
Author: Carlos Garnacho <carlosg gnome org>
Date:   Wed Jul 19 23:05:04 2017 +0200

    libtracker-data: Don't rely on spaces as separators on title collation
    
    Skip non alphanumeric characters both at the beginning of titles, and after
    the prefix match. Of course, require at least one such non alphanumeric
    character after the prefix match, in order to avoid matching beginnings of
    words.
    
    https://bugzilla.gnome.org/show_bug.cgi?id=785146

 src/libtracker-data/tracker-collation.c |   60 ++++++++++++++++++++++++++-----
 1 files changed, 51 insertions(+), 9 deletions(-)
---
diff --git a/src/libtracker-data/tracker-collation.c b/src/libtracker-data/tracker-collation.c
index d7d2b52..8bd3e6d 100644
--- a/src/libtracker-data/tracker-collation.c
+++ b/src/libtracker-data/tracker-collation.c
@@ -242,6 +242,36 @@ tracker_collation_utf8 (gpointer      collator,
 #endif
 
 static gboolean
+skip_non_alphanumeric (const gchar **str,
+                       gint         *len)
+{
+       GUnicodeBreakType break_type;
+       const gchar *remaining = *str, *end = &remaining[*len];
+       gboolean found = FALSE, is_alnum;
+       gunichar unichar;
+
+       do {
+               unichar = g_utf8_get_char (remaining);
+               is_alnum = g_unichar_isalnum (unichar);
+               if (!is_alnum) {
+                       found = TRUE;
+                       remaining = g_utf8_next_char (remaining);
+               }
+       } while (!is_alnum && remaining < end);
+
+       /* The string must not be left empty */
+       if (remaining == end)
+               return FALSE;
+
+       if (found) {
+               *len = end - remaining;
+               *str = remaining;
+       }
+
+       return found;
+}
+
+static gboolean
 check_remove_prefix (const gchar  *str,
                      gint          len,
                      const gchar  *prefix,
@@ -249,22 +279,33 @@ check_remove_prefix (const gchar  *str,
                      const gchar **str_out,
                      gint         *len_out)
 {
-       gboolean substituted = FALSE;
+       const gchar *remaining;
        gchar *strstart;
+       gint remaining_len;
 
        if (len <= prefix_len)
                return FALSE;
 
+       /* Check that the prefix matches */
        strstart = g_utf8_casefold (str, prefix_len);
-       if (strcmp (strstart, prefix) == 0) {
-               *str_out = str + prefix_len;
-               *len_out = len - prefix_len;
-               substituted = TRUE;
+       if (strcmp (strstart, prefix) != 0) {
+               g_free (strstart);
+               return FALSE;
        }
 
+       /* Check that the following letter is a break
+        * character.
+        */
        g_free (strstart);
+       remaining = &str[prefix_len];
+       remaining_len = len - prefix_len;
 
-       return substituted;
+       if (!skip_non_alphanumeric (&remaining, &remaining_len))
+               return FALSE;
+
+       *len_out = remaining_len;
+       *str_out = remaining;
+       return TRUE;
 }
 
 /* Helper function valid for all implementations */
@@ -280,6 +321,9 @@ tracker_collation_utf8_title (gpointer      collator,
        const gchar *res1 = NULL, *res2 = NULL;
        gint i;
 
+       skip_non_alphanumeric ((const gchar **) &str1, &len1);
+       skip_non_alphanumeric ((const gchar **) &str2, &len2);
+
        /* Translators: this is a '|' (U+007C) separated list of common
         * title beginnings. Meant to be skipped for sorting purposes,
         * case doesn't matter. Given English media is quite common, it is
@@ -295,8 +339,7 @@ tracker_collation_utf8_title (gpointer      collator,
                gchar *prefix, *str;
                gint prefix_len;
 
-               str = g_strdup_printf ("%s ", title_beginnings[i]);
-               prefix = g_utf8_casefold (str, -1);
+               prefix = g_utf8_casefold (title_beginnings[i], -1);
                prefix_len = strlen (prefix);
 
                if (!res1)
@@ -306,7 +349,6 @@ tracker_collation_utf8_title (gpointer      collator,
                        check_remove_prefix (str2, len2, prefix, prefix_len,
                                             &res2, &len2);
                g_free (prefix);
-               g_free (str);
        }
 
        if (!res1)


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]