[tracker] libtracker-data: Don't rely on spaces as separators on title collation
- From: Carlos Garnacho <carlosg src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker] libtracker-data: Don't rely on spaces as separators on title collation
- Date: Mon, 24 Jul 2017 08:22:49 +0000 (UTC)
commit 0dab83673b48e20747f00cabc2bfac7cfc3fab56
Author: Carlos Garnacho <carlosg gnome org>
Date: Wed Jul 19 23:05:04 2017 +0200
libtracker-data: Don't rely on spaces as separators on title collation
Skip non alphanumeric characters both at the beginning of titles, and after
the prefix match. Of course, require at least one such non alphanumeric
character after the prefix match, in order to avoid matching beginnings of
words.
https://bugzilla.gnome.org/show_bug.cgi?id=785146
src/libtracker-data/tracker-collation.c | 60 ++++++++++++++++++++++++++-----
1 files changed, 51 insertions(+), 9 deletions(-)
---
diff --git a/src/libtracker-data/tracker-collation.c b/src/libtracker-data/tracker-collation.c
index d7d2b52..8bd3e6d 100644
--- a/src/libtracker-data/tracker-collation.c
+++ b/src/libtracker-data/tracker-collation.c
@@ -242,6 +242,36 @@ tracker_collation_utf8 (gpointer collator,
#endif
static gboolean
+skip_non_alphanumeric (const gchar **str,
+ gint *len)
+{
+ GUnicodeBreakType break_type;
+ const gchar *remaining = *str, *end = &remaining[*len];
+ gboolean found = FALSE, is_alnum;
+ gunichar unichar;
+
+ do {
+ unichar = g_utf8_get_char (remaining);
+ is_alnum = g_unichar_isalnum (unichar);
+ if (!is_alnum) {
+ found = TRUE;
+ remaining = g_utf8_next_char (remaining);
+ }
+ } while (!is_alnum && remaining < end);
+
+ /* The string must not be left empty */
+ if (remaining == end)
+ return FALSE;
+
+ if (found) {
+ *len = end - remaining;
+ *str = remaining;
+ }
+
+ return found;
+}
+
+static gboolean
check_remove_prefix (const gchar *str,
gint len,
const gchar *prefix,
@@ -249,22 +279,33 @@ check_remove_prefix (const gchar *str,
const gchar **str_out,
gint *len_out)
{
- gboolean substituted = FALSE;
+ const gchar *remaining;
gchar *strstart;
+ gint remaining_len;
if (len <= prefix_len)
return FALSE;
+ /* Check that the prefix matches */
strstart = g_utf8_casefold (str, prefix_len);
- if (strcmp (strstart, prefix) == 0) {
- *str_out = str + prefix_len;
- *len_out = len - prefix_len;
- substituted = TRUE;
+ if (strcmp (strstart, prefix) != 0) {
+ g_free (strstart);
+ return FALSE;
}
+ /* Check that the following letter is a break
+ * character.
+ */
g_free (strstart);
+ remaining = &str[prefix_len];
+ remaining_len = len - prefix_len;
- return substituted;
+ if (!skip_non_alphanumeric (&remaining, &remaining_len))
+ return FALSE;
+
+ *len_out = remaining_len;
+ *str_out = remaining;
+ return TRUE;
}
/* Helper function valid for all implementations */
@@ -280,6 +321,9 @@ tracker_collation_utf8_title (gpointer collator,
const gchar *res1 = NULL, *res2 = NULL;
gint i;
+ skip_non_alphanumeric ((const gchar **) &str1, &len1);
+ skip_non_alphanumeric ((const gchar **) &str2, &len2);
+
/* Translators: this is a '|' (U+007C) separated list of common
* title beginnings. Meant to be skipped for sorting purposes,
* case doesn't matter. Given English media is quite common, it is
@@ -295,8 +339,7 @@ tracker_collation_utf8_title (gpointer collator,
gchar *prefix, *str;
gint prefix_len;
- str = g_strdup_printf ("%s ", title_beginnings[i]);
- prefix = g_utf8_casefold (str, -1);
+ prefix = g_utf8_casefold (title_beginnings[i], -1);
prefix_len = strlen (prefix);
if (!res1)
@@ -306,7 +349,6 @@ tracker_collation_utf8_title (gpointer collator,
check_remove_prefix (str2, len2, prefix, prefix_len,
&res2, &len2);
g_free (prefix);
- g_free (str);
}
if (!res1)
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]