[gnome-documents] search, utils: Unaccent and normalize strings when searching

From: Cosimo Cecchi <cosimoc src gnome org>
To: commits-list gnome org
Cc:
Subject: [gnome-documents] search, utils: Unaccent and normalize strings when searching
Date: Wed, 24 Dec 2014 03:17:50 +0000 (UTC)
commit df39e35d716c4864b835358fb40866bac5d45f64
Author: Debarshi Ray <debarshir gnome org>
Date:   Thu Jan 16 08:32:50 2014 +0100

    search, utils: Unaccent and normalize strings when searching
    
    This will ensure that searching for "podminky" will match with with
    "podmínky" and vice versa.
    
    This requires the new tracker:unaccent function in Tracker 0.17.1.
    
    https://bugzilla.gnome.org/show_bug.cgi?id=722246

 src/lib/gd-utils.c |   87 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/lib/gd-utils.h |    2 +
 src/search.js      |    9 +++--
 3 files changed, 95 insertions(+), 3 deletions(-)
---
diff --git a/src/lib/gd-utils.c b/src/lib/gd-utils.c
index bbf10a7..6a4a574 100644
--- a/src/lib/gd-utils.c
+++ b/src/lib/gd-utils.c
@@ -266,6 +266,93 @@ gd_filename_to_rdf_type (const gchar *filename_with_extension)
   return type;
 }
 
+/* Copied from tracker/src/libtracker-fts/tracker-parser-glib.c under LGPLv2+
+ * And then from gnome-shell/src/shell-util.c under GPLv2+
+ *
+ * Originally written by Aleksander Morgado <aleksander gnu org>
+ */
+
+/* Combining diacritical mark?
+ *  Basic range: [0x0300,0x036F]
+ *  Supplement:  [0x1DC0,0x1DFF]
+ *  For Symbols: [0x20D0,0x20FF]
+ *  Half marks:  [0xFE20,0xFE2F]
+ */
+#define IS_CDM_UCS4(c) (((c) >= 0x0300 && (c) <= 0x036F)  || \
+                        ((c) >= 0x1DC0 && (c) <= 0x1DFF)  || \
+                        ((c) >= 0x20D0 && (c) <= 0x20FF)  || \
+                        ((c) >= 0xFE20 && (c) <= 0xFE2F))
+
+/**
+ * gd_normalize_casefold_and_unaccent:
+ * @str:
+ *
+ * Returns: (transfer full):
+ */
+char *
+gd_normalize_casefold_and_unaccent (const char *str)
+{
+  char *normalized, *tmp;
+  int i = 0, j = 0, ilen;
+
+  if (str == NULL)
+    return NULL;
+
+  normalized = g_utf8_normalize (str, -1, G_NORMALIZE_NFKD);
+  tmp = g_utf8_casefold (normalized, -1);
+  g_free (normalized);
+
+  ilen = strlen (tmp);
+
+  while (i < ilen)
+    {
+      gunichar unichar;
+      char *next_utf8;
+      int utf8_len;
+
+      /* Get next character of the word as UCS4 */
+      unichar = g_utf8_get_char_validated (&tmp[i], -1);
+
+      /* Invalid UTF-8 character or end of original string. */
+      if (unichar == (gunichar) -1 ||
+          unichar == (gunichar) -2)
+        {
+          break;
+        }
+
+      /* Find next UTF-8 character */
+      next_utf8 = g_utf8_next_char (&tmp[i]);
+      utf8_len = next_utf8 - &tmp[i];
+
+      if (IS_CDM_UCS4 ((guint32) unichar))
+        {
+          /* If the given unichar is a combining diacritical mark,
+           * just update the original index, not the output one */
+          i += utf8_len;
+          continue;
+        }
+
+      /* If already found a previous combining
+       * diacritical mark, indexes are different so
+       * need to copy characters. As output and input
+       * buffers may overlap, need to use memmove
+       * instead of memcpy */
+      if (i != j)
+        {
+          memmove (&tmp[j], &tmp[i], utf8_len);
+        }
+
+      /* Update both indexes */
+      i += utf8_len;
+      j += utf8_len;
+    }
+
+  /* Force proper string end */
+  tmp[j] = '\0';
+
+  return tmp;
+}
+
 /**
  * gd_iso8601_from_timestamp:
  * @timestamp:
diff --git a/src/lib/gd-utils.h b/src/lib/gd-utils.h
index d7a58c1..e334553 100644
--- a/src/lib/gd-utils.h
+++ b/src/lib/gd-utils.h
@@ -39,6 +39,8 @@ const char *gd_filename_to_mime_type (const gchar *filename_with_extension);
 
 const char *gd_filename_to_rdf_type (const gchar *filename_with_extension);
 
+char *gd_normalize_casefold_and_unaccent (const char *str);
+
 gchar *gd_iso8601_from_timestamp (gint64 timestamp);
 
 GIcon *gd_create_collection_icon (gint base_size,
diff --git a/src/search.js b/src/search.js
index 3dc93b4..0b5e8b2 100644
--- a/src/search.js
+++ b/src/search.js
@@ -26,6 +26,7 @@ const Query = imports.query;
 const Lang = imports.lang;
 const Signals = imports.signals;
 
+const GdPrivate = imports.gi.GdPrivate;
 const Gio = imports.gi.Gio;
 const GLib = imports.gi.GLib;
 const Tracker = imports.gi.Tracker;
@@ -64,7 +65,7 @@ const SearchController = new Lang.Class({
 
     getTerms: function() {
         let escaped_str = Tracker.sparql_escape_string(this._string);
-        let str = GLib.utf8_casefold(escaped_str, -1);
+        let str = GdPrivate.normalize_casefold_and_unaccent(escaped_str);
         return str.replace(/ +/g, ' ').split(' ');
     }
 });
@@ -266,11 +267,13 @@ const SearchMatch = new Lang.Class({
     getFilter: function() {
         if (this.id == SearchMatchStock.TITLE)
             return ('fn:contains ' +
-                    '(tracker:case-fold(tracker:coalesce(nie:title(?urn), nfo:fileName(?urn))), ' +
+                    '(tracker:unaccent(tracker:case-fold' +
+                    '(tracker:coalesce(nie:title(?urn), nfo:fileName(?urn)))), ' +
                     '"%s")').format(this._term);
         if (this.id == SearchMatchStock.AUTHOR)
             return ('fn:contains ' +
-                    '(tracker:case-fold(tracker:coalesce(nco:fullname(?creator), nco:fullname(?publisher))), 
' +
+                    '(tracker:unaccent(tracker:case-fold' +
+                    '(tracker:coalesce(nco:fullname(?creator), nco:fullname(?publisher)))), ' +
                     '"%s")').format(this._term);
         return '';
     }
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]