[gnome-documents] search: use g_str_tokenize_and_fold()



commit a6ba5fc8495d69c29ee357dc4eed6dc2ba400fbf
Author: Cosimo Cecchi <cosimo endlesm com>
Date:   Wed Dec 24 13:04:48 2014 +0800

    search: use g_str_tokenize_and_fold()
    
    Use the GLib-provided g_str_tokenize_and_fold() function to tokenize our
    string, instead of a hand-rolled function.
    This changes the behavior slightly, in that the string we will use in
    the filters will not be unaccented. We then duplicate filters so that
    Tracker matches both on the accented and the unaccented string.
    
    This means that e.g. "podminky" will return results that include both
    "podminky" and "podmínky", but "podmínky" will only return results for
    "podmínky". We assume that if the user typed an accent, it is in fact
    intentional.
    
    https://bugzilla.gnome.org/show_bug.cgi?id=722246

 src/lib/gd-utils.c |   87 ----------------------------------------------------
 src/lib/gd-utils.h |    2 -
 src/search.js      |   20 ++++++++---
 3 files changed, 14 insertions(+), 95 deletions(-)
---
diff --git a/src/lib/gd-utils.c b/src/lib/gd-utils.c
index 6a4a574..bbf10a7 100644
--- a/src/lib/gd-utils.c
+++ b/src/lib/gd-utils.c
@@ -266,93 +266,6 @@ gd_filename_to_rdf_type (const gchar *filename_with_extension)
   return type;
 }
 
-/* Copied from tracker/src/libtracker-fts/tracker-parser-glib.c under LGPLv2+
- * And then from gnome-shell/src/shell-util.c under GPLv2+
- *
- * Originally written by Aleksander Morgado <aleksander gnu org>
- */
-
-/* Combining diacritical mark?
- *  Basic range: [0x0300,0x036F]
- *  Supplement:  [0x1DC0,0x1DFF]
- *  For Symbols: [0x20D0,0x20FF]
- *  Half marks:  [0xFE20,0xFE2F]
- */
-#define IS_CDM_UCS4(c) (((c) >= 0x0300 && (c) <= 0x036F)  || \
-                        ((c) >= 0x1DC0 && (c) <= 0x1DFF)  || \
-                        ((c) >= 0x20D0 && (c) <= 0x20FF)  || \
-                        ((c) >= 0xFE20 && (c) <= 0xFE2F))
-
-/**
- * gd_normalize_casefold_and_unaccent:
- * @str:
- *
- * Returns: (transfer full):
- */
-char *
-gd_normalize_casefold_and_unaccent (const char *str)
-{
-  char *normalized, *tmp;
-  int i = 0, j = 0, ilen;
-
-  if (str == NULL)
-    return NULL;
-
-  normalized = g_utf8_normalize (str, -1, G_NORMALIZE_NFKD);
-  tmp = g_utf8_casefold (normalized, -1);
-  g_free (normalized);
-
-  ilen = strlen (tmp);
-
-  while (i < ilen)
-    {
-      gunichar unichar;
-      char *next_utf8;
-      int utf8_len;
-
-      /* Get next character of the word as UCS4 */
-      unichar = g_utf8_get_char_validated (&tmp[i], -1);
-
-      /* Invalid UTF-8 character or end of original string. */
-      if (unichar == (gunichar) -1 ||
-          unichar == (gunichar) -2)
-        {
-          break;
-        }
-
-      /* Find next UTF-8 character */
-      next_utf8 = g_utf8_next_char (&tmp[i]);
-      utf8_len = next_utf8 - &tmp[i];
-
-      if (IS_CDM_UCS4 ((guint32) unichar))
-        {
-          /* If the given unichar is a combining diacritical mark,
-           * just update the original index, not the output one */
-          i += utf8_len;
-          continue;
-        }
-
-      /* If already found a previous combining
-       * diacritical mark, indexes are different so
-       * need to copy characters. As output and input
-       * buffers may overlap, need to use memmove
-       * instead of memcpy */
-      if (i != j)
-        {
-          memmove (&tmp[j], &tmp[i], utf8_len);
-        }
-
-      /* Update both indexes */
-      i += utf8_len;
-      j += utf8_len;
-    }
-
-  /* Force proper string end */
-  tmp[j] = '\0';
-
-  return tmp;
-}
-
 /**
  * gd_iso8601_from_timestamp:
  * @timestamp:
diff --git a/src/lib/gd-utils.h b/src/lib/gd-utils.h
index e334553..d7a58c1 100644
--- a/src/lib/gd-utils.h
+++ b/src/lib/gd-utils.h
@@ -39,8 +39,6 @@ const char *gd_filename_to_mime_type (const gchar *filename_with_extension);
 
 const char *gd_filename_to_rdf_type (const gchar *filename_with_extension);
 
-char *gd_normalize_casefold_and_unaccent (const char *str);
-
 gchar *gd_iso8601_from_timestamp (gint64 timestamp);
 
 GIcon *gd_create_collection_icon (gint base_size,
diff --git a/src/search.js b/src/search.js
index 0b5e8b2..8e8751c 100644
--- a/src/search.js
+++ b/src/search.js
@@ -64,9 +64,9 @@ const SearchController = new Lang.Class({
     },
 
     getTerms: function() {
-        let escaped_str = Tracker.sparql_escape_string(this._string);
-        let str = GdPrivate.normalize_casefold_and_unaccent(escaped_str);
-        return str.replace(/ +/g, ' ').split(' ');
+        let escapedStr = Tracker.sparql_escape_string(this._string);
+        let [tokens, ] = GLib.str_tokenize_and_fold(escapedStr, null);
+        return tokens;
     }
 });
 Signals.addSignalMethods(SearchController.prototype);
@@ -269,12 +269,20 @@ const SearchMatch = new Lang.Class({
             return ('fn:contains ' +
                     '(tracker:unaccent(tracker:case-fold' +
                     '(tracker:coalesce(nie:title(?urn), nfo:fileName(?urn)))), ' +
-                    '"%s")').format(this._term);
+                    '"%s") || ' +
+                    'fn:contains ' +
+                    '(tracker:case-fold' +
+                    '(tracker:coalesce(nie:title(?urn), nfo:fileName(?urn))), ' +
+                    '"%s")').format(this._term, this._term);
         if (this.id == SearchMatchStock.AUTHOR)
             return ('fn:contains ' +
                     '(tracker:unaccent(tracker:case-fold' +
                     '(tracker:coalesce(nco:fullname(?creator), nco:fullname(?publisher)))), ' +
-                    '"%s")').format(this._term);
+                    '"%s") || ' +
+                    'fn:contains ' +
+                    '(tracker:case-fold' +
+                    '(tracker:coalesce(nco:fullname(?creator), nco:fullname(?publisher))), ' +
+                    '"%s")').format(this._term, this._term);
         return '';
     }
 });
@@ -310,7 +318,7 @@ const SearchMatchManager = new Lang.Class({
             });
             filters.push(this.parent());
         }
-        return filters.length ? '( ' + filters.join(' && ') + ')' : '';
+        return filters.length ? '( ' + filters.join(' && ') + ')' : '(true)';
     }
 });
 


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]