[nautilus/wip/oholy/tracker] search-engine-tracker: Do not lose filename results due to stop words



commit 7126b77cce37c2487b2cbd5e6062fe09856a9759
Author: Ondrej Holy <oholy redhat com>
Date:   Thu Dec 5 13:45:37 2019 +0100

    search-engine-tracker: Do not lose filename results due to stop words
    
    Tracker uses list of stop words which are not indexed. Consequently,
    fts:match doesn't provide any results for such words, which also affects
    filenames. I am conviced that this is not crucial issue for content
    search, but it is really problem in case of filename search. We should
    really always find files regardless of stop words in their names. This
    can be fixed on Nautilus side by splitting the search string and using
    ftp:match only for content search.
    
    For example, currently it is not possible to find "file-name.txt" file
    using "file-n" search string, because "name" is stop word, but it works
    nicely with this fix.
    
    Just note that /org/freedesktop/tracker/fts/ignore-stop-words setting
    needs to be changed to fix this issue for content search as well.

 src/nautilus-search-engine-tracker.c | 46 ++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 12 deletions(-)
---
diff --git a/src/nautilus-search-engine-tracker.c b/src/nautilus-search-engine-tracker.c
index 66494cae8..ad2091104 100644
--- a/src/nautilus-search-engine-tracker.c
+++ b/src/nautilus-search-engine-tracker.c
@@ -286,6 +286,12 @@ search_finished_idle (gpointer user_data)
     return FALSE;
 }
 
+/* This is used to compensate rank if fts:rank is not set (resp. fts:match is
+ * not used). The value was determined experimentally. I am conviced that
+ * fts:rank is currently always set to 5.0 in case of filename match.
+ */
+#define FILENAME_RANK 5.0
+
 static void
 nautilus_search_engine_tracker_start (NautilusSearchProvider *provider)
 {
@@ -327,11 +333,15 @@ nautilus_search_engine_tracker_start (NautilusSearchProvider *provider)
     location_uri = location ? g_file_get_uri (location) : NULL;
     mimetypes = nautilus_query_get_mime_types (tracker->query);
 
-    sparql = g_string_new ("SELECT DISTINCT nie:url(?urn) fts:rank(?urn) nfo:fileLastModified(?urn) 
nfo:fileLastAccessed(?urn)");
+    sparql = g_string_new ("SELECT DISTINCT"
+                           " nie:url(?urn)"
+                           " xsd:double(COALESCE(?rank2, ?rank1)) AS ?rank"
+                           " nfo:fileLastModified(?urn)"
+                           " nfo:fileLastAccessed(?urn)");
 
     if (tracker->fts_enabled)
     {
-        g_string_append (sparql, " fts:snippet(?urn)");
+        g_string_append (sparql, " COALESCE(?snippet2, ?snippet1)");
     }
 
     g_string_append (sparql,
@@ -342,16 +352,33 @@ nautilus_search_engine_tracker_start (NautilusSearchProvider *provider)
                      "  tracker:available true;"
                      "  nie:url ?url");
 
-    if (*search_text)
+    if (mimetypes->len > 0)
     {
-        g_string_append_printf (sparql, "; fts:match '\"%s\"*'", search_text);
+        g_string_append (sparql, "; nie:mimeType ?mime");
     }
 
-    if (mimetypes->len > 0)
+    if (tracker->fts_enabled)
     {
-        g_string_append (sparql, "; nie:mimeType ?mime");
+        /* Use fts:match only for content search to not lose some filename results due to stop words. */
+        g_string_append_printf (sparql,
+                                " {"
+                                " ?urn fts:match '\"nie:plainTextContent\" : \"%s\"*' ."
+                                " BIND(fts:rank(?urn) AS ?rank1) ."
+                                " BIND(fts:snippet(?urn) AS ?snippet1)"
+                                " } UNION",
+                                search_text);
     }
 
+    g_string_append_printf (sparql,
+                            " {"
+                            " ?urn nfo:fileName ?filename ."
+                            " FILTER(fn:contains(fn:lower-case(?filename), '%s')) ."
+                            " BIND(%f AS ?rank2) ."
+                            " BIND(?filename AS ?snippet2)"
+                            " }",
+                            search_text,
+                            FILENAME_RANK);
+
     g_string_append_printf (sparql, " . FILTER( ");
 
     if (!tracker->recursive)
@@ -363,11 +390,6 @@ nautilus_search_engine_tracker_start (NautilusSearchProvider *provider)
         g_string_append_printf (sparql, "tracker:uri-is-descendant('%s', ?url)", location_uri);
     }
 
-    if (!tracker->fts_enabled)
-    {
-        g_string_append_printf (sparql, " && fn:contains(fn:lower-case(nfo:fileName(?urn)), '%s')", 
search_text);
-    }
-
     date_range = nautilus_query_get_date_range (tracker->query);
     if (date_range)
     {
@@ -424,7 +446,7 @@ nautilus_search_engine_tracker_start (NautilusSearchProvider *provider)
         g_string_append (sparql, ")\n");
     }
 
-    g_string_append (sparql, ")} ORDER BY DESC (fts:rank(?urn))");
+    g_string_append (sparql, ")} ORDER BY DESC (?rank)");
 
     tracker->cancellable = g_cancellable_new ();
     tracker_sparql_connection_query_async (tracker->connection,


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]