[tracker-miners/sam/plain-text: 36/36] tracker-extract: Add filename allowlist for text files



commit 0ae0e7da5b1b05b7d277737069072859cb61585a
Author: Sam Thursfield <sam afuera me uk>
Date:   Thu Jun 18 02:05:18 2020 +0200

    tracker-extract: Add filename allowlist for text files
    
    The MIME type detection in shared-mime-info gives a rich range of text/*
    MIME types, which allows us to ignore most forms of source code.
    However, it's not exhaustive and if we index all text/plain content we
    will always risk including some unwanted content.
    
    We now have an allowlist specific to the text extractor, which provides
    an extra line of defense. Attempting to extract files with names that
    don't match the allowlist will produce the usual 'No metadata or
    extractor modules found to handle this file' result.

 data/org.freedesktop.Tracker.Extract.gschema.xml |  7 +++
 src/tracker-extract/tracker-config.c             | 68 ++++++++++++++++++++++-
 src/tracker-extract/tracker-config.h             | 14 +++++
 src/tracker-extract/tracker-extract-text.c       | 70 ++++++++++++++++--------
 4 files changed, 133 insertions(+), 26 deletions(-)
---
diff --git a/data/org.freedesktop.Tracker.Extract.gschema.xml 
b/data/org.freedesktop.Tracker.Extract.gschema.xml
index 3249e2135..e1a418686 100644
--- a/data/org.freedesktop.Tracker.Extract.gschema.xml
+++ b/data/org.freedesktop.Tracker.Extract.gschema.xml
@@ -26,6 +26,13 @@ Boston, MA  02110-1301, USA.
       <default>1048576</default>
     </key>
 
+    <key name="text-allowlist" type="as">
+      <summary>Text file allowlist</summary>
+      <description>Filename patterns for plain text documents that should be indexed</description>
+      <default>[ '*.txt', '*.md', '*.mdwn' ]</default>
+    </key>
+
+
     <key name="wait-for-miner-fs" type="b">
       <summary>Wait for FS miner to be done before extracting</summary>
       <description>When true, tracker-extract will wait for tracker-miner-fs to be done crawling before 
extracting meta-data. This option is useful on constrained environment where it is important to list files as 
fast as possible and can wait to get meta-data later.</description>
diff --git a/src/tracker-extract/tracker-config.c b/src/tracker-extract/tracker-config.c
index dd7e3c0f2..426bba67c 100644
--- a/src/tracker-extract/tracker-config.c
+++ b/src/tracker-extract/tracker-config.c
@@ -44,6 +44,7 @@ static void     config_constructed          (GObject       *object);
 enum {
        PROP_0,
        PROP_MAX_BYTES,
+       PROP_TEXT_ALLOWLIST,
        PROP_WAIT_FOR_MINER_FS,
 };
 
@@ -69,6 +70,13 @@ tracker_config_class_init (TrackerConfigClass *klass)
                                                           1024 * 1024,
                                                           G_PARAM_READWRITE));
 
+       g_object_class_install_property (object_class,
+                                        PROP_TEXT_ALLOWLIST,
+                                        g_param_spec_boxed ("text-allowlist",
+                                                            "Text file allowlist",
+                                                            "Filename patterns for plain text documents that 
should be indexed",
+                                                            G_TYPE_STRV,
+                                                            G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS));
        g_object_class_install_property (object_class,
                                         PROP_WAIT_FOR_MINER_FS,
                                         g_param_spec_boolean ("wait-for-miner-fs",
@@ -92,6 +100,7 @@ config_set_property (GObject      *object,
        switch (param_id) {
        /* We don't care about these... we don't save anyway. */
        case PROP_MAX_BYTES:
+       case PROP_TEXT_ALLOWLIST:
        case PROP_WAIT_FOR_MINER_FS:
                break;
 
@@ -115,6 +124,10 @@ config_get_property (GObject    *object,
                                 tracker_config_get_max_bytes (config));
                break;
 
+       case PROP_TEXT_ALLOWLIST:
+               g_value_take_boxed (value, tracker_gslist_to_string_list (config->text_allowlist));
+               break;
+
        case PROP_WAIT_FOR_MINER_FS:
                g_value_set_boolean (value,
                                     tracker_config_get_wait_for_miner_fs (config));
@@ -126,14 +139,42 @@ config_get_property (GObject    *object,
        };
 }
 
+static void
+config_set_text_allowlist_conveniences (TrackerConfig *config)
+{
+       GSList *l;
+       GSList *patterns = NULL;
+
+       g_slist_foreach (config->text_allowlist_patterns,
+                        (GFunc) g_pattern_spec_free,
+                        NULL);
+       g_slist_free (config->text_allowlist_patterns);
+
+       for (l = config->text_allowlist; l; l = l->next) {
+               GPatternSpec *spec;
+               const gchar *str = l->data;
+
+               if (str) {
+                       spec = g_pattern_spec_new (l->data);
+                       patterns = g_slist_prepend (patterns, spec);
+               }
+       }
+
+       config->text_allowlist_patterns = g_slist_reverse (patterns);
+}
+
 static void
 config_finalize (GObject *object)
 {
-       /* For now we do nothing here, we left this override in for
-        * future expansion.
-        */
+       TrackerConfig *config = TRACKER_CONFIG (object);
+
+       g_slist_foreach (config->text_allowlist_patterns,
+                        (GFunc) g_pattern_spec_free,
+                        NULL);
+       g_slist_free (config->text_allowlist);
 
        (G_OBJECT_CLASS (tracker_config_parent_class)->finalize) (object);
+
 }
 
 static void
@@ -166,6 +207,9 @@ config_constructed (GObject *object)
         * unintended open() calls.
         */
        TRACKER_CONFIG (settings)->max_bytes = g_settings_get_int (settings, "max-bytes");
+       TRACKER_CONFIG (settings)->text_allowlist = tracker_string_list_to_gslist (g_settings_get_strv 
(settings, "text-allowlist"), -1);
+
+       config_set_text_allowlist_conveniences (TRACKER_CONFIG (settings));
 }
 
 TrackerConfig *
@@ -218,6 +262,14 @@ tracker_config_get_max_bytes (TrackerConfig *config)
        return config->max_bytes;
 }
 
+GSList *
+tracker_config_get_text_allowlist (TrackerConfig *config)
+{
+       g_return_val_if_fail (TRACKER_IS_CONFIG (config), NULL);
+
+       return config->text_allowlist;
+}
+
 gboolean
 tracker_config_get_wait_for_miner_fs (TrackerConfig *config)
 {
@@ -225,3 +277,13 @@ tracker_config_get_wait_for_miner_fs (TrackerConfig *config)
 
        return g_settings_get_boolean (G_SETTINGS (config), "wait-for-miner-fs");
 }
+
+
+/*
+ * Convenience functions
+ */
+GSList *
+tracker_config_get_text_allowlist_patterns (TrackerConfig *config)
+{
+       return config->text_allowlist_patterns;
+}
diff --git a/src/tracker-extract/tracker-config.h b/src/tracker-extract/tracker-config.h
index dd752bdb7..18dc292b0 100644
--- a/src/tracker-extract/tracker-config.h
+++ b/src/tracker-extract/tracker-config.h
@@ -37,6 +37,10 @@ typedef struct TrackerConfigClass TrackerConfigClass;
 struct TrackerConfig {
        GSettings parent;
        gint max_bytes;
+       GSList *text_allowlist;
+
+       /* Convenience data */
+       GSList *text_allowlist_patterns;
 };
 
 struct TrackerConfigClass {
@@ -47,8 +51,18 @@ GType          tracker_config_get_type                (void) G_GNUC_CONST;
 
 TrackerConfig *tracker_config_new                     (void);
 gint           tracker_config_get_max_bytes           (TrackerConfig *config);
+GSList *       tracker_config_get_text_allowlist      (TrackerConfig *config);
 gboolean       tracker_config_get_wait_for_miner_fs   (TrackerConfig *config);
 
+/*
+ * Convenience functions:
+ */
+
+/* The _patterns() APIs return GPatternSpec pointers for basename
+ * pattern matching.
+ */
+GSList *       tracker_config_get_text_allowlist_patterns        (TrackerConfig *config);
+
 G_END_DECLS
 
 #endif /* __TRACKER_EXTRACT_CONFIG_H__ */
diff --git a/src/tracker-extract/tracker-extract-text.c b/src/tracker-extract/tracker-extract-text.c
index 2d60551f8..b1d6e5611 100644
--- a/src/tracker-extract/tracker-extract-text.c
+++ b/src/tracker-extract/tracker-extract-text.c
@@ -40,6 +40,24 @@
 #include "tracker-extract.h"
 #include "tracker-read.h"
 
+static gboolean
+allow_file (GSList      *text_allowlist_patterns,
+            GFile       *file)
+{
+       GSList *l;
+       g_autofree gchar *basename = NULL;
+
+       basename = g_file_get_basename (file);
+
+       for (l = text_allowlist_patterns; l; l = l->next) {
+               if (g_pattern_match_string (l->data, basename)) {
+                       return TRUE;
+               }
+       }
+
+       return FALSE;
+}
+
 static gchar *
 get_file_content (GFile   *file,
                   gsize    n_bytes,
@@ -81,34 +99,40 @@ tracker_extract_get_metadata (TrackerExtractInfo *info)
 {
        TrackerResource *metadata;
        TrackerConfig *config;
+       GFile *file;
+       GSList *text_allowlist_patterns;
        gchar *content = NULL;
        GError *error = NULL;
 
        config = tracker_main_get_config ();
-
-       content = get_file_content (tracker_extract_info_get_file (info),
-                                   tracker_config_get_max_bytes (config),
-                                   &error);
-
-       if (error != NULL) {
-               /* An error occurred, perhaps the file was deleted. */
-               g_message ("Error extracting content: %s", error->message);
-               g_error_free (error);
-               return FALSE;
-       }
-
-       metadata = tracker_resource_new (NULL);
-       tracker_resource_add_uri (metadata, "rdf:type", "nfo:PlainTextDocument");
-
-       if (content) {
-               tracker_resource_set_string (metadata, "nie:plainTextContent", content);
-               g_free (content);
-       } else {
-               tracker_resource_set_string (metadata, "nie:plainTextContent", "");
+       text_allowlist_patterns = tracker_config_get_text_allowlist_patterns (config);
+       file = tracker_extract_info_get_file (info);
+
+       if (allow_file (text_allowlist_patterns, file)) {
+               content = get_file_content (tracker_extract_info_get_file (info),
+                                           tracker_config_get_max_bytes (config),
+                                           &error);
+
+               if (error != NULL) {
+                       /* An error occurred, perhaps the file was deleted. */
+                       g_message ("Error extracting content: %s", error->message);
+                       g_error_free (error);
+                       return FALSE;
+               }
+
+               metadata = tracker_resource_new (NULL);
+               tracker_resource_add_uri (metadata, "rdf:type", "nfo:PlainTextDocument");
+
+               if (content) {
+                       tracker_resource_set_string (metadata, "nie:plainTextContent", content);
+                       g_free (content);
+               } else {
+                       tracker_resource_set_string (metadata, "nie:plainTextContent", "");
+               }
+
+               tracker_extract_info_set_resource (info, metadata);
+               g_object_unref (metadata);
        }
 
-       tracker_extract_info_set_resource (info, metadata);
-       g_object_unref (metadata);
-
        return TRUE;
 }


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]