[tracker-miners/sam/plain-text: 36/36] tracker-extract: Add filename allowlist for text files
- From: Sam Thursfield <sthursfield src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker-miners/sam/plain-text: 36/36] tracker-extract: Add filename allowlist for text files
- Date: Sun, 21 Jun 2020 12:26:20 +0000 (UTC)
commit 0ae0e7da5b1b05b7d277737069072859cb61585a
Author: Sam Thursfield <sam afuera me uk>
Date: Thu Jun 18 02:05:18 2020 +0200
tracker-extract: Add filename allowlist for text files
The MIME type detection in shared-mime-info gives a rich range of text/*
MIME types, which allows us to ignore most forms of source code.
However, it's not exhaustive and if we index all text/plain content we
will always risk including some unwanted content.
We now have an allowlist specific to the text extractor, which provides
an extra line of defense. Attempting to extract files with names that
don't match the allowlist will produce the usual 'No metadata or
extractor modules found to handle this file' result.
data/org.freedesktop.Tracker.Extract.gschema.xml | 7 +++
src/tracker-extract/tracker-config.c | 68 ++++++++++++++++++++++-
src/tracker-extract/tracker-config.h | 14 +++++
src/tracker-extract/tracker-extract-text.c | 70 ++++++++++++++++--------
4 files changed, 133 insertions(+), 26 deletions(-)
---
diff --git a/data/org.freedesktop.Tracker.Extract.gschema.xml
b/data/org.freedesktop.Tracker.Extract.gschema.xml
index 3249e2135..e1a418686 100644
--- a/data/org.freedesktop.Tracker.Extract.gschema.xml
+++ b/data/org.freedesktop.Tracker.Extract.gschema.xml
@@ -26,6 +26,13 @@ Boston, MA 02110-1301, USA.
<default>1048576</default>
</key>
+ <key name="text-allowlist" type="as">
+ <summary>Text file allowlist</summary>
+ <description>Filename patterns for plain text documents that should be indexed</description>
+ <default>[ '*.txt', '*.md', '*.mdwn' ]</default>
+ </key>
+
+
<key name="wait-for-miner-fs" type="b">
<summary>Wait for FS miner to be done before extracting</summary>
<description>When true, tracker-extract will wait for tracker-miner-fs to be done crawling before
extracting meta-data. This option is useful on constrained environment where it is important to list files as
fast as possible and can wait to get meta-data later.</description>
diff --git a/src/tracker-extract/tracker-config.c b/src/tracker-extract/tracker-config.c
index dd7e3c0f2..426bba67c 100644
--- a/src/tracker-extract/tracker-config.c
+++ b/src/tracker-extract/tracker-config.c
@@ -44,6 +44,7 @@ static void config_constructed (GObject *object);
enum {
PROP_0,
PROP_MAX_BYTES,
+ PROP_TEXT_ALLOWLIST,
PROP_WAIT_FOR_MINER_FS,
};
@@ -69,6 +70,13 @@ tracker_config_class_init (TrackerConfigClass *klass)
1024 * 1024,
G_PARAM_READWRITE));
+ g_object_class_install_property (object_class,
+ PROP_TEXT_ALLOWLIST,
+ g_param_spec_boxed ("text-allowlist",
+ "Text file allowlist",
+ "Filename patterns for plain text documents that
should be indexed",
+ G_TYPE_STRV,
+ G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS));
g_object_class_install_property (object_class,
PROP_WAIT_FOR_MINER_FS,
g_param_spec_boolean ("wait-for-miner-fs",
@@ -92,6 +100,7 @@ config_set_property (GObject *object,
switch (param_id) {
/* We don't care about these... we don't save anyway. */
case PROP_MAX_BYTES:
+ case PROP_TEXT_ALLOWLIST:
case PROP_WAIT_FOR_MINER_FS:
break;
@@ -115,6 +124,10 @@ config_get_property (GObject *object,
tracker_config_get_max_bytes (config));
break;
+ case PROP_TEXT_ALLOWLIST:
+ g_value_take_boxed (value, tracker_gslist_to_string_list (config->text_allowlist));
+ break;
+
case PROP_WAIT_FOR_MINER_FS:
g_value_set_boolean (value,
tracker_config_get_wait_for_miner_fs (config));
@@ -126,14 +139,42 @@ config_get_property (GObject *object,
};
}
+static void
+config_set_text_allowlist_conveniences (TrackerConfig *config)
+{
+ GSList *l;
+ GSList *patterns = NULL;
+
+ g_slist_foreach (config->text_allowlist_patterns,
+ (GFunc) g_pattern_spec_free,
+ NULL);
+ g_slist_free (config->text_allowlist_patterns);
+
+ for (l = config->text_allowlist; l; l = l->next) {
+ GPatternSpec *spec;
+ const gchar *str = l->data;
+
+ if (str) {
+ spec = g_pattern_spec_new (l->data);
+ patterns = g_slist_prepend (patterns, spec);
+ }
+ }
+
+ config->text_allowlist_patterns = g_slist_reverse (patterns);
+}
+
static void
config_finalize (GObject *object)
{
- /* For now we do nothing here, we left this override in for
- * future expansion.
- */
+ TrackerConfig *config = TRACKER_CONFIG (object);
+
+ g_slist_foreach (config->text_allowlist_patterns,
+ (GFunc) g_pattern_spec_free,
+ NULL);
+ g_slist_free (config->text_allowlist);
(G_OBJECT_CLASS (tracker_config_parent_class)->finalize) (object);
+
}
static void
@@ -166,6 +207,9 @@ config_constructed (GObject *object)
* unintended open() calls.
*/
TRACKER_CONFIG (settings)->max_bytes = g_settings_get_int (settings, "max-bytes");
+ TRACKER_CONFIG (settings)->text_allowlist = tracker_string_list_to_gslist (g_settings_get_strv
(settings, "text-allowlist"), -1);
+
+ config_set_text_allowlist_conveniences (TRACKER_CONFIG (settings));
}
TrackerConfig *
@@ -218,6 +262,14 @@ tracker_config_get_max_bytes (TrackerConfig *config)
return config->max_bytes;
}
+GSList *
+tracker_config_get_text_allowlist (TrackerConfig *config)
+{
+ g_return_val_if_fail (TRACKER_IS_CONFIG (config), NULL);
+
+ return config->text_allowlist;
+}
+
gboolean
tracker_config_get_wait_for_miner_fs (TrackerConfig *config)
{
@@ -225,3 +277,13 @@ tracker_config_get_wait_for_miner_fs (TrackerConfig *config)
return g_settings_get_boolean (G_SETTINGS (config), "wait-for-miner-fs");
}
+
+
+/*
+ * Convenience functions
+ */
+GSList *
+tracker_config_get_text_allowlist_patterns (TrackerConfig *config)
+{
+ return config->text_allowlist_patterns;
+}
diff --git a/src/tracker-extract/tracker-config.h b/src/tracker-extract/tracker-config.h
index dd752bdb7..18dc292b0 100644
--- a/src/tracker-extract/tracker-config.h
+++ b/src/tracker-extract/tracker-config.h
@@ -37,6 +37,10 @@ typedef struct TrackerConfigClass TrackerConfigClass;
struct TrackerConfig {
GSettings parent;
gint max_bytes;
+ GSList *text_allowlist;
+
+ /* Convenience data */
+ GSList *text_allowlist_patterns;
};
struct TrackerConfigClass {
@@ -47,8 +51,18 @@ GType tracker_config_get_type (void) G_GNUC_CONST;
TrackerConfig *tracker_config_new (void);
gint tracker_config_get_max_bytes (TrackerConfig *config);
+GSList * tracker_config_get_text_allowlist (TrackerConfig *config);
gboolean tracker_config_get_wait_for_miner_fs (TrackerConfig *config);
+/*
+ * Convenience functions:
+ */
+
+/* The _patterns() APIs return GPatternSpec pointers for basename
+ * pattern matching.
+ */
+GSList * tracker_config_get_text_allowlist_patterns (TrackerConfig *config);
+
G_END_DECLS
#endif /* __TRACKER_EXTRACT_CONFIG_H__ */
diff --git a/src/tracker-extract/tracker-extract-text.c b/src/tracker-extract/tracker-extract-text.c
index 2d60551f8..b1d6e5611 100644
--- a/src/tracker-extract/tracker-extract-text.c
+++ b/src/tracker-extract/tracker-extract-text.c
@@ -40,6 +40,24 @@
#include "tracker-extract.h"
#include "tracker-read.h"
+static gboolean
+allow_file (GSList *text_allowlist_patterns,
+ GFile *file)
+{
+ GSList *l;
+ g_autofree gchar *basename = NULL;
+
+ basename = g_file_get_basename (file);
+
+ for (l = text_allowlist_patterns; l; l = l->next) {
+ if (g_pattern_match_string (l->data, basename)) {
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+}
+
static gchar *
get_file_content (GFile *file,
gsize n_bytes,
@@ -81,34 +99,40 @@ tracker_extract_get_metadata (TrackerExtractInfo *info)
{
TrackerResource *metadata;
TrackerConfig *config;
+ GFile *file;
+ GSList *text_allowlist_patterns;
gchar *content = NULL;
GError *error = NULL;
config = tracker_main_get_config ();
-
- content = get_file_content (tracker_extract_info_get_file (info),
- tracker_config_get_max_bytes (config),
- &error);
-
- if (error != NULL) {
- /* An error occurred, perhaps the file was deleted. */
- g_message ("Error extracting content: %s", error->message);
- g_error_free (error);
- return FALSE;
- }
-
- metadata = tracker_resource_new (NULL);
- tracker_resource_add_uri (metadata, "rdf:type", "nfo:PlainTextDocument");
-
- if (content) {
- tracker_resource_set_string (metadata, "nie:plainTextContent", content);
- g_free (content);
- } else {
- tracker_resource_set_string (metadata, "nie:plainTextContent", "");
+ text_allowlist_patterns = tracker_config_get_text_allowlist_patterns (config);
+ file = tracker_extract_info_get_file (info);
+
+ if (allow_file (text_allowlist_patterns, file)) {
+ content = get_file_content (tracker_extract_info_get_file (info),
+ tracker_config_get_max_bytes (config),
+ &error);
+
+ if (error != NULL) {
+ /* An error occurred, perhaps the file was deleted. */
+ g_message ("Error extracting content: %s", error->message);
+ g_error_free (error);
+ return FALSE;
+ }
+
+ metadata = tracker_resource_new (NULL);
+ tracker_resource_add_uri (metadata, "rdf:type", "nfo:PlainTextDocument");
+
+ if (content) {
+ tracker_resource_set_string (metadata, "nie:plainTextContent", content);
+ g_free (content);
+ } else {
+ tracker_resource_set_string (metadata, "nie:plainTextContent", "");
+ }
+
+ tracker_extract_info_set_resource (info, metadata);
+ g_object_unref (metadata);
}
- tracker_extract_info_set_resource (info, metadata);
- g_object_unref (metadata);
-
return TRUE;
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]