[epiphany] lib: Add helper to remove tracking from URIs



commit 7f09ee3763add068fc715da677568ffbbd2273df
Author: Bastien Nocera <hadess hadess net>
Date:   Mon Dec 16 11:11:21 2013 +0100

    lib: Add helper to remove tracking from URIs
    
    This function, based on the PureURL Firefox add-on, will strip
    tracking information from URLs, such as analytics information.
    
    See https://addons.mozilla.org/fr/firefox/addon/pure-url/
    
    https://bugzilla.gnome.org/show_bug.cgi?id=720520

 lib/Makefile.am               |    2 +
 lib/ephy-uri-helpers.c        |  258 +++++++++++++++++++++++++++++++++++++++++
 lib/ephy-uri-helpers.h        |   38 ++++++
 tests/Makefile.am             |    4 +
 tests/ephy-uri-helpers-test.c |   82 +++++++++++++
 5 files changed, 384 insertions(+), 0 deletions(-)
---
diff --git a/lib/Makefile.am b/lib/Makefile.am
index 1c555be..3770bf2 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -24,6 +24,7 @@ NOINST_H_FILES = \
        ephy-string.h                           \
        ephy-snapshot-service.h                 \
        ephy-time-helpers.h                     \
+       ephy-uri-helpers.h                      \
        ephy-web-app-utils.h                    \
        ephy-web-dom-utils.h                    \
        ephy-zoom.h
@@ -67,6 +68,7 @@ libephymisc_la_SOURCES = \
        ephy-sqlite-statement.c                 \
        ephy-string.c                           \
        ephy-time-helpers.c                     \
+       ephy-uri-helpers.c                      \
        ephy-web-app-utils.c                    \
        ephy-web-dom-utils.c                    \
        ephy-zoom.c                             \
diff --git a/lib/ephy-uri-helpers.c b/lib/ephy-uri-helpers.c
new file mode 100644
index 0000000..acd46aa
--- /dev/null
+++ b/lib/ephy-uri-helpers.c
@@ -0,0 +1,258 @@
+/* -*- Mode: C; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/*
+ *  Copyright © 2013 Bastien Nocera <hadess hadess net>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#include "config.h"
+#include "ephy-uri-helpers.h"
+
+#include <glib.h>
+#include <libsoup/soup.h>
+#include <string.h>
+
+/**
+ * SECTION:ephy-uri-helpers
+ * @short_description: miscellaneous URI related utility functions
+ *
+ * URI related functions, including functions to clean up URI.
+ */
+
+typedef struct {
+  char *name;
+  char *value;
+} QueryItem;
+
+static void
+query_item_free (QueryItem *item)
+{
+  g_free (item->name);
+  /* value is actually part of the name allocation,
+   * see query_decode() */
+  g_slice_free (QueryItem, item);
+}
+
+#define XDIGIT(c) ((c) <= '9' ? (c) - '0' : ((c) & 0x4F) - 'A' + 10)
+#define HEXCHAR(s) ((XDIGIT (s[1]) << 4) + XDIGIT (s[2]))
+
+/* From libsoup, in libsoup/soup-form.c */
+static gboolean
+form_decode (char *part)
+{
+  unsigned char *s, *d;
+
+  s = d = (unsigned char *)part;
+  do {
+    if (*s == '%') {
+      if (!g_ascii_isxdigit (s[1]) ||
+          !g_ascii_isxdigit (s[2]))
+        return FALSE;
+      *d++ = HEXCHAR (s);
+      s += 2;
+    } else if (*s == '+')
+      *d++ = ' ';
+    else
+      *d++ = *s;
+  } while (*s++);
+
+  return TRUE;
+}
+
+static void
+append_form_encoded (GString *str, const char *in)
+{
+  const unsigned char *s = (const unsigned char *)in;
+
+  while (*s) {
+    if (*s == ' ') {
+      g_string_append_c (str, '+');
+      s++;
+    } else if (!g_ascii_isalnum (*s))
+      g_string_append_printf (str, "%%%02X", (int)*s++);
+    else
+      g_string_append_c (str, *s++);
+  }
+}
+
+static void
+encode_pair (GString *str, const char *name, const char *value)
+{
+  g_return_if_fail (name != NULL);
+  g_return_if_fail (value != NULL);
+
+  if (str->len)
+    g_string_append_c (str, '&');
+  append_form_encoded (str, name);
+  g_string_append_c (str, '=');
+  append_form_encoded (str, value);
+}
+
+/* Adapted from soup_form_decode in libsoup */
+static GList *
+query_decode (const char *query)
+{
+  GList *items;
+  char **pairs, *eq, *name, *value;
+  int i;
+
+  items = NULL;
+  pairs = g_strsplit (query, "&", -1);
+  for (i = 0; pairs[i]; i++) {
+    QueryItem *item;
+
+    name = pairs[i];
+    eq = strchr (name, '=');
+    if (eq) {
+      *eq = '\0';
+      value = eq + 1;
+    } else
+      value = NULL;
+    if (!value || !form_decode (name) || !form_decode (value)) {
+      g_free (name);
+      continue;
+    }
+
+    item = g_slice_new0 (QueryItem);
+    item->name = name;
+    item->value = value;
+    items = g_list_prepend (items, item);
+  }
+  g_free (pairs);
+
+  return g_list_reverse (items);
+}
+
+static char *
+query_encode (GList *items)
+{
+  GList *l;
+  GString *str;
+
+  if (!items)
+    return NULL;
+
+  str = g_string_new (NULL);
+  for (l = items; l != NULL; l = l->next) {
+    QueryItem *item = l->data;
+
+    encode_pair (str, item->name, item->value);
+  }
+
+  return g_string_free (str, FALSE);
+}
+
+static gboolean
+is_garbage (const char *name,
+            const char *host)
+{
+  struct {
+    const char *field;
+    const char *host;
+  } const fields[] = {
+    /* analytics.google.com */
+    { "utm_source",            NULL },
+    { "utm_medium",            NULL },
+    { "utm_term",              NULL },
+    { "utm_content",   NULL },
+    { "utm_campaign",  NULL },
+    /* metrika.yandex.ru */
+    { "yclid",         NULL },
+    /* youtube.com */
+    { "feature",               "youtube.com" },
+    /* facebook.com */
+    { "fb_action_ids", NULL},
+    { "fb_action_types",       NULL },
+    { "fb_ref",                NULL },
+    { "fb_source",             NULL },
+    { "action_object_map",     NULL },
+    { "action_type_map",       NULL },
+    { "action_ref_map",        NULL },
+    { "ref",           "facebook.com" },
+    { "fref",          "facebook.com" },
+    { "hc_location",   "facebook.com" },
+    /* imdb.com */
+    { "ref_",          "imdb.com" }
+  };
+  guint i;
+
+  for (i = 0; i < G_N_ELEMENTS (fields); i++) {
+    if (fields[i].host != NULL &&
+        !g_str_has_suffix (host, fields[i].host))
+      continue;
+    if (g_str_equal (fields[i].field, name))
+      return TRUE;
+  }
+
+  return FALSE;
+}
+
+/**
+ * ephy_remove_tracking_from_uri:
+ * @uri: a uri
+ *
+ * Sanitize @uri to make sure it does not contain analytics tracking
+ * information. Inspired by the Firefox PureURL add-on:
+ * https://addons.mozilla.org/fr/firefox/addon/pure-url/
+ *
+ * Returns: the sanitized uri, or %NULL on error or when the URI did
+ * not change.
+ */
+char *
+ephy_remove_tracking_from_uri (const char *uri_string)
+{
+  SoupURI *uri;
+  GList *items, *new_items, *l;
+  const char *query, *host;
+  char *new_query;
+  char *ret = NULL;
+
+  uri = soup_uri_new (uri_string);
+  if (!uri)
+    return ret;
+
+  host = soup_uri_get_host (uri);
+  query = soup_uri_get_query (uri);
+  if (!query)
+    goto bail;
+
+  items = query_decode (query);
+  if (!items)
+    goto bail;
+
+  new_items = NULL;
+  for (l = items; l != NULL; l = l->next) {
+    QueryItem *item = l->data;
+    if (!is_garbage (item->name, host))
+      new_items = g_list_prepend (new_items, item);
+  }
+  new_items = g_list_reverse (new_items);
+
+  new_query = query_encode (new_items);
+
+  g_list_free_full (items, (GDestroyNotify) query_item_free);
+  g_list_free (new_items);
+
+  soup_uri_set_query (uri, new_query);
+  g_free (new_query);
+
+  ret = soup_uri_to_string (uri, FALSE);
+
+bail:
+  soup_uri_free (uri);
+  return ret;
+}
+/* vim: set sw=2 ts=2 sts=2 et: */
diff --git a/lib/ephy-uri-helpers.h b/lib/ephy-uri-helpers.h
new file mode 100644
index 0000000..0d7f4b6
--- /dev/null
+++ b/lib/ephy-uri-helpers.h
@@ -0,0 +1,38 @@
+/* -*- Mode: C; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/*
+ *  Copyright © 2013 Bastien Nocera <hadess hadess net>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#if !defined (__EPHY_EPIPHANY_H_INSIDE__) && !defined (EPIPHANY_COMPILATION)
+#error "Only <epiphany/epiphany.h> can be included directly."
+#endif
+
+#ifndef EPHY_URI_HELPERS_H
+#define EPHY_URI_HELPERS_H
+
+#include <glib.h>
+
+G_BEGIN_DECLS
+
+char *ephy_remove_tracking_from_uri (const char *uri);
+
+G_END_DECLS
+
+#endif /* EPHY_URI_HELPERS_H */
+
+/* vim: set sw=2 ts=2 sts=2 et: */
diff --git a/tests/Makefile.am b/tests/Makefile.am
index aca37d7..2ce0538 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -16,6 +16,7 @@ noinst_PROGRAMS = \
        test-ephy-snapshot-service \
        test-ephy-sqlite \
        test-ephy-string \
+       ephy-uri-helpers-test \
        test-ephy-web-app-utils \
        test-ephy-web-view \
        $(NULL)
@@ -174,6 +175,9 @@ test_ephy_sqlite_SOURCES = \
 test_ephy_string_SOURCES = \
        ephy-string-test.c
 
+test_ephy_uri_helpers_SOURCES = \
+       ephy-uri-helpers-test.c
+
 test_ephy_web_app_utils_SOURCES = \
        ephy-web-app-utils-test.c
 
diff --git a/tests/ephy-uri-helpers-test.c b/tests/ephy-uri-helpers-test.c
new file mode 100644
index 0000000..1d44b64
--- /dev/null
+++ b/tests/ephy-uri-helpers-test.c
@@ -0,0 +1,82 @@
+/* vim: set sw=2 ts=2 sts=2 et: */
+/* -*- Mode: C; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/*
+ * ephy-url-helpers-test.c
+ * This file is part of Epiphany
+ *
+ * Copyright © 2013 Bastien Nocera <hadess hadess net>
+ *
+ * Epiphany is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Epiphany is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Epiphany; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA  02110-1301  USA
+ */
+
+#include "config.h"
+#include "ephy-debug.h"
+#include "ephy-uri-helpers.h"
+#include "ephy-settings.h"
+
+#include <glib.h>
+#include <gtk/gtk.h>
+
+static void
+test_ephy_uri_helpers_remove_tracking (void)
+{
+  struct {
+    const char *input;
+    const char *output;
+  } const items[] = {
+    { "http://www.test.com/";, "http://www.test.com/"; },
+    { "http://www.test.com/?key=foo";, "http://www.test.com/?key=foo"; },
+    /* From the description in https://addons.mozilla.org/fr/firefox/addon/pure-url/ */
+    { 
"http://bigpicture.ru/?p=431513&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%%3A+bigpictures+%%28%%D0%%9D%%D0%%9E%%D0%%92%%D0%%9E%%D0%%A1%%D0%%A2%%D0%%98+%%D0%%92+%%D0%%A4%%D0%%9E%%D0%%A2%%D0%%9E%%D0%%93%%D0%%A0%%D0%%90%%D0%%A4%%D0%%98%%D0%%AF%%D0%%A5%%29";,
 "http://bigpicture.ru/?p=431513"; },
+    { "http://www.test.com/?utm_source=feedburner";, "http://www.test.com/"; },
+    { "http://www.test.com/?feature=foo";, "http://www.test.com/?feature=foo"; },
+    { "http://foo.youtube.com/?feature=foo";, "http://foo.youtube.com/"; },
+  };
+  guint i;
+
+  for (i = 0; i < G_N_ELEMENTS (items); i++) {
+    char *result;
+
+    g_test_message ("TRACKING: uri: %s; expected: %s;",
+                    items[i].input, items[i].output);
+    result = ephy_remove_tracking_from_uri (items[i].input);
+    if (result == NULL)
+      result = g_strdup (items[i].input);
+    g_assert_cmpstr (items[i].output, ==, result);
+    g_free (result);
+  }
+}
+
+int
+main (int argc, char *argv[])
+{
+  int ret;
+
+  /* This should affect only this test, we use it so we can test for
+   * default directory changes. */
+  g_setenv ("GSETTINGS_BACKEND", "memory", TRUE);
+
+  gtk_test_init (&argc, &argv);
+
+  ephy_debug_init ();
+
+  g_test_add_func ("/lib/ephy-uri-helpers/remove_tracking",
+                   test_ephy_uri_helpers_remove_tracking);
+
+  ret = g_test_run ();
+
+  return ret;
+}


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]