[epiphany/wip/google-safe-browsing: 7/22] gsb-utils: Add function to compute URL hashes



commit 4cee4f8e0b08cf1fa7b176549122b34dd0b99548
Author: Gabriel Ivascu <gabrielivascu gnome org>
Date:   Fri Sep 15 23:58:17 2017 +0300

    gsb-utils: Add function to compute URL hashes

 lib/safe-browsing/ephy-gsb-utils.c |  146 +++++++++++++++++++++++++++++++++++-
 lib/safe-browsing/ephy-gsb-utils.h |    6 +-
 tests/ephy-gsb-utils-test.c        |    2 +-
 3 files changed, 151 insertions(+), 3 deletions(-)
---
diff --git a/lib/safe-browsing/ephy-gsb-utils.c b/lib/safe-browsing/ephy-gsb-utils.c
index 360dde5..8627cab 100644
--- a/lib/safe-browsing/ephy-gsb-utils.c
+++ b/lib/safe-browsing/ephy-gsb-utils.c
@@ -28,6 +28,8 @@
 #include <libsoup/soup.h>
 #include <string.h>
 
+#define MAX_HOST_SUFFIXES 5
+#define MAX_PATH_PREFIXES 6
 #define MAX_UNESCAPE_STEP 1024
 
 EphyGSBThreatList *
@@ -232,7 +234,10 @@ ephy_gsb_utils_canonicalize_host (const char *host)
  * https://developers.google.com/safe-browsing/v4/urls-hashing#canonicalization
  */
 char *
-ephy_gsb_utils_canonicalize (const char *url)
+ephy_gsb_utils_canonicalize (const char  *url,
+                             char       **host_out,
+                             char       **path_out,
+                             char       **query_out)
 {
   SoupURI *uri;
   char *tmp;
@@ -291,6 +296,13 @@ ephy_gsb_utils_canonicalize (const char *url)
                               host_canonical, path_canonical);
   }
 
+  if (host_out)
+    *host_out = g_strdup (host_canonical);
+  if (path_out)
+    *path_out = g_strdup (path_canonical);
+  if (query_out)
+    *query_out = g_strdup (query);
+
   g_free (host);
   g_free (path);
   g_free (host_canonical);
@@ -299,3 +311,135 @@ ephy_gsb_utils_canonicalize (const char *url)
 
   return retval;
 }
+
+/*
+ * https://developers.google.com/safe-browsing/v4/urls-hashing#suffixprefix-expressions
+ */
+static GList *
+ephy_gsb_utils_compute_host_suffixes (const char *host)
+{
+  struct in_addr addr;
+  char **tokens;
+  guint start;
+  guint num_tokens;
+  GList *retval = NULL;
+
+  g_assert (host);
+
+  retval = g_list_prepend (retval, g_strdup (host));
+
+  /* If host is an IP address, return immediately. */
+  if (inet_aton (host, &addr) != 0)
+    return retval;
+
+  tokens = g_strsplit (host, ".", -1);
+  num_tokens = g_strv_length (tokens);
+  start = num_tokens > MAX_HOST_SUFFIXES ? num_tokens - MAX_HOST_SUFFIXES : 1;
+  for (guint i = start; i < num_tokens - 1 && i < start + MAX_HOST_SUFFIXES - 1; i++)
+    retval = g_list_prepend (retval, g_strjoinv (".", tokens + i));
+
+  g_strfreev (tokens);
+
+  return g_list_reverse (retval);
+}
+
+/*
+ * https://developers.google.com/safe-browsing/v4/urls-hashing#suffixprefix-expressions
+ */
+static GList *
+ephy_gsb_utils_compute_path_prefixes (const char *path,
+                                      const char *query)
+{
+  char *no_trailing;
+  char **tokens;
+  guint steps;
+  guint num_tokens;
+  guint no_trailing_len;
+  gboolean has_trailing;
+  GList *retval = NULL;
+
+  g_assert (path);
+
+  if (query)
+    retval = g_list_prepend (retval, g_strjoin ("?", path, query, NULL));
+  retval = g_list_prepend (retval, g_strdup (path));
+
+  if (!g_strcmp0 (path, "/"))
+    return retval;
+
+  has_trailing = path[strlen (path) - 1] == '/';
+  no_trailing = ephy_string_remove_trailing (g_strdup (path), '/');
+  no_trailing_len = strlen (no_trailing);
+
+  tokens = g_strsplit (no_trailing, "/", -1);
+  num_tokens = g_strv_length (tokens);
+  steps = num_tokens < MAX_PATH_PREFIXES - 2 ? num_tokens : MAX_PATH_PREFIXES - 2;
+
+  for (guint i = 0; i < steps; i++) {
+    char *value = g_strconcat (i > 0 ? retval->data : "", tokens[i], "/", NULL);
+
+    if ((has_trailing && !g_strcmp0 (value, path)) ||
+        (!has_trailing && !strncmp (value, no_trailing, no_trailing_len))) {
+      g_free (value);
+      break;
+    }
+
+    retval = g_list_prepend (retval, value);
+  }
+
+  g_free (no_trailing);
+  g_strfreev (tokens);
+
+  return g_list_reverse (retval);
+}
+
+GList *
+ephy_gsb_utils_compute_hashes (const char *url)
+{
+  GChecksum *checksum;
+  GList *retval = NULL;
+  GList *host_suffixes;
+  GList *path_prefixes;
+  char *url_canonical;
+  char *host = NULL;
+  char *path = NULL;
+  char *query = NULL;
+  gsize hash_len = g_checksum_type_get_length (G_CHECKSUM_SHA256);
+
+  g_assert (url);
+
+  url_canonical = ephy_gsb_utils_canonicalize (url, &host, &path, &query);
+  if (!url_canonical)
+    return NULL;
+
+  host_suffixes = ephy_gsb_utils_compute_host_suffixes (host);
+  path_prefixes = ephy_gsb_utils_compute_path_prefixes (path, query);
+  checksum = g_checksum_new (G_CHECKSUM_SHA256);
+
+  /* Get the hash of every host-path combination.
+   * The maximum number of combinations is MAX_HOST_SUFFIXES * MAX_PATH_PREFIXES.
+   */
+  for (GList *h = host_suffixes; h && h->data; h = h->next) {
+    for (GList *p = path_prefixes; p && p->data; p = p->next) {
+      char *value = g_strconcat (h->data, p->data, NULL);
+      guint8 *hash = g_malloc (hash_len);
+
+      g_checksum_reset (checksum);
+      g_checksum_update (checksum, (const guint8 *)value, strlen (value));
+      g_checksum_get_digest (checksum, hash, &hash_len);
+      retval = g_list_prepend (retval, hash);
+
+      g_free (value);
+    }
+  }
+
+  g_free (host);
+  g_free (path);
+  g_free (query);
+  g_free (url_canonical);
+  g_checksum_free (checksum);
+  g_list_free_full (host_suffixes, g_free);
+  g_list_free_full (path_prefixes, g_free);
+
+  return g_list_reverse (retval);
+}
diff --git a/lib/safe-browsing/ephy-gsb-utils.h b/lib/safe-browsing/ephy-gsb-utils.h
index 49bfa4a..7f66af9 100644
--- a/lib/safe-browsing/ephy-gsb-utils.h
+++ b/lib/safe-browsing/ephy-gsb-utils.h
@@ -42,6 +42,10 @@ void               ephy_gsb_threat_list_free  (EphyGSBThreatList *list);
 
 JsonObject        *ephy_gsb_utils_make_list_updates_request (GList *threat_lists);
 
-char              *ephy_gsb_utils_canonicalize              (const char *url);
+char              *ephy_gsb_utils_canonicalize              (const char  *url,
+                                                             char       **host_out,
+                                                             char       **path_out,
+                                                             char       **query_out);
+GList             *ephy_gsb_utils_compute_hashes            (const char *url);
 
 G_END_DECLS
diff --git a/tests/ephy-gsb-utils-test.c b/tests/ephy-gsb-utils-test.c
index b8b16e6..982f814 100644
--- a/tests/ephy-gsb-utils-test.c
+++ b/tests/ephy-gsb-utils-test.c
@@ -75,7 +75,7 @@ test_ephy_gsb_utils_canonicalize (void)
     CanonicalizeTest test = canonicalize_tests[i];
     char *url_canonical;
 
-    url_canonical = ephy_gsb_utils_canonicalize (test.url_raw);
+    url_canonical = ephy_gsb_utils_canonicalize (test.url_raw, NULL, NULL, NULL);
     g_assert_cmpstr (url_canonical, ==, test.url_canonical);
 
     g_free (url_canonical);


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]