[epiphany/wip/google-safe-browsing: 5/37] gsb-utils: Add function to canonicalize URL



commit 6f7e6d7764ad1f98a8d44e21f4d91a79aab51626
Author: Gabriel Ivascu <gabrielivascu gnome org>
Date:   Fri Sep 15 18:46:56 2017 +0300

    gsb-utils: Add function to canonicalize URL
    
    https://developers.google.com/safe-browsing/v4/urls-hashing#canonicalization

 lib/ephy-string.c                  |   76 ++++++++++++---
 lib/ephy-string.h                  |    5 +
 lib/safe-browsing/ephy-gsb-utils.c |  183 ++++++++++++++++++++++++++++++++++++
 lib/safe-browsing/ephy-gsb-utils.h |    2 +
 4 files changed, 250 insertions(+), 16 deletions(-)
---
diff --git a/lib/ephy-string.c b/lib/ephy-string.c
index 497c7f8..a1ea6d4 100644
--- a/lib/ephy-string.c
+++ b/lib/ephy-string.c
@@ -248,27 +248,71 @@ ephy_string_find_and_replace (const char *haystack,
                               const char *to_find,
                               const char *to_repl)
 {
-  GString *string;
-  const char *needle;
+  GString *str;
+  const char *tmp;
   gsize to_find_len;
-  gsize to_repl_len;
-  gsize pos = 0;
-  gsize i = 0;
+  gsize pos;
 
-  string = g_string_new (haystack);
+  g_assert (haystack);
+  g_assert (to_find);
+  g_assert (to_repl);
+
+  str = g_string_new (haystack);
   to_find_len = strlen (to_find);
-  to_repl_len = strlen (to_repl);
-
-  while ((needle = strstr (haystack, to_find)) != NULL) {
-    pos += needle - haystack;
-    g_string_erase (string, pos + i * (to_repl_len - to_find_len), to_find_len);
-    g_string_insert (string, pos + i * (to_repl_len - to_find_len), to_repl);
-    haystack = needle + to_find_len;
-    pos += to_find_len;
-    i++;
+
+  while ((tmp = strstr (str->str, to_find)) != NULL) {
+    pos = tmp - str->str;
+    g_string_erase (str, pos, to_find_len);
+    g_string_insert (str, pos, to_repl);
   }
 
-  return g_string_free (string, FALSE);
+  return g_string_free (str, FALSE);
+}
+
+/*
+ * Adapted from GLib's g_strchug()
+ *
+ * This function doesn't allocate or reallocate any memory;
+ * it modifies @string in place. Therefore, it cannot be used on
+ * statically allocated strings.
+ *
+ * The pointer to @string is returned to allow the nesting of functions.
+ */
+char *
+ephy_string_remove_leading (char *string,
+                            char  ch)
+{
+  char *start;
+
+  g_assert (string);
+
+  for (start = string; *start && *start == ch; start++)
+    ;
+
+  memmove (string, start, strlen (start) + 1);
+
+  return string;
+}
+
+/*
+ * Adapted from GLib's g_strchomp()
+ *
+ * This function doesn't allocate or reallocate any memory;
+ * it modifies @string in place. Therefore, it cannot be used on
+ * statically allocated strings.
+ *
+ * The pointer to @string is returned to allow the nesting of functions.
+ */
+char *
+ephy_string_remove_trailing (char *string,
+                             char  ch)
+{
+  g_assert (string);
+
+  for (gssize i = strlen (string) - 1; i >= 0 && string[i] == ch; i--)
+    string[i] = '\0';
+
+  return string;
 }
 
 char **
diff --git a/lib/ephy-string.h b/lib/ephy-string.h
index 44913f3..2ad0a0c 100644
--- a/lib/ephy-string.h
+++ b/lib/ephy-string.h
@@ -44,6 +44,11 @@ char     *ephy_string_find_and_replace         (const char *string,
                                                 const char *to_find,
                                                 const char *to_repl);
 
+char     *ephy_string_remove_leading           (char *string,
+                                                char  ch);
+char     *ephy_string_remove_trailing          (char *string,
+                                                char  ch);
+
 char    **ephy_strv_append                     (const char * const *strv,
                                                 const char *str);
 char    **ephy_strv_remove                     (const char * const *strv,
diff --git a/lib/safe-browsing/ephy-gsb-utils.c b/lib/safe-browsing/ephy-gsb-utils.c
index 8cef2e8..a21eb35 100644
--- a/lib/safe-browsing/ephy-gsb-utils.c
+++ b/lib/safe-browsing/ephy-gsb-utils.c
@@ -21,7 +21,15 @@
 #include "config.h"
 #include "ephy-gsb-utils.h"
 
+#include "ephy-debug.h"
+#include "ephy-string.h"
+
+#include <arpa/inet.h>
 #include <json-glib/json-glib.h>
+#include <libsoup/soup.h>
+#include <string.h>
+
+#define MAX_UNESCAPE_STEP 1024
 
 EphyGSBThreatList *
 ephy_gsb_threat_list_new (const char *threat_type,
@@ -126,3 +134,178 @@ ephy_gsb_utils_make_list_updates_request (GList *threat_lists)
 
   return retval;
 }
+
+static char *
+ephy_gsb_utils_full_unescape (const char *part)
+{
+  char *prev;
+  char *prev_prev;
+  char *retval;
+  int attempts = 0;
+
+  g_assert (part);
+
+  prev = g_strdup (part);
+  retval = soup_uri_decode (part);
+
+  /* Iteratively unescape the string until it cannot be unescaped anymore.
+   * This is useful for strings that have been escaped multiple times.
+   */
+  while (g_strcmp0 (prev, retval) != 0 && attempts++ < MAX_UNESCAPE_STEP) {
+    prev_prev = prev;
+    prev = retval;
+    retval = soup_uri_decode (retval);
+    g_free (prev_prev);
+  }
+
+  g_free (prev);
+
+  return retval;
+}
+
+static char *
+ephy_gsb_utils_escape (const char *part)
+{
+  const guchar *s = (const guchar *)part;
+  GString *str;
+
+  g_assert (part);
+
+  str = g_string_new (NULL);
+
+  /* Use this instead of soup_uri_encode() because that escapes other
+   * characters that we don't want to be escaped.
+   */
+  while (*s) {
+    if (*s < 0x20 || *s >= 0x7f || *s == ' ' || *s == '#' || *s == '%')
+      g_string_append_printf (str, "%%%02X", *s++);
+    else
+      g_string_append_c (str, *s++);
+  }
+
+  return g_string_free (str, FALSE);
+}
+
+static char *
+ephy_gsb_utils_normalize_escape (const char *part)
+{
+  char *tmp;
+  char *retval;
+
+  g_assert (part);
+
+  /* Perform a full unescape and then escape the string exactly once. */
+  tmp = ephy_gsb_utils_full_unescape (part);
+  retval = ephy_gsb_utils_escape (tmp);
+
+  g_free (tmp);
+
+  return retval;
+}
+
+static char *
+ephy_gsb_utils_canonicalize_host (const char *host)
+{
+  struct in_addr addr;
+  char *tmp;
+  char *trimmed;
+  char *retval;
+
+  g_assert (host);
+
+  trimmed = g_strdup (host);
+  ephy_string_remove_leading (trimmed, '.');
+  ephy_string_remove_trailing (trimmed, '.');
+
+  /* This actually replaces groups of consecutive dots with a single dot. */
+  tmp = ephy_string_find_and_replace (trimmed, "..", ".");
+
+  /* If host is as an IP address, normalize it to 4 dot-separated decimal values.
+   * If host is not an IP address, then it's a string and needs to be lowercased.
+   *
+   * inet_aton() handles octal, hex and fewer than 4 components addresses.
+   * See https://linux.die.net/man/3/inet_network
+   */
+  if (inet_aton (tmp, &addr) != 0) {
+    retval = g_strdup (inet_ntoa (addr));
+  } else {
+    retval = g_ascii_strdown (tmp, -1);
+  }
+
+  g_free (trimmed);
+  g_free (tmp);
+
+  return retval;
+}
+
+/*
+ * https://developers.google.com/safe-browsing/v4/urls-hashing#canonicalization
+ */
+char *
+ephy_gsb_utils_canonicalize (const char *url)
+{
+  SoupURI *uri;
+  char *tmp;
+  char *host;
+  char *path;
+  char *host_canonical;
+  char *path_canonical;
+  char *retval;
+  const char *query;
+
+  g_assert (url);
+
+  /* Handle URLs with no scheme. */
+  if (g_str_has_prefix (url, "//"))
+    tmp = g_strdup_printf ("http:%s", url);
+  else if (g_str_has_prefix (url, "://"))
+    tmp = g_strdup_printf ("http%s", url);
+  else if (!strstr (url, "://"))
+    tmp = g_strdup_printf ("http://%s";, url);
+  else
+    tmp = g_strdup (url);
+
+  /* soup_uri_new() prepares the URL for us:
+   * 1. Strips trailing and leading whitespaces.
+   * 2. Includes the path component if missing.
+   * 3. Removes tab (0x09), CR (0x0d), LF (0x0a) characters.
+   */
+  uri = soup_uri_new (tmp);
+  g_free (tmp);
+  if (!uri) {
+    LOG ("Cannot make SoupURI from URL %s", url);
+    return NULL;
+  }
+
+  /* Remove fragment. */
+  soup_uri_set_fragment (uri, NULL);
+
+  /* Canonicalize host. */
+  host = ephy_gsb_utils_normalize_escape (soup_uri_get_host (uri));
+  host_canonical = ephy_gsb_utils_canonicalize_host (host);
+
+  /* Canonicalize path. "/../" and "/./" have already been resolved by soup_uri_new(). */
+  path = ephy_gsb_utils_normalize_escape (soup_uri_get_path (uri));
+  path_canonical = ephy_string_find_and_replace (path, "//", "/");
+
+  /* Combine all parts. */
+  query = soup_uri_get_query (uri);
+  if (query) {
+    retval = g_strdup_printf ("%s://%s%s?%s",
+                              soup_uri_get_scheme (uri),
+                              host_canonical, path_canonical,
+                              query);
+  } else {
+    retval = g_strdup_printf ("%s://%s%s",
+                              soup_uri_get_scheme (uri),
+                              host_canonical, path_canonical);
+  }
+
+  g_free (host);
+  g_free (path);
+  g_free (host_canonical);
+  g_free (path_canonical);
+  soup_uri_free (uri);
+
+  return retval;
+}
diff --git a/lib/safe-browsing/ephy-gsb-utils.h b/lib/safe-browsing/ephy-gsb-utils.h
index e7906fb..9a72e90 100644
--- a/lib/safe-browsing/ephy-gsb-utils.h
+++ b/lib/safe-browsing/ephy-gsb-utils.h
@@ -41,4 +41,6 @@ void               ephy_gsb_threat_list_free  (EphyGSBThreatList *list);
 
 char              *ephy_gsb_utils_make_list_updates_request (GList *threat_lists);
 
+char              *ephy_gsb_utils_canonicalize              (const char *url);
+
 G_END_DECLS


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]