[epiphany/wip/google-safe-browsing: 1/4] gsb-utils: Add function to canonicalize URLs
- From: Gabriel Ivașcu <gabrielivascu src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [epiphany/wip/google-safe-browsing: 1/4] gsb-utils: Add function to canonicalize URLs
- Date: Fri, 15 Sep 2017 22:24:32 +0000 (UTC)
commit 616a4e3aae6599c8807b7f6759ebffca6bbe328b
Author: Gabriel Ivascu <gabrielivascu gnome org>
Date: Fri Sep 15 18:46:56 2017 +0300
gsb-utils: Add function to canonicalize URLs
https://developers.google.com/safe-browsing/v4/urls-hashing#canonicalization
lib/ephy-string.c | 76 ++++++++++++---
lib/ephy-string.h | 5 +
lib/safe-browsing/ephy-gsb-utils.c | 183 ++++++++++++++++++++++++++++++++++++
lib/safe-browsing/ephy-gsb-utils.h | 2 +
4 files changed, 250 insertions(+), 16 deletions(-)
---
diff --git a/lib/ephy-string.c b/lib/ephy-string.c
index 497c7f8..a1ea6d4 100644
--- a/lib/ephy-string.c
+++ b/lib/ephy-string.c
@@ -248,27 +248,71 @@ ephy_string_find_and_replace (const char *haystack,
const char *to_find,
const char *to_repl)
{
- GString *string;
- const char *needle;
+ GString *str;
+ const char *tmp;
gsize to_find_len;
- gsize to_repl_len;
- gsize pos = 0;
- gsize i = 0;
+ gsize pos;
- string = g_string_new (haystack);
+ g_assert (haystack);
+ g_assert (to_find);
+ g_assert (to_repl);
+
+ str = g_string_new (haystack);
to_find_len = strlen (to_find);
- to_repl_len = strlen (to_repl);
-
- while ((needle = strstr (haystack, to_find)) != NULL) {
- pos += needle - haystack;
- g_string_erase (string, pos + i * (to_repl_len - to_find_len), to_find_len);
- g_string_insert (string, pos + i * (to_repl_len - to_find_len), to_repl);
- haystack = needle + to_find_len;
- pos += to_find_len;
- i++;
+
+ while ((tmp = strstr (str->str, to_find)) != NULL) {
+ pos = tmp - str->str;
+ g_string_erase (str, pos, to_find_len);
+ g_string_insert (str, pos, to_repl);
}
- return g_string_free (string, FALSE);
+ return g_string_free (str, FALSE);
+}
+
+/*
+ * Adapted from GLib's g_strchug()
+ *
+ * This function doesn't allocate or reallocate any memory;
+ * it modifies @string in place. Therefore, it cannot be used on
+ * statically allocated strings.
+ *
+ * The pointer to @string is returned to allow the nesting of functions.
+ */
+char *
+ephy_string_remove_leading (char *string,
+ char ch)
+{
+ char *start;
+
+ g_assert (string);
+
+ for (start = string; *start && *start == ch; start++)
+ ;
+
+ memmove (string, start, strlen (start) + 1);
+
+ return string;
+}
+
+/*
+ * Adapted from GLib's g_strchomp()
+ *
+ * This function doesn't allocate or reallocate any memory;
+ * it modifies @string in place. Therefore, it cannot be used on
+ * statically allocated strings.
+ *
+ * The pointer to @string is returned to allow the nesting of functions.
+ */
+char *
+ephy_string_remove_trailing (char *string,
+ char ch)
+{
+ g_assert (string);
+
+ for (gssize i = strlen (string) - 1; i >= 0 && string[i] == ch; i--)
+ string[i] = '\0';
+
+ return string;
}
char **
diff --git a/lib/ephy-string.h b/lib/ephy-string.h
index 44913f3..2ad0a0c 100644
--- a/lib/ephy-string.h
+++ b/lib/ephy-string.h
@@ -44,6 +44,11 @@ char *ephy_string_find_and_replace (const char *string,
const char *to_find,
const char *to_repl);
+char *ephy_string_remove_leading (char *string,
+ char ch);
+char *ephy_string_remove_trailing (char *string,
+ char ch);
+
char **ephy_strv_append (const char * const *strv,
const char *str);
char **ephy_strv_remove (const char * const *strv,
diff --git a/lib/safe-browsing/ephy-gsb-utils.c b/lib/safe-browsing/ephy-gsb-utils.c
index 46cede1..9351eda 100644
--- a/lib/safe-browsing/ephy-gsb-utils.c
+++ b/lib/safe-browsing/ephy-gsb-utils.c
@@ -21,6 +21,14 @@
#include "config.h"
#include "ephy-gsb-utils.h"
+#include "ephy-string.h"
+
+#include <arpa/inet.h>
+#include <libsoup/soup.h>
+#include <string.h>
+
+#define MAX_UNESCAPE_STEP 1024
+
EphyGSBThreatList *
ephy_gsb_threat_list_new (const char *threat_type,
const char *platform_type,
@@ -106,3 +114,178 @@ ephy_gsb_utils_make_list_update_request (EphyGSBThreatList *list)
return lur;
}
+
+static char *
+ephy_gsb_utils_full_unescape (const char *part)
+{
+ char *prev;
+ char *prev_prev;
+ char *retval;
+ int attempts = 0;
+
+ g_assert (part);
+
+ prev = g_strdup (part);
+ retval = soup_uri_decode (part);
+
+ /* Iteratively unescape the string until it cannot be unescaped anymore.
+ * This is useful for strings that have been escaped multiple times.
+ */
+ while (g_strcmp0 (prev, retval) != 0 && attempts++ < MAX_UNESCAPE_STEP) {
+ prev_prev = prev;
+ prev = retval;
+ retval = soup_uri_decode (retval);
+ g_free (prev_prev);
+ }
+
+ g_free (prev);
+
+ return retval;
+}
+
+static char *
+ephy_gsb_utils_escape (const char *part)
+{
+ const guchar *s = (const guchar *)part;
+ GString *str;
+
+ g_assert (part);
+
+ str = g_string_new (NULL);
+
+ /* Use this instead of soup_uri_encode() because that escapes other
+ * characters that we don't want to be escaped.
+ */
+ while (*s) {
+ if (*s < 0x20 || *s >= 0x7f || *s == ' ' || *s == '#' || *s == '%')
+ g_string_append_printf (str, "%%%02X", *s++);
+ else
+ g_string_append_c (str, *s++);
+ }
+
+ return g_string_free (str, FALSE);
+}
+
+static char *
+ephy_gsb_utils_normalize_escape (const char *part)
+{
+ char *tmp;
+ char *retval;
+
+ g_assert (part);
+
+ /* Perform a full unescape and then escape the string exactly once. */
+ tmp = ephy_gsb_utils_full_unescape (part);
+ retval = ephy_gsb_utils_escape (tmp);
+
+ g_free (tmp);
+
+ return retval;
+}
+
+static char *
+ephy_gsb_utils_canonicalize_host (const char *host)
+{
+ struct in_addr addr;
+ char *tmp;
+ char *trimmed;
+ char *retval;
+
+ g_assert (host);
+
+ trimmed = g_strdup (host);
+ ephy_string_remove_leading (trimmed, '.');
+ ephy_string_remove_trailing (trimmed, '.');
+
+ /* This actually replaces groups of consecutive dots with a single dot. */
+ tmp = ephy_string_find_and_replace (trimmed, "..", ".");
+
+ /* If host is as an IP address, normalize it to 4 dot-separated decimal values.
+ * If host is not an IP address, then it's a string and needs to be lowercased.
+ *
+ * inet_aton() handles octal, hex and fewer than 4 components addresses.
+ * See https://linux.die.net/man/3/inet_network
+ */
+ if (inet_aton (tmp, &addr) != 0) {
+ retval = g_strdup (inet_ntoa (addr));
+ } else {
+ retval = g_ascii_strdown (tmp, -1);
+ }
+
+ g_free (trimmed);
+ g_free (tmp);
+
+ return retval;
+}
+
+/*
+ * https://developers.google.com/safe-browsing/v4/urls-hashing#canonicalization
+ */
+char *
+ephy_gsb_utils_canonicalize (const char *url)
+{
+ SoupURI *uri;
+ char *tmp;
+ char *host;
+ char *path;
+ char *host_canonical;
+ char *path_canonical;
+ char *retval;
+ const char *query;
+
+ g_assert (url);
+
+ /* Handle URLs with no scheme. */
+ if (g_str_has_prefix (url, "//"))
+ tmp = g_strdup_printf ("http:%s", url);
+ else if (g_str_has_prefix (url, "://"))
+ tmp = g_strdup_printf ("http%s", url);
+ else if (!strstr (url, "://"))
+ tmp = g_strdup_printf ("http://%s", url);
+ else
+ tmp = g_strdup (url);
+
+ /* soup_uri_new() prepares the URL for us:
+ * 1. Strips trailing and leading whitespaces.
+ * 2. Includes the path component if missing.
+ * 3. Removes tab (0x09), CR (0x0d), LF (0x0a) characters.
+ */
+ uri = soup_uri_new (tmp);
+ g_free (tmp);
+ if (!uri)
+ return NULL;
+
+ /* Remove fragment. */
+ soup_uri_set_fragment (uri, NULL);
+
+ /* Canonicalize host. */
+ host = ephy_gsb_utils_normalize_escape (soup_uri_get_host (uri));
+ host_canonical = ephy_gsb_utils_canonicalize_host (host);
+
+ /* Canonicalize path.
+ * "/../" and "/./" have already been resolved by soup_uri_new().
+ */
+ path = ephy_gsb_utils_normalize_escape (soup_uri_get_path (uri));
+ path_canonical = ephy_string_find_and_replace (path, "//", "/");
+
+ /* Combine all parts. */
+ query = soup_uri_get_query (uri);
+ if (query) {
+ retval = g_strdup_printf ("%s://%s%s?%s",
+ soup_uri_get_scheme (uri),
+ host_canonical, path_canonical,
+ query);
+ } else {
+ retval = g_strdup_printf ("%s://%s%s",
+ soup_uri_get_scheme (uri),
+ host_canonical, path_canonical);
+ }
+
+ g_free (host);
+ g_free (path);
+ g_free (host_canonical);
+ g_free (path_canonical);
+ soup_uri_free (uri);
+
+ return retval;
+}
diff --git a/lib/safe-browsing/ephy-gsb-utils.h b/lib/safe-browsing/ephy-gsb-utils.h
index e1a8a29..8042a69 100644
--- a/lib/safe-browsing/ephy-gsb-utils.h
+++ b/lib/safe-browsing/ephy-gsb-utils.h
@@ -43,4 +43,6 @@ void ephy_gsb_threat_list_free (EphyGSBThreatList *list);
JsonObject *ephy_gsb_utils_make_client_info (void);
JsonObject *ephy_gsb_utils_make_list_update_request (EphyGSBThreatList *list);
+char *ephy_gsb_utils_canonicalize (const char *url);
+
G_END_DECLS
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]