[epiphany/mcatanzaro/uri-decode] uri-helpers: Use webkit_uri_for_display()



commit 353072b415ffa1d51bdded8ca345c7a5def1b954
Author: Michael Catanzaro <mcatanzaro igalia com>
Date:   Sat Feb 9 10:27:08 2019 -0600

    uri-helpers: Use webkit_uri_for_display()
    
    We'll use WebKit's code for processing punycode and IDN. This also means
    we'll adopt the same IDN homograph mitigation strategy as Safari, rather
    than the current implementation based on Firefox.

 lib/ephy-uri-helpers.c | 223 +------------------------------------------------
 lib/meson.build        |   1 -
 meson.build            |   3 +-
 3 files changed, 4 insertions(+), 223 deletions(-)
---
diff --git a/lib/ephy-uri-helpers.c b/lib/ephy-uri-helpers.c
index 32b3ef3a4..3d9014e17 100644
--- a/lib/ephy-uri-helpers.c
+++ b/lib/ephy-uri-helpers.c
@@ -25,7 +25,6 @@
 #include <glib.h>
 #include <libsoup/soup.h>
 #include <string.h>
-#include <unicode/uidna.h>
 #include <webkit2/webkit2.h>
 
 /**
@@ -253,237 +252,21 @@ ephy_remove_tracking_from_uri (const char *uri_string)
   return ret;
 }
 
-static inline void
-script_table_update (GHashTable     *table,
-                     GUnicodeScript  script)
-{
-  gpointer value;
-  gpointer new_value;
-
-  value = g_hash_table_lookup (table, GINT_TO_POINTER (script));
-  new_value = GINT_TO_POINTER (GPOINTER_TO_INT (value) + 1);
-  g_hash_table_replace (table, GINT_TO_POINTER (script), new_value);
-}
-
-static inline int
-script_table_get (GHashTable     *table,
-                  GUnicodeScript  script)
-{
-  gpointer value;
-
-  value = g_hash_table_lookup (table, GINT_TO_POINTER (script));
-  return GPOINTER_TO_INT (value);
-}
-
-/**
- * validate_unicode_label:
- * @label: a domain label, UTF-8 encoded
- *
- * Verifies whether @label is safe to be displayed as Unicode characters, as per
- * this algorithm: https://wiki.mozilla.org/IDN_Display_Algorithm#Algorithm. If
- * %FALSE is returned, then @label should be displayed as Punycode text.
- *
- * Return value: %TRUE if @label is considered safe, %FALSE otherwise
- **/
-static gboolean
-validate_unicode_label (const char *label)
-{
-  GHashTable *table;
-  GUnicodeScript script;
-  gunichar *unichars;
-  gunichar saved_zero_char = 0;
-  gboolean retval = FALSE;
-  long num;
-
-  g_assert (label);
-
-  if (!g_utf8_validate (label, -1, NULL))
-    return FALSE;
-
-  /* Use a hash table to count the occurrences of every script,
-   * except Common and Inherited. */
-  table = g_hash_table_new (g_direct_hash, g_direct_equal);
-  unichars = g_utf8_to_ucs4_fast (label, -1, &num);
-
-  for (gunichar *u = unichars; u && *u; u++) {
-    script = g_unichar_get_script (*u);
-
-    if (script != G_UNICODE_SCRIPT_COMMON && script != G_UNICODE_SCRIPT_INHERITED)
-      script_table_update (table, script);
-    else
-      num--;
-
-    /* Check for mixed numbering systems. */
-    if (g_unichar_isdigit (*u)) {
-      gunichar zero_char = *u - g_unichar_digit_value (*u);
-      if (saved_zero_char == 0)
-        saved_zero_char = zero_char;
-      else if (zero_char != saved_zero_char)
-        goto out;
-    }
-  }
-
-  /* Single script, allow. */
-  if (g_hash_table_size (table) < 2) {
-    retval = TRUE;
-    goto out;
-  }
-
-  /* Chinese scripts. */
-  if (script_table_get (table, G_UNICODE_SCRIPT_LATIN) +
-      script_table_get (table, G_UNICODE_SCRIPT_HAN) +
-      script_table_get (table, G_UNICODE_SCRIPT_BOPOMOFO) == num) {
-    retval = TRUE;
-    goto out;
-  }
-
-  /* Korean scripts. */
-  if (script_table_get (table, G_UNICODE_SCRIPT_LATIN) +
-      script_table_get (table, G_UNICODE_SCRIPT_HAN) +
-      script_table_get (table, G_UNICODE_SCRIPT_HANGUL) == num) {
-    retval = TRUE;
-    goto out;
-  }
-
-  /* Japanese scripts. */
-  if (script_table_get (table, G_UNICODE_SCRIPT_LATIN) +
-      script_table_get (table, G_UNICODE_SCRIPT_HAN) +
-      script_table_get (table, G_UNICODE_SCRIPT_HIRAGANA) +
-      script_table_get (table, G_UNICODE_SCRIPT_KATAKANA) == num) {
-    retval = TRUE;
-    goto out;
-  }
-
-  /* Ban mixes of more than two scripts. */
-  if (g_hash_table_size (table) > 2)
-    goto out;
-
-  /* Ban any mix of two scrips that doesn't contain Latin. */
-  if (script_table_get (table, G_UNICODE_SCRIPT_LATIN) == 0)
-    goto out;
-
-  /* Ban Latin + Cyrillic or Latin + Greek. */
-  if (script_table_get (table, G_UNICODE_SCRIPT_CYRILLIC) > 0 ||
-      script_table_get (table, G_UNICODE_SCRIPT_GREEK) > 0)
-    goto out;
-
-  /* Allow Latin + any other single script. */
-  retval = TRUE;
-
-out:
-  g_hash_table_unref (table);
-  g_free (unichars);
-
-  return retval;
-}
-
-static char *
-evaluate_host_for_display (const char *original_host,
-                           const char *unicode_host)
-{
-  char **original_labels;
-  char **unicode_labels;
-  char *retval;
-
-  g_assert (original_host);
-  g_assert (unicode_host);
-
-  /* These arrays will have the same length. */
-  original_labels = g_strsplit (original_host, ".", -1);
-  unicode_labels = g_strsplit (unicode_host, ".", -1);
-
-  for (guint i = 0; i < g_strv_length (unicode_labels); i++) {
-    if (!validate_unicode_label (unicode_labels[i])) {
-      g_free (unicode_labels[i]);
-      unicode_labels[i] = g_strdup (original_labels[i]);
-    }
-  }
-
-  retval = g_strjoinv (".", unicode_labels);
-  g_strfreev (original_labels);
-  g_strfreev (unicode_labels);
-
-  return retval;
-}
-
-
 /* Use this function to format a URI for display. The URIs used
  * internally by WebKit may contain percent-encoded characters or
  * punycode, which we do not want the user to see.
- *
- * Note this should probably be handled by WebKit instead.
  */
 char *
 ephy_uri_decode (const char *uri_string)
 {
-  static const guint MAX_DOMAIN_LENGTH = 255; /* RFC 1034, section 3.1 */
-  static UIDNA *idna = NULL;
-  static GMutex idna_creation_mutex;
-  SoupURI *uri;
-  char *percent_encoded_uri;
-  char *percent_decoded_host;
-  char *idna_decoded_name;
-  char *fully_decoded_uri;
-  UIDNAInfo info = UIDNA_INFO_INITIALIZER;
-  UErrorCode error = U_ZERO_ERROR;
+  char *decoded_uri;
 
   /* This function is not null-safe since it is mostly used in scenarios where
    * passing or returning null would typically lead to a security issue. */
   g_assert (uri_string);
 
-  /* This object is threadsafe to *use*, but need to create it exactly once. */
-  g_mutex_lock (&idna_creation_mutex);
-  if (idna == NULL) {
-    /* These flags should be synced with URLParser::internationalDomainNameTranscoder
-     * in WebKit's URLParser.cpp. */
-    idna = uidna_openUTS46 (UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ | UIDNA_NONTRANSITIONAL_TO_UNICODE | 
UIDNA_NONTRANSITIONAL_TO_ASCII, &error);
-    if (U_FAILURE (error))
-      g_error ("ICU error opening UTS #46 context: %d", error);
-  }
-  g_mutex_unlock (&idna_creation_mutex);
-
-  uri = soup_uri_new (uri_string);
-  if (uri == NULL)
-    return g_strdup (uri_string);
-
-  /* Process any punycode in the host portion of the URI. */
-  if (uri->host != NULL) {
-    /* +1 so there is space for the trailing NUL with the longest-possible
-     * domain name. +2 because ICU has this rather terrible behavior of
-     * sometimes returning a result that's not NUL-terminated if the buffer
-     * capacity exactly matches the output length, indicating that with a
-     * warning code that's not caught by U_FAILURE. Our buffer is large enough
-     * for any valid domain, but this function may receive invalid domains as
-     * input. */
-    idna_decoded_name = g_malloc0 (MAX_DOMAIN_LENGTH + 2);
-    uidna_nameToUnicodeUTF8 (idna, uri->host, -1, idna_decoded_name, MAX_DOMAIN_LENGTH + 1, &info, &error);
-
-    if (U_FAILURE (error)) {
-      g_warning ("ICU error converting domain %s for display: %d", uri->host, error);
-      soup_uri_free (uri);
-      g_free (idna_decoded_name);
-      return g_strdup (uri_string);
-    }
-
-    percent_decoded_host = soup_uri_decode (uri->host);
-    g_free (uri->host);
-    uri->host = evaluate_host_for_display (percent_decoded_host, idna_decoded_name);
-    g_free (percent_decoded_host);
-    g_free (idna_decoded_name);
-  }
-
-  /* Note: this also strips passwords from the display URI. */
-  percent_encoded_uri = soup_uri_to_string (uri, FALSE);
-  soup_uri_free (uri);
-
-  /* Now, decode any percent-encoded characters in the URI. If there are null
-   * characters or escaped slashes, this returns NULL, so just display the
-   * encoded URI in that case. */
-  fully_decoded_uri = g_uri_unescape_string (percent_encoded_uri, "/");
-  if (fully_decoded_uri == NULL)
-    return percent_encoded_uri;
-  g_free (percent_encoded_uri);
-  return fully_decoded_uri;
+  decoded_uri = webkit_uri_for_display (uri_string);
+  return decoded_uri ? decoded_uri : g_strdup (uri_string);
 }
 
 char *
diff --git a/lib/meson.build b/lib/meson.build
index be5af33ef..0bef938f2 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -62,7 +62,6 @@ libephymisc_deps = [
   glib_dep,
   gsettings_desktop_schemas,
   gtk_dep,
-  icu_uc_dep,
   json_glib_dep,
   libdazzle_dep,
   libsecret_dep,
diff --git a/meson.build b/meson.build
index 8db3f3ea8..301251c3a 100644
--- a/meson.build
+++ b/meson.build
@@ -82,7 +82,7 @@ endif
 glib_requirement = '>= 2.56.0'
 gtk_requirement = '>= 3.22.13'
 nettle_requirement = '>= 3.4'
-webkitgtk_requirement = '>= 2.21.92'
+webkitgtk_requirement = '>= 2.23.90'
 
 cairo_dep = dependency('cairo', version: '>= 1.2')
 gcr_dep = dependency('gcr-3', version: '>= 3.5.5')
@@ -95,7 +95,6 @@ gsettings_desktop_schemas = dependency('gsettings-desktop-schemas')
 gtk_dep = dependency('gtk+-3.0', version: gtk_requirement)
 gtk_unix_print_dep = dependency('gtk+-unix-print-3.0', version: gtk_requirement)
 hogweed_dep = dependency('hogweed', version: nettle_requirement)
-icu_uc_dep = dependency('icu-uc', version: '>= 4.6')
 iso_codes_dep = dependency('iso-codes', version: '>= 0.35')
 json_glib_dep = dependency('json-glib-1.0', version: '>= 1.2.4')
 libdazzle_dep = dependency('libdazzle-1.0', version: '>= 3.29.4', required: false)


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]