[epiphany/wip/idn-display: 12/12] uri-helpers: Implement Mozilla's IDN display algorithm



commit ae47ccbe3f31931d70b97bd13a3f74a36187a817
Author: Gabriel Ivascu <gabrielivascu gnome org>
Date:   Sat Dec 16 19:28:14 2017 +0200

    uri-helpers: Implement Mozilla's IDN display algorithm
    
    https://wiki.mozilla.org/IDN_Display_Algorithm#Algorithm

 lib/ephy-uri-helpers.c |  120 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 118 insertions(+), 2 deletions(-)
---
diff --git a/lib/ephy-uri-helpers.c b/lib/ephy-uri-helpers.c
index 7b00593..68ddc89 100644
--- a/lib/ephy-uri-helpers.c
+++ b/lib/ephy-uri-helpers.c
@@ -252,6 +252,118 @@ ephy_remove_tracking_from_uri (const char *uri_string)
   return ret;
 }
 
+static inline void
+script_table_update (GHashTable     *table,
+                     GUnicodeScript  script)
+{
+  gpointer value;
+  gpointer new_value;
+
+  value = g_hash_table_lookup (table, GINT_TO_POINTER (script));
+  new_value = GINT_TO_POINTER (GPOINTER_TO_INT (value) + 1);
+  g_hash_table_replace (table, GINT_TO_POINTER (script), new_value);
+}
+
+static inline int
+script_table_get (GHashTable     *table,
+                  GUnicodeScript  script)
+{
+  gpointer value;
+
+  value = g_hash_table_lookup (table, GINT_TO_POINTER (script));
+  return GPOINTER_TO_INT (value);
+}
+
+/*
+ * https://wiki.mozilla.org/IDN_Display_Algorithm#Algorithm
+ */
+static gboolean
+label_is_safe_to_display (const char *label)
+{
+  GHashTable *table;
+  GUnicodeScript script;
+  gunichar *unichars;
+  gunichar saved_zero_char = 0;
+  gboolean retval = FALSE;
+  long num;
+
+  g_assert (label);
+
+  if (!g_utf8_validate (label, -1, NULL))
+    return FALSE;
+
+  table = g_hash_table_new (g_direct_hash, g_direct_equal);
+  unichars = g_utf8_to_ucs4_fast (label, -1, &num);
+
+  for (gunichar *u = unichars; u && *u; u++) {
+    script = g_unichar_get_script (*u);
+
+    if (script != G_UNICODE_SCRIPT_COMMON && script != G_UNICODE_SCRIPT_INHERITED)
+      script_table_update (table, script);
+    else
+      num--;
+
+    /* Check for mixed numbering systems. */
+    if (g_unichar_isdigit (*u)) {
+      gunichar zero_char = *u - g_unichar_digit_value (*u);
+      if (saved_zero_char == 0)
+        saved_zero_char = zero_char;
+      else if (zero_char != saved_zero_char)
+        goto out;
+    }
+  }
+
+  /* Single script, allow. */
+  if (g_hash_table_size (table) < 2) {
+    retval = TRUE;
+    goto out;
+  }
+
+  /* Chinese scripts. */
+  if (script_table_get (table, G_UNICODE_SCRIPT_LATIN) +
+      script_table_get (table, G_UNICODE_SCRIPT_HAN) +
+      script_table_get (table, G_UNICODE_SCRIPT_BOPOMOFO) == num) {
+    retval = TRUE;
+    goto out;
+  }
+
+  /* Korean scripts. */
+  if (script_table_get (table, G_UNICODE_SCRIPT_LATIN) +
+      script_table_get (table, G_UNICODE_SCRIPT_HAN) +
+      script_table_get (table, G_UNICODE_SCRIPT_HANGUL) == num) {
+    retval = TRUE;
+    goto out;
+  }
+
+  /* Japanese scripts. */
+  if (script_table_get (table, G_UNICODE_SCRIPT_LATIN) +
+      script_table_get (table, G_UNICODE_SCRIPT_HAN) +
+      script_table_get (table, G_UNICODE_SCRIPT_HIRAGANA) +
+      script_table_get (table, G_UNICODE_SCRIPT_KATAKANA) == num) {
+    retval = TRUE;
+    goto out;
+  }
+
+  /* Ban mixes of more than two scripts. */
+  if (g_hash_table_size (table) > 2)
+    goto out;
+
+  /* Ban Latin + Cyrillic or Latin + Greek. */
+  if (script_table_get (table, G_UNICODE_SCRIPT_LATIN) > 0 &&
+      (script_table_get (table, G_UNICODE_SCRIPT_CYRILLIC) > 0 ||
+       script_table_get (table, G_UNICODE_SCRIPT_GREEK) > 0))
+    goto out;
+
+  /* Allow Latin + any other single script. */
+  retval = TRUE;
+
+out:
+  g_hash_table_unref (table);
+  g_free (unichars);
+
+  return retval;
+}
+
 /* Use this function to format a URI for display. The URIs used
  * internally by WebKit may contain percent-encoded characters or
  * punycode, which we do not want the user to see.
@@ -307,8 +419,12 @@ ephy_uri_decode (const char *uri_string)
       return g_strdup (uri_string);
     }
 
-    g_free (uri->host);
-    uri->host = idna_decoded_name;
+    if (label_is_safe_to_display (idna_decoded_name)) {
+      g_free (uri->host);
+      uri->host = idna_decoded_name;
+    } else {
+      g_free (idna_decoded_name);
+    }
   }
 
   /* Note: this also strips passwords from the display URI. */


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]