[glib] ghostutils: Convert non-ASCII dots to '.' when converting hostnames

From: Dan Winship <danw src gnome org>
To: commits-list gnome org
Cc:
Subject: [glib] ghostutils: Convert non-ASCII dots to '.' when converting hostnames
Date: Wed, 15 Dec 2010 08:58:47 +0000 (UTC)
commit 7ee902a3d05cc74a4edaf0197e076611401c029c
Author: Dan Winship <danw gnome org>
Date:   Fri Dec 10 11:42:56 2010 +0100

    ghostutils: Convert non-ASCII dots to '.' when converting hostnames
    
    Also add some test cases to test/hostutils for that and a few other
    things, and make the test program just act as an ASCII/unicode
    hostname converter rather than a test program if it's run with an
    argument.
    
    https://bugzilla.gnome.org/show_bug.cgi?id=633350

 glib/ghostutils.c      |   53 +++++++++++++++++++++++++++++++++--------
 glib/tests/hostutils.c |   61 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+), 11 deletions(-)
---
diff --git a/glib/ghostutils.c b/glib/ghostutils.c
index c036195..99afe9a 100644
--- a/glib/ghostutils.c
+++ b/glib/ghostutils.c
@@ -303,7 +303,8 @@ idna_is_prohibited (gunichar ch)
 /* RFC 3491 IDN cleanup algorithm. */
 static gchar *
 nameprep (const gchar *hostname,
-          gint         len)
+          gint         len,
+          gboolean    *is_unicode)
 {
   gchar *name, *tmp = NULL, *p;
 
@@ -336,12 +337,15 @@ nameprep (const gchar *hostname,
   /* If there are no UTF8 characters, we're done. */
   if (!contains_non_ascii (name, len))
     {
+      *is_unicode = FALSE;
       if (name == (gchar *)hostname)
         return len == -1 ? g_strdup (hostname) : g_strndup (hostname, len);
       else
         return name;
     }
 
+  *is_unicode = TRUE;
+
   /* Normalize */
   name = g_utf8_normalize (name, len, G_NORMALIZE_NFKC);
   g_free (tmp);
@@ -383,6 +387,26 @@ nameprep (const gchar *hostname,
   return name;
 }
 
+/* RFC 3490, section 3.1 says '.', 0x3002, 0xFF0E, and 0xFF61 count as
+ * label-separating dots. @str must be '\0'-terminated.
+ */
+#define idna_is_dot(str) ( \
+  ((guchar)(str)[0] == '.') ||                                                 \
+  ((guchar)(str)[0] == 0xE3 && (guchar)(str)[1] == 0x80 && (guchar)(str)[2] == 0x82) || \
+  ((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBC && (guchar)(str)[2] == 0x8E) || \
+  ((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBD && (guchar)(str)[2] == 0xA1) )
+
+static const gchar *
+idna_end_of_label (const gchar *str)
+{
+  for (; *str; str = g_utf8_next_char (str))
+    {
+      if (idna_is_dot (str))
+        return str;
+    }
+  return str;
+}
+
 /**
  * g_hostname_to_ascii:
  * @hostname: a valid UTF-8 or ASCII hostname
@@ -404,16 +428,16 @@ g_hostname_to_ascii (const gchar *hostname)
   gssize llen, oldlen;
   gboolean unicode;
 
-  label = name = nameprep (hostname, -1);
-  if (!name)
-    return NULL;
+  label = name = nameprep (hostname, -1, &unicode);
+  if (!name || !unicode)
+    return name;
 
   out = g_string_new (NULL);
 
   do
     {
       unicode = FALSE;
-      for (p = label; *p && *p != '.'; p++)
+      for (p = label; *p && !idna_is_dot (p); p++)
 	{
 	  if ((guchar)*p > 0x80)
 	    unicode = TRUE;
@@ -437,7 +461,9 @@ g_hostname_to_ascii (const gchar *hostname)
 	goto fail;
 
       label += llen;
-      if (*label && *++label)
+      if (*label)
+        label = g_utf8_next_char (label);
+      if (*label)
         g_string_append_c (out, '.');
     }
   while (*label);
@@ -585,7 +611,7 @@ g_hostname_to_unicode (const gchar *hostname)
 
   do
     {
-      llen = strcspn (hostname, ".");
+      llen = idna_end_of_label (hostname) - hostname;
       if (!g_ascii_strncasecmp (hostname, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN))
 	{
 	  hostname += IDNA_ACE_PREFIX_LEN;
@@ -598,7 +624,8 @@ g_hostname_to_unicode (const gchar *hostname)
 	}
       else
         {
-          gchar *canonicalized = nameprep (hostname, llen);
+          gboolean unicode;
+          gchar *canonicalized = nameprep (hostname, llen, &unicode);
 
           if (!canonicalized)
             {
@@ -610,7 +637,9 @@ g_hostname_to_unicode (const gchar *hostname)
         }
 
       hostname += llen;
-      if (*hostname && *++hostname)
+      if (*hostname)
+        hostname = g_utf8_next_char (hostname);
+      if (*hostname)
         g_string_append_c (out, '.');
     }
   while (*hostname);
@@ -643,8 +672,10 @@ g_hostname_is_ascii_encoded (const gchar *hostname)
     {
       if (!g_ascii_strncasecmp (hostname, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN))
 	return TRUE;
-      hostname = strchr (hostname, '.');
-      if (!hostname++)
+      hostname = idna_end_of_label (hostname);
+      if (*hostname)
+        hostname = g_utf8_next_char (hostname);
+      if (!*hostname)
 	return FALSE;
     }
 }
diff --git a/glib/tests/hostutils.c b/glib/tests/hostutils.c
index 622a0ce..218f516 100644
--- a/glib/tests/hostutils.c
+++ b/glib/tests/hostutils.c
@@ -19,6 +19,7 @@
 
 #include <glib/glib.h>
 
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -48,6 +49,23 @@ static const struct {
 };
 static const gint num_idn_test_domains = G_N_ELEMENTS (idn_test_domains);
 
+static const struct {
+  const gchar *orig_name, *ascii_name;
+  gboolean orig_is_unicode, ascii_is_encoded;
+} non_round_trip_names[] = {
+  /* uppercase characters */
+  { "EXAMPLE.COM", "example.com", FALSE, FALSE },
+  { "\xc3\x89XAMPLE.COM", "xn--xample-9ua.com", TRUE, TRUE },
+
+  /* unicode that decodes to ascii */
+  { "\xe2\x93\x94\xe2\x93\xa7\xe2\x93\x90\xe2\x93\x9c\xe2\x93\x9f\xe2\x93\x9b\xe2\x93\x94.com", "example.com", TRUE, FALSE },
+
+  /* non-standard dot characters */
+  { "example\xe3\x80\x82" "com", "example.com", TRUE, FALSE },
+  { "\xc3\xa9xample\xe3\x80\x82" "com", "xn--xample-9ua.com", TRUE, TRUE }
+};
+static const gint num_non_round_trip_names = G_N_ELEMENTS (non_round_trip_names);
+
 static const gchar *bad_names[] = {
   "disallowed\xef\xbf\xbd" "character",
   "non-utf\x88",
@@ -73,6 +91,27 @@ test_to_ascii (void)
       g_free (ascii);
     }
 
+  for (i = 0; i < num_non_round_trip_names; i++)
+    {
+      if (non_round_trip_names[i].orig_is_unicode)
+	g_assert (g_hostname_is_non_ascii (non_round_trip_names[i].orig_name));
+      else
+	g_assert (!g_hostname_is_non_ascii (non_round_trip_names[i].orig_name));
+
+      if (non_round_trip_names[i].ascii_is_encoded)
+	g_assert (g_hostname_is_ascii_encoded (non_round_trip_names[i].ascii_name));
+      else
+	g_assert (!g_hostname_is_ascii_encoded (non_round_trip_names[i].ascii_name));
+
+      ascii = g_hostname_to_ascii (non_round_trip_names[i].orig_name);
+      g_assert_cmpstr (non_round_trip_names[i].ascii_name, ==, ascii);
+      g_free (ascii);
+
+      ascii = g_hostname_to_ascii (non_round_trip_names[i].ascii_name);
+      g_assert_cmpstr (non_round_trip_names[i].ascii_name, ==, ascii);
+      g_free (ascii);
+    }
+
   for (i = 0; i < num_bad_names; i++)
     {
       ascii = g_hostname_to_ascii (bad_names[i]);
@@ -278,6 +317,28 @@ main (int   argc,
 {
   g_test_init (&argc, &argv, NULL);
   
+  if (argc == 2 && argv[1][0] != '-')
+    {
+      const gchar *hostname = argv[1];
+      gchar *converted;
+
+      if (g_hostname_is_non_ascii (hostname))
+	{
+	  converted = g_hostname_to_ascii (hostname);
+	  printf ("to_ascii: %s\n", converted);
+	  g_free (converted);
+	}
+      else if (g_hostname_is_ascii_encoded (hostname))
+	{
+	  converted = g_hostname_to_unicode (hostname);
+	  printf ("to_unicode: %s\n", converted);
+	  g_free (converted);
+	}
+      else
+	printf ("hostname is neither unicode nor ACE encoded\n");
+      return 0;
+    }
+
   g_test_add_func ("/hostutils/to_ascii", test_to_ascii);
   g_test_add_func ("/hostutils/to_unicode", test_to_unicode);
   g_test_add_func ("/hostutils/is_ip_addr", test_is_ip_addr);
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]