[libsoup] soup-tld: accept ASCII-encoded hostnames too



commit 4c2d1ac3daed3940217da2477546db31fbdb1b43
Author: Dan Winship <danw gnome org>
Date:   Wed Dec 18 17:06:16 2013 -0500

    soup-tld: accept ASCII-encoded hostnames too
    
    It's inconvenient for callers to have to ensure that the hostname they
    pass in is UTF-8 (since they themselves may not need to care). So
    accept ASCII-encoded hostnames too, and add the corresponding punycode
    tests from the publicsuffix.org test list that we were ignoring
    before.

 libsoup/soup-tld.c |   67 ++++++++++++++++++++++++++++++++++++++++++++-------
 tests/tld-test.c   |   26 +++++++++++++++++++-
 2 files changed, 82 insertions(+), 11 deletions(-)
---
diff --git a/libsoup/soup-tld.c b/libsoup/soup-tld.c
index c6faed1..2e3da62 100644
--- a/libsoup/soup-tld.c
+++ b/libsoup/soup-tld.c
@@ -57,7 +57,7 @@ soup_tld_ensure_rules_hash_table (void)
 
 /**
  * soup_tld_get_base_domain:
- * @hostname: a UTF-8 hostname in its canonical representation form
+ * @hostname: a hostname
  * @error: return location for a #GError, or %NULL to ignore
  *   errors. See #SoupTLDError for the available error codes
  *
@@ -70,9 +70,10 @@ soup_tld_ensure_rules_hash_table (void)
  * with any well known TLD) because choosing a base domain for them
  * would be totally arbitrary.
  *
- * This method only works for valid UTF-8 hostnames in their canonical
- * representation form, so you should use g_hostname_to_unicode() to
- * get the canonical representation if that is not the case.
+ * Prior to libsoup 2.46, this function required that @hostname be in
+ * UTF-8 if it was an IDN. From 2.46 on, the name can be in either
+ * UTF-8 or ASCII format (and the return value will be in the same
+ * format).
  *
  * Returns: a pointer to the start of the base domain in @hostname. If
  * an error occurs, %NULL will be returned and @error set.
@@ -83,21 +84,21 @@ const char *
 soup_tld_get_base_domain (const char *hostname, GError **error)
 {
        g_return_val_if_fail (hostname, NULL);
-       g_return_val_if_fail (!g_hostname_is_ascii_encoded (hostname), FALSE);
 
        return soup_tld_get_base_domain_internal (hostname, 1, error);
 }
 
 /**
  * soup_tld_domain_is_public_suffix:
- * @domain: a UTF-8 domain in its canonical representation form
+ * @domain: a domain name
  *
  * Looks whether the @domain passed as argument is a public domain
  * suffix (.org, .com, .co.uk, etc) or not.
  *
- * This method only works for valid UTF-8 domains in their canonical
- * representation form, so you should use g_hostname_to_unicode() to
- * get the canonical representation if that is not the case.
+ * Prior to libsoup 2.46, this function required that @domain be in
+ * UTF-8 if it was an IDN. From 2.46 on, the name can be in either
+ * UTF-8 or ASCII format (and the return value will be in the same
+ * format).
  *
  * Returns: %TRUE if it is a public domain, %FALSE otherwise.
  *
@@ -174,8 +175,10 @@ soup_tld_error_quark (void)
 static const char *
 soup_tld_get_base_domain_internal (const char *hostname, guint additional_domains, GError **error)
 {
-       char *prev_domain, *cur_domain, *tld, *next_dot;
+       char *prev_domain, *cur_domain, *next_dot;
        gint add_domains;
+       const char *orig_hostname = NULL, *tld;
+       char *utf8_hostname = NULL;
 
        soup_tld_ensure_rules_hash_table ();
 
@@ -186,6 +189,17 @@ soup_tld_get_base_domain_internal (const char *hostname, guint additional_domain
                return NULL;
        }
 
+       if (g_hostname_is_ascii_encoded (hostname)) {
+               orig_hostname = hostname;
+               hostname = utf8_hostname = g_hostname_to_unicode (hostname);
+               if (!hostname) {
+                       g_set_error_literal (error, SOUP_TLD_ERROR,
+                                            SOUP_TLD_ERROR_INVALID_HOSTNAME,
+                                            _("Invalid hostname"));
+                       return NULL;
+               }
+       }
+
        cur_domain = (char *) hostname;
        tld = cur_domain;
        prev_domain = NULL;
@@ -204,6 +218,7 @@ soup_tld_get_base_domain_internal (const char *hostname, guint additional_domain
                        g_set_error_literal (error, SOUP_TLD_ERROR,
                                             SOUP_TLD_ERROR_INVALID_HOSTNAME,
                                             _("Invalid hostname"));
+                       g_free (utf8_hostname);
                        return NULL;
                }
 
@@ -233,6 +248,7 @@ soup_tld_get_base_domain_internal (const char *hostname, guint additional_domain
                        g_set_error_literal (error, SOUP_TLD_ERROR,
                                             SOUP_TLD_ERROR_NO_BASE_DOMAIN,
                                             _("Hostname has no base domain"));
+                       g_free (utf8_hostname);
                        return NULL;
                }
 
@@ -240,6 +256,37 @@ soup_tld_get_base_domain_internal (const char *hostname, guint additional_domain
                cur_domain = next_dot + 1;
        }
 
+       if (orig_hostname) {
+               int dots;
+               const char *p;
+
+               /* Count the number of dots that appear after tld in
+                * utf8_hostname, and then find the corresponding spot
+                * in orig_hostname;
+                */
+               for (p = tld, dots = 0; *p; p++) {
+                       if (*p == '.')
+                               dots++;
+               }
+
+               for (p = orig_hostname + strlen (orig_hostname); p > orig_hostname; p--) {
+                       if (*(p - 1) == '.') {
+                               if (dots)
+                                       dots--;
+                               else
+                                       break;
+                       }
+               }
+               /* It's not possible for utf8_hostname to have had
+                * more dots than orig_hostname.
+                */
+               g_assert (dots == 0);
+
+               tld = p;
+               g_free (utf8_hostname);
+               hostname = orig_hostname;
+       }
+
        /* Include the additional number of domains requested. */
        add_domains = additional_domains;
        while (tld != hostname) {
diff --git a/tests/tld-test.c b/tests/tld-test.c
index 2b6b5dd..d1f1de1 100644
--- a/tests/tld-test.c
+++ b/tests/tld-test.c
@@ -83,8 +83,30 @@ static struct {
   { "www.食狮.中国", "食狮.中国" },
   { "shishi.中国", "shishi.中国" },
   { "中国", NULL },
-  /* This is not in http://publicsuffix.org/list/test.txt but we want to check it anyway. */
+  /* Same as above, but punycoded. */
+  { "xn--85x722f.com.cn", "xn--85x722f.com.cn" },
+  { "xn--85x722f.xn--55qx5d.cn", "xn--85x722f.xn--55qx5d.cn" },
+  { "www.xn--85x722f.xn--55qx5d.cn", "xn--85x722f.xn--55qx5d.cn" },
+  { "shishi.xn--55qx5d.cn", "shishi.xn--55qx5d.cn" },
+  { "xn--55qx5d.cn", NULL },
+  { "xn--85x722f.xn--fiqs8s", "xn--85x722f.xn--fiqs8s" },
+  { "www.xn--85x722f.xn--fiqs8s", "xn--85x722f.xn--fiqs8s" },
+  { "shishi.xn--fiqs8s", "shishi.xn--fiqs8s" },
+  { "xn--fiqs8s", NULL },
+  /* End of publicsuffix.org tests */
+
+  /* Let's just double-check this one... */
   { "co.uk", NULL },
+  { "test.co.uk", "test.co.uk" },
+  { "www.test.co.uk", "test.co.uk" },
+
+  /* Two levels of non-ASCII */
+  { "våler.østfold.no", NULL },
+  { "test.våler.østfold.no", "test.våler.østfold.no" },
+  { "www.test.våler.østfold.no", "test.våler.østfold.no" },
+  { "xn--vler-qoa.xn--stfold-9xa.no", NULL },
+  { "test.xn--vler-qoa.xn--stfold-9xa.no", "test.xn--vler-qoa.xn--stfold-9xa.no" },
+  { "www.test.xn--vler-qoa.xn--stfold-9xa.no", "test.xn--vler-qoa.xn--stfold-9xa.no" },
 },
 /* Non Internet TLDs have NULL as expected result
  */
@@ -135,6 +157,8 @@ main (int argc, char **argv)
                g_clear_error(&error);
        }
 
+       debug_printf (1, "\n");
+
        for (i = 0; i < G_N_ELEMENTS (non_inet_tld_tests); ++i) {
                gboolean is_public = soup_tld_domain_is_public_suffix (non_inet_tld_tests[i].hostname);
                const char *base_domain = soup_tld_get_base_domain (non_inet_tld_tests[i].hostname, NULL);


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]