[libsoup] soup-tld: use libpsl instead of our own copy of the public suffix list



commit 9d7559df1d8e9abf135663ccedb8f47c6e137415
Author: Claudio Saavedra <csaavedra igalia com>
Date:   Thu Feb 22 15:23:15 2018 +0200

    soup-tld: use libpsl instead of our own copy of the public suffix list
    
    This adds a dependency on libpsl. For compatibility with our API, we
    depend on libpsl 0.20.0, which is the first version to provide all
    the features we need to pass all our tests as expected.
    
    All existing TLD tests are passing as expected so there shouldn't be
    any regressions.
    
    https://bugzilla.gnome.org/show_bug.cgi?id=769650

 Makefile.am                  |    1 -
 configure.ac                 |    4 +
 data/effective_tld_names.dat |12370 ------------------------------------------
 libsoup/Makefile.am          |   18 +-
 libsoup/soup-tld-private.h   |   26 -
 libsoup/soup-tld.c           |  192 +-
 libsoup/tld-parser.py        |   45 -
 7 files changed, 43 insertions(+), 12613 deletions(-)
---
diff --git a/Makefile.am b/Makefile.am
index 38d2872..141ef38 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -4,7 +4,6 @@ ACLOCAL_AMFLAGS = -I m4 ${ACLOCAL_FLAGS}
 SUBDIRS = libsoup po tests examples docs win32
 
 EXTRA_DIST =                           \
-       data/effective_tld_names.dat    \
        libsoup-2.4.pc.in               \
        libsoup-gnome-2.4.pc.in         \
        gtk-doc.make                    \
diff --git a/configure.ac b/configure.ac
index 2db0990..668c0fd 100644
--- a/configure.ac
+++ b/configure.ac
@@ -98,6 +98,10 @@ PKG_CHECK_MODULES(SQLITE, sqlite3)
 AC_SUBST(SQLITE_CFLAGS)
 AC_SUBST(SQLITE_LIBS)
 
+PKG_CHECK_MODULES(LIBPSL, libpsl >= 0.20.0)
+AC_SUBST(LIBPSL_CFLAGS)
+AC_SUBST(LIBPSL_LIBS)
+
 dnl ***********************
 dnl *** Check for Win32 ***
 dnl ***********************
diff --git a/libsoup/Makefile.am b/libsoup/Makefile.am
index 4232be6..575aa66 100644
--- a/libsoup/Makefile.am
+++ b/libsoup/Makefile.am
@@ -24,7 +24,8 @@ AM_CPPFLAGS =                                 \
        $(XML_CFLAGS)                   \
        $(SQLITE_CFLAGS)                \
        $(CODE_COVERAGE_CFLAGS)         \
-       $(KRB5_CFLAGS)
+       $(KRB5_CFLAGS)                  \
+       $(LIBPSL_CFLAGS)
 
 libsoupincludedir = $(includedir)/libsoup-2.4/libsoup
 
@@ -103,7 +104,8 @@ libsoup_2_4_la_LIBADD =                     \
        $(LIBWS2_32)                    \
        $(XML_LIBS)                     \
        $(SQLITE_LIBS)                  \
-       $(KRB5_LIBS)
+       $(KRB5_LIBS)                    \
+       $(LIBPSL_LIBS)
 
 libsoup_2_4_la_SOURCES =               \
        gconstructor.h                  \
@@ -200,7 +202,6 @@ libsoup_2_4_la_SOURCES =            \
        soup-socket-properties.c        \
        soup-status.c                   \
        soup-tld.c                      \
-       soup-tld-private.h              \
        soup-uri.c                      \
        soup-value-utils.c              \
        soup-version.c                  \
@@ -209,14 +210,6 @@ libsoup_2_4_la_SOURCES =           \
        soup-xmlrpc.c                   \
        soup-xmlrpc-old.c
 
-# TLD rules
-EXTRA_DIST += tld-parser.py
-
-TLD_DATA_FILE=$(top_srcdir)/data/effective_tld_names.dat
-
-tld_data.inc: tld-parser.py $(TLD_DATA_FILE)
-       $(srcdir)/tld-parser.py $(TLD_DATA_FILE) tld_data.inc
-
 if BUILD_LIBSOUP_GNOME
 
 libsoupgnomeincludedir = $(includedir)/libsoup-gnome-2.4/libsoup
@@ -251,8 +244,7 @@ endif
 
 GLIB_GENERATED = soup-enum-types.c soup-enum-types.h
 BUILT_SOURCES = \
-       $(GLIB_GENERATED)   \
-       tld_data.inc
+       $(GLIB_GENERATED)
 
 soup_enum_types_sources = $(libsoupinclude_HEADERS) $(libsoupgnomeinclude_HEADERS)
 soup_enum_types_MKENUMS_C_FLAGS = --fhead "\#define LIBSOUP_USE_UNSTABLE_REQUEST_API"
diff --git a/libsoup/soup-tld.c b/libsoup/soup-tld.c
index f598a80..61bfbe0 100644
--- a/libsoup/soup-tld.c
+++ b/libsoup/soup-tld.c
@@ -12,10 +12,10 @@
 #include <string.h>
 
 #include <glib/gi18n-lib.h>
+#include <libpsl.h>
 
 #include "soup-tld.h"
 #include "soup.h"
-#include "soup-tld-private.h"
 
 /**
  * SECTION:soup-tld
@@ -26,35 +26,9 @@
  * simply a "public suffix" such as ".com".
  */
 
-static void soup_tld_ensure_rules_hash_table (void);
 static const char *soup_tld_get_base_domain_internal (const char *hostname,
-                                                     guint       additional_domains,
                                                      GError    **error);
 
-static GHashTable *rules = NULL;
-static SoupTLDEntry tld_entries[] = {
-#include "tld_data.inc"
-};
-
-/* Stores the entries data in a hash table to ease and speed up
- * searches.
- */
-static void
-soup_tld_ensure_rules_hash_table (void)
-{
-       static gsize init = 0;
-
-       if (g_once_init_enter (&init)) {
-               int i;
-
-               rules = g_hash_table_new (g_str_hash, g_str_equal);
-               for (i = 0; i < G_N_ELEMENTS (tld_entries); ++i)
-                       g_hash_table_insert (rules, tld_entries[i].domain,
-                                            &(tld_entries[i].flags));
-               g_once_init_leave (&init, 1);
-       }
-}
-
 /**
  * soup_tld_get_base_domain:
  * @hostname: a hostname
@@ -85,7 +59,7 @@ soup_tld_get_base_domain (const char *hostname, GError **error)
 {
        g_return_val_if_fail (hostname, NULL);
 
-       return soup_tld_get_base_domain_internal (hostname, 1, error);
+       return soup_tld_get_base_domain_internal (hostname, error);
 }
 
 /**
@@ -106,35 +80,14 @@ soup_tld_get_base_domain (const char *hostname, GError **error)
 gboolean
 soup_tld_domain_is_public_suffix (const char *domain)
 {
-       const char *base_domain;
-       GError *error = NULL;
+       const psl_ctx_t* psl = psl_builtin ();
 
        g_return_val_if_fail (domain, FALSE);
 
-       /* Skip the leading '.' if present */
-       if (*domain == '.' && !*(++domain))
-               g_return_val_if_reached (FALSE);
-
-       base_domain = soup_tld_get_base_domain_internal (domain, 0, &error);
-       if (g_strcmp0 (domain, base_domain)) {
-               g_clear_error (&error);
-               return FALSE;
-       }
-
-       if (g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_NO_BASE_DOMAIN)) {
-               g_error_free (error);
-               return FALSE;
-       }
+       /* This will fail if libpsl's built-in data was disabled during compilation. */
+       g_assert (psl);
 
-       if (g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_IS_IP_ADDRESS) ||
-           g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_INVALID_HOSTNAME)) {
-               g_error_free (error);
-               g_return_val_if_reached (FALSE);
-       }
-
-       g_clear_error (&error);
-
-       return TRUE;
+       return psl_is_public_suffix2 (psl, domain, PSL_TYPE_ANY | PSL_TYPE_NO_STAR_RULE);
 }
 
 /**
@@ -172,14 +125,24 @@ soup_tld_error_quark (void)
 }
 
 static const char *
-soup_tld_get_base_domain_internal (const char *hostname, guint additional_domains, GError **error)
+soup_tld_get_base_domain_internal (const char *hostname, GError **error)
 {
-       char *prev_domain, *cur_domain, *next_dot;
-       gint add_domains;
-       const char *orig_hostname = NULL, *tld;
        char *utf8_hostname = NULL;
+       const psl_ctx_t* psl = psl_builtin ();
+       const char *registrable_domain, *unregistrable_domain;
 
-       soup_tld_ensure_rules_hash_table ();
+       /* This will fail if libpsl's built-in data was disabled during compilation. */
+       g_assert (psl);
+
+       /* Valid hostnames neither start with a dot nor have more than one
+        * dot together.
+        */
+       if (*hostname == '.') {
+               g_set_error_literal (error, SOUP_TLD_ERROR,
+                                    SOUP_TLD_ERROR_INVALID_HOSTNAME,
+                                    _("Invalid hostname"));
+               return NULL;
+       }
 
        if (g_hostname_is_ip_address (hostname)) {
                g_set_error_literal (error, SOUP_TLD_ERROR,
@@ -189,120 +152,33 @@ soup_tld_get_base_domain_internal (const char *hostname, guint additional_domain
        }
 
        if (g_hostname_is_ascii_encoded (hostname)) {
-               orig_hostname = hostname;
-               hostname = utf8_hostname = g_hostname_to_unicode (hostname);
-               if (!hostname) {
-                       g_set_error_literal (error, SOUP_TLD_ERROR,
-                                            SOUP_TLD_ERROR_INVALID_HOSTNAME,
-                                            _("Invalid hostname"));
-                       return NULL;
-               }
-       }
-
-       cur_domain = (char *) hostname;
-       tld = cur_domain;
-       prev_domain = NULL;
-       /* Process matching rules from longest to shortest. Logic
-        * based on Mozilla's implementation of nsEffectiveTLDService.
-        */
-       while (TRUE) {
-               char *orig_domain;
-               gboolean domain_found;
-               int *flags;
-
-               /* Valid hostnames neither start with a dot nor have more than one
-                * dot together.
-                */
-               if (*cur_domain == '.') {
+               utf8_hostname = g_hostname_to_unicode (hostname);
+               if (!utf8_hostname) {
                        g_set_error_literal (error, SOUP_TLD_ERROR,
                                             SOUP_TLD_ERROR_INVALID_HOSTNAME,
                                             _("Invalid hostname"));
-                       g_free (utf8_hostname);
-                       return NULL;
-               }
-
-               next_dot = strchr (cur_domain, '.');
-               domain_found = g_hash_table_lookup_extended (rules, cur_domain, (gpointer *) &orig_domain, 
(gpointer *) &flags);
-               /* We compare the keys just to be sure that we haven't hit a collision */
-               if (domain_found && !strncmp (orig_domain, cur_domain, strlen (orig_domain))) {
-                       if (*flags & SOUP_TLD_RULE_MATCH_ALL) {
-                               /* If we match a *. rule and there were no previous exceptions
-                                * nor previous domains then treat it as an exact match.
-                                */
-                               tld = prev_domain ? prev_domain : cur_domain;
-                               break;
-                       } else if (*flags == SOUP_TLD_RULE_NORMAL) {
-                               tld = cur_domain;
-                               break;
-                       } else if (*flags & SOUP_TLD_RULE_EXCEPTION) {
-                               tld = next_dot + 1;
-                               break;
-                       }
-               }
-
-               /* If we hit the top and haven't matched yet, then it
-                * has no public suffix.
-                */
-               if (!next_dot) {
-                       g_set_error_literal (error, SOUP_TLD_ERROR,
-                                            SOUP_TLD_ERROR_NO_BASE_DOMAIN,
-                                            _("Hostname has no base domain"));
-                       g_free (utf8_hostname);
                        return NULL;
                }
-
-               prev_domain = cur_domain;
-               cur_domain = next_dot + 1;
-       }
-
-       if (orig_hostname) {
-               int dots;
-               const char *p;
-
-               /* Count the number of dots that appear after tld in
-                * utf8_hostname, and then find the corresponding spot
-                * in orig_hostname;
-                */
-               for (p = tld, dots = 0; *p; p++) {
-                       if (*p == '.')
-                               dots++;
-               }
-
-               for (p = orig_hostname + strlen (orig_hostname); p > orig_hostname; p--) {
-                       if (*(p - 1) == '.') {
-                               if (dots)
-                                       dots--;
-                               else
-                                       break;
-                       }
-               }
-               /* It's not possible for utf8_hostname to have had
-                * more dots than orig_hostname.
-                */
-               g_assert (dots == 0);
-
-               tld = p;
                g_free (utf8_hostname);
-               hostname = orig_hostname;
        }
 
-       /* Include the additional number of domains requested. */
-       add_domains = additional_domains;
-       while (tld != hostname) {
-               if (*(--tld) == '.' && (!(add_domains--))) {
-                       ++add_domains;
-                       ++tld;
-                       break;
-               }
+       /* Fetch the domain portion of the hostname and check whether
+        * it's a public domain. */
+       unregistrable_domain = psl_unregistrable_domain (psl, hostname);
+       if (!psl_is_public_suffix2 (psl, unregistrable_domain, PSL_TYPE_ANY | PSL_TYPE_NO_STAR_RULE)) {
+               g_set_error_literal (error, SOUP_TLD_ERROR,
+                                    SOUP_TLD_ERROR_NO_BASE_DOMAIN,
+                                    _("Hostname has no base domain"));
+               return NULL;
        }
 
-       /* If additional_domains > 0 then we haven't found enough additional domains. */
-       if (add_domains) {
+       registrable_domain = psl_registrable_domain (psl, hostname);
+       if (!registrable_domain) {
                g_set_error_literal (error, SOUP_TLD_ERROR,
                                     SOUP_TLD_ERROR_NOT_ENOUGH_DOMAINS,
                                     _("Not enough domains"));
                return NULL;
        }
 
-       return tld;
+       return registrable_domain;
 }


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]