[gnome-terminal/gnome-3-18] screen: Rewrite URL regexes



commit 3bd692f5d63f0bdfb526d570c92da3270601c155
Author: Christian Persch <chpe gnome org>
Date:   Sun Feb 21 15:46:05 2016 +0100

    screen: Rewrite URL regexes
    
    Rewrite the URL match regex to be more correct.
    
    https://bugzilla.gnome.org/show_bug.cgi?id=756038
    (cherry picked from commit 1a94499aca8f9a06479f5149e70e246bf9249ac0)

 src/Makefile.am       |   27 +++++
 src/terminal-regex.c  |  313 +++++++++++++++++++++++++++++++++++++++++++++++++
 src/terminal-regex.h  |  145 +++++++++++++++++++++++
 src/terminal-screen.c |   27 ++---
 4 files changed, 493 insertions(+), 19 deletions(-)
---
diff --git a/src/Makefile.am b/src/Makefile.am
index 0af81d7..dfc41d1 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -6,6 +6,8 @@ bin_PROGRAMS = gnome-terminal
 libexec_PROGRAMS = gnome-terminal-server
 noinst_PROGRAMS =
 
+check_PROGRAMS = terminal-regex
+
 if WITH_NAUTILUS_EXTENSION
 nautilusextension_LTLIBRARIES = libterminal-nautilus.la
 endif # WITH_NAUTILUS_EXTENSION
@@ -60,6 +62,7 @@ gnome_terminal_server_SOURCES = \
        terminal-prefs.h \
        terminal-profiles-list.c \
        terminal-profiles-list.h \
+       terminal-regex.h \
        terminal-schemas.h \
        terminal-settings-list.c \
        terminal-settings-list.h \
@@ -149,6 +152,30 @@ terminal-gdbus-generated.c terminal-gdbus-generated.h: org.gnome.Terminal.xml Ma
 terminal-resources.h terminal-resources.c: terminal.gresource.xml Makefile $(shell $(GLIB_COMPILE_RESOURCES) 
--generate-dependencies --sourcedir $(srcdir) $(srcdir)/terminal.gresource.xml)
        $(AM_V_GEN) XMLLINT=$(XMLLINT) $(GLIB_COMPILE_RESOURCES) --target $@ --sourcedir $(srcdir) --generate 
--c-name terminal $<
 
+# Checks
+
+TESTS = \
+       terminal-regex \
+       $(NULL)
+
+# Check programmes
+
+terminal_regex_CPPFLAGS = \
+       $(AM_CPPFLAGS)
+terminal_regex_SOURCES = \
+       terminal-regex.c \
+       terminal-regex.h \
+       $(NULL)
+terminal_regex_CFLAGS = \
+       -DTERMINAL_REGEX_MAIN \
+       $(TERM_CFLAGS) \
+       $(WARN_CFLAGS) \
+       $(AM_CFLAGS)
+terminal_regex_LDFLAGS = \
+       $(AM_LDFLAGS)
+terminal_regex_LDADD = \
+       $(TERM_LIBS)
+
 # Terminal client
 
 if ENABLE_GTERMINAL
diff --git a/src/terminal-regex.c b/src/terminal-regex.c
new file mode 100644
index 0000000..b18b432
--- /dev/null
+++ b/src/terminal-regex.c
@@ -0,0 +1,313 @@
+/*
+ * Copyright © 2015 Egmont Koblinger
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#include <glib.h>
+#include <stdio.h>
+
+#include "terminal-regex.h"
+
+#ifdef TERMINAL_REGEX_MAIN
+
+/* Shorthand for expecting the pattern to match the entire input string */
+#define ENTIRE ((char *) 1)
+
+static char*
+get_match (const char *pattern, const char *string, GRegexMatchFlags match_flags)
+{
+  GRegex *regex;
+  GMatchInfo *match_info;
+  gchar *match;
+
+  regex = g_regex_new (pattern, 0, 0, NULL);
+  g_regex_match (regex, string, match_flags, &match_info);
+  match = g_match_info_fetch (match_info, 0);
+
+  g_free (regex);
+  g_free (match_info);
+  return match;
+}
+
+/* Macros rather than functions to report useful line numbers on failure. */
+#define assert_match(__pattern, __string, __expected) do { \
+  gchar *__actual_match = get_match(__pattern, __string, 0); \
+  const gchar *__expected_match = __expected; \
+  if (__expected_match == ENTIRE) __expected_match = __string; \
+  g_assert_cmpstr(__actual_match, ==, __expected_match); \
+  g_free (__actual_match); \
+} while (0)
+
+#define assert_match_anchored(__pattern, __string, __expected) do { \
+  gchar *__actual_match = get_match(__pattern, __string, G_REGEX_MATCH_ANCHORED); \
+  const gchar *__expected_match = __expected; \
+  if (__expected_match == ENTIRE) __expected_match = __string; \
+  g_assert_cmpstr(__actual_match, ==, __expected_match); \
+  g_free (__actual_match); \
+} while (0)
+
+int
+main (int argc, char **argv)
+{
+  /* SCHEME is case insensitive */
+  assert_match_anchored (SCHEME, "http",  ENTIRE);
+  assert_match_anchored (SCHEME, "HTTPS", ENTIRE);
+
+  /* USER is nonempty, alphanumeric, dot, plus and dash */
+  assert_match_anchored (USER, "",              NULL);
+  assert_match_anchored (USER, "dr.john-smith", ENTIRE);
+  assert_match_anchored (USER, "abc+def ghi",   "abc+def");
+
+  /* PASS is optional colon-prefixed value, allowing quite some characters, but definitely not @ */
+  assert_match_anchored (PASS, "",          ENTIRE);
+  assert_match_anchored (PASS, "nocolon",   "");
+  assert_match_anchored (PASS, ":s3cr3T",   ENTIRE);
+  assert_match_anchored (PASS, ":$?# host", ":$?#");
+
+  /* Hostname of at least 1 component, containing at least one non-digit in at least one of the segments */
+  assert_match_anchored (HOSTNAME1, "example.com",       ENTIRE);
+  assert_match_anchored (HOSTNAME1, "a-b.c-d",           ENTIRE);
+  assert_match_anchored (HOSTNAME1, "a_b",               "a");    /* TODO: can/should we totally abort here? 
*/
+  assert_match_anchored (HOSTNAME1, "déjà-vu.com",       ENTIRE);
+  assert_match_anchored (HOSTNAME1, "➡.ws",              ENTIRE);
+  assert_match_anchored (HOSTNAME1, "cömbining-áccents", ENTIRE);
+  assert_match_anchored (HOSTNAME1, "12",                NULL);
+  assert_match_anchored (HOSTNAME1, "12.34",             NULL);
+  assert_match_anchored (HOSTNAME1, "12.ab",             ENTIRE);
+//  assert_match_anchored (HOSTNAME1, "ab.12",             NULL);  /* errr... could we fail here?? */
+
+  /* Hostname of at least 2 components, containing at least one non-digit in at least one of the segments */
+  assert_match_anchored (HOSTNAME2, "example.com",       ENTIRE);
+  assert_match_anchored (HOSTNAME2, "example",           NULL);
+  assert_match_anchored (HOSTNAME2, "12",                NULL);
+  assert_match_anchored (HOSTNAME2, "12.34",             NULL);
+  assert_match_anchored (HOSTNAME2, "12.ab",             ENTIRE);
+  assert_match_anchored (HOSTNAME2, "ab.12",             NULL);
+//  assert_match_anchored (HOSTNAME2, "ab.cd.12",          NULL);  /* errr... could we fail here?? */
+
+  /* IPv4 segment (number between 0 and 255) */
+  assert_match_anchored (DEFS "(?&S4)", "0",    ENTIRE);
+  assert_match_anchored (DEFS "(?&S4)", "1",    ENTIRE);
+  assert_match_anchored (DEFS "(?&S4)", "9",    ENTIRE);
+  assert_match_anchored (DEFS "(?&S4)", "10",   ENTIRE);
+  assert_match_anchored (DEFS "(?&S4)", "99",   ENTIRE);
+  assert_match_anchored (DEFS "(?&S4)", "100",  ENTIRE);
+  assert_match_anchored (DEFS "(?&S4)", "200",  ENTIRE);
+  assert_match_anchored (DEFS "(?&S4)", "250",  ENTIRE);
+  assert_match_anchored (DEFS "(?&S4)", "255",  ENTIRE);
+  assert_match_anchored (DEFS "(?&S4)", "256",  NULL);
+  assert_match_anchored (DEFS "(?&S4)", "260",  NULL);
+  assert_match_anchored (DEFS "(?&S4)", "300",  NULL);
+  assert_match_anchored (DEFS "(?&S4)", "1000", NULL);
+  assert_match_anchored (DEFS "(?&S4)", "",     NULL);
+  assert_match_anchored (DEFS "(?&S4)", "a1b",  NULL);
+
+  /* IPv4 addresses */
+  assert_match_anchored (DEFS "(?&IPV4)", "11.22.33.44",    ENTIRE);
+  assert_match_anchored (DEFS "(?&IPV4)", "0.1.254.255",    ENTIRE);
+  assert_match_anchored (DEFS "(?&IPV4)", "75.150.225.300", NULL);
+  assert_match_anchored (DEFS "(?&IPV4)", "1.2.3.4.5",      "1.2.3.4");  /* we could also bail out and not 
match at all */
+
+  /* IPv6 addresses */
+  assert_match_anchored (DEFS "(?&IPV6)", "11:::22",                           NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22::33:44::55:66",               NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "dead::beef",                        ENTIRE);
+  assert_match_anchored (DEFS "(?&IPV6)", "faded::bee",                        NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "live::pork",                        NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "::1",                               ENTIRE);
+  assert_match_anchored (DEFS "(?&IPV6)", "11::22:33::44",                     NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:::33",                        NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "dead:beef::192.168.1.1",            ENTIRE);
+  assert_match_anchored (DEFS "(?&IPV6)", "192.168.1.1",                       NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:87654",        NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22::33:45678",                   NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:192.168.1.12345", NULL);
+
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77",              NULL);   /* no :: */
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:88",           ENTIRE);
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:88:99",        NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:66:77",            ENTIRE); /* :: at the start */
+  assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:66:77:88",         NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:66:77",             ENTIRE); /* :: in the middle 
*/
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:66:77:88",          NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77::",            ENTIRE); /* :: at the end */
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:88::",         NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "::",                                ENTIRE); /* :: only */
+
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:192.168.1.1",        NULL);   /* no :: */
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:192.168.1.1",     ENTIRE);
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:192.168.1.1",  NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:192.168.1.1",      ENTIRE); /* :: at the start */
+  assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:66:192.168.1.1",   NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:192.168.1.1",       ENTIRE); /* :: in the imddle 
*/
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:66:192.168.1.1",    NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55::192.168.1.1",       ENTIRE); /* :: at the 
end(ish) */
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66::192.168.1.1",    NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "::192.168.1.1",                     ENTIRE); /* :: only(ish) */
+
+  /* URL_HOST is either a hostname, or an IPv4 address, or a bracket-enclosed IPv6 address */
+  assert_match_anchored (DEFS URL_HOST, "example",       ENTIRE);
+  assert_match_anchored (DEFS URL_HOST, "example.com",   ENTIRE);
+  assert_match_anchored (DEFS URL_HOST, "11.22.33.44",   ENTIRE);
+  assert_match_anchored (DEFS URL_HOST, "[11.22.33.44]", NULL);
+  assert_match_anchored (DEFS URL_HOST, "dead::be:ef",   "dead");  /* TODO: can/should we totally abort 
here? */
+  assert_match_anchored (DEFS URL_HOST, "[dead::be:ef]", ENTIRE);
+
+  /* EMAIL_HOST is either an at least two-component hostname, or a bracket-enclosed IPv[46] address */
+  assert_match_anchored (DEFS EMAIL_HOST, "example",        NULL);
+  assert_match_anchored (DEFS EMAIL_HOST, "example.com",    ENTIRE);
+  assert_match_anchored (DEFS EMAIL_HOST, "11.22.33.44",    NULL);
+  assert_match_anchored (DEFS EMAIL_HOST, "[11.22.33.44]",  ENTIRE);
+  assert_match_anchored (DEFS EMAIL_HOST, "[11.22.33.456]", NULL);
+  assert_match_anchored (DEFS EMAIL_HOST, "dead::be:ef",    NULL);
+  assert_match_anchored (DEFS EMAIL_HOST, "[dead::be:ef]",  ENTIRE);
+
+  /* Number between 1 and 65535 (helper for port) */
+  assert_match_anchored (N_1_65535, "0",      NULL);
+  assert_match_anchored (N_1_65535, "1",      ENTIRE);
+  assert_match_anchored (N_1_65535, "10",     ENTIRE);
+  assert_match_anchored (N_1_65535, "100",    ENTIRE);
+  assert_match_anchored (N_1_65535, "1000",   ENTIRE);
+  assert_match_anchored (N_1_65535, "10000",  ENTIRE);
+  assert_match_anchored (N_1_65535, "60000",  ENTIRE);
+  assert_match_anchored (N_1_65535, "65000",  ENTIRE);
+  assert_match_anchored (N_1_65535, "65500",  ENTIRE);
+  assert_match_anchored (N_1_65535, "65530",  ENTIRE);
+  assert_match_anchored (N_1_65535, "65535",  ENTIRE);
+  assert_match_anchored (N_1_65535, "65536",  NULL);
+  assert_match_anchored (N_1_65535, "65540",  NULL);
+  assert_match_anchored (N_1_65535, "65600",  NULL);
+  assert_match_anchored (N_1_65535, "66000",  NULL);
+  assert_match_anchored (N_1_65535, "70000",  NULL);
+  assert_match_anchored (N_1_65535, "100000", NULL);
+  assert_match_anchored (N_1_65535, "",       NULL);
+  assert_match_anchored (N_1_65535, "a1b",    NULL);
+
+  /* PORT is an optional colon-prefixed value */
+  assert_match_anchored (PORT, "",       ENTIRE);
+  assert_match_anchored (PORT, ":1",     ENTIRE);
+  assert_match_anchored (PORT, ":65535", ENTIRE);
+  assert_match_anchored (PORT, ":65536", "");     /* TODO: can/should we totally abort here? */
+
+  /* TODO: add tests for PATHCHARS and PATHNONTERM; and/or URLPATH */
+  assert_match_anchored (URLPATH, "/ab/cd",       ENTIRE);
+  assert_match_anchored (URLPATH, "/ab/cd.html.", "/ab/cd.html");
+
+
+  /* Put the components together and test the big picture */
+
+  assert_match (REGEX_URL_AS_IS, "There's no URL here http:/foo",               NULL);
+  assert_match (REGEX_URL_AS_IS, "Visit http://example.com for details",        "http://example.com";);
+  assert_match (REGEX_URL_AS_IS, "Trailing dot http://foo/bar.html.";,           "http://foo/bar.html";);
+  assert_match (REGEX_URL_AS_IS, "Trailing ellipsis http://foo/bar.html...";,    "http://foo/bar.html";);
+  assert_match (REGEX_URL_AS_IS, "See <http://foo/bar>",                        "http://foo/bar";);
+  assert_match (REGEX_URL_AS_IS, "<http://foo.bar/asdf.qwer.html>",             
"http://foo.bar/asdf.qwer.html";);
+  assert_match (REGEX_URL_AS_IS, "Go to http://192.168.1.1.";,                   "http://192.168.1.1";);
+  assert_match (REGEX_URL_AS_IS, "If not, see <http://www.gnu.org/licenses/>.", 
"http://www.gnu.org/licenses/";);
+
+  assert_match (REGEX_URL_AS_IS, "http://";,          NULL);
+  assert_match (REGEX_URL_AS_IS, "http://a";,         ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://aa.";,       "http://aa";);
+  assert_match (REGEX_URL_AS_IS, "http://aa.b";,      ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://aa.bb";,     ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://aa.bb/c";,   ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://aa.bb/cc";,  ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://aa.bb/cc/";, ENTIRE);
+
+  assert_match (REGEX_URL_AS_IS, "HtTp://déjà-vu.com:10000/déjà/vu", ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "HTTP://joe:sEcReT ➡ ws:1080",      ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "https://cömbining-áccents";,        ENTIRE);
+
+  assert_match (REGEX_URL_AS_IS, "http://111.222.33.44";,                ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://111.222.33.44/";,               ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://111.222.33.44/foo";,            ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://1.2.3.4:5555/xyz";,             ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "https://[dead::beef]:12345/ipv6";,     ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "https://[dead::beef:11.22.33.44]";,    ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://1.2.3.4:";,                     "http://1.2.3.4";);  /* TODO: 
can/should we totally abort here? */
+  assert_match (REGEX_URL_AS_IS, "https://dead::beef/no-brackets-ipv6";, "https://dead";);    /* detto */
+  assert_match (REGEX_URL_AS_IS, "http://111.222.333.444/";,             NULL);
+  assert_match (REGEX_URL_AS_IS, "http://1.2.3.4:70000";,                "http://1.2.3.4";);  /* TODO: 
can/should we totally abort here? */
+  assert_match (REGEX_URL_AS_IS, "http://[dead::beef:111.222.333.444]";, NULL);
+
+  /* Username, password */
+  assert_match (REGEX_URL_AS_IS, "http://joe example com",                 ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://user.name:sec ret host name",     ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://joe:secret [::1]",                ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://dudewithnopassword:@example.com";, ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://safeguy:!#$%^&* host",            ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://invalidusername! host",           "http://invalidusername";);
+
+  assert_match (REGEX_URL_AS_IS, "http://ab.cd/ef?g=h&i=j|k=l#m=n:o=p", ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http:///foo";,                         NULL);
+
+  /* No scheme */
+  assert_match (REGEX_URL_HTTP, "www.foo.bar/baz",     ENTIRE);
+  assert_match (REGEX_URL_HTTP, "WWW3.foo.bar/baz",    ENTIRE);
+  assert_match (REGEX_URL_HTTP, "FTP.FOO.BAR/BAZ",     ENTIRE);  /* FIXME if no scheme is given and url 
starts with ftp, can we make the protocol ftp instead of http? */
+  assert_match (REGEX_URL_HTTP, "ftpxy.foo.bar/baz",   ENTIRE);
+//  assert_match (REGEX_URL_HTTP, "ftp.123/baz",         NULL);  /* errr... could we fail here?? */
+  assert_match (REGEX_URL_HTTP, "foo.bar/baz",         NULL);
+  assert_match (REGEX_URL_HTTP, "abc.www.foo.bar/baz", NULL);
+  assert_match (REGEX_URL_HTTP, "uvwww.foo.bar/baz",   NULL);
+  assert_match (REGEX_URL_HTTP, "xftp.foo.bar/baz",    NULL);
+
+  /* file:/ or file://(hostname)?/ */
+  assert_match (REGEX_URL_FILE, "file:",                NULL);
+  assert_match (REGEX_URL_FILE, "file:/",               ENTIRE);
+  assert_match (REGEX_URL_FILE, "file://",              NULL);
+  assert_match (REGEX_URL_FILE, "file:///",             ENTIRE);
+  assert_match (REGEX_URL_FILE, "file:////",            NULL);
+  assert_match (REGEX_URL_FILE, "file:etc/passwd",      NULL);
+  assert_match (REGEX_URL_FILE, "File:/etc/passwd",     ENTIRE);
+  assert_match (REGEX_URL_FILE, "FILE:///etc/passwd",   ENTIRE);
+  assert_match (REGEX_URL_FILE, "file:////etc/passwd",  NULL);
+  assert_match (REGEX_URL_FILE, "file://host.name",     NULL);
+  assert_match (REGEX_URL_FILE, "file://host.name/",    ENTIRE);
+  assert_match (REGEX_URL_FILE, "file://host.name/etc", ENTIRE);
+
+  assert_match (REGEX_URL_FILE, "See file:/.",             "file:/");
+  assert_match (REGEX_URL_FILE, "See file:///.",           "file:///");
+  assert_match (REGEX_URL_FILE, "See file:/lost+found.",   "file:/lost+found");
+  assert_match (REGEX_URL_FILE, "See file:///lost+found.", "file:///lost+found");
+
+  /* Email */
+  assert_match (REGEX_EMAIL, "Write to foo bar com ",        "foo bar com");
+  assert_match (REGEX_EMAIL, "Write to <foo bar com>",       "foo bar com");
+  assert_match (REGEX_EMAIL, "Write to mailto:foo bar com ", "mailto:foo bar com");
+  assert_match (REGEX_EMAIL, "Write to MAILTO:FOO BAR COM ", "MAILTO:FOO BAR COM");
+  assert_match (REGEX_EMAIL, "Write to foo [1 2 3 4]",       "foo [1 2 3 4]");
+  assert_match (REGEX_EMAIL, "Write to foo [1 2 3 456]",     NULL);
+  assert_match (REGEX_EMAIL, "Write to foo [1::2345]",       "foo [1::2345]");
+
+  /* Sip, examples from rfc 3261 */
+  assert_match (REGEX_URL_VOIP, "sip:alice atlanta com;maddr=239.255.255.1;ttl=15",           ENTIRE);
+  assert_match (REGEX_URL_VOIP, "sip:alice atlanta com",                                      ENTIRE);
+  assert_match (REGEX_URL_VOIP, "sip:alice:secretword atlanta com;transport=tcp",             ENTIRE);
+  assert_match (REGEX_URL_VOIP, "sips:alice atlanta com?subject=project%20x&priority=urgent", ENTIRE);
+  assert_match (REGEX_URL_VOIP, "sip:+1-212-555-1212:1234 gateway com;user=phone",            ENTIRE);
+  assert_match (REGEX_URL_VOIP, "sips:1212 gateway com",                                      ENTIRE);
+  assert_match (REGEX_URL_VOIP, "sip:alice 192 0 2 4",                                        ENTIRE);
+  assert_match (REGEX_URL_VOIP, "sip:atlanta.com;method=REGISTER?to=alice%40atlanta.com",     ENTIRE);
+  assert_match (REGEX_URL_VOIP, "SIP:alice;day=tuesday atlanta com",                          ENTIRE);
+  assert_match (REGEX_URL_VOIP, "Dial sip:alice 192 0 2 4 ",                                  "sip:alice 192 
0 2 4");
+
+  printf("terminal-regex tests passed :)\n");
+  return 0;
+}
+
+#endif /* TERMINAL_REGEX_MAIN */
diff --git a/src/terminal-regex.h b/src/terminal-regex.h
new file mode 100644
index 0000000..f53bdce
--- /dev/null
+++ b/src/terminal-regex.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright © 2015 Egmont Koblinger
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Mini style-guide:
+ *
+ * #define'd fragments should preferably have an outermost group, for the
+ * exact same reason as why usually in C/C++ #define's the values are enclosed
+ * in parentheses: that is, so that you don't get surprised when you use the
+ * macro and append a quantifier.
+ *
+ * For repeated fragments prefer regex-style (?(DEFINE)(?<NAME>(...))) and use
+ * as (?&NAME), so that the regex string and the compiled regex object is
+ * smaller.
+ *
+ * Build small blocks, comment and unittest them heavily.
+ *
+ * Use free-spacing mode for improved readability. The hardest to read is
+ * which additional characters belong to a "(?" prefix. To improve
+ * readability, place a space after this, and for symmetry, before the closing
+ * parenthesis. Also place a space around "|" characters. No space before
+ * quantifiers. Try to be consistent with the existing style (yes I know the
+ * existing style is not consistent either, but please do your best).
+ *
+ * See http://www.rexegg.com/regex-disambiguation.html for all the "(?"
+ * syntaxes.
+ */
+
+#ifndef TERMINAL_REGEX_H
+#define TERMINAL_REGEX_H
+
+#define SCHEME "(?ix: news | telnet | nntp | https? | ftps? | sftp | webcal )"
+
+#define USERCHARS "-+.[:alnum:]"
+/* Nonempty username, e.g. "john.smith" */
+#define USER "[" USERCHARS "]+"
+
+#define PASSCHARS_CLASS "[-[:alnum:]\\Q,?;.:/!%$^*&~\"#'\\E]"
+/* Optional colon-prefixed password. I guess empty password should be allowed, right? E.g. ":secret", ":", 
"" */
+#define PASS "(?x: :" PASSCHARS_CLASS "* )?"
+
+/* Optional at-terminated username (with perhaps a password too), e.g. "joe@", "pete:secret@", "" */
+#define USERPASS "(?:" USER PASS "@)?"
+
+/* S4: IPv4 segment (number between 0 and 255) with lookahead at the end so that we don't match "25" in the 
string "256".
+   The lookahead could go to the last segment of IPv4 only but this construct allows nicer unittesting. */
+#define S4_DEF "(?(DEFINE)(?<S4>(?x: (?: [0-9] | [1-9][0-9] | 1[0-9]{2} | 2[0-4][0-9] | 25[0-5] ) (?! [0-9] 
) )))"
+
+/* IPV4: Decimal IPv4, e.g. "1.2.3.4", with lookahead (implemented in S4) at the end so that we don't match 
"192.168.1.123" in the string "192.168.1.1234". */
+#define IPV4_DEF S4_DEF "(?(DEFINE)(?<IPV4>(?x: (?: (?&S4) \\. ){3} (?&S4) )))"
+
+/* IPv6, including embedded IPv4, e.g. "::1", "dead:beef::1.2.3.4".
+ * Lookahead for the next char not being a dot or digit, so it doesn't get stuck matching "dead:beef::1" in 
"dead:beef::1.2.3.4".
+ * This is not required since the surrounding brackets would trigger backtracking, but it allows nicer 
unittesting.
+ * TODO: more strict check (right number of colons, etc.)
+ * TODO: add zone_id: RFC 4007 section 11, RFC 6874 */
+
+/* S6: IPv6 segment, S6C: IPv6 segment followed by a comma, CS6: comma followed by an IPv6 segment */
+#define S6_DEF "(?(DEFINE)(?<S6>[[:xdigit:]]{1,4})(?<CS6>:(?&S6))(?<S6C>(?&S6):))"
+
+/* No :: shorthand */
+#define IPV6_FULL  "(?x: (?&S6C){7} (?&S6) )"
+/* Begins with :: */
+#define IPV6_LEFT  "(?x: : (?&CS6){1,7} )"
+/* :: somewhere in the middle - use negative lookahead to make sure there aren't too many colons in total */
+#define IPV6_MID   "(?x: (?! (?: [[:xdigit:]]*: ){8} ) (?&S6C){1,6} (?&CS6){1,6} )"
+/* Ends with :: */
+#define IPV6_RIGHT "(?x: (?&S6C){1,7} : )"
+/* Is "::" and nothing more */
+#define IPV6_NULL  "(?x: :: )"
+
+/* The same ones for IPv4-embedded notation, without the actual IPv4 part */
+#define IPV6V4_FULL  "(?x: (?&S6C){6} )"
+#define IPV6V4_LEFT  "(?x: :: (?&S6C){0,5} )"  /* includes "::<ipv4>" */
+#define IPV6V4_MID   "(?x: (?! (?: [[:xdigit:]]*: ){7} ) (?&S6C){1,4} (?&CS6){1,4} ) :"
+#define IPV6V4_RIGHT "(?x: (?&S6C){1,5} : )"
+
+/* IPV6: An IPv6 address (possibly with an embedded IPv4).
+ * This macro defines both IPV4 and IPV6, since the latter one requires the former. */
+#define IP_DEF IPV4_DEF S6_DEF "(?(DEFINE)(?<IPV6>(?x: (?: " IPV6_NULL " | " IPV6_LEFT " | " IPV6_MID " | " 
IPV6_RIGHT " | " IPV6_FULL " | (?: " IPV6V4_FULL " | " IPV6V4_LEFT " | " IPV6V4_MID " | " IPV6V4_RIGHT " ) 
(?&IPV4) ) (?! [.:[:xdigit:]] ) )))"
+
+/* Either an alphanumeric character or dash; or if [negative lookahead] not ASCII
+ * then any graphical Unicode character.
+ * A segment can consist entirely of numbers.
+ * (Note: PCRE doesn't support character class subtraction/intersection.) */
+#define HOSTNAMESEGMENTCHARS_CLASS "(?x: [-[:alnum:]] | (?! [[:ascii:]] ) [[:graph:]] )"
+
+/* A hostname of at least 1 component. The last component cannot be entirely numbers.
+ * E.g. "foo", "example.com", "1234.com", but not "foo.123" */
+#define HOSTNAME1 "(?x: (?: " HOSTNAMESEGMENTCHARS_CLASS "+ \\. )* " HOSTNAMESEGMENTCHARS_CLASS "* (?! [0-9] 
) " HOSTNAMESEGMENTCHARS_CLASS "+ )"
+
+/* A hostname of at least 2 components. The last component cannot be entirely numbers.
+ * E.g. "example.com", "1234.com", but not "1234.56" */
+#define HOSTNAME2 "(?x: (?: " HOSTNAMESEGMENTCHARS_CLASS "+ \\.)+ " HOSTNAME1 " )"
+
+/* For URL: Hostname, IPv4, or bracket-enclosed IPv6, e.g. "example.com", "1.2.3.4", "[::1]" */
+#define URL_HOST "(?x: " HOSTNAME1 " | (?&IPV4) | \\[ (?&IPV6) \\] )"
+
+/* For e-mail: Hostname of at least two segments, or bracket-enclosed IPv4 or IPv6, e.g. "example.com", 
"[1.2.3.4]", "[::1]".
+ * Technically an e-mail with a single-component hostname might be valid on a local network, but let's avoid 
tons of false positives (e.g. in a typical shell prompt). */
+#define EMAIL_HOST "(?x: " HOSTNAME2 " | \\[ (?: (?&IPV4) | (?&IPV6) ) \\] )"
+
+/* Number between 1 and 65535, with lookahead at the end so that we don't match "6789" in the string "67890",
+   and in turn we don't eventually match "http://host:6789"; in "http://host:67890";. */
+#define N_1_65535 "(?x: (?: [1-9][0-9]{0,3} | [1-5][0-9]{4} | 6[0-4][0-9]{3} | 65[0-4][0-9]{2} | 
655[0-2][0-9] | 6553[0-5] ) (?! [0-9] ) )"
+
+/* Optional colon-prefixed port, e.g. ":1080", "" */
+#define PORT "(?x: \\:" N_1_65535 " )?"
+
+#define PATHCHARS_CLASS "[-[:alnum:]\\Q_$.+!*,:;@&=?/~#|%\\E]"
+/* Chars not to end a URL */
+#define PATHNONTERM_CLASS "[\\Q.!,?\\E]"
+
+/* Lookbehind at the end, so that the last character (if we matched a character at all) is not from 
PATHTERM_CLASS */
+#define URLPATH "(?x: /" PATHCHARS_CLASS "* (?<! " PATHNONTERM_CLASS " ) )?"
+#define VOIP_PATH "(?x: [;?]" PATHCHARS_CLASS "* (?<! " PATHNONTERM_CLASS " ) )?"
+
+/* Now let's put these fragments together */
+
+#define DEFS IP_DEF
+
+#define REGEX_URL_AS_IS  DEFS SCHEME "://" USERPASS URL_HOST PORT URLPATH
+/* TODO: also support file:/etc/passwd */
+#define REGEX_URL_FILE   DEFS "(?ix: file:/ (?: / (?: " HOSTNAME1 " )? / )? (?! / ) )(?x: " PATHCHARS_CLASS 
"+ (?<! " PATHNONTERM_CLASS " ) )?" 
+/* Lookbehind so that we don't catch "abc.www.foo.bar", bug 739757. Lookahead for www/ftp for convenience 
(so that we can reuse HOSTNAME1). */
+#define REGEX_URL_HTTP   DEFS "(?<!(?:" HOSTNAMESEGMENTCHARS_CLASS "|[.]))(?=(?i:www|ftp))" HOSTNAME1 PORT 
URLPATH
+#define REGEX_URL_VOIP   DEFS "(?i:h323:|sips?:)" USERPASS URL_HOST PORT VOIP_PATH
+#define REGEX_EMAIL      DEFS "(?i:mailto:)?" USER "@" EMAIL_HOST
+#define REGEX_NEWS_MAN   "(?i:news:|man:|info:)[-[:alnum:]\\Q^_{|}~!\"#$%&'()*+,./;:=?`\\E]+"
+
+#endif /* !TERMINAL_REGEX_H */
diff --git a/src/terminal-screen.c b/src/terminal-screen.c
index a48de10..051f7be 100644
--- a/src/terminal-screen.c
+++ b/src/terminal-screen.c
@@ -19,6 +19,7 @@
 #include "config.h"
 #define _GNU_SOURCE /* for dup3 */
 
+#include "terminal-regex.h"
 #include "terminal-screen.h"
 
 #include <errno.h>
@@ -158,30 +159,18 @@ static void terminal_screen_set_override_command (TerminalScreen  *screen,
 
 static guint signals[LAST_SIGNAL];
 
-#define USERCHARS "-[:alnum:]"
-#define USERCHARS_CLASS "[" USERCHARS "]"
-#define PASSCHARS_CLASS "[-[:alnum:]\\Q,?;.:/!%$^*&~\"#'\\E]"
-#define HOSTCHARS_CLASS "[-[:alnum:]]"
-#define HOST HOSTCHARS_CLASS "+(\\." HOSTCHARS_CLASS "+)*"
-#define PORT "(?:\\:[[:digit:]]{1,5})?"
-#define PATHCHARS_CLASS "[-[:alnum:]\\Q_$.+!*,:;@&=?/~#%\\E]"
-#define PATHTERM_CLASS "[^\\Q]'.:}>) \t\r\n,\"\\E]"
-#define SCHEME "(?:news:|telnet:|nntp:|file:\\/|https?:|ftps?:|sftp:|webcal:)"
-#define USERPASS USERCHARS_CLASS "+(?:" PASSCHARS_CLASS "+)?"
-#define URLPATH   
"(?:(/"PATHCHARS_CLASS"+(?:[(]"PATHCHARS_CLASS"*[)])*"PATHCHARS_CLASS"*)*"PATHTERM_CLASS")?"
-
 typedef struct {
   const char *pattern;
   TerminalURLFlavour flavor;
-  GRegexCompileFlags flags;
 } TerminalRegexPattern;
 
 static const TerminalRegexPattern url_regex_patterns[] = {
-  { SCHEME "//(?:" USERPASS "\\@)?" HOST PORT URLPATH, FLAVOR_AS_IS, G_REGEX_CASELESS },
-  { "(?:www|ftp)" HOSTCHARS_CLASS "*\\." HOST PORT URLPATH , FLAVOR_DEFAULT_TO_HTTP, G_REGEX_CASELESS  },
-  { "(?:callto:|h323:|sip:)" USERCHARS_CLASS "[" USERCHARS ".]*(?:" PORT "/[a-z0-9]+)?\\@" HOST, 
FLAVOR_VOIP_CALL, G_REGEX_CASELESS  },
-  { "(?:mailto:)?" USERCHARS_CLASS "[" USERCHARS ".]*\\@" HOSTCHARS_CLASS "+\\." HOST, FLAVOR_EMAIL, 
G_REGEX_CASELESS  },
-  { "(?:news:|man:|info:)[-[:alnum:]\\Q^_{|}~!\"#$%&'()*+,./;:=?`\\E]+", FLAVOR_AS_IS, G_REGEX_CASELESS  },
+  { REGEX_URL_AS_IS, FLAVOR_AS_IS },
+  { REGEX_URL_HTTP,  FLAVOR_DEFAULT_TO_HTTP },
+  { REGEX_URL_FILE,  FLAVOR_AS_IS },
+  { REGEX_URL_VOIP,  FLAVOR_VOIP_CALL },
+  { REGEX_EMAIL,     FLAVOR_EMAIL },
+  { REGEX_NEWS_MAN,  FLAVOR_AS_IS },
 };
 
 static GRegex **url_regexes;
@@ -544,7 +533,7 @@ terminal_screen_class_init (TerminalScreenClass *klass)
       GError *error = NULL;
 
       url_regexes[i] = g_regex_new (url_regex_patterns[i].pattern,
-                                    url_regex_patterns[i].flags | G_REGEX_OPTIMIZE,
+                                    G_REGEX_OPTIMIZE,
                                     0, &error);
       g_assert_no_error (error);
 


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]