[gnome-terminal/gnome-3-18] screen: Rewrite URL regexes
- From: Egmont Koblinger <egmontkob src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gnome-terminal/gnome-3-18] screen: Rewrite URL regexes
- Date: Fri, 4 Mar 2016 17:40:52 +0000 (UTC)
commit 3bd692f5d63f0bdfb526d570c92da3270601c155
Author: Christian Persch <chpe gnome org>
Date: Sun Feb 21 15:46:05 2016 +0100
screen: Rewrite URL regexes
Rewrite the URL match regex to be more correct.
https://bugzilla.gnome.org/show_bug.cgi?id=756038
(cherry picked from commit 1a94499aca8f9a06479f5149e70e246bf9249ac0)
src/Makefile.am | 27 +++++
src/terminal-regex.c | 313 +++++++++++++++++++++++++++++++++++++++++++++++++
src/terminal-regex.h | 145 +++++++++++++++++++++++
src/terminal-screen.c | 27 ++---
4 files changed, 493 insertions(+), 19 deletions(-)
---
diff --git a/src/Makefile.am b/src/Makefile.am
index 0af81d7..dfc41d1 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -6,6 +6,8 @@ bin_PROGRAMS = gnome-terminal
libexec_PROGRAMS = gnome-terminal-server
noinst_PROGRAMS =
+check_PROGRAMS = terminal-regex
+
if WITH_NAUTILUS_EXTENSION
nautilusextension_LTLIBRARIES = libterminal-nautilus.la
endif # WITH_NAUTILUS_EXTENSION
@@ -60,6 +62,7 @@ gnome_terminal_server_SOURCES = \
terminal-prefs.h \
terminal-profiles-list.c \
terminal-profiles-list.h \
+ terminal-regex.h \
terminal-schemas.h \
terminal-settings-list.c \
terminal-settings-list.h \
@@ -149,6 +152,30 @@ terminal-gdbus-generated.c terminal-gdbus-generated.h: org.gnome.Terminal.xml Ma
terminal-resources.h terminal-resources.c: terminal.gresource.xml Makefile $(shell $(GLIB_COMPILE_RESOURCES)
--generate-dependencies --sourcedir $(srcdir) $(srcdir)/terminal.gresource.xml)
$(AM_V_GEN) XMLLINT=$(XMLLINT) $(GLIB_COMPILE_RESOURCES) --target $@ --sourcedir $(srcdir) --generate
--c-name terminal $<
+# Checks
+
+TESTS = \
+ terminal-regex \
+ $(NULL)
+
+# Check programmes
+
+terminal_regex_CPPFLAGS = \
+ $(AM_CPPFLAGS)
+terminal_regex_SOURCES = \
+ terminal-regex.c \
+ terminal-regex.h \
+ $(NULL)
+terminal_regex_CFLAGS = \
+ -DTERMINAL_REGEX_MAIN \
+ $(TERM_CFLAGS) \
+ $(WARN_CFLAGS) \
+ $(AM_CFLAGS)
+terminal_regex_LDFLAGS = \
+ $(AM_LDFLAGS)
+terminal_regex_LDADD = \
+ $(TERM_LIBS)
+
# Terminal client
if ENABLE_GTERMINAL
diff --git a/src/terminal-regex.c b/src/terminal-regex.c
new file mode 100644
index 0000000..b18b432
--- /dev/null
+++ b/src/terminal-regex.c
@@ -0,0 +1,313 @@
+/*
+ * Copyright © 2015 Egmont Koblinger
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#include <glib.h>
+#include <stdio.h>
+
+#include "terminal-regex.h"
+
+#ifdef TERMINAL_REGEX_MAIN
+
+/* Shorthand for expecting the pattern to match the entire input string */
+#define ENTIRE ((char *) 1)
+
+static char*
+get_match (const char *pattern, const char *string, GRegexMatchFlags match_flags)
+{
+ GRegex *regex;
+ GMatchInfo *match_info;
+ gchar *match;
+
+ regex = g_regex_new (pattern, 0, 0, NULL);
+ g_regex_match (regex, string, match_flags, &match_info);
+ match = g_match_info_fetch (match_info, 0);
+
+ g_free (regex);
+ g_free (match_info);
+ return match;
+}
+
+/* Macros rather than functions to report useful line numbers on failure. */
+#define assert_match(__pattern, __string, __expected) do { \
+ gchar *__actual_match = get_match(__pattern, __string, 0); \
+ const gchar *__expected_match = __expected; \
+ if (__expected_match == ENTIRE) __expected_match = __string; \
+ g_assert_cmpstr(__actual_match, ==, __expected_match); \
+ g_free (__actual_match); \
+} while (0)
+
+#define assert_match_anchored(__pattern, __string, __expected) do { \
+ gchar *__actual_match = get_match(__pattern, __string, G_REGEX_MATCH_ANCHORED); \
+ const gchar *__expected_match = __expected; \
+ if (__expected_match == ENTIRE) __expected_match = __string; \
+ g_assert_cmpstr(__actual_match, ==, __expected_match); \
+ g_free (__actual_match); \
+} while (0)
+
+int
+main (int argc, char **argv)
+{
+ /* SCHEME is case insensitive */
+ assert_match_anchored (SCHEME, "http", ENTIRE);
+ assert_match_anchored (SCHEME, "HTTPS", ENTIRE);
+
+ /* USER is nonempty, alphanumeric, dot, plus and dash */
+ assert_match_anchored (USER, "", NULL);
+ assert_match_anchored (USER, "dr.john-smith", ENTIRE);
+ assert_match_anchored (USER, "abc+def ghi", "abc+def");
+
+ /* PASS is optional colon-prefixed value, allowing quite some characters, but definitely not @ */
+ assert_match_anchored (PASS, "", ENTIRE);
+ assert_match_anchored (PASS, "nocolon", "");
+ assert_match_anchored (PASS, ":s3cr3T", ENTIRE);
+ assert_match_anchored (PASS, ":$?# host", ":$?#");
+
+ /* Hostname of at least 1 component, containing at least one non-digit in at least one of the segments */
+ assert_match_anchored (HOSTNAME1, "example.com", ENTIRE);
+ assert_match_anchored (HOSTNAME1, "a-b.c-d", ENTIRE);
+ assert_match_anchored (HOSTNAME1, "a_b", "a"); /* TODO: can/should we totally abort here?
*/
+ assert_match_anchored (HOSTNAME1, "déjà-vu.com", ENTIRE);
+ assert_match_anchored (HOSTNAME1, "➡.ws", ENTIRE);
+ assert_match_anchored (HOSTNAME1, "cömbining-áccents", ENTIRE);
+ assert_match_anchored (HOSTNAME1, "12", NULL);
+ assert_match_anchored (HOSTNAME1, "12.34", NULL);
+ assert_match_anchored (HOSTNAME1, "12.ab", ENTIRE);
+// assert_match_anchored (HOSTNAME1, "ab.12", NULL); /* errr... could we fail here?? */
+
+ /* Hostname of at least 2 components, containing at least one non-digit in at least one of the segments */
+ assert_match_anchored (HOSTNAME2, "example.com", ENTIRE);
+ assert_match_anchored (HOSTNAME2, "example", NULL);
+ assert_match_anchored (HOSTNAME2, "12", NULL);
+ assert_match_anchored (HOSTNAME2, "12.34", NULL);
+ assert_match_anchored (HOSTNAME2, "12.ab", ENTIRE);
+ assert_match_anchored (HOSTNAME2, "ab.12", NULL);
+// assert_match_anchored (HOSTNAME2, "ab.cd.12", NULL); /* errr... could we fail here?? */
+
+ /* IPv4 segment (number between 0 and 255) */
+ assert_match_anchored (DEFS "(?&S4)", "0", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "1", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "9", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "10", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "99", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "100", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "200", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "250", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "255", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "256", NULL);
+ assert_match_anchored (DEFS "(?&S4)", "260", NULL);
+ assert_match_anchored (DEFS "(?&S4)", "300", NULL);
+ assert_match_anchored (DEFS "(?&S4)", "1000", NULL);
+ assert_match_anchored (DEFS "(?&S4)", "", NULL);
+ assert_match_anchored (DEFS "(?&S4)", "a1b", NULL);
+
+ /* IPv4 addresses */
+ assert_match_anchored (DEFS "(?&IPV4)", "11.22.33.44", ENTIRE);
+ assert_match_anchored (DEFS "(?&IPV4)", "0.1.254.255", ENTIRE);
+ assert_match_anchored (DEFS "(?&IPV4)", "75.150.225.300", NULL);
+ assert_match_anchored (DEFS "(?&IPV4)", "1.2.3.4.5", "1.2.3.4"); /* we could also bail out and not
match at all */
+
+ /* IPv6 addresses */
+ assert_match_anchored (DEFS "(?&IPV6)", "11:::22", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22::33:44::55:66", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "dead::beef", ENTIRE);
+ assert_match_anchored (DEFS "(?&IPV6)", "faded::bee", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "live::pork", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "::1", ENTIRE);
+ assert_match_anchored (DEFS "(?&IPV6)", "11::22:33::44", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:::33", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "dead:beef::192.168.1.1", ENTIRE);
+ assert_match_anchored (DEFS "(?&IPV6)", "192.168.1.1", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:87654", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22::33:45678", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:192.168.1.12345", NULL);
+
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77", NULL); /* no :: */
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:88", ENTIRE);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:88:99", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:66:77", ENTIRE); /* :: at the start */
+ assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:66:77:88", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:66:77", ENTIRE); /* :: in the middle
*/
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:66:77:88", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77::", ENTIRE); /* :: at the end */
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:88::", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "::", ENTIRE); /* :: only */
+
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:192.168.1.1", NULL); /* no :: */
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:192.168.1.1", ENTIRE);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:192.168.1.1", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:192.168.1.1", ENTIRE); /* :: at the start */
+ assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:66:192.168.1.1", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:192.168.1.1", ENTIRE); /* :: in the imddle
*/
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:66:192.168.1.1", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55::192.168.1.1", ENTIRE); /* :: at the
end(ish) */
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66::192.168.1.1", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "::192.168.1.1", ENTIRE); /* :: only(ish) */
+
+ /* URL_HOST is either a hostname, or an IPv4 address, or a bracket-enclosed IPv6 address */
+ assert_match_anchored (DEFS URL_HOST, "example", ENTIRE);
+ assert_match_anchored (DEFS URL_HOST, "example.com", ENTIRE);
+ assert_match_anchored (DEFS URL_HOST, "11.22.33.44", ENTIRE);
+ assert_match_anchored (DEFS URL_HOST, "[11.22.33.44]", NULL);
+ assert_match_anchored (DEFS URL_HOST, "dead::be:ef", "dead"); /* TODO: can/should we totally abort
here? */
+ assert_match_anchored (DEFS URL_HOST, "[dead::be:ef]", ENTIRE);
+
+ /* EMAIL_HOST is either an at least two-component hostname, or a bracket-enclosed IPv[46] address */
+ assert_match_anchored (DEFS EMAIL_HOST, "example", NULL);
+ assert_match_anchored (DEFS EMAIL_HOST, "example.com", ENTIRE);
+ assert_match_anchored (DEFS EMAIL_HOST, "11.22.33.44", NULL);
+ assert_match_anchored (DEFS EMAIL_HOST, "[11.22.33.44]", ENTIRE);
+ assert_match_anchored (DEFS EMAIL_HOST, "[11.22.33.456]", NULL);
+ assert_match_anchored (DEFS EMAIL_HOST, "dead::be:ef", NULL);
+ assert_match_anchored (DEFS EMAIL_HOST, "[dead::be:ef]", ENTIRE);
+
+ /* Number between 1 and 65535 (helper for port) */
+ assert_match_anchored (N_1_65535, "0", NULL);
+ assert_match_anchored (N_1_65535, "1", ENTIRE);
+ assert_match_anchored (N_1_65535, "10", ENTIRE);
+ assert_match_anchored (N_1_65535, "100", ENTIRE);
+ assert_match_anchored (N_1_65535, "1000", ENTIRE);
+ assert_match_anchored (N_1_65535, "10000", ENTIRE);
+ assert_match_anchored (N_1_65535, "60000", ENTIRE);
+ assert_match_anchored (N_1_65535, "65000", ENTIRE);
+ assert_match_anchored (N_1_65535, "65500", ENTIRE);
+ assert_match_anchored (N_1_65535, "65530", ENTIRE);
+ assert_match_anchored (N_1_65535, "65535", ENTIRE);
+ assert_match_anchored (N_1_65535, "65536", NULL);
+ assert_match_anchored (N_1_65535, "65540", NULL);
+ assert_match_anchored (N_1_65535, "65600", NULL);
+ assert_match_anchored (N_1_65535, "66000", NULL);
+ assert_match_anchored (N_1_65535, "70000", NULL);
+ assert_match_anchored (N_1_65535, "100000", NULL);
+ assert_match_anchored (N_1_65535, "", NULL);
+ assert_match_anchored (N_1_65535, "a1b", NULL);
+
+ /* PORT is an optional colon-prefixed value */
+ assert_match_anchored (PORT, "", ENTIRE);
+ assert_match_anchored (PORT, ":1", ENTIRE);
+ assert_match_anchored (PORT, ":65535", ENTIRE);
+ assert_match_anchored (PORT, ":65536", ""); /* TODO: can/should we totally abort here? */
+
+ /* TODO: add tests for PATHCHARS and PATHNONTERM; and/or URLPATH */
+ assert_match_anchored (URLPATH, "/ab/cd", ENTIRE);
+ assert_match_anchored (URLPATH, "/ab/cd.html.", "/ab/cd.html");
+
+
+ /* Put the components together and test the big picture */
+
+ assert_match (REGEX_URL_AS_IS, "There's no URL here http:/foo", NULL);
+ assert_match (REGEX_URL_AS_IS, "Visit http://example.com for details", "http://example.com");
+ assert_match (REGEX_URL_AS_IS, "Trailing dot http://foo/bar.html.", "http://foo/bar.html");
+ assert_match (REGEX_URL_AS_IS, "Trailing ellipsis http://foo/bar.html...", "http://foo/bar.html");
+ assert_match (REGEX_URL_AS_IS, "See <http://foo/bar>", "http://foo/bar");
+ assert_match (REGEX_URL_AS_IS, "<http://foo.bar/asdf.qwer.html>",
"http://foo.bar/asdf.qwer.html");
+ assert_match (REGEX_URL_AS_IS, "Go to http://192.168.1.1.", "http://192.168.1.1");
+ assert_match (REGEX_URL_AS_IS, "If not, see <http://www.gnu.org/licenses/>.",
"http://www.gnu.org/licenses/");
+
+ assert_match (REGEX_URL_AS_IS, "http://", NULL);
+ assert_match (REGEX_URL_AS_IS, "http://a", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://aa.", "http://aa");
+ assert_match (REGEX_URL_AS_IS, "http://aa.b", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://aa.bb", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://aa.bb/c", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://aa.bb/cc", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://aa.bb/cc/", ENTIRE);
+
+ assert_match (REGEX_URL_AS_IS, "HtTp://déjà-vu.com:10000/déjà/vu", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "HTTP://joe:sEcReT ➡ ws:1080", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "https://cömbining-áccents", ENTIRE);
+
+ assert_match (REGEX_URL_AS_IS, "http://111.222.33.44", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://111.222.33.44/", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://111.222.33.44/foo", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://1.2.3.4:5555/xyz", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "https://[dead::beef]:12345/ipv6", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "https://[dead::beef:11.22.33.44]", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://1.2.3.4:", "http://1.2.3.4"); /* TODO:
can/should we totally abort here? */
+ assert_match (REGEX_URL_AS_IS, "https://dead::beef/no-brackets-ipv6", "https://dead"); /* detto */
+ assert_match (REGEX_URL_AS_IS, "http://111.222.333.444/", NULL);
+ assert_match (REGEX_URL_AS_IS, "http://1.2.3.4:70000", "http://1.2.3.4"); /* TODO:
can/should we totally abort here? */
+ assert_match (REGEX_URL_AS_IS, "http://[dead::beef:111.222.333.444]", NULL);
+
+ /* Username, password */
+ assert_match (REGEX_URL_AS_IS, "http://joe example com", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://user.name:sec ret host name", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://joe:secret [::1]", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://dudewithnopassword:@example.com", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://safeguy:!#$%^&* host", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://invalidusername! host", "http://invalidusername");
+
+ assert_match (REGEX_URL_AS_IS, "http://ab.cd/ef?g=h&i=j|k=l#m=n:o=p", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http:///foo", NULL);
+
+ /* No scheme */
+ assert_match (REGEX_URL_HTTP, "www.foo.bar/baz", ENTIRE);
+ assert_match (REGEX_URL_HTTP, "WWW3.foo.bar/baz", ENTIRE);
+ assert_match (REGEX_URL_HTTP, "FTP.FOO.BAR/BAZ", ENTIRE); /* FIXME if no scheme is given and url
starts with ftp, can we make the protocol ftp instead of http? */
+ assert_match (REGEX_URL_HTTP, "ftpxy.foo.bar/baz", ENTIRE);
+// assert_match (REGEX_URL_HTTP, "ftp.123/baz", NULL); /* errr... could we fail here?? */
+ assert_match (REGEX_URL_HTTP, "foo.bar/baz", NULL);
+ assert_match (REGEX_URL_HTTP, "abc.www.foo.bar/baz", NULL);
+ assert_match (REGEX_URL_HTTP, "uvwww.foo.bar/baz", NULL);
+ assert_match (REGEX_URL_HTTP, "xftp.foo.bar/baz", NULL);
+
+ /* file:/ or file://(hostname)?/ */
+ assert_match (REGEX_URL_FILE, "file:", NULL);
+ assert_match (REGEX_URL_FILE, "file:/", ENTIRE);
+ assert_match (REGEX_URL_FILE, "file://", NULL);
+ assert_match (REGEX_URL_FILE, "file:///", ENTIRE);
+ assert_match (REGEX_URL_FILE, "file:////", NULL);
+ assert_match (REGEX_URL_FILE, "file:etc/passwd", NULL);
+ assert_match (REGEX_URL_FILE, "File:/etc/passwd", ENTIRE);
+ assert_match (REGEX_URL_FILE, "FILE:///etc/passwd", ENTIRE);
+ assert_match (REGEX_URL_FILE, "file:////etc/passwd", NULL);
+ assert_match (REGEX_URL_FILE, "file://host.name", NULL);
+ assert_match (REGEX_URL_FILE, "file://host.name/", ENTIRE);
+ assert_match (REGEX_URL_FILE, "file://host.name/etc", ENTIRE);
+
+ assert_match (REGEX_URL_FILE, "See file:/.", "file:/");
+ assert_match (REGEX_URL_FILE, "See file:///.", "file:///");
+ assert_match (REGEX_URL_FILE, "See file:/lost+found.", "file:/lost+found");
+ assert_match (REGEX_URL_FILE, "See file:///lost+found.", "file:///lost+found");
+
+ /* Email */
+ assert_match (REGEX_EMAIL, "Write to foo bar com ", "foo bar com");
+ assert_match (REGEX_EMAIL, "Write to <foo bar com>", "foo bar com");
+ assert_match (REGEX_EMAIL, "Write to mailto:foo bar com ", "mailto:foo bar com");
+ assert_match (REGEX_EMAIL, "Write to MAILTO:FOO BAR COM ", "MAILTO:FOO BAR COM");
+ assert_match (REGEX_EMAIL, "Write to foo [1 2 3 4]", "foo [1 2 3 4]");
+ assert_match (REGEX_EMAIL, "Write to foo [1 2 3 456]", NULL);
+ assert_match (REGEX_EMAIL, "Write to foo [1::2345]", "foo [1::2345]");
+
+ /* Sip, examples from rfc 3261 */
+ assert_match (REGEX_URL_VOIP, "sip:alice atlanta com;maddr=239.255.255.1;ttl=15", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "sip:alice atlanta com", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "sip:alice:secretword atlanta com;transport=tcp", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "sips:alice atlanta com?subject=project%20x&priority=urgent", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "sip:+1-212-555-1212:1234 gateway com;user=phone", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "sips:1212 gateway com", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "sip:alice 192 0 2 4", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "sip:atlanta.com;method=REGISTER?to=alice%40atlanta.com", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "SIP:alice;day=tuesday atlanta com", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "Dial sip:alice 192 0 2 4 ", "sip:alice 192
0 2 4");
+
+ printf("terminal-regex tests passed :)\n");
+ return 0;
+}
+
+#endif /* TERMINAL_REGEX_MAIN */
diff --git a/src/terminal-regex.h b/src/terminal-regex.h
new file mode 100644
index 0000000..f53bdce
--- /dev/null
+++ b/src/terminal-regex.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright © 2015 Egmont Koblinger
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Mini style-guide:
+ *
+ * #define'd fragments should preferably have an outermost group, for the
+ * exact same reason as why usually in C/C++ #define's the values are enclosed
+ * in parentheses: that is, so that you don't get surprised when you use the
+ * macro and append a quantifier.
+ *
+ * For repeated fragments prefer regex-style (?(DEFINE)(?<NAME>(...))) and use
+ * as (?&NAME), so that the regex string and the compiled regex object is
+ * smaller.
+ *
+ * Build small blocks, comment and unittest them heavily.
+ *
+ * Use free-spacing mode for improved readability. The hardest to read is
+ * which additional characters belong to a "(?" prefix. To improve
+ * readability, place a space after this, and for symmetry, before the closing
+ * parenthesis. Also place a space around "|" characters. No space before
+ * quantifiers. Try to be consistent with the existing style (yes I know the
+ * existing style is not consistent either, but please do your best).
+ *
+ * See http://www.rexegg.com/regex-disambiguation.html for all the "(?"
+ * syntaxes.
+ */
+
+#ifndef TERMINAL_REGEX_H
+#define TERMINAL_REGEX_H
+
+#define SCHEME "(?ix: news | telnet | nntp | https? | ftps? | sftp | webcal )"
+
+#define USERCHARS "-+.[:alnum:]"
+/* Nonempty username, e.g. "john.smith" */
+#define USER "[" USERCHARS "]+"
+
+#define PASSCHARS_CLASS "[-[:alnum:]\\Q,?;.:/!%$^*&~\"#'\\E]"
+/* Optional colon-prefixed password. I guess empty password should be allowed, right? E.g. ":secret", ":",
"" */
+#define PASS "(?x: :" PASSCHARS_CLASS "* )?"
+
+/* Optional at-terminated username (with perhaps a password too), e.g. "joe@", "pete:secret@", "" */
+#define USERPASS "(?:" USER PASS "@)?"
+
+/* S4: IPv4 segment (number between 0 and 255) with lookahead at the end so that we don't match "25" in the
string "256".
+ The lookahead could go to the last segment of IPv4 only but this construct allows nicer unittesting. */
+#define S4_DEF "(?(DEFINE)(?<S4>(?x: (?: [0-9] | [1-9][0-9] | 1[0-9]{2} | 2[0-4][0-9] | 25[0-5] ) (?! [0-9]
) )))"
+
+/* IPV4: Decimal IPv4, e.g. "1.2.3.4", with lookahead (implemented in S4) at the end so that we don't match
"192.168.1.123" in the string "192.168.1.1234". */
+#define IPV4_DEF S4_DEF "(?(DEFINE)(?<IPV4>(?x: (?: (?&S4) \\. ){3} (?&S4) )))"
+
+/* IPv6, including embedded IPv4, e.g. "::1", "dead:beef::1.2.3.4".
+ * Lookahead for the next char not being a dot or digit, so it doesn't get stuck matching "dead:beef::1" in
"dead:beef::1.2.3.4".
+ * This is not required since the surrounding brackets would trigger backtracking, but it allows nicer
unittesting.
+ * TODO: more strict check (right number of colons, etc.)
+ * TODO: add zone_id: RFC 4007 section 11, RFC 6874 */
+
+/* S6: IPv6 segment, S6C: IPv6 segment followed by a comma, CS6: comma followed by an IPv6 segment */
+#define S6_DEF "(?(DEFINE)(?<S6>[[:xdigit:]]{1,4})(?<CS6>:(?&S6))(?<S6C>(?&S6):))"
+
+/* No :: shorthand */
+#define IPV6_FULL "(?x: (?&S6C){7} (?&S6) )"
+/* Begins with :: */
+#define IPV6_LEFT "(?x: : (?&CS6){1,7} )"
+/* :: somewhere in the middle - use negative lookahead to make sure there aren't too many colons in total */
+#define IPV6_MID "(?x: (?! (?: [[:xdigit:]]*: ){8} ) (?&S6C){1,6} (?&CS6){1,6} )"
+/* Ends with :: */
+#define IPV6_RIGHT "(?x: (?&S6C){1,7} : )"
+/* Is "::" and nothing more */
+#define IPV6_NULL "(?x: :: )"
+
+/* The same ones for IPv4-embedded notation, without the actual IPv4 part */
+#define IPV6V4_FULL "(?x: (?&S6C){6} )"
+#define IPV6V4_LEFT "(?x: :: (?&S6C){0,5} )" /* includes "::<ipv4>" */
+#define IPV6V4_MID "(?x: (?! (?: [[:xdigit:]]*: ){7} ) (?&S6C){1,4} (?&CS6){1,4} ) :"
+#define IPV6V4_RIGHT "(?x: (?&S6C){1,5} : )"
+
+/* IPV6: An IPv6 address (possibly with an embedded IPv4).
+ * This macro defines both IPV4 and IPV6, since the latter one requires the former. */
+#define IP_DEF IPV4_DEF S6_DEF "(?(DEFINE)(?<IPV6>(?x: (?: " IPV6_NULL " | " IPV6_LEFT " | " IPV6_MID " | "
IPV6_RIGHT " | " IPV6_FULL " | (?: " IPV6V4_FULL " | " IPV6V4_LEFT " | " IPV6V4_MID " | " IPV6V4_RIGHT " )
(?&IPV4) ) (?! [.:[:xdigit:]] ) )))"
+
+/* Either an alphanumeric character or dash; or if [negative lookahead] not ASCII
+ * then any graphical Unicode character.
+ * A segment can consist entirely of numbers.
+ * (Note: PCRE doesn't support character class subtraction/intersection.) */
+#define HOSTNAMESEGMENTCHARS_CLASS "(?x: [-[:alnum:]] | (?! [[:ascii:]] ) [[:graph:]] )"
+
+/* A hostname of at least 1 component. The last component cannot be entirely numbers.
+ * E.g. "foo", "example.com", "1234.com", but not "foo.123" */
+#define HOSTNAME1 "(?x: (?: " HOSTNAMESEGMENTCHARS_CLASS "+ \\. )* " HOSTNAMESEGMENTCHARS_CLASS "* (?! [0-9]
) " HOSTNAMESEGMENTCHARS_CLASS "+ )"
+
+/* A hostname of at least 2 components. The last component cannot be entirely numbers.
+ * E.g. "example.com", "1234.com", but not "1234.56" */
+#define HOSTNAME2 "(?x: (?: " HOSTNAMESEGMENTCHARS_CLASS "+ \\.)+ " HOSTNAME1 " )"
+
+/* For URL: Hostname, IPv4, or bracket-enclosed IPv6, e.g. "example.com", "1.2.3.4", "[::1]" */
+#define URL_HOST "(?x: " HOSTNAME1 " | (?&IPV4) | \\[ (?&IPV6) \\] )"
+
+/* For e-mail: Hostname of at least two segments, or bracket-enclosed IPv4 or IPv6, e.g. "example.com",
"[1.2.3.4]", "[::1]".
+ * Technically an e-mail with a single-component hostname might be valid on a local network, but let's avoid
tons of false positives (e.g. in a typical shell prompt). */
+#define EMAIL_HOST "(?x: " HOSTNAME2 " | \\[ (?: (?&IPV4) | (?&IPV6) ) \\] )"
+
+/* Number between 1 and 65535, with lookahead at the end so that we don't match "6789" in the string "67890",
+ and in turn we don't eventually match "http://host:6789" in "http://host:67890". */
+#define N_1_65535 "(?x: (?: [1-9][0-9]{0,3} | [1-5][0-9]{4} | 6[0-4][0-9]{3} | 65[0-4][0-9]{2} |
655[0-2][0-9] | 6553[0-5] ) (?! [0-9] ) )"
+
+/* Optional colon-prefixed port, e.g. ":1080", "" */
+#define PORT "(?x: \\:" N_1_65535 " )?"
+
+#define PATHCHARS_CLASS "[-[:alnum:]\\Q_$.+!*,:;@&=?/~#|%\\E]"
+/* Chars not to end a URL */
+#define PATHNONTERM_CLASS "[\\Q.!,?\\E]"
+
+/* Lookbehind at the end, so that the last character (if we matched a character at all) is not from
PATHTERM_CLASS */
+#define URLPATH "(?x: /" PATHCHARS_CLASS "* (?<! " PATHNONTERM_CLASS " ) )?"
+#define VOIP_PATH "(?x: [;?]" PATHCHARS_CLASS "* (?<! " PATHNONTERM_CLASS " ) )?"
+
+/* Now let's put these fragments together */
+
+#define DEFS IP_DEF
+
+#define REGEX_URL_AS_IS DEFS SCHEME "://" USERPASS URL_HOST PORT URLPATH
+/* TODO: also support file:/etc/passwd */
+#define REGEX_URL_FILE DEFS "(?ix: file:/ (?: / (?: " HOSTNAME1 " )? / )? (?! / ) )(?x: " PATHCHARS_CLASS
"+ (?<! " PATHNONTERM_CLASS " ) )?"
+/* Lookbehind so that we don't catch "abc.www.foo.bar", bug 739757. Lookahead for www/ftp for convenience
(so that we can reuse HOSTNAME1). */
+#define REGEX_URL_HTTP DEFS "(?<!(?:" HOSTNAMESEGMENTCHARS_CLASS "|[.]))(?=(?i:www|ftp))" HOSTNAME1 PORT
URLPATH
+#define REGEX_URL_VOIP DEFS "(?i:h323:|sips?:)" USERPASS URL_HOST PORT VOIP_PATH
+#define REGEX_EMAIL DEFS "(?i:mailto:)?" USER "@" EMAIL_HOST
+#define REGEX_NEWS_MAN "(?i:news:|man:|info:)[-[:alnum:]\\Q^_{|}~!\"#$%&'()*+,./;:=?`\\E]+"
+
+#endif /* !TERMINAL_REGEX_H */
diff --git a/src/terminal-screen.c b/src/terminal-screen.c
index a48de10..051f7be 100644
--- a/src/terminal-screen.c
+++ b/src/terminal-screen.c
@@ -19,6 +19,7 @@
#include "config.h"
#define _GNU_SOURCE /* for dup3 */
+#include "terminal-regex.h"
#include "terminal-screen.h"
#include <errno.h>
@@ -158,30 +159,18 @@ static void terminal_screen_set_override_command (TerminalScreen *screen,
static guint signals[LAST_SIGNAL];
-#define USERCHARS "-[:alnum:]"
-#define USERCHARS_CLASS "[" USERCHARS "]"
-#define PASSCHARS_CLASS "[-[:alnum:]\\Q,?;.:/!%$^*&~\"#'\\E]"
-#define HOSTCHARS_CLASS "[-[:alnum:]]"
-#define HOST HOSTCHARS_CLASS "+(\\." HOSTCHARS_CLASS "+)*"
-#define PORT "(?:\\:[[:digit:]]{1,5})?"
-#define PATHCHARS_CLASS "[-[:alnum:]\\Q_$.+!*,:;@&=?/~#%\\E]"
-#define PATHTERM_CLASS "[^\\Q]'.:}>) \t\r\n,\"\\E]"
-#define SCHEME "(?:news:|telnet:|nntp:|file:\\/|https?:|ftps?:|sftp:|webcal:)"
-#define USERPASS USERCHARS_CLASS "+(?:" PASSCHARS_CLASS "+)?"
-#define URLPATH
"(?:(/"PATHCHARS_CLASS"+(?:[(]"PATHCHARS_CLASS"*[)])*"PATHCHARS_CLASS"*)*"PATHTERM_CLASS")?"
-
typedef struct {
const char *pattern;
TerminalURLFlavour flavor;
- GRegexCompileFlags flags;
} TerminalRegexPattern;
static const TerminalRegexPattern url_regex_patterns[] = {
- { SCHEME "//(?:" USERPASS "\\@)?" HOST PORT URLPATH, FLAVOR_AS_IS, G_REGEX_CASELESS },
- { "(?:www|ftp)" HOSTCHARS_CLASS "*\\." HOST PORT URLPATH , FLAVOR_DEFAULT_TO_HTTP, G_REGEX_CASELESS },
- { "(?:callto:|h323:|sip:)" USERCHARS_CLASS "[" USERCHARS ".]*(?:" PORT "/[a-z0-9]+)?\\@" HOST,
FLAVOR_VOIP_CALL, G_REGEX_CASELESS },
- { "(?:mailto:)?" USERCHARS_CLASS "[" USERCHARS ".]*\\@" HOSTCHARS_CLASS "+\\." HOST, FLAVOR_EMAIL,
G_REGEX_CASELESS },
- { "(?:news:|man:|info:)[-[:alnum:]\\Q^_{|}~!\"#$%&'()*+,./;:=?`\\E]+", FLAVOR_AS_IS, G_REGEX_CASELESS },
+ { REGEX_URL_AS_IS, FLAVOR_AS_IS },
+ { REGEX_URL_HTTP, FLAVOR_DEFAULT_TO_HTTP },
+ { REGEX_URL_FILE, FLAVOR_AS_IS },
+ { REGEX_URL_VOIP, FLAVOR_VOIP_CALL },
+ { REGEX_EMAIL, FLAVOR_EMAIL },
+ { REGEX_NEWS_MAN, FLAVOR_AS_IS },
};
static GRegex **url_regexes;
@@ -544,7 +533,7 @@ terminal_screen_class_init (TerminalScreenClass *klass)
GError *error = NULL;
url_regexes[i] = g_regex_new (url_regex_patterns[i].pattern,
- url_regex_patterns[i].flags | G_REGEX_OPTIMIZE,
+ G_REGEX_OPTIMIZE,
0, &error);
g_assert_no_error (error);
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]