[gnome-terminal] regex: Allow balanced pairs of parentheses in URLs
- From: Egmont Koblinger <egmontkob src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gnome-terminal] regex: Allow balanced pairs of parentheses in URLs
- Date: Sun, 23 Apr 2017 21:44:29 +0000 (UTC)
commit 27275a3463fcba079337a55877ea4170c1419129
Author: Egmont Koblinger <egmont gmail com>
Date: Sun Apr 23 23:42:06 2017 +0200
regex: Allow balanced pairs of parentheses in URLs
https://bugzilla.gnome.org/show_bug.cgi?id=763980
src/terminal-regex.c | 25 +++++++++++++++++++++++--
src/terminal-regex.h | 19 ++++++++++++-------
2 files changed, 35 insertions(+), 9 deletions(-)
---
diff --git a/src/terminal-regex.c b/src/terminal-regex.c
index e184414..250c0eb 100644
--- a/src/terminal-regex.c
+++ b/src/terminal-regex.c
@@ -203,9 +203,23 @@ main (int argc, char **argv)
assert_match_anchored (PORT, ":65535", ENTIRE);
assert_match_anchored (PORT, ":65536", ""); /* TODO: can/should we totally abort here? */
+ /* Parentheses are only allowed in matching pairs, see bug 763980. */
/* TODO: add tests for PATHCHARS and PATHNONTERM; and/or URLPATH */
- assert_match_anchored (URLPATH, "/ab/cd", ENTIRE);
- assert_match_anchored (URLPATH, "/ab/cd.html.", "/ab/cd.html");
+ assert_match_anchored (DEFS URLPATH, "/ab/cd", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "/ab/cd.html.", "/ab/cd.html");
+ assert_match_anchored (DEFS URLPATH, "/The_Offspring_(album)", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "/The_Offspring)", "/The_Offspring");
+ assert_match_anchored (DEFS URLPATH, "/a((b(c)d)e(f))", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "/a((b(c)d)e(f)))", "/a((b(c)d)e(f))");
+ assert_match_anchored (DEFS URLPATH, "/a(b).(c).", "/a(b).(c)");
+ assert_match_anchored (DEFS URLPATH, "/a.(b.(c.).).(d.(e.).).)", "/a.(b.(c.).).(d.(e.).)");
+ assert_match_anchored (DEFS URLPATH, "/a)b(c", "/a");
+ assert_match_anchored (DEFS URLPATH, "/.", "/");
+ assert_match_anchored (DEFS URLPATH, "/(.", "/");
+ assert_match_anchored (DEFS URLPATH, "/).", "/");
+ assert_match_anchored (DEFS URLPATH, "/().", "/()");
+ assert_match_anchored (DEFS URLPATH, "/", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "", ENTIRE);
/* Put the components together and test the big picture */
@@ -258,6 +272,13 @@ main (int argc, char **argv)
assert_match (REGEX_URL_AS_IS, "http://ab.cd/ef?g=h&i=j|k=l#m=n:o=p", ENTIRE);
assert_match (REGEX_URL_AS_IS, "http:///foo", NULL);
+ /* Parentheses are only allowed in matching pairs, see bug 763980. */
+ assert_match (REGEX_URL_AS_IS, "https://en.wikipedia.org/wiki/The_Offspring_(album)", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "[markdown](https://en.wikipedia.org/wiki/The_Offspring)",
"https://en.wikipedia.org/wiki/The_Offspring");
+ assert_match (REGEX_URL_AS_IS, "[markdown](https://en.wikipedia.org/wiki/The_Offspring_(album))",
"https://en.wikipedia.org/wiki/The_Offspring_(album)");
+ assert_match (REGEX_URL_AS_IS, "[markdown](http://foo.bar/(a(b)c)d)e)f", "http://foo.bar/(a(b)c)d");
+ assert_match (REGEX_URL_AS_IS, "[markdown](http://foo.bar/a)b(c", "http://foo.bar/a");
+
/* No scheme */
assert_match (REGEX_URL_HTTP, "www.foo.bar/baz", ENTIRE);
assert_match (REGEX_URL_HTTP, "WWW3.foo.bar/baz", ENTIRE);
diff --git a/src/terminal-regex.h b/src/terminal-regex.h
index f53bdce..ddf75e0 100644
--- a/src/terminal-regex.h
+++ b/src/terminal-regex.h
@@ -121,21 +121,26 @@
/* Optional colon-prefixed port, e.g. ":1080", "" */
#define PORT "(?x: \\:" N_1_65535 " )?"
+/* Omit the parentheses, see below */
#define PATHCHARS_CLASS "[-[:alnum:]\\Q_$.+!*,:;@&=?/~#|%\\E]"
-/* Chars not to end a URL */
-#define PATHNONTERM_CLASS "[\\Q.!,?\\E]"
+/* Chars to end a URL */
+#define PATHTERM_CLASS "[-[:alnum:]\\Q_$+*:;@&=/~#|%\\E]"
-/* Lookbehind at the end, so that the last character (if we matched a character at all) is not from
PATHTERM_CLASS */
-#define URLPATH "(?x: /" PATHCHARS_CLASS "* (?<! " PATHNONTERM_CLASS " ) )?"
-#define VOIP_PATH "(?x: [;?]" PATHCHARS_CLASS "* (?<! " PATHNONTERM_CLASS " ) )?"
+/* Recursive definition of PATH that allows parentheses only if balanced, see bug 763980. */
+#define PATH_INNER_DEF "(?(DEFINE)(?<PATH_INNER>(?x: (?: " PATHCHARS_CLASS "* \\( (?&PATH_INNER) \\) )* "
PATHCHARS_CLASS "* )))"
+/* Same as above, but the last character (if exists and is not a parenthesis) must be from PATHTERM_CLASS. */
+#define PATH_DEF "(?(DEFINE)(?<PATH>(?x: (?: " PATHCHARS_CLASS "* \\( (?&PATH_INNER) \\) )* (?: "
PATHCHARS_CLASS "* " PATHTERM_CLASS " )? )))"
+
+#define URLPATH "(?x: /(?&PATH) )?"
+#define VOIP_PATH "(?x: [;?](?&PATH) )?"
/* Now let's put these fragments together */
-#define DEFS IP_DEF
+#define DEFS IP_DEF PATH_INNER_DEF PATH_DEF
#define REGEX_URL_AS_IS DEFS SCHEME "://" USERPASS URL_HOST PORT URLPATH
/* TODO: also support file:/etc/passwd */
-#define REGEX_URL_FILE DEFS "(?ix: file:/ (?: / (?: " HOSTNAME1 " )? / )? (?! / ) )(?x: " PATHCHARS_CLASS
"+ (?<! " PATHNONTERM_CLASS " ) )?"
+#define REGEX_URL_FILE DEFS "(?ix: file:/ (?: / (?: " HOSTNAME1 " )? / )? (?! / ) )(?&PATH)"
/* Lookbehind so that we don't catch "abc.www.foo.bar", bug 739757. Lookahead for www/ftp for convenience
(so that we can reuse HOSTNAME1). */
#define REGEX_URL_HTTP DEFS "(?<!(?:" HOSTNAMESEGMENTCHARS_CLASS "|[.]))(?=(?i:www|ftp))" HOSTNAME1 PORT
URLPATH
#define REGEX_URL_VOIP DEFS "(?i:h323:|sips?:)" USERPASS URL_HOST PORT VOIP_PATH
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]