[gnome-terminal] regex: Allow balanced pairs of parentheses in URLs



commit 27275a3463fcba079337a55877ea4170c1419129
Author: Egmont Koblinger <egmont gmail com>
Date:   Sun Apr 23 23:42:06 2017 +0200

    regex: Allow balanced pairs of parentheses in URLs
    
    https://bugzilla.gnome.org/show_bug.cgi?id=763980

 src/terminal-regex.c |   25 +++++++++++++++++++++++--
 src/terminal-regex.h |   19 ++++++++++++-------
 2 files changed, 35 insertions(+), 9 deletions(-)
---
diff --git a/src/terminal-regex.c b/src/terminal-regex.c
index e184414..250c0eb 100644
--- a/src/terminal-regex.c
+++ b/src/terminal-regex.c
@@ -203,9 +203,23 @@ main (int argc, char **argv)
   assert_match_anchored (PORT, ":65535", ENTIRE);
   assert_match_anchored (PORT, ":65536", "");     /* TODO: can/should we totally abort here? */
 
+  /* Parentheses are only allowed in matching pairs, see bug 763980. */
   /* TODO: add tests for PATHCHARS and PATHNONTERM; and/or URLPATH */
-  assert_match_anchored (URLPATH, "/ab/cd",       ENTIRE);
-  assert_match_anchored (URLPATH, "/ab/cd.html.", "/ab/cd.html");
+  assert_match_anchored (DEFS URLPATH, "/ab/cd",       ENTIRE);
+  assert_match_anchored (DEFS URLPATH, "/ab/cd.html.", "/ab/cd.html");
+  assert_match_anchored (DEFS URLPATH, "/The_Offspring_(album)", ENTIRE);
+  assert_match_anchored (DEFS URLPATH, "/The_Offspring)", "/The_Offspring");
+  assert_match_anchored (DEFS URLPATH, "/a((b(c)d)e(f))", ENTIRE);
+  assert_match_anchored (DEFS URLPATH, "/a((b(c)d)e(f)))", "/a((b(c)d)e(f))");
+  assert_match_anchored (DEFS URLPATH, "/a(b).(c).", "/a(b).(c)");
+  assert_match_anchored (DEFS URLPATH, "/a.(b.(c.).).(d.(e.).).)", "/a.(b.(c.).).(d.(e.).)");
+  assert_match_anchored (DEFS URLPATH, "/a)b(c", "/a");
+  assert_match_anchored (DEFS URLPATH, "/.", "/");
+  assert_match_anchored (DEFS URLPATH, "/(.", "/");
+  assert_match_anchored (DEFS URLPATH, "/).", "/");
+  assert_match_anchored (DEFS URLPATH, "/().", "/()");
+  assert_match_anchored (DEFS URLPATH, "/", ENTIRE);
+  assert_match_anchored (DEFS URLPATH, "", ENTIRE);
 
 
   /* Put the components together and test the big picture */
@@ -258,6 +272,13 @@ main (int argc, char **argv)
   assert_match (REGEX_URL_AS_IS, "http://ab.cd/ef?g=h&i=j|k=l#m=n:o=p", ENTIRE);
   assert_match (REGEX_URL_AS_IS, "http:///foo";,                         NULL);
 
+  /* Parentheses are only allowed in matching pairs, see bug 763980. */
+  assert_match (REGEX_URL_AS_IS, "https://en.wikipedia.org/wiki/The_Offspring_(album)", ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "[markdown](https://en.wikipedia.org/wiki/The_Offspring)", 
"https://en.wikipedia.org/wiki/The_Offspring";);
+  assert_match (REGEX_URL_AS_IS, "[markdown](https://en.wikipedia.org/wiki/The_Offspring_(album))", 
"https://en.wikipedia.org/wiki/The_Offspring_(album)");
+  assert_match (REGEX_URL_AS_IS, "[markdown](http://foo.bar/(a(b)c)d)e)f", "http://foo.bar/(a(b)c)d");
+  assert_match (REGEX_URL_AS_IS, "[markdown](http://foo.bar/a)b(c", "http://foo.bar/a";);
+
   /* No scheme */
   assert_match (REGEX_URL_HTTP, "www.foo.bar/baz",     ENTIRE);
   assert_match (REGEX_URL_HTTP, "WWW3.foo.bar/baz",    ENTIRE);
diff --git a/src/terminal-regex.h b/src/terminal-regex.h
index f53bdce..ddf75e0 100644
--- a/src/terminal-regex.h
+++ b/src/terminal-regex.h
@@ -121,21 +121,26 @@
 /* Optional colon-prefixed port, e.g. ":1080", "" */
 #define PORT "(?x: \\:" N_1_65535 " )?"
 
+/* Omit the parentheses, see below */
 #define PATHCHARS_CLASS "[-[:alnum:]\\Q_$.+!*,:;@&=?/~#|%\\E]"
-/* Chars not to end a URL */
-#define PATHNONTERM_CLASS "[\\Q.!,?\\E]"
+/* Chars to end a URL */
+#define PATHTERM_CLASS "[-[:alnum:]\\Q_$+*:;@&=/~#|%\\E]"
 
-/* Lookbehind at the end, so that the last character (if we matched a character at all) is not from 
PATHTERM_CLASS */
-#define URLPATH "(?x: /" PATHCHARS_CLASS "* (?<! " PATHNONTERM_CLASS " ) )?"
-#define VOIP_PATH "(?x: [;?]" PATHCHARS_CLASS "* (?<! " PATHNONTERM_CLASS " ) )?"
+/* Recursive definition of PATH that allows parentheses only if balanced, see bug 763980. */
+#define PATH_INNER_DEF "(?(DEFINE)(?<PATH_INNER>(?x: (?: " PATHCHARS_CLASS "* \\( (?&PATH_INNER) \\) )* " 
PATHCHARS_CLASS "* )))"
+/* Same as above, but the last character (if exists and is not a parenthesis) must be from PATHTERM_CLASS. */
+#define PATH_DEF "(?(DEFINE)(?<PATH>(?x: (?: " PATHCHARS_CLASS "* \\( (?&PATH_INNER) \\) )* (?: " 
PATHCHARS_CLASS "* " PATHTERM_CLASS " )? )))"
+
+#define URLPATH "(?x: /(?&PATH) )?"
+#define VOIP_PATH "(?x: [;?](?&PATH) )?"
 
 /* Now let's put these fragments together */
 
-#define DEFS IP_DEF
+#define DEFS IP_DEF PATH_INNER_DEF PATH_DEF
 
 #define REGEX_URL_AS_IS  DEFS SCHEME "://" USERPASS URL_HOST PORT URLPATH
 /* TODO: also support file:/etc/passwd */
-#define REGEX_URL_FILE   DEFS "(?ix: file:/ (?: / (?: " HOSTNAME1 " )? / )? (?! / ) )(?x: " PATHCHARS_CLASS 
"+ (?<! " PATHNONTERM_CLASS " ) )?" 
+#define REGEX_URL_FILE   DEFS "(?ix: file:/ (?: / (?: " HOSTNAME1 " )? / )? (?! / ) )(?&PATH)"
 /* Lookbehind so that we don't catch "abc.www.foo.bar", bug 739757. Lookahead for www/ftp for convenience 
(so that we can reuse HOSTNAME1). */
 #define REGEX_URL_HTTP   DEFS "(?<!(?:" HOSTNAMESEGMENTCHARS_CLASS "|[.]))(?=(?i:www|ftp))" HOSTNAME1 PORT 
URLPATH
 #define REGEX_URL_VOIP   DEFS "(?i:h323:|sips?:)" USERPASS URL_HOST PORT VOIP_PATH


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]