[gnome-terminal] regex: Allow apostrophes in URLs, except when enclosed between them



commit 51cb07f3244f8999dece9332d928b7529d9a63f9
Author: Egmont Koblinger <egmont gmail com>
Date:   Mon Jan 1 15:57:10 2018 +0100

    regex: Allow apostrophes in URLs, except when enclosed between them
    
    https://bugzilla.gnome.org/show_bug.cgi?id=448044

 src/terminal-regex.c |    7 +++++++
 src/terminal-regex.h |   14 +++++++++-----
 2 files changed, 16 insertions(+), 5 deletions(-)
---
diff --git a/src/terminal-regex.c b/src/terminal-regex.c
index cf06b76..cf51453 100644
--- a/src/terminal-regex.c
+++ b/src/terminal-regex.c
@@ -288,6 +288,13 @@ main (int argc, char **argv)
   assert_match (REGEX_URL_AS_IS, "[markdown](http://foo.bar/(a(b)c)d)e)f", "http://foo.bar/(a(b)c)d");
   assert_match (REGEX_URL_AS_IS, "[markdown](http://foo.bar/a)b(c", "http://foo.bar/a";);
 
+  /* Apostrophes are allowed, except at trailing position if the URL is preceded by an apostrophe, see bug 
448044. */
+  assert_match (REGEX_URL_AS_IS, "https://en.wikipedia.org/wiki/Moore's_law", ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "<a href=\"https://en.wikipedia.org/wiki/Moore's_law\">", 
"https://en.wikipedia.org/wiki/Moore's_law");
+  assert_match (REGEX_URL_AS_IS, "https://en.wikipedia.org/wiki/Cryin'", ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "<a href=\"https://en.wikipedia.org/wiki/Cryin'\">", 
"https://en.wikipedia.org/wiki/Cryin'");
+  assert_match (REGEX_URL_AS_IS, "<a href='https://en.wikipedia.org/wiki/Aerosmith'>", 
"https://en.wikipedia.org/wiki/Aerosmith";);
+
   /* No scheme */
   assert_match (REGEX_URL_HTTP, "www.foo.bar/baz",     ENTIRE);
   assert_match (REGEX_URL_HTTP, "WWW3.foo.bar/baz",    ENTIRE);
diff --git a/src/terminal-regex.h b/src/terminal-regex.h
index 7136f95..3a3e89a 100644
--- a/src/terminal-regex.h
+++ b/src/terminal-regex.h
@@ -43,6 +43,9 @@
 #ifndef TERMINAL_REGEX_H
 #define TERMINAL_REGEX_H
 
+/* Lookbehind to see if there's a preceding apostrophe */
+#define APOS_START_DEF "(?<APOS_START>(?<='))?"
+
 #define SCHEME "(?ix: news | telnet | nntp | https? | ftps? | sftp | webcal )"
 
 #define USERCHARS "-+.[:alnum:]"
@@ -122,21 +125,22 @@
 #define PORT "(?x: \\:" N_1_65535 " )?"
 
 /* Omit the parentheses, see below */
-#define PATHCHARS_CLASS "[-[:alnum:]\\Q_$.+!*,:;@&=?/~#|%\\E]"
-/* Chars to end a URL */
-#define PATHTERM_CLASS "[-[:alnum:]\\Q_$+*:@&=/~#|%\\E]"
+#define PATHCHARS_CLASS "[-[:alnum:]\\Q_$.+!*,:;@&=?/~#|%'\\E]"
+/* Chars to end a URL. Apostrophe only allowed if there wasn't one in front of the URL, see bug 448044 */
+#define PATHTERM_CLASS        "[-[:alnum:]\\Q_$+*:@&=/~#|%'\\E]"
+#define PATHTERM_NOAPOS_CLASS "[-[:alnum:]\\Q_$+*:@&=/~#|%\\E]"
 
 /* Recursive definition of PATH that allows parentheses and square brackets only if balanced, see bug 
763980. */
 #define PATH_INNER_DEF "(?(DEFINE)(?<PATH_INNER>(?x: (?: " PATHCHARS_CLASS "* (?: \\( (?&PATH_INNER) \\) | 
\\[ (?&PATH_INNER) \\] ) )* " PATHCHARS_CLASS "* )))"
 /* Same as above, but the last character (if exists and is not a parenthesis) must be from PATHTERM_CLASS. */
-#define PATH_DEF "(?(DEFINE)(?<PATH>(?x: (?: " PATHCHARS_CLASS "* (?: \\( (?&PATH_INNER) \\) | \\[ 
(?&PATH_INNER) \\] ) )* (?: " PATHCHARS_CLASS "* " PATHTERM_CLASS " )? )))"
+#define PATH_DEF "(?(DEFINE)(?<PATH>(?x: (?: " PATHCHARS_CLASS "* (?: \\( (?&PATH_INNER) \\) | \\[ 
(?&PATH_INNER) \\] ) )* (?: " PATHCHARS_CLASS "* (?(<APOS_START>)" PATHTERM_NOAPOS_CLASS "|" PATHTERM_CLASS 
") )? )))"
 
 #define URLPATH "(?x: /(?&PATH) )?"
 #define VOIP_PATH "(?x: [;?](?&PATH) )?"
 
 /* Now let's put these fragments together */
 
-#define DEFS IP_DEF PATH_INNER_DEF PATH_DEF
+#define DEFS APOS_START_DEF IP_DEF PATH_INNER_DEF PATH_DEF
 
 #define REGEX_URL_AS_IS  DEFS SCHEME "://" USERPASS URL_HOST PORT URLPATH
 /* TODO: also support file:/etc/passwd */


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]