[gnome-terminal/gnome-41] regex: Fix path-less URL recognition



commit 5caf874a311b49ebe7ecba46a021b8023c41f7e9
Author: Egmont Koblinger <egmont gmail com>
Date:   Thu Feb 3 22:19:57 2022 +0100

    regex: Fix path-less URL recognition
    
    URLs are allowed to contain the query ('?') or fragment ('#') directly
    after the hostname without a path or even a path separator ('/'), so fix
    the regexes to recognise these URLs.
    
    Fixes: https://gitlab.gnome.org/GNOME/gnome-terminal/-/issues/7888
    (cherry picked from commit 17ee0e43be8bb41e429d5d1d9d4a21fad57c3222)

 src/terminal-regex.cc | 16 ++++++++++++++++
 src/terminal-regex.hh |  2 +-
 2 files changed, 17 insertions(+), 1 deletion(-)
---
diff --git a/src/terminal-regex.cc b/src/terminal-regex.cc
index 9b1d2529..3856c646 100644
--- a/src/terminal-regex.cc
+++ b/src/terminal-regex.cc
@@ -220,6 +220,10 @@ main (int argc, char **argv)
   assert_match_anchored (DEFS URLPATH, "/().", "/()");
   assert_match_anchored (DEFS URLPATH, "/", ENTIRE);
   assert_match_anchored (DEFS URLPATH, "", ENTIRE);
+  assert_match_anchored (DEFS URLPATH, "?", ENTIRE);
+  assert_match_anchored (DEFS URLPATH, "?param=value", ENTIRE);
+  assert_match_anchored (DEFS URLPATH, "#", ENTIRE);
+  assert_match_anchored (DEFS URLPATH, "#anchor", ENTIRE);
   assert_match_anchored (DEFS URLPATH, "/php?param[]=value1&param[]=value2", ENTIRE);
   assert_match_anchored (DEFS URLPATH, "/foo?param1[index1]=value1&param2[index2]=value2", ENTIRE);
   assert_match_anchored (DEFS URLPATH, "/[[[]][]]", ENTIRE);
@@ -270,6 +274,18 @@ main (int argc, char **argv)
   assert_match (REGEX_URL_AS_IS, "http://1.2.3.4:70000";,                "http://1.2.3.4";);  /* TODO: 
can/should we totally abort here? */
   assert_match (REGEX_URL_AS_IS, "http://[dead::beef:111.222.333.444]";, nullptr);
 
+  /* '?' or '#' without '/', #7888 */
+  assert_match (REGEX_URL_AS_IS, "http://foo.bar?";,                  ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://foo.bar?param=value";,       ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://foo.bar:12345?param=value";, ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://1.2.3.4?param=value";,       ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://[dead::beef]?param=value";,  ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://foo.bar#";,                  ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://foo.bar#anchor";,            ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://foo.bar:12345#anchor";,      ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://1.2.3.4#anchor";,            ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://[dead::beef]#anchor";,       ENTIRE);
+
   /* Username, password */
   assert_match (REGEX_URL_AS_IS, "http://joe example com",                 ENTIRE);
   assert_match (REGEX_URL_AS_IS, "http://user.name:sec ret host name",     ENTIRE);
diff --git a/src/terminal-regex.hh b/src/terminal-regex.hh
index 868b017d..da1d6d6b 100644
--- a/src/terminal-regex.hh
+++ b/src/terminal-regex.hh
@@ -139,7 +139,7 @@
 /* Same as above, but the last character (if exists and is not a parenthesis) must be from PATHTERM_CLASS. */
 #define PATH_DEF "(?(DEFINE)(?<PATH>(?x: (?: " PATHCHARS_CLASS "* (?: \\( (?&PATH_INNER) \\) | \\[ 
(?&PATH_INNER) \\] ) )* (?: " PATHCHARS_CLASS "* (?(<APOS_START>)" PATHTERM_NOAPOS_CLASS "|" PATHTERM_CLASS 
") )? )))"
 
-#define URLPATH "(?x: /(?&PATH) )?"
+#define URLPATH "(?x: [/?#](?&PATH) )?"
 #define VOIP_PATH "(?x: [;?](?&PATH) )?"
 
 /* Now let's put these fragments together */


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]