[vte/wip/regex-builtins: 3/3] lib: Add builtin regexes



commit 4b0f3828e0754b63e09d5f705c545fba43d09bb7
Author: Christian Persch <chpe src gnome org>
Date:   Wed May 8 17:35:07 2019 +0200

    lib: Add builtin regexes
    
    Add builin regexes to recognise URLs, copied from gnome-terminal.
    
    https://gitlab.gnome.org/GNOME/vte/issues/114

 doc/reference/vte-sections.txt |   5 +
 src/app/app.cc                 |  13 +-
 src/fwd.hh                     |  28 +++
 src/meson.build                |  24 +-
 src/regex-builtins-patterns.hh | 151 ++++++++++++
 src/regex-builtins.cc          | 104 +++++++++
 src/regex-builtins.hh          |  79 +++++++
 src/regex-test.cc              | 507 +++++++++++++++++++++++++++++++++++++++++
 src/vte.cc                     |  39 +++-
 src/vte/vteenums.h             |  14 ++
 src/vte/vteterminal.h          |   4 +
 src/vtedefines.hh              |   1 +
 src/vtegtk.cc                  |  78 ++++++-
 src/vteinternal.hh             |   6 +
 src/vteregex.cc                |   6 +-
 15 files changed, 1035 insertions(+), 24 deletions(-)
---
diff --git a/doc/reference/vte-sections.txt b/doc/reference/vte-sections.txt
index 9ab873c1..ad2c6475 100644
--- a/doc/reference/vte-sections.txt
+++ b/doc/reference/vte-sections.txt
@@ -6,6 +6,7 @@ VteCursorBlinkMode
 VteCursorShape
 VteEraseBinding
 VteTextBlinkMode
+VteBuiltinMatchTags
 VteFormat
 VteWriteFlags
 VteSelectionFunc
@@ -70,8 +71,10 @@ vte_terminal_get_text_range
 vte_terminal_get_cursor_position
 vte_terminal_hyperlink_check_event
 vte_terminal_match_add_regex
+vte_terminal_match_add_builtins
 vte_terminal_match_remove
 vte_terminal_match_remove_all
+vte_terminal_match_remove_builtins
 vte_terminal_match_check
 vte_terminal_match_check_event
 vte_terminal_match_set_cursor_name
@@ -115,6 +118,8 @@ VTE_TYPE_ERASE_BINDING
 vte_erase_binding_get_type
 VTE_TYPE_TEXT_BLINK_MODE
 vte_text_blink_mode_get_type
+VTE_TYPE_BUILTIN_MATCH_TAGS
+vte_builtin_match_tags_get_type
 VTE_TYPE_FORMAT
 vte_format_get_type
 VTE_TYPE_WRITE_FLAGS
diff --git a/src/app/app.cc b/src/app/app.cc
index 529b584d..b3940ffc 100644
--- a/src/app/app.cc
+++ b/src/app/app.cc
@@ -991,8 +991,7 @@ struct _VteappWindowClass {
 static GType vteapp_window_get_type(void);
 
 static char const* const builtin_dingus[] = {
-        
"(((gopher|news|telnet|nntp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+(:[0-9]*)?",
-        
"(((gopher|news|telnet|nntp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+(:[0-9]*)?/[-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]*[^]'\\.}>\\)
 ,\\\"]",
+        "(foo|bar|baz)+",
         nullptr,
 };
 
@@ -1032,6 +1031,12 @@ vteapp_window_add_dingus(VteappWindow* window,
         }
 }
 
+static void
+vteapp_window_add_builtin_dingus(VteappWindow* window)
+{
+        vte_terminal_match_add_builtins(window->terminal);
+}
+
 static void
 vteapp_window_update_geometry(VteappWindow* window)
 {
@@ -1955,8 +1960,10 @@ vteapp_window_constructed(GObject *object)
                 gtk_widget_set_opacity (GTK_WIDGET (window), options.get_alpha());
 
         /* Dingus */
-        if (!options.no_builtin_dingus)
+        if (!options.no_builtin_dingus) {
                 vteapp_window_add_dingus(window, builtin_dingus);
+                vteapp_window_add_builtin_dingus(window);
+        }
         if (options.dingus != nullptr)
                 vteapp_window_add_dingus(window, options.dingus);
 
diff --git a/src/fwd.hh b/src/fwd.hh
new file mode 100644
index 00000000..4414f021
--- /dev/null
+++ b/src/fwd.hh
@@ -0,0 +1,28 @@
+/*
+ * Copyright © 2019 Christian Persch
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+namespace vte {
+
+namespace base {
+
+class RegexBuiltins;
+
+} // namespace base
+
+} // namespace vte
diff --git a/src/meson.build b/src/meson.build
index 2ce7d6e2..33d70725 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -51,7 +51,12 @@ parser_sources = files(
 
 regex_sources = files(
   'regex.cc',
-  'regex.hh'
+  'regex.hh',
+  'regex-builtins.cc',
+  'regex-builtins.hh',
+  'regex-builtins-patterns.hh',
+  'vteregex.cc',
+  'vteregexinternal.hh',
 )
 
 utf8_sources = files(
@@ -67,6 +72,7 @@ libvte_common_sources = debug_sources + modes_sources + parser_sources + regex_s
   'chunk.cc',
   'chunk.hh',
   'color-triple.hh',
+  'fwd.hh',
   'keymap.cc',
   'keymap.h',
   'pty.cc',
@@ -88,8 +94,6 @@ libvte_common_sources = debug_sources + modes_sources + parser_sources + regex_s
   'vteinternal.hh',
   'vtepcre2.h',
   'vtepty-private.h',
-  'vteregex.cc',
-  'vteregexinternal.hh',
   'vterowdata.cc',
   'vterowdata.hh',
   'vteseq.cc',
@@ -353,6 +357,19 @@ test_refptr = executable(
   install: false,
 )
 
+test_regex_sources = regex_sources + files(
+  'regex-test.cc',
+)
+
+test_regex = executable(
+  'test-regex',
+  sources: test_regex_sources,
+  dependencies: [glib_dep, gobject_dep, pcre2_dep,],
+  cpp_args: ['-DVTE_COMPILATION',],
+  include_directories: top_inc,
+  install: false,
+)
+
 test_tabstops_sources = files(
   'tabstops-test.cc',
   'tabstops.hh'
@@ -420,6 +437,7 @@ test_units = [
   ['parser', test_parser],
   ['reaper', test_reaper],
   ['refptr', test_refptr],
+  ['regex', test_regex],
   ['stream', test_stream],
   ['tabstops', test_tabstops],
   ['utf8', test_utf8],
diff --git a/src/regex-builtins-patterns.hh b/src/regex-builtins-patterns.hh
new file mode 100644
index 00000000..e4586ee9
--- /dev/null
+++ b/src/regex-builtins-patterns.hh
@@ -0,0 +1,151 @@
+/*
+ * Copyright © 2015 Egmont Koblinger
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Mini style-guide:
+ *
+ * #define'd fragments should preferably have an outermost group, for the
+ * exact same reason as why usually in C/C++ #define's the values are enclosed
+ * in parentheses: that is, so that you don't get surprised when you use the
+ * macro and append a quantifier.
+ *
+ * For repeated fragments prefer regex-style (?(DEFINE)(?<NAME>(...))) and use
+ * as (?&NAME), so that the regex string and the compiled regex object is
+ * smaller.
+ *
+ * Build small blocks, comment and unittest them heavily.
+ *
+ * Use free-spacing mode for improved readability. The hardest to read is
+ * which additional characters belong to a "(?" prefix. To improve
+ * readability, place a space after this, and for symmetry, before the closing
+ * parenthesis. Also place a space around "|" characters. No space before
+ * quantifiers. Try to be consistent with the existing style (yes I know the
+ * existing style is not consistent either, but please do your best).
+ *
+ * See http://www.rexegg.com/regex-disambiguation.html for all the "(?"
+ * syntaxes.
+ */
+
+#pragma once
+
+/* Lookbehind to see if there's a preceding apostrophe */
+#define APOS_START_DEF "(?<APOS_START>(?<='))?"
+
+#define SCHEME "(?ix: news | telnet | nntp | https? | ftps? | sftp | webcal )"
+
+#define USERCHARS "-+.[:alnum:]"
+/* Nonempty username, e.g. "john.smith" */
+#define USER "[" USERCHARS "]+"
+
+#define PASSCHARS_CLASS "[-[:alnum:]\\Q,?;.:/!%$^*&~\"#'\\E]"
+/* Optional colon-prefixed password. I guess empty password should be allowed, right? E.g. ":secret", ":", 
"" */
+#define PASS "(?x: :" PASSCHARS_CLASS "* )?"
+
+/* Optional at-terminated username (with perhaps a password too), e.g. "joe@", "pete:secret@", "" */
+#define USERPASS "(?:" USER PASS "@)?"
+
+/* S4: IPv4 segment (number between 0 and 255) with lookahead at the end so that we don't match "25" in the 
string "256".
+   The lookahead could go to the last segment of IPv4 only but this construct allows nicer unittesting. */
+#define S4_DEF "(?(DEFINE)(?<S4>(?x: (?: [0-9] | [1-9][0-9] | 1[0-9]{2} | 2[0-4][0-9] | 25[0-5] ) (?! [0-9] 
) )))"
+
+/* IPV4: Decimal IPv4, e.g. "1.2.3.4", with lookahead (implemented in S4) at the end so that we don't match 
"192.168.1.123" in the string "192.168.1.1234". */
+#define IPV4_DEF S4_DEF "(?(DEFINE)(?<IPV4>(?x: (?: (?&S4) \\. ){3} (?&S4) )))"
+
+/* IPv6, including embedded IPv4, e.g. "::1", "dead:beef::1.2.3.4".
+ * Lookahead for the next char not being a dot or digit, so it doesn't get stuck matching "dead:beef::1" in 
"dead:beef::1.2.3.4".
+ * This is not required since the surrounding brackets would trigger backtracking, but it allows nicer 
unittesting.
+ * TODO: more strict check (right number of colons, etc.)
+ * TODO: add zone_id: RFC 4007 section 11, RFC 6874 */
+
+/* S6: IPv6 segment, S6C: IPv6 segment followed by a comma, CS6: comma followed by an IPv6 segment */
+#define S6_DEF "(?(DEFINE)(?<S6>[[:xdigit:]]{1,4})(?<CS6>:(?&S6))(?<S6C>(?&S6):))"
+
+/* No :: shorthand */
+#define IPV6_FULL  "(?x: (?&S6C){7} (?&S6) )"
+/* Begins with :: */
+#define IPV6_LEFT  "(?x: : (?&CS6){1,7} )"
+/* :: somewhere in the middle - use negative lookahead to make sure there aren't too many colons in total */
+#define IPV6_MID   "(?x: (?! (?: [[:xdigit:]]*: ){8} ) (?&S6C){1,6} (?&CS6){1,6} )"
+/* Ends with :: */
+#define IPV6_RIGHT "(?x: (?&S6C){1,7} : )"
+/* Is "::" and nothing more */
+#define IPV6_NULL  "(?x: :: )"
+
+/* The same ones for IPv4-embedded notation, without the actual IPv4 part */
+#define IPV6V4_FULL  "(?x: (?&S6C){6} )"
+#define IPV6V4_LEFT  "(?x: :: (?&S6C){0,5} )"  /* includes "::<ipv4>" */
+#define IPV6V4_MID   "(?x: (?! (?: [[:xdigit:]]*: ){7} ) (?&S6C){1,4} (?&CS6){1,4} ) :"
+#define IPV6V4_RIGHT "(?x: (?&S6C){1,5} : )"
+
+/* IPV6: An IPv6 address (possibly with an embedded IPv4).
+ * This macro defines both IPV4 and IPV6, since the latter one requires the former. */
+#define IP_DEF IPV4_DEF S6_DEF "(?(DEFINE)(?<IPV6>(?x: (?: " IPV6_NULL " | " IPV6_LEFT " | " IPV6_MID " | " 
IPV6_RIGHT " | " IPV6_FULL " | (?: " IPV6V4_FULL " | " IPV6V4_LEFT " | " IPV6V4_MID " | " IPV6V4_RIGHT " ) 
(?&IPV4) ) (?! [.:[:xdigit:]] ) )))"
+
+/* Either an alphanumeric character or dash; or if [negative lookahead] not ASCII
+ * then any graphical Unicode character.
+ * A segment can consist entirely of numbers.
+ * (Note: PCRE doesn't support character class subtraction/intersection.) */
+#define HOSTNAMESEGMENTCHARS_CLASS "(?x: [-[:alnum:]] | (?! [[:ascii:]] ) [[:graph:]] )"
+
+/* A hostname of at least 1 component. The last component cannot be entirely numbers.
+ * E.g. "foo", "example.com", "1234.com", but not "foo.123" */
+#define HOSTNAME1 "(?x: (?: " HOSTNAMESEGMENTCHARS_CLASS "+ \\. )* " HOSTNAMESEGMENTCHARS_CLASS "* (?! [0-9] 
) " HOSTNAMESEGMENTCHARS_CLASS "+ )"
+
+/* A hostname of at least 2 components. The last component cannot be entirely numbers.
+ * E.g. "example.com", "1234.com", but not "1234.56" */
+#define HOSTNAME2 "(?x: (?: " HOSTNAMESEGMENTCHARS_CLASS "+ \\.)+ " HOSTNAME1 " )"
+
+/* For URL: Hostname, IPv4, or bracket-enclosed IPv6, e.g. "example.com", "1.2.3.4", "[::1]" */
+#define URL_HOST "(?x: " HOSTNAME1 " | (?&IPV4) | \\[ (?&IPV6) \\] )"
+
+/* For e-mail: Hostname of at least two segments, or bracket-enclosed IPv4 or IPv6, e.g. "example.com", 
"[1.2.3.4]", "[::1]".
+ * Technically an e-mail with a single-component hostname might be valid on a local network, but let's avoid 
tons of false positives (e.g. in a typical shell prompt). */
+#define EMAIL_HOST "(?x: " HOSTNAME2 " | \\[ (?: (?&IPV4) | (?&IPV6) ) \\] )"
+
+/* Number between 1 and 65535, with lookahead at the end so that we don't match "6789" in the string "67890",
+   and in turn we don't eventually match "http://host:6789"; in "http://host:67890";. */
+#define N_1_65535 "(?x: (?: [1-9][0-9]{0,3} | [1-5][0-9]{4} | 6[0-4][0-9]{3} | 65[0-4][0-9]{2} | 
655[0-2][0-9] | 6553[0-5] ) (?! [0-9] ) )"
+
+/* Optional colon-prefixed port, e.g. ":1080", "" */
+#define PORT "(?x: \\:" N_1_65535 " )?"
+
+/* Omit the parentheses, see below */
+#define PATHCHARS_CLASS "[-[:alnum:]\\Q_$.+!*,:;@&=?/~#|%'\\E]"
+/* Chars to end a URL. Apostrophe only allowed if there wasn't one in front of the URL, see bug 448044 */
+#define PATHTERM_CLASS        "[-[:alnum:]\\Q_$+*:@&=/~#|%'\\E]"
+#define PATHTERM_NOAPOS_CLASS "[-[:alnum:]\\Q_$+*:@&=/~#|%\\E]"
+
+/* Recursive definition of PATH that allows parentheses and square brackets only if balanced, see bug 
763980. */
+#define PATH_INNER_DEF "(?(DEFINE)(?<PATH_INNER>(?x: (?: " PATHCHARS_CLASS "* (?: \\( (?&PATH_INNER) \\) | 
\\[ (?&PATH_INNER) \\] ) )* " PATHCHARS_CLASS "* )))"
+/* Same as above, but the last character (if exists and is not a parenthesis) must be from PATHTERM_CLASS. */
+#define PATH_DEF "(?(DEFINE)(?<PATH>(?x: (?: " PATHCHARS_CLASS "* (?: \\( (?&PATH_INNER) \\) | \\[ 
(?&PATH_INNER) \\] ) )* (?: " PATHCHARS_CLASS "* (?(<APOS_START>)" PATHTERM_NOAPOS_CLASS "|" PATHTERM_CLASS 
") )? )))"
+
+#define URLPATH "(?x: /(?&PATH) )?"
+#define VOIP_PATH "(?x: [;?](?&PATH) )?"
+
+/* Now let's put these fragments together */
+
+#define DEFS APOS_START_DEF IP_DEF PATH_INNER_DEF PATH_DEF
+
+#define REGEX_URL_AS_IS  DEFS SCHEME "://" USERPASS URL_HOST PORT URLPATH
+/* TODO: also support file:/etc/passwd */
+#define REGEX_URL_FILE   DEFS "(?ix: file:/ (?: / (?: " HOSTNAME1 " )? / )? (?! / ) )(?&PATH)"
+/* Lookbehind so that we don't catch "abc.www.foo.bar", bug 739757. Lookahead for www/ftp for convenience 
(so that we can reuse HOSTNAME1). */
+#define REGEX_URL_HTTP   DEFS "(?<!(?:" HOSTNAMESEGMENTCHARS_CLASS "|[.]))(?=(?i:www|ftp))" HOSTNAME1 PORT 
URLPATH
+#define REGEX_URL_VOIP   DEFS "(?i:h323:|sips?:)" USERPASS URL_HOST PORT VOIP_PATH
+#define REGEX_EMAIL      DEFS "(?i:mailto:)?" USER "@" EMAIL_HOST
+#define REGEX_NEWS_MAN   "(?i:news:|man:|info:)[-[:alnum:]\\Q^_{|}~!\"#$%&'()*+,./;:=?`\\E]+"
diff --git a/src/regex-builtins.cc b/src/regex-builtins.cc
new file mode 100644
index 00000000..2952aca6
--- /dev/null
+++ b/src/regex-builtins.cc
@@ -0,0 +1,104 @@
+/*
+ * Copyright © 2019 Christian Persch
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#include <glib.h>
+
+#include "regex.hh"
+#include "regex-builtins.hh"
+#include "regex-builtins-patterns.hh"
+
+#include "vtepcre2.h"
+
+namespace vte::base {
+
+RegexBuiltins::RegexBuiltins()
+{
+        m_builtins.reserve(8);
+        g_printerr("ctor\n");
+        compile_builtin(REGEX_URL_AS_IS, InternalBuiltinsTags::eURL);
+        compile_builtin(REGEX_URL_HTTP,  InternalBuiltinsTags::eHTTP);
+        compile_builtin(REGEX_URL_FILE,  InternalBuiltinsTags::eFILE);
+        compile_builtin(REGEX_URL_VOIP,  InternalBuiltinsTags::eVOIP);
+        compile_builtin(REGEX_EMAIL,     InternalBuiltinsTags::eEMAIL);
+        compile_builtin(REGEX_NEWS_MAN,  InternalBuiltinsTags::eNEWS_MAN);
+}
+
+void
+RegexBuiltins::compile_builtin(char const* pattern,
+                               InternalBuiltinsTags tag)
+{
+        GError* error{nullptr};
+        auto regex = Regex::compile(Regex::Purpose::eMatch,
+                                    pattern,
+                                    PCRE2_ZERO_TERMINATED,
+                                    PCRE2_UTF | PCRE2_UCP | PCRE2_NO_UTF_CHECK | PCRE2_MULTILINE,
+                                    &error);
+        if (error) {
+                g_printerr("Failed to compile builtin regex %d: %s\n", int(tag), error->message);
+                g_error_free(error);
+                return;
+        }
+
+        regex->jit(PCRE2_JIT_COMPLETE, &error);
+        if (error) {
+                g_printerr("Failed to complete JIT compile builtin regex %d: %s\n", int(tag), 
error->message);
+                g_clear_error(&error);
+        }
+
+        regex->jit(PCRE2_JIT_PARTIAL_SOFT, &error);
+        if (error) {
+                g_printerr("Failed to partial-soft JIT compile builtin regex %d: %s\n", int(tag), 
error->message);
+                g_clear_error(&error);
+        }
+
+        m_builtins.emplace_back(take_ref(regex), int(tag));
+}
+
+int
+RegexBuiltins::transform_match(char*& match,
+                               int tag) const noexcept
+{
+        switch (InternalBuiltinsTags(tag)) {
+        case InternalBuiltinsTags::eURL:
+        case InternalBuiltinsTags::eFILE:
+        case InternalBuiltinsTags::eNEWS_MAN:
+        case InternalBuiltinsTags::eVOIP:
+                /* No transformation */
+                return int(BuiltinsTags::eURI);
+
+        case InternalBuiltinsTags::eHTTP: {
+                auto v = match;
+                match = g_strdup_printf("http://%s";, match);
+                g_free(v);
+                return int(BuiltinsTags::eURI);
+        }
+
+        case InternalBuiltinsTags::eEMAIL:
+                if (g_ascii_strncasecmp ("mailto:";, match, 7) != 0) {
+                        auto v = match;
+                        match = g_strdup_printf ("mailto:%s";, match);
+                        g_free(v);
+                }
+                return int(BuiltinsTags::eURI);
+        }
+
+        return -1;
+}
+
+} // namespace vte::base
diff --git a/src/regex-builtins.hh b/src/regex-builtins.hh
new file mode 100644
index 00000000..fcef325e
--- /dev/null
+++ b/src/regex-builtins.hh
@@ -0,0 +1,79 @@
+/*
+ * Copyright © 2019 Christian Persch
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "regex.hh"
+#include "refptr.hh"
+
+namespace vte {
+
+namespace base {
+
+class RegexBuiltins {
+private:
+        static inline std::weak_ptr<RegexBuiltins> s_weak_ptr{};
+
+        std::vector<std::pair<RefPtr<Regex>, int>> m_builtins{};
+
+        enum class InternalBuiltinsTags : int {
+                eURL      = -2,
+                eHTTP     = -3,
+                eFILE     = -4,
+                eVOIP     = -5,
+                eEMAIL    = -6,
+                eNEWS_MAN = -7
+        };
+
+        void compile_builtin(char const* pattern,
+                             InternalBuiltinsTags tag) noexcept;
+
+public:
+        // these must have the same values as the public VteBuiltinMatchTags
+        enum class BuiltinsTags : int {
+                eURI = -2
+        };
+
+        RegexBuiltins();
+        ~RegexBuiltins() { }
+        RegexBuiltins(RegexBuiltins const&) = delete;
+        RegexBuiltins(RegexBuiltins&&) = delete;
+
+        RegexBuiltins& operator= (RegexBuiltins const&) = delete;
+        RegexBuiltins& operator= (RegexBuiltins&&) = delete;
+
+        auto const& builtins() const noexcept { return m_builtins; }
+
+        int transform_match(char*& match,
+                            int tag) const noexcept;
+
+        static std::shared_ptr<RegexBuiltins> get()
+        {
+                auto inst = s_weak_ptr.lock();
+                if (!inst)
+                        s_weak_ptr = inst = std::make_shared<RegexBuiltins>();
+                return inst;
+        }
+};
+
+} // namespace base
+
+} // namespace vte
diff --git a/src/regex-test.cc b/src/regex-test.cc
new file mode 100644
index 00000000..94526ba5
--- /dev/null
+++ b/src/regex-test.cc
@@ -0,0 +1,507 @@
+/*
+ * Copyright © 2015 Egmont Koblinger
+ * Copyright © 2019 Christian Persch
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#include <locale.h>
+#include <glib.h>
+
+#include <cstdint>
+
+#include "regex.hh"
+#include "regex-builtins-patterns.hh"
+
+/* Shorthand for expecting the pattern to match the entire input string */
+#define ENTIRE ((char *) 1)
+
+static pcre2_match_context_8*
+create_match_context()
+{
+        pcre2_match_context_8 *match_context;
+
+        match_context = pcre2_match_context_create_8(nullptr /* general context */);
+        pcre2_set_match_limit_8(match_context, 65536); /* should be plenty */
+        pcre2_set_recursion_limit_8(match_context, 64); /* should be plenty */
+
+        return match_context;
+}
+
+static char*
+get_match(decltype(&pcre2_match_8) match_fn,
+          vte::base::Regex const* regex,
+          uint32_t match_flags,
+          char const* subject)
+{
+        auto match_context = create_match_context();
+        auto match_data = pcre2_match_data_create_8(256 /* should be plenty */,
+                                                    nullptr /* general context */);
+
+        auto r = match_fn(regex->code(),
+                          (PCRE2_SPTR8)subject,
+                          strlen(subject),
+                          0, /* start offset */
+                          match_flags |
+                          PCRE2_NO_UTF_CHECK,
+                          match_data,
+                          match_context);
+
+        char* match;
+        if (r == PCRE2_ERROR_NOMATCH) {
+                match = nullptr;
+        } else if (r < 0) {
+                /* Error */
+                PCRE2_UCHAR8 buf[256];
+                int n = pcre2_get_error_message_8(r, buf, sizeof(buf));
+                g_assert_true(n >= 0);
+                g_printerr("PCRE2 error %d: %s\n", r, buf);
+
+                match = nullptr;
+        } else {
+                /* has match */
+                auto const* ovector = pcre2_get_ovector_pointer_8(match_data);
+                auto const so = ovector[0];
+                auto const eo = ovector[1];
+                if (so == PCRE2_UNSET || eo == PCRE2_UNSET)
+                        match = nullptr;
+                else
+                        match = g_strndup(subject + so, eo - so);
+        }
+
+        pcre2_match_data_free_8(match_data);
+        pcre2_match_context_free_8(match_context);
+
+        return match;
+}
+
+struct TestData {
+        char const* pattern;
+        char const* string;
+        char const* expected;
+        uint32_t match_flags;
+};
+
+static void
+assert_match_test(void const* ptr)
+{
+        TestData const* data = (TestData*)ptr;
+
+        GError *error{nullptr};
+        auto regex = vte::base::Regex::compile(vte::base::Regex::Purpose::eMatch,
+                                               data->pattern,
+                                               PCRE2_ZERO_TERMINATED,
+                                               PCRE2_UTF | PCRE2_NO_UTF_CHECK |
+                                               PCRE2_UCP |
+                                               PCRE2_MULTILINE,
+                                               &error);
+        g_assert_no_error(error);
+        g_assert_nonnull(regex);
+
+        auto match = get_match(&pcre2_match_8, regex, data->match_flags, data->string);
+
+        g_assert_cmpstr(match, ==, data->expected);
+        g_free(match);
+
+        if (vte::base::Regex::check_pcre_config_jit()) {
+                regex->jit(PCRE2_JIT_COMPLETE, &error);
+                g_assert_no_error(error);
+                regex->jit(PCRE2_JIT_PARTIAL_SOFT, &error);
+                g_assert_no_error(error);
+
+#if 1
+                // FIXME: some JIT matches are wrong, why?
+                match = get_match(&pcre2_jit_match_8, regex, data->match_flags, data->string);
+                #if 1
+                if (match != data->expected &&
+                    g_strcmp0(match, data->expected) != 0)
+                        g_printerr("JIT match: pattern: \"%s\"\n"
+                                   "           flags:   %08x\n"
+                                   "           subject: \"%s\"\n"
+                                   "           match:   \"%s\"\n"
+                                   "           expected: \"%s\"\n\n",
+                                   data->pattern,
+                                   data->match_flags,
+                                   data->string,
+                                   match ? match : "(nil)",
+                                   data->expected ? data->expected : "(nil)");
+                #else
+                g_assert_cmpstr(match, ==, data->expected);
+                #endif
+                g_free(match);
+#endif
+        }
+
+        regex->unref();
+}
+
+static void
+assert_match(char const* pattern,
+             char const* string,
+             char const* expected,
+             uint32_t match_flags = 0u,
+             int line = __builtin_LINE())
+{
+        TestData* data = g_new(TestData, 1);
+        data->pattern = pattern;
+        data->string = string;
+        data->expected = expected == ENTIRE ? string : expected;
+        data->match_flags = match_flags;
+
+        char* path = g_strdup_printf("/vte/regex/builtins/%d", line);
+        g_test_add_data_func_full(path, data, assert_match_test, (GDestroyNotify)g_free);
+        g_free(path);
+}
+
+static void
+assert_match_anchored(char const* pattern,
+                      char const* string,
+                      char const* expected,
+                      int line = __builtin_LINE())
+{
+        assert_match(pattern, string, expected, PCRE2_ANCHORED, line);
+}
+
+static void
+setup_regex_builtins_tests(void)
+{
+  /* SCHEME is case insensitive */
+  assert_match_anchored (SCHEME, "http",  ENTIRE);
+  assert_match_anchored (SCHEME, "HTTPS", ENTIRE);
+
+  /* USER is nonempty, alphanumeric, dot, plus and dash */
+  assert_match_anchored (USER, "",              NULL);
+  assert_match_anchored (USER, "dr.john-smith", ENTIRE);
+  assert_match_anchored (USER, "abc+def@ghi",   "abc+def");
+
+  /* PASS is optional colon-prefixed value, allowing quite some characters, but definitely not @ */
+  assert_match_anchored (PASS, "",          ENTIRE);
+  assert_match_anchored (PASS, "nocolon",   "");
+  assert_match_anchored (PASS, ":s3cr3T",   ENTIRE);
+  assert_match_anchored (PASS, ":$?#@host", ":$?#");
+
+  /* Hostname of at least 1 component, containing at least one non-digit in at least one of the segments */
+  assert_match_anchored (HOSTNAME1, "example.com",       ENTIRE);
+  assert_match_anchored (HOSTNAME1, "a-b.c-d",           ENTIRE);
+  assert_match_anchored (HOSTNAME1, "a_b",               "a");    /* TODO: can/should we totally abort here? 
*/
+  assert_match_anchored (HOSTNAME1, "déjà-vu.com",       ENTIRE);
+  assert_match_anchored (HOSTNAME1, "➡.ws",              ENTIRE);
+  assert_match_anchored (HOSTNAME1, "cömbining-áccents", ENTIRE);
+  assert_match_anchored (HOSTNAME1, "12",                NULL);
+  assert_match_anchored (HOSTNAME1, "12.34",             NULL);
+  assert_match_anchored (HOSTNAME1, "12.ab",             ENTIRE);
+//  assert_match_anchored (HOSTNAME1, "ab.12",             NULL);  /* errr... could we fail here?? */
+
+  /* Hostname of at least 2 components, containing at least one non-digit in at least one of the segments */
+  assert_match_anchored (HOSTNAME2, "example.com",       ENTIRE);
+  assert_match_anchored (HOSTNAME2, "example",           NULL);
+  assert_match_anchored (HOSTNAME2, "12",                NULL);
+  assert_match_anchored (HOSTNAME2, "12.34",             NULL);
+  assert_match_anchored (HOSTNAME2, "12.ab",             ENTIRE);
+  assert_match_anchored (HOSTNAME2, "ab.12",             NULL);
+//  assert_match_anchored (HOSTNAME2, "ab.cd.12",          NULL);  /* errr... could we fail here?? */
+
+  /* IPv4 segment (number between 0 and 255) */
+  assert_match_anchored (DEFS "(?&S4)", "0",    ENTIRE);
+  assert_match_anchored (DEFS "(?&S4)", "1",    ENTIRE);
+  assert_match_anchored (DEFS "(?&S4)", "9",    ENTIRE);
+  assert_match_anchored (DEFS "(?&S4)", "10",   ENTIRE);
+  assert_match_anchored (DEFS "(?&S4)", "99",   ENTIRE);
+  assert_match_anchored (DEFS "(?&S4)", "100",  ENTIRE);
+  assert_match_anchored (DEFS "(?&S4)", "200",  ENTIRE);
+  assert_match_anchored (DEFS "(?&S4)", "250",  ENTIRE);
+  assert_match_anchored (DEFS "(?&S4)", "255",  ENTIRE);
+  assert_match_anchored (DEFS "(?&S4)", "256",  NULL);
+  assert_match_anchored (DEFS "(?&S4)", "260",  NULL);
+  assert_match_anchored (DEFS "(?&S4)", "300",  NULL);
+  assert_match_anchored (DEFS "(?&S4)", "1000", NULL);
+  assert_match_anchored (DEFS "(?&S4)", "",     NULL);
+  assert_match_anchored (DEFS "(?&S4)", "a1b",  NULL);
+
+  /* IPv4 addresses */
+  assert_match_anchored (DEFS "(?&IPV4)", "11.22.33.44",    ENTIRE);
+  assert_match_anchored (DEFS "(?&IPV4)", "0.1.254.255",    ENTIRE);
+  assert_match_anchored (DEFS "(?&IPV4)", "75.150.225.300", NULL);
+  assert_match_anchored (DEFS "(?&IPV4)", "1.2.3.4.5",      "1.2.3.4");  /* we could also bail out and not 
match at all */
+
+  /* IPv6 addresses */
+  assert_match_anchored (DEFS "(?&IPV6)", "11:::22",                           NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22::33:44::55:66",               NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "dead::beef",                        ENTIRE);
+  assert_match_anchored (DEFS "(?&IPV6)", "faded::bee",                        NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "live::pork",                        NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "::1",                               ENTIRE);
+  assert_match_anchored (DEFS "(?&IPV6)", "11::22:33::44",                     NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:::33",                        NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "dead:beef::192.168.1.1",            ENTIRE);
+  assert_match_anchored (DEFS "(?&IPV6)", "192.168.1.1",                       NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:87654",        NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22::33:45678",                   NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:192.168.1.12345", NULL);
+
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77",              NULL);   /* no :: */
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:88",           ENTIRE);
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:88:99",        NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:66:77",            ENTIRE); /* :: at the start */
+  assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:66:77:88",         NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:66:77",             ENTIRE); /* :: in the middle 
*/
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:66:77:88",          NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77::",            ENTIRE); /* :: at the end */
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:88::",         NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "::",                                ENTIRE); /* :: only */
+
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:192.168.1.1",        NULL);   /* no :: */
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:192.168.1.1",     ENTIRE);
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:192.168.1.1",  NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:192.168.1.1",      ENTIRE); /* :: at the start */
+  assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:66:192.168.1.1",   NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:192.168.1.1",       ENTIRE); /* :: in the imddle 
*/
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:66:192.168.1.1",    NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55::192.168.1.1",       ENTIRE); /* :: at the 
end(ish) */
+  assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66::192.168.1.1",    NULL);
+  assert_match_anchored (DEFS "(?&IPV6)", "::192.168.1.1",                     ENTIRE); /* :: only(ish) */
+
+  /* URL_HOST is either a hostname, or an IPv4 address, or a bracket-enclosed IPv6 address */
+  assert_match_anchored (DEFS URL_HOST, "example",       ENTIRE);
+  assert_match_anchored (DEFS URL_HOST, "example.com",   ENTIRE);
+  assert_match_anchored (DEFS URL_HOST, "11.22.33.44",   ENTIRE);
+  assert_match_anchored (DEFS URL_HOST, "[11.22.33.44]", NULL);
+  assert_match_anchored (DEFS URL_HOST, "dead::be:ef",   "dead");  /* TODO: can/should we totally abort 
here? */
+  assert_match_anchored (DEFS URL_HOST, "[dead::be:ef]", ENTIRE);
+
+  /* EMAIL_HOST is either an at least two-component hostname, or a bracket-enclosed IPv[46] address */
+  assert_match_anchored (DEFS EMAIL_HOST, "example",        NULL);
+  assert_match_anchored (DEFS EMAIL_HOST, "example.com",    ENTIRE);
+  assert_match_anchored (DEFS EMAIL_HOST, "11.22.33.44",    NULL);
+  assert_match_anchored (DEFS EMAIL_HOST, "[11.22.33.44]",  ENTIRE);
+  assert_match_anchored (DEFS EMAIL_HOST, "[11.22.33.456]", NULL);
+  assert_match_anchored (DEFS EMAIL_HOST, "dead::be:ef",    NULL);
+  assert_match_anchored (DEFS EMAIL_HOST, "[dead::be:ef]",  ENTIRE);
+
+  /* Number between 1 and 65535 (helper for port) */
+  assert_match_anchored (N_1_65535, "0",      NULL);
+  assert_match_anchored (N_1_65535, "1",      ENTIRE);
+  assert_match_anchored (N_1_65535, "10",     ENTIRE);
+  assert_match_anchored (N_1_65535, "100",    ENTIRE);
+  assert_match_anchored (N_1_65535, "1000",   ENTIRE);
+  assert_match_anchored (N_1_65535, "10000",  ENTIRE);
+  assert_match_anchored (N_1_65535, "60000",  ENTIRE);
+  assert_match_anchored (N_1_65535, "65000",  ENTIRE);
+  assert_match_anchored (N_1_65535, "65500",  ENTIRE);
+  assert_match_anchored (N_1_65535, "65530",  ENTIRE);
+  assert_match_anchored (N_1_65535, "65535",  ENTIRE);
+  assert_match_anchored (N_1_65535, "65536",  NULL);
+  assert_match_anchored (N_1_65535, "65540",  NULL);
+  assert_match_anchored (N_1_65535, "65600",  NULL);
+  assert_match_anchored (N_1_65535, "66000",  NULL);
+  assert_match_anchored (N_1_65535, "70000",  NULL);
+  assert_match_anchored (N_1_65535, "100000", NULL);
+  assert_match_anchored (N_1_65535, "",       NULL);
+  assert_match_anchored (N_1_65535, "a1b",    NULL);
+
+  /* PORT is an optional colon-prefixed value */
+  assert_match_anchored (PORT, "",       ENTIRE);
+  assert_match_anchored (PORT, ":1",     ENTIRE);
+  assert_match_anchored (PORT, ":65535", ENTIRE);
+  assert_match_anchored (PORT, ":65536", "");     /* TODO: can/should we totally abort here? */
+
+  /* Parentheses are only allowed in matching pairs, see bug 763980. */
+  /* TODO: add tests for PATHCHARS and PATHNONTERM; and/or URLPATH */
+  assert_match_anchored (DEFS URLPATH, "/ab/cd",       ENTIRE);
+  assert_match_anchored (DEFS URLPATH, "/ab/cd.html.", "/ab/cd.html");
+  assert_match_anchored (DEFS URLPATH, "/The_Offspring_(album)", ENTIRE);
+  assert_match_anchored (DEFS URLPATH, "/The_Offspring)", "/The_Offspring");
+  assert_match_anchored (DEFS URLPATH, "/a((b(c)d)e(f))", ENTIRE);
+  assert_match_anchored (DEFS URLPATH, "/a((b(c)d)e(f)))", "/a((b(c)d)e(f))");
+  assert_match_anchored (DEFS URLPATH, "/a(b).(c).", "/a(b).(c)");
+  assert_match_anchored (DEFS URLPATH, "/a.(b.(c.).).(d.(e.).).)", "/a.(b.(c.).).(d.(e.).)");
+  assert_match_anchored (DEFS URLPATH, "/a)b(c", "/a");
+  assert_match_anchored (DEFS URLPATH, "/.", "/");
+  assert_match_anchored (DEFS URLPATH, "/(.", "/");
+  assert_match_anchored (DEFS URLPATH, "/).", "/");
+  assert_match_anchored (DEFS URLPATH, "/().", "/()");
+  assert_match_anchored (DEFS URLPATH, "/", ENTIRE);
+  assert_match_anchored (DEFS URLPATH, "", ENTIRE);
+  assert_match_anchored (DEFS URLPATH, "/php?param[]=value1&param[]=value2", ENTIRE);
+  assert_match_anchored (DEFS URLPATH, "/foo?param1[index1]=value1&param2[index2]=value2", ENTIRE);
+  assert_match_anchored (DEFS URLPATH, "/[[[]][]]", ENTIRE);
+  assert_match_anchored (DEFS URLPATH, "/[([])]([()])", ENTIRE);
+  assert_match_anchored (DEFS URLPATH, "/([()])[([])]", ENTIRE);
+  assert_match_anchored (DEFS URLPATH, "/[(])", "/");
+  assert_match_anchored (DEFS URLPATH, "/([)]", "/");
+
+
+  /* Put the components together and test the big picture */
+
+  assert_match (REGEX_URL_AS_IS, "There's no URL here http:/foo",               NULL);
+  assert_match (REGEX_URL_AS_IS, "Visit http://example.com for details",        "http://example.com";);
+  assert_match (REGEX_URL_AS_IS, "Trailing dot http://foo/bar.html.";,           "http://foo/bar.html";);
+  assert_match (REGEX_URL_AS_IS, "Trailing ellipsis http://foo/bar.html...";,    "http://foo/bar.html";);
+  assert_match (REGEX_URL_AS_IS, "Trailing comma http://foo/bar,baz,";,          "http://foo/bar,baz";);
+  assert_match (REGEX_URL_AS_IS, "Trailing semicolon http://foo/bar;baz;";,      "http://foo/bar;baz";);
+  assert_match (REGEX_URL_AS_IS, "See <http://foo/bar>",                        "http://foo/bar";);
+  assert_match (REGEX_URL_AS_IS, "<http://foo.bar/asdf.qwer.html>",             
"http://foo.bar/asdf.qwer.html";);
+  assert_match (REGEX_URL_AS_IS, "Go to http://192.168.1.1.";,                   "http://192.168.1.1";);
+  assert_match (REGEX_URL_AS_IS, "If not, see <http://www.gnu.org/licenses/>.", 
"http://www.gnu.org/licenses/";);
+  assert_match (REGEX_URL_AS_IS, "<a href=\"http://foo/bar\";>foo</a>",          "http://foo/bar";);
+  assert_match (REGEX_URL_AS_IS, "<a href='http://foo/bar'>foo</a>",            "http://foo/bar";);
+  assert_match (REGEX_URL_AS_IS, "<url>http://foo/bar</url>",                   "http://foo/bar";);
+
+  assert_match (REGEX_URL_AS_IS, "http://";,          NULL);
+  assert_match (REGEX_URL_AS_IS, "http://a";,         ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://aa.";,       "http://aa";);
+  assert_match (REGEX_URL_AS_IS, "http://aa.b";,      ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://aa.bb";,     ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://aa.bb/c";,   ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://aa.bb/cc";,  ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://aa.bb/cc/";, ENTIRE);
+
+  assert_match (REGEX_URL_AS_IS, "HtTp://déjà-vu.com:10000/déjà/vu", ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "HTTP://joe:sEcReT@➡.ws:1080",      ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "https://cömbining-áccents";,        ENTIRE);
+
+  assert_match (REGEX_URL_AS_IS, "http://111.222.33.44";,                ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://111.222.33.44/";,               ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://111.222.33.44/foo";,            ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://1.2.3.4:5555/xyz";,             ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "https://[dead::beef]:12345/ipv6";,     ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "https://[dead::beef:11.22.33.44]";,    ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://1.2.3.4:";,                     "http://1.2.3.4";);  /* TODO: 
can/should we totally abort here? */
+  assert_match (REGEX_URL_AS_IS, "https://dead::beef/no-brackets-ipv6";, "https://dead";);    /* ditto */
+  assert_match (REGEX_URL_AS_IS, "http://111.222.333.444/";,             NULL);
+  assert_match (REGEX_URL_AS_IS, "http://1.2.3.4:70000";,                "http://1.2.3.4";);  /* TODO: 
can/should we totally abort here? */
+  assert_match (REGEX_URL_AS_IS, "http://[dead::beef:111.222.333.444]";, NULL);
+
+  /* Username, password */
+  assert_match (REGEX_URL_AS_IS, "http://joe example com",                 ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://user.name:sec ret host name",     ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://joe:secret@[::1]";,                ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://dudewithnopassword:@example.com";, ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://safeguy:!#$%^&*@host";,            ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http://invalidusername!@host";,           "http://invalidusername";);
+
+  assert_match (REGEX_URL_AS_IS, "http://ab.cd/ef?g=h&i=j|k=l#m=n:o=p", ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "http:///foo";,                         NULL);
+
+  /* Parentheses are only allowed in matching pairs, see bug 763980. */
+  assert_match (REGEX_URL_AS_IS, "https://en.wikipedia.org/wiki/The_Offspring_(album)", ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "[markdown](https://en.wikipedia.org/wiki/The_Offspring)", 
"https://en.wikipedia.org/wiki/The_Offspring";);
+  assert_match (REGEX_URL_AS_IS, "[markdown](https://en.wikipedia.org/wiki/The_Offspring_(album))", 
"https://en.wikipedia.org/wiki/The_Offspring_(album)");
+  assert_match (REGEX_URL_AS_IS, "[markdown](http://foo.bar/(a(b)c)d)e)f", "http://foo.bar/(a(b)c)d");
+  assert_match (REGEX_URL_AS_IS, "[markdown](http://foo.bar/a)b(c", "http://foo.bar/a";);
+
+  /* Apostrophes are allowed, except at trailing position if the URL is preceded by an apostrophe, see bug 
448044. */
+  assert_match (REGEX_URL_AS_IS, "https://en.wikipedia.org/wiki/Moore's_law", ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "<a href=\"https://en.wikipedia.org/wiki/Moore's_law\">", 
"https://en.wikipedia.org/wiki/Moore's_law");
+  assert_match (REGEX_URL_AS_IS, "https://en.wikipedia.org/wiki/Cryin'", ENTIRE);
+  assert_match (REGEX_URL_AS_IS, "<a href=\"https://en.wikipedia.org/wiki/Cryin'\">", 
"https://en.wikipedia.org/wiki/Cryin'");
+  assert_match (REGEX_URL_AS_IS, "<a href='https://en.wikipedia.org/wiki/Aerosmith'>", 
"https://en.wikipedia.org/wiki/Aerosmith";);
+
+  /* No scheme */
+  assert_match (REGEX_URL_HTTP, "www.foo.bar/baz",     ENTIRE);
+  assert_match (REGEX_URL_HTTP, "WWW3.foo.bar/baz",    ENTIRE);
+  assert_match (REGEX_URL_HTTP, "FTP.FOO.BAR/BAZ",     ENTIRE);  /* FIXME if no scheme is given and url 
starts with ftp, can we make the protocol ftp instead of http? */
+  assert_match (REGEX_URL_HTTP, "ftpxy.foo.bar/baz",   ENTIRE);
+//  assert_match (REGEX_URL_HTTP, "ftp.123/baz",         NULL);  /* errr... could we fail here?? */
+  assert_match (REGEX_URL_HTTP, "foo.bar/baz",         NULL);
+  assert_match (REGEX_URL_HTTP, "abc.www.foo.bar/baz", NULL);
+  assert_match (REGEX_URL_HTTP, "uvwww.foo.bar/baz",   NULL);
+  assert_match (REGEX_URL_HTTP, "xftp.foo.bar/baz",    NULL);
+
+  /* file:/ or file://(hostname)?/ */
+  assert_match (REGEX_URL_FILE, "file:",                NULL);
+  assert_match (REGEX_URL_FILE, "file:/",               ENTIRE);
+  assert_match (REGEX_URL_FILE, "file://",              NULL);
+  assert_match (REGEX_URL_FILE, "file:///",             ENTIRE);
+  assert_match (REGEX_URL_FILE, "file:////",            NULL);
+  assert_match (REGEX_URL_FILE, "file:etc/passwd",      NULL);
+  assert_match (REGEX_URL_FILE, "File:/etc/passwd",     ENTIRE);
+  assert_match (REGEX_URL_FILE, "FILE:///etc/passwd",   ENTIRE);
+  assert_match (REGEX_URL_FILE, "file:////etc/passwd",  NULL);
+  assert_match (REGEX_URL_FILE, "file://host.name",     NULL);
+  assert_match (REGEX_URL_FILE, "file://host.name/",    ENTIRE);
+  assert_match (REGEX_URL_FILE, "file://host.name/etc", ENTIRE);
+
+  assert_match (REGEX_URL_FILE, "See file:/.",             "file:/");
+  assert_match (REGEX_URL_FILE, "See file:///.",           "file:///");
+  assert_match (REGEX_URL_FILE, "See file:/lost+found.",   "file:/lost+found");
+  assert_match (REGEX_URL_FILE, "See file:///lost+found.", "file:///lost+found");
+
+  /* Email */
+  assert_match (REGEX_EMAIL, "Write to foo bar com.",        "foo bar com");
+  assert_match (REGEX_EMAIL, "Write to <foo bar com>",       "foo bar com");
+  assert_match (REGEX_EMAIL, "Write to mailto:foo bar com.", "mailto:foo bar com");
+  assert_match (REGEX_EMAIL, "Write to MAILTO:FOO BAR COM.", "MAILTO:FOO BAR COM");
+  assert_match (REGEX_EMAIL, "Write to foo@[1.2.3.4]",       "foo@[1.2.3.4]");
+  assert_match (REGEX_EMAIL, "Write to foo@[1.2.3.456]",     NULL);
+  assert_match (REGEX_EMAIL, "Write to foo@[1::2345]",       "foo@[1::2345]");
+  assert_match (REGEX_EMAIL, "Write to foo@[dead::beef]",    "foo@[dead::beef]");
+  assert_match (REGEX_EMAIL, "Write to foo@1.2.3.4",         NULL);
+  assert_match (REGEX_EMAIL, "Write to foo@1.2.3.456",       NULL);
+  assert_match (REGEX_EMAIL, "Write to foo@1::2345",         NULL);
+  assert_match (REGEX_EMAIL, "Write to foo@dead::beef",      NULL);
+  assert_match (REGEX_EMAIL, "<baz email=\"foo bar com\"/>", "foo bar com");
+  assert_match (REGEX_EMAIL, "<baz email='foo bar com'/>",   "foo bar com");
+  assert_match (REGEX_EMAIL, "<email>foo bar com</email>",   "foo bar com");
+
+  /* Sip, examples from rfc 3261 */
+  assert_match (REGEX_URL_VOIP, "sip:alice atlanta com;maddr=239.255.255.1;ttl=15",           ENTIRE);
+  assert_match (REGEX_URL_VOIP, "sip:alice atlanta com",                                      ENTIRE);
+  assert_match (REGEX_URL_VOIP, "sip:alice:secretword atlanta com;transport=tcp",             ENTIRE);
+  assert_match (REGEX_URL_VOIP, "sips:alice atlanta com?subject=project%20x&priority=urgent", ENTIRE);
+  assert_match (REGEX_URL_VOIP, "sip:+1-212-555-1212:1234 gateway com;user=phone",            ENTIRE);
+  assert_match (REGEX_URL_VOIP, "sips:1212 gateway com",                                      ENTIRE);
+  assert_match (REGEX_URL_VOIP, "sip:alice@192.0.2.4",                                        ENTIRE);
+  assert_match (REGEX_URL_VOIP, "sip:atlanta.com;method=REGISTER?to=alice%40atlanta.com",     ENTIRE);
+  assert_match (REGEX_URL_VOIP, "SIP:alice;day=tuesday atlanta com",                          ENTIRE);
+  assert_match (REGEX_URL_VOIP, "Dial sip:alice@192.0.2.4.",                                  
"sip:alice@192.0.2.4");
+
+  /* Extremely long match, bug 770147 */
+  assert_match (REGEX_URL_AS_IS, "http://www.example.com/ThisPathConsistsOfMoreThan1024Characters";
+                                 
"1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+                                 
"1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+                                 
"1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+                                 
"1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+                                 
"1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+                                 
"1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+                                 
"1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+                                 
"1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+                                 
"1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+                                 
"1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890", 
ENTIRE);
+}
+
+static void
+test_regex_unicode(void)
+{
+        GError* error{nullptr};
+        g_assert_true(vte::base::Regex::check_pcre_config_unicode(&error));
+        g_assert_no_error(error);
+}
+
+int
+main(int argc,
+     char* argv[])
+{
+        setlocale(LC_ALL, "");
+
+        g_test_init(&argc, &argv, nullptr);
+
+        g_test_add_func("/vte/regex/unicode", test_regex_unicode);
+
+        setup_regex_builtins_tests();
+
+        return g_test_run();
+}
diff --git a/src/vte.cc b/src/vte.cc
index e0930bd2..fa2137da 100644
--- a/src/vte.cc
+++ b/src/vte.cc
@@ -65,6 +65,9 @@
 #include "vtepty-private.h"
 #include "vtegtk.hh"
 
+#include "regex-builtins.hh"
+
+#include <algorithm>
 #include <new> /* placement new */
 
 #ifndef HAVE_ROUND
@@ -816,6 +819,28 @@ Terminal::regex_match_remove(int tag) noexcept
         match_regexes_writable().erase(i);
 }
 
+void
+Terminal::regex_match_add_builtins() noexcept
+{
+        auto& match_regexes = match_regexes_writable();
+        if (!m_match_regex_builtins)
+                m_match_regex_builtins = vte::base::RegexBuiltins::get();
+        for (auto const& [regex, tag] : m_match_regex_builtins->builtins()) {
+                match_regexes.emplace_back(make_ref(regex.get()),
+                                           0 /* match flags */,
+                                           VTE_MATCH_BUILTINS_CURSOR,
+                                           tag);
+        }
+}
+
+void
+Terminal::regex_match_remove_builtins() noexcept
+{
+        auto& match_regexes = match_regexes_writable();
+        std::remove_if(std::begin(match_regexes), std::end(match_regexes),
+                       [](MatchRegex const& rem) { return rem.tag() < 0; });
+}
+
 /*
  * match_rowcol_to_offset:
  * @terminal:
@@ -1195,7 +1220,7 @@ Terminal::match_check_internal(vte::grid::column_t column,
 char*
 Terminal::regex_match_check(vte::grid::column_t column,
                             vte::grid::row_t row,
-                            int* tag)
+                            int* tag_ptr)
 {
        long delta = m_screen->scroll_delta;
        _vte_debug_print(VTE_DEBUG_EVENTS | VTE_DEBUG_REGEX,
@@ -1218,8 +1243,16 @@ Terminal::regex_match_check(vte::grid::column_t column,
        _VTE_DEBUG_IF(VTE_DEBUG_EVENTS | VTE_DEBUG_REGEX) {
                if (ret != NULL) g_printerr("Matched `%s'.\n", ret);
        }
-        if (tag != nullptr)
-                *tag = (match != nullptr) ? match->tag() : -1;
+
+        int tag = -1;
+        if (match != nullptr) {
+                tag = match->tag();
+                if (tag < -1 && m_match_regex_builtins)
+                        tag = m_match_regex_builtins->transform_match(ret, tag);
+        }
+
+        if (tag_ptr != nullptr)
+                *tag_ptr = tag;
 
        return ret;
 }
diff --git a/src/vte/vteenums.h b/src/vte/vteenums.h
index 84d07a0d..a288bc5d 100644
--- a/src/vte/vteenums.h
+++ b/src/vte/vteenums.h
@@ -177,6 +177,20 @@ typedef enum {
         VTE_FORMAT_HTML = 2
 } VteFormat;
 
+/**
+ * VteBuiltinMatchTags:
+ * @VTE_BUILTIN_MATCH_TAG_URI: the match is an URI as recognised by
+ *   the expressions added with vte_terminal_match_add_uris()
+ *
+ * An enumeration that will be returned from vte_terminal_match_check_event()
+ * if a builtin expression matched.
+ *
+ * Since: 0.58
+ */
+typedef enum {
+        VTE_BUILTIN_MATCH_TAG_URI = -2
+} VteBuiltinMatchTags;
+
 G_END_DECLS
 
 #endif /* __VTE_VTE_ENUMS_H__ */
diff --git a/src/vte/vteterminal.h b/src/vte/vteterminal.h
index 26ac236e..e5e9708c 100644
--- a/src/vte/vteterminal.h
+++ b/src/vte/vteterminal.h
@@ -383,6 +383,8 @@ _VTE_PUBLIC
 int vte_terminal_match_add_regex(VteTerminal *terminal,
                                  VteRegex *regex,
                                  guint32 flags) _VTE_GNUC_NONNULL(1) _VTE_GNUC_NONNULL(2);
+_VTE_PUBLIC
+void vte_terminal_match_add_builtins(VteTerminal *terminal) _VTE_GNUC_NONNULL(1);
 /* Set the cursor to be used when the pointer is over a given match. */
 _VTE_PUBLIC
 void vte_terminal_match_set_cursor_name(VteTerminal *terminal,
@@ -392,6 +394,8 @@ _VTE_PUBLIC
 void vte_terminal_match_remove(VteTerminal *terminal,
                                int tag) _VTE_GNUC_NONNULL(1);
 _VTE_PUBLIC
+void vte_terminal_match_remove_builtins(VteTerminal *terminal) _VTE_GNUC_NONNULL(1);
+_VTE_PUBLIC
 void vte_terminal_match_remove_all(VteTerminal *terminal) _VTE_GNUC_NONNULL(1);
 
 /* Check if a given cell on the screen contains part of a matched string.  If
diff --git a/src/vtedefines.hh b/src/vtedefines.hh
index 71896a76..a3f70dc1 100644
--- a/src/vtedefines.hh
+++ b/src/vtedefines.hh
@@ -79,6 +79,7 @@
 #define VTE_MOUSING_CURSOR             GDK_LEFT_PTR
 #define VTE_HYPERLINK_CURSOR           GDK_HAND2
 #define VTE_HYPERLINK_CURSOR_DEBUG     GDK_SPIDER
+#define VTE_MATCH_BUILTINS_CURSOR       GDK_HAND2
 #define VTE_CHILD_INPUT_PRIORITY       G_PRIORITY_DEFAULT_IDLE
 #define VTE_CHILD_OUTPUT_PRIORITY      G_PRIORITY_HIGH
 #define VTE_MAX_INPUT_READ             0x1000
diff --git a/src/vtegtk.cc b/src/vtegtk.cc
index 3243e10a..2a289b9e 100644
--- a/src/vtegtk.cc
+++ b/src/vtegtk.cc
@@ -1913,6 +1913,9 @@ vte_terminal_paste_primary(VteTerminal *terminal)
  * user moves the mouse cursor over a section of displayed text which matches
  * this expression, the text will be highlighted.
  *
+ * When vte_terminal_match_check_event() returns a match for this regex, the
+ * returned tag will be the return value of this function.
+ *
  * Returns: an integer associated with this expression, or -1 if @gregex could not be
  *   transformed into a #VteRegex or @gflags were incompatible
  *
@@ -1939,13 +1942,20 @@ vte_terminal_match_add_gregex(VteTerminal *terminal,
  * vte_terminal_match_add_regex:
  * @terminal: a #VteTerminal
  * @regex: (transfer none): a #VteRegex
- * @flags: PCRE2 match flags, or 0
+ * @flags: PCRE2 match flags, or 0 to use the default flags
  *
  * Adds the regular expression @regex to the list of matching expressions.  When the
  * user moves the mouse cursor over a section of displayed text which matches
  * this expression, the text will be highlighted.
  *
- * Returns: an integer associated with this expression
+ * When vte_terminal_match_check_event() returns a match for this regex, the
+ * returned tag will be the return value of this function.
+ *
+ * Note that the default flags only contain PCRE2_UTF (and some flags for internal use);
+ * if you want to match unicode properties, you need to pass PCRE2_UCP in @flags.
+ * See man:pcre2_compile(3) for more information on available flags.
+ *
+ * Returns: a nonnegative integer associated with this expression
  *
  * Since: 0.46
  */
@@ -1966,6 +1976,30 @@ vte_terminal_match_add_regex(VteTerminal *terminal,
                                      impl->regex_match_next_tag()).tag();
 }
 
+/**
+ * vte_terminal_match_add_builtins:
+ * @terminal: a #VteTerminal
+ *
+ * Adds regular expressions to recognise URIs to the list of matching expressions.
+ * When the user moves the mouse cursor over a section of displayed text which matches
+ * this expression, the text will be highlighted.
+ *
+ * When vte_terminal_match_check_event() returns a match for this regex, the
+ * returned tag will a value from #VteBuiltinMatchTags.
+ *
+ * Use vte_terminal_match_remove_builtins() or vte_terminal_match_remove_all() to remove
+ * the matching expressions added by this function.
+ *
+ * Since: 0.58
+ */
+void
+vte_terminal_match_add_builtins(VteTerminal *terminal)
+{
+       g_return_if_fail(VTE_IS_TERMINAL(terminal));
+
+        IMPL(terminal)->regex_match_add_builtins();
+}
+
 /**
  * vte_terminal_match_check:
  * @terminal: a #VteTerminal
@@ -2006,20 +2040,24 @@ vte_terminal_match_check(VteTerminal *terminal,
  *
  * Checks if the text in and around the position of the event matches any of the
  * regular expressions previously set using vte_terminal_match_add().  If a
- * match exists, the text string is returned and if @tag is not %NULL, the number
- * associated with the matched regular expression will be stored in @tag.
+ * match exists, the text string is returned.
  *
- * If more than one regular expression has been set with
- * vte_terminal_match_add(), then expressions are checked in the order in
- * which they were added.
+ * If @tag is not %NULL, it will store the nonnegative integer associated with the
+ * matched regular expression, if it was added with vte_terminal_match_add_regex(),
+ * or a negative number from #VteBuiltinMatchTags if the matching regular expression
+ * is one added with vte_terminal_match_add_builtins() matched, or -1 if there is
+ * no match.
+ *
+ * Expressions are checked in the order in which they were added, returning the
+ * first match.
  *
  * Returns: (transfer full): a newly allocated string which matches one of the previously
  *   set regular expressions
  */
 char *
-vte_terminal_match_check_event(VteTerminal *terminal,
-                               GdkEvent *event,
-                               int *tag)
+vte_terminal_match_check_event(VteTerminal* terminal,
+                               GdkEvent* event,
+                               int* tag)
 {
         g_return_val_if_fail(VTE_IS_TERMINAL(terminal), FALSE);
         return IMPL(terminal)->regex_match_check(event, tag);
@@ -2194,20 +2232,36 @@ vte_terminal_match_set_cursor_name(VteTerminal *terminal,
 /**
  * vte_terminal_match_remove:
  * @terminal: a #VteTerminal
- * @tag: the tag of the regex to remove
+ * @tag: the nonnegative tag of the regex to remove
  *
  * Removes the regular expression which is associated with the given @tag from
  * the list of expressions which the terminal will highlight when the user
  * moves the mouse cursor over matching text.
  */
 void
-vte_terminal_match_remove(VteTerminal *terminal, int tag)
+vte_terminal_match_remove(VteTerminal *terminal,
+                          int tag)
 {
        g_return_if_fail(VTE_IS_TERMINAL(terminal));
         g_return_if_fail(tag >= 0);
         IMPL(terminal)->regex_match_remove(tag);
 }
 
+/**
+ * vte_terminal_match_remove_builtins:
+ * @terminal: a #VteTerminal
+ *
+ * Removes the regular expression added with vte_terminal_match_add_builtins().
+ *
+ * Since: 0.58
+ */
+void
+vte_terminal_match_remove_builtins(VteTerminal *terminal)
+{
+       g_return_if_fail(VTE_IS_TERMINAL(terminal));
+        IMPL(terminal)->regex_match_remove_builtins();
+}
+
 /**
  * vte_terminal_match_remove_all:
  * @terminal: a #VteTerminal
diff --git a/src/vteinternal.hh b/src/vteinternal.hh
index 8ebd99c9..cca7b210 100644
--- a/src/vteinternal.hh
+++ b/src/vteinternal.hh
@@ -37,8 +37,10 @@
 
 #include "chunk.hh"
 #include "utf8.hh"
+#include "fwd.hh"
 
 #include <list>
+#include <memory>
 #include <queue>
 #include <string>
 #include <variant>
@@ -537,6 +539,10 @@ public:
                 return match_regexes_writable().emplace_back(std::forward<Args>(args)...);
         }
 
+        std::shared_ptr<vte::base::RegexBuiltins> m_match_regex_builtins{};
+        void regex_match_add_builtins() noexcept;
+        void regex_match_remove_builtins() noexcept;
+
         char* m_match_contents;
         GArray* m_match_attributes;
         char* m_match;
diff --git a/src/vteregex.cc b/src/vteregex.cc
index d95c77aa..39fc588b 100644
--- a/src/vteregex.cc
+++ b/src/vteregex.cc
@@ -24,9 +24,9 @@
 
 #include "config.h"
 
-#include "vtemacros.h"
-#include "vteenums.h"
-#include "vteregex.h"
+#include "vte/vtemacros.h"
+#include "vte/vteenums.h"
+#include "vte/vteregex.h"
 #include "vtepcre2.h"
 
 #include "regex.hh"


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]