[california] Improved URI parsing



commit 7ee08ee545768b5d74100113670266b271431694
Author: Jim Nelson <jim yorba org>
Date:   Thu Nov 13 17:26:01 2014 -0800

    Improved URI parsing
    
    Job Gruber updated the URI-parsing regex we were using in February
    2014, so moved to it.  This new regex also allows us to catch mailto:
    URIs.  The linkify code now allows transformation of the URI itself
    when it's linked, so (for example) we could in the future ellipisize
    long URIs and use a tooltip to give the user the full string.

 src/host/host-show-event.vala |    6 ++++--
 src/util/util-markup.vala     |   37 +++++++++++++++++++++----------------
 2 files changed, 25 insertions(+), 18 deletions(-)
---
diff --git a/src/host/host-show-event.vala b/src/host/host-show-event.vala
index 721156d..f5cd1d0 100644
--- a/src/host/host-show-event.vala
+++ b/src/host/host-show-event.vala
@@ -211,17 +211,19 @@ public class ShowEvent : Gtk.Grid, Toolkit.Card {
     }
     
     private bool linkify_delegate(string uri, bool known_protocol, out string? pre_markup,
-        out string? post_markup) {
+        out string? markup, out string? post_markup) {
         // preserve but don't linkify if unknown protocol
         if (!known_protocol) {
             pre_markup = null;
+            markup = null;
             post_markup = null;
             
             return true;
         }
         
-        // anchor it
+        // anchor it and preserve uri (i.e. markup = null)
         pre_markup = "<a href=\"%s\">".printf(uri);
+        markup = null;
         post_markup = "</a>";
         
         return true;
diff --git a/src/util/util-markup.vala b/src/util/util-markup.vala
index e633cf1..b5b0c69 100644
--- a/src/util/util-markup.vala
+++ b/src/util/util-markup.vala
@@ -11,27 +11,32 @@ namespace California.Markup {
  *
  * known_protocol indicates the URI has a well-known protocol (i.e. http:// or ftp://, etc.)
  *
+ * markup can hold a new string that is placed in between the pre- and post-markup strings.  If
+ * null or an empty string is returned, uri will be used.
+ *
  * Returns false if the uri should not be included in the string returned by { link linkify}.  To
- * leave a URI bare, return null for both strings and return true.
+ * leave a URI bare, return null for pre_markup, post_markup, and new_uri.
  */
 public delegate bool LinkifyDelegate(string uri, bool known_protocol, out string? pre_markup,
-    out string? post_markup);
+    out string? markup, out string? post_markup);
 
-// Regex to detect URLs.
-// Originally from http://daringfireball.net/2010/07/improved_regex_for_matching_urls
-private const string URL_REGEX = 
"(?i)\\b((?:[a-z][\\w-]+:(?:/{1,3}|[a-z0-9%])|www\\d{0,3}[.]|[a-z0-9.\\-]+[.][a-z]{2,4}/)(?:[^\\s()<>]+|\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\))+(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:'\".,<>?«»“”‘’]))";
+// Regex to detect URIs.
+// Originally from https://gist.github.com/gruber/249502
+// See http://daringfireball.net/2010/07/improved_regex_for_matching_urls for note on earlier version
+// of this regex.
+private const string URI_REGEX = 
"""(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))""";
 
-// Regex to determine if a URL has a known protocol.
-private const string PROTOCOL_REGEX = 
"^(aim|apt|bitcoin|cvs|ed2k|ftp|file|finger|git|gtalk|http|https|irc|ircs|irc6|lastfm|ldap|ldaps|magnet|news|nntp|rsync|sftp|skype|smb|sms|svn|telnet|tftp|ssh|webcal|xmpp):";
+// Regex to determine if a URI has a well-known protocol.
+private const string PROTOCOL_REGEX = 
"^(aim|apt|bitcoin|cvs|ed2k|ftp|file|finger|git|gtalk|http|https|irc|ircs|irc6|lastfm|ldap|ldaps|magnet|mailto|news|nntp|rsync|sftp|skype|smb|sms|svn|telnet|tftp|ssh|webcal|xmpp):";
 
-private Regex url_regex;
+private Regex uri_regex;
 private Regex protocol_regex;
 
 /**
  * Called by Util.init()
  */
 internal void init() throws Error {
-    url_regex = new Regex(URL_REGEX, RegexCompileFlags.CASELESS | RegexCompileFlags.OPTIMIZE);
+    uri_regex = new Regex(URI_REGEX, RegexCompileFlags.CASELESS | RegexCompileFlags.OPTIMIZE);
     protocol_regex = new Regex(PROTOCOL_REGEX, RegexCompileFlags.CASELESS | RegexCompileFlags.OPTIMIZE);
 }
 
@@ -39,7 +44,7 @@ internal void init() throws Error {
  * Called by Util.terminate()
  */
 internal void terminate() {
-    url_regex = null;
+    uri_regex = null;
     protocol_regex = null;
 }
 
@@ -53,21 +58,21 @@ public string? linkify(string? unlinked, LinkifyDelegate linkify_cb) {
         return unlinked;
     
     try {
-        return url_regex.replace_eval(unlinked, -1, 0, 0, (match_info, result) => {
+        return uri_regex.replace_eval(unlinked, -1, 0, 0, (match_info, result) => {
             // match zero is the only match we're interested in
-            string? url = match_info.fetch(0);
-            if (String.is_empty(url))
+            string? uri = match_info.fetch(0);
+            if (String.is_empty(uri))
                 return false;
             
             // have original caller provide markup (or drop the URL)
-            string? pre_markup, post_markup;
-            if (!linkify_cb(url, protocol_regex.match(url), out pre_markup, out post_markup))
+            string? pre_markup, markup, post_markup;
+            if (!linkify_cb(uri, protocol_regex.match(uri), out pre_markup, out markup, out post_markup))
                 return false;
             
             // put it all together
             result.append_printf("%s%s%s",
                 (pre_markup != null) ? pre_markup : "",
-                url,
+                String.is_empty(markup) ? uri : markup,
                 (post_markup != null) ? post_markup : ""
             );
             


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]