[smuxi: 146/179] Engine(-Tests): support ignoring leading characters in message patterns

From: Mirco M. M. Bauer <mmmbauer src gnome org>
To: commits-list gnome org
Cc:
Subject: [smuxi: 146/179] Engine(-Tests): support ignoring leading characters in message patterns
Date: Sat, 4 Nov 2017 05:49:58 +0000 (UTC)
commit bc56465a3658794817178c7f5283395e4035be77
Author: Mirco Bauer <meebey meebey net>
Date:   Sun May 21 14:24:13 2017 +0800

    Engine(-Tests): support ignoring leading characters in message patterns
    
    A message pattern could try to match a specific pattern like a word but wants to
    remove/ignore leading characters. To remove and ignore trailing characters from
    the match was already possible by using the DelimiterForEndOfPattern capture
    group but not for leading characters. This feature is now extended by using a
    capture group that is named DelimiterForStartOfPattern.
    
    So for example a Smuxi message pattern with a regex of:
    
        (?<DelimiterForStartOfPattern>\s|\W)([a-fA-F0-9]{64})(?<DelimiterForEndOfPattern>\s|\W)
    
    This will match bitcoin transaction hashes with a length of 64 hex numbers but
    the whitespace (\s) and non-word characters (\W) in front or behind that won't
    be part of linked result. This avoids false-positives when the pattern is likely
    to be found within words.
    
    There was also a bug in the existing DelimiterForEndOfPattern implementation
    that would remove text from the {0} capture group which should contain the
    complete text of the matched part. This is now fixed by retaining {0} to contain
    the originally matched value and by using an explicit capture group {1} in the
    heuristic domain regex.

 src/Engine-Tests/MessageBuilderTests.cs     |   27 ++++++++++++++++++++++++++-
 src/Engine/Config/MessageBuilderSettings.cs |   10 ++++++----
 src/Engine/Messages/MessageBuilder.cs       |   27 +++++++++++++++++----------
 3 files changed, 49 insertions(+), 15 deletions(-)
---
diff --git a/src/Engine-Tests/MessageBuilderTests.cs b/src/Engine-Tests/MessageBuilderTests.cs
index 2b9807e..1f04985 100644
--- a/src/Engine-Tests/MessageBuilderTests.cs
+++ b/src/Engine-Tests/MessageBuilderTests.cs
@@ -1,6 +1,6 @@
 // Smuxi - Smart MUltipleXed Irc
 //
-// Copyright (c) 2013-2015 Mirco Bauer <meebey meebey net>
+// Copyright (c) 2013-2017 Mirco Bauer <meebey meebey net>
 //
 // Full GPL License: <http://www.gnu.org/licenses/gpl.txt>
 // 
@@ -626,6 +626,23 @@ namespace Smuxi.Engine
         }
 
         [Test]
+        public void AppendMessageWithHeuristicDomainAndLeadingAndTrailingParanthesis()
+        {
+            var msg = "leading text (example.com) trailing text";
+            var builder = new MessageBuilder();
+            builder.TimeStamp = DateTime.MinValue;
+            builder.Append(new TextMessagePartModel("leading text ("));
+            builder.Append(
+                new UrlMessagePartModel(
+                    "http://example.com";,
+                    "example.com"
+                )
+            );
+            builder.Append(new TextMessagePartModel(") trailing text"));
+            TestMessage(msg, builder.ToMessage());
+        }
+
+        [Test]
         public void AppendMessageWithIdnLink()
         {
             var msg = "http://www.brasileirão.com";;
@@ -649,6 +666,14 @@ namespace Smuxi.Engine
             builder.TimeStamp = DateTime.MinValue;
             builder.Append(new UrlMessagePartModel("http://www.ietf.org/rfc/rfc2812.txt";, "RFC2812"));
             TestMessage(msg, builder.ToMessage());
+
+            msg = "(CVE-2017-0144)";
+            builder = new MessageBuilder();
+            builder.TimeStamp = DateTime.MinValue;
+            builder.Append(new TextMessagePartModel("("));
+            builder.Append(new 
UrlMessagePartModel("http://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2017-0144";, "CVE-2017-0144"));
+            builder.Append(new TextMessagePartModel(")"));
+            TestMessage(msg, builder.ToMessage());
         }
 
         [Test]
diff --git a/src/Engine/Config/MessageBuilderSettings.cs b/src/Engine/Config/MessageBuilderSettings.cs
index 06bec4a..7177d99 100644
--- a/src/Engine/Config/MessageBuilderSettings.cs
+++ b/src/Engine/Config/MessageBuilderSettings.cs
@@ -1,6 +1,6 @@
 // Smuxi - Smart MUltipleXed Irc
 //
-// Copyright (c) 2011, 2014-2015 Mirco Bauer <meebey meebey net>
+// Copyright (c) 2011, 2014-2017 Mirco Bauer <meebey meebey net>
 //
 // Full GPL License: <http://www.gnu.org/licenses/gpl.txt>
 //
@@ -68,6 +68,7 @@ namespace Smuxi.Engine
             HighlightWords = settings.HighlightWords;
         }
 
+        internal const string StartDelimiterGroupName = "DelimiterForStartOfPattern";
         internal const string EndDelimiterGroupName = "DelimiterForEndOfPattern";
 
         static void InitBuiltinSmartLinks()
@@ -94,6 +95,8 @@ namespace Smuxi.Engine
             string path = @"/(?:["+ path_chars +"]*["+ path_last_chars +"]+)?";
             string protocol = @"[a-z][a-z0-9\-+]*://";
             string protocol_user_host_port_path = protocol + user_host_port + "(?:" + path + ")?";
+            string start_delimiter = String.Format(@"(?<{0}>^|\s|\W)", StartDelimiterGroupName);
+            string end_delimiter = String.Format(@"(?<{0}>$|\s|\W)", EndDelimiterGroupName);
 
             // facebook attachment
             var regex = new Regex(
@@ -125,14 +128,13 @@ namespace Smuxi.Engine
             // include well known TLDs to prevent autogen.sh, configure.ac or
             // Gst.Buffer.Unref() from matching
             string heuristic_domain = @"(?:(?:" + subdomain + ")+(?:" + common_tld + ")|localhost)";
-            string end_delimiter = String.Format(@"(?<{0}>$|\s|\W)", EndDelimiterGroupName);
-            string heuristic_address = heuristic_domain + "(?:" + path + ")?" + end_delimiter;
+            string heuristic_address = @"(" +heuristic_domain + ")(?:" + path + ")?" + end_delimiter;
             regex = new Regex(
                 heuristic_address,
                 RegexOptions.IgnoreCase | RegexOptions.Compiled
             );
             BuiltinPatterns.Add(new MessagePatternModel(regex) {
-                LinkFormat = "http://{0}";
+                LinkFormat = "http://{1}";
             });
 
             // Smuxi bugtracker
diff --git a/src/Engine/Messages/MessageBuilder.cs b/src/Engine/Messages/MessageBuilder.cs
index 739c2cc..512004e 100644
--- a/src/Engine/Messages/MessageBuilder.cs
+++ b/src/Engine/Messages/MessageBuilder.cs
@@ -1,7 +1,8 @@
 // Smuxi - Smart MUltipleXed Irc
 // 
-// Copyright (c) 2010-2014 Mirco Bauer <meebey meebey net>
+// Copyright (c) 2010-2017 Mirco Bauer <meebey meebey net>
 // Copyright (c) 2013 Oliver Schneider <mail oli-obk de>
+// Copyright (c) 2016 Andres G. Aragoneses <knocte gmail com>
 // 
 // Full GPL License: <http://www.gnu.org/licenses/gpl.txt>
 // 
@@ -892,41 +893,47 @@ namespace Smuxi.Engine
             
             int lastindex = 0;
             do {
-                var delimiterLength = 0;
+                var startDelimiterLength = 0;
+                var regexDelimiterForStartOfPatternValue = 
match.Groups[MessageBuilderSettings.StartDelimiterGroupName];
+                if (regexDelimiterForStartOfPatternValue != null) {
+                    startDelimiterLength = regexDelimiterForStartOfPatternValue.Value.Length;
+                }
+                var endDelimiterLength = 0;
                 var regexDelimiterForEndOfPatternValue = 
match.Groups[MessageBuilderSettings.EndDelimiterGroupName];
                 if (regexDelimiterForEndOfPatternValue != null) {
-                    delimiterLength = regexDelimiterForEndOfPatternValue.Value.Length;
+                    endDelimiterLength = regexDelimiterForEndOfPatternValue.Value.Length;
                 }
 
                 var groupValues = match.Groups.Cast<Group>()
 
                     // don't get the delimiter because it only determines
-                    // the end of pattern, which is not part of the pattern
-                    .Where(g => g != regexDelimiterForEndOfPatternValue)
+                    // the start or end of pattern, which is not part of the pattern
+                    .Where(g => g != regexDelimiterForStartOfPatternValue &&
+                                g != regexDelimiterForEndOfPatternValue)
 
                     .Select(g => g.Value).ToArray();
 
                 string url;
                 if (String.IsNullOrEmpty(pattern.LinkFormat)) {
                     url = match.Value;
+                    url = url.Substring(0 + startDelimiterLength, url.Length - (startDelimiterLength - 
endDelimiterLength));
                 } else {
                     url = String.Format(pattern.LinkFormat, groupValues);
                 }
-                url = url.Substring(0, url.Length - delimiterLength);
                 string text;
                 if (String.IsNullOrEmpty(pattern.TextFormat)) {
                     text = match.Value;
                 } else {
                     text = String.Format(pattern.TextFormat, groupValues);
                 }
-                text = text.Substring(0, text.Length - delimiterLength);
+                text = text.Substring(0 + startDelimiterLength, text.Length - (startDelimiterLength + 
endDelimiterLength));
 
                 if (lastindex != match.Index) {
                     // there were some non-matching-chars before the match
                     // copy that to a TextMessagePartModel
                     var notMatchPart = new TextMessagePartModel(textPart);
                     // only take the proper chunk of text
-                    notMatchPart.Text = textPart.Text.Substring(lastindex, match.Index - lastindex);
+                    notMatchPart.Text = textPart.Text.Substring(lastindex, match.Index + 
startDelimiterLength - lastindex);
                     // and try other patterns on this part
                     var parts = ParsePatterns(notMatchPart, remainingPatterns);
                     foreach (var part in parts) {
@@ -945,12 +952,12 @@ namespace Smuxi.Engine
                     msgPart = new TextMessagePartModel(text);
                 }
                 msgParts.Add(msgPart);
-                lastindex = match.Index + match.Length - delimiterLength;
+                lastindex = match.Index + match.Length - endDelimiterLength;
                 match = match.NextMatch();
             } while (match.Success);
             
             if (lastindex != textPart.Text.Length) {
-                // there were some non-url-chars before this url
+                // there were some non-matching-chars after the last match
                 // copy TextMessagePartModel
                 var notMatchPart = new TextMessagePartModel(textPart);
                 // only take the proper chunk of text
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]