[epiphany/gnome-3-22] uri-tester: Ensure regexps are properly constructed



commit 30a34f2a0a97d951dea8de923feafca6d0846011
Author: Adrian Perez de Castro <aperez igalia com>
Date:   Fri Feb 3 00:54:51 2017 +0200

    uri-tester: Ensure regexps are properly constructed
    
    This adds a few more cases to the escaping done when converting an AdBlock
    non-regepx "simple pattern" from a rule into a GRegex. This patch does the
    following:
    
    - Adds escaping to some of the regexp metacharacters which were not being
      handled: (){}+.|\
    - Adds support for using a vertical bar at the end of a pattern to anchor the
      match at the end.
    - Adds support for using ^ to match a "separator character" (a non-letter,
      non-number, or one of _-.%).
    
    This also adds as much comment lines as code, which in this particular case
    is probably a good thing, so reading the code in the future does not need
    checking each case against the GRegex documentation.
    
    https://bugzilla.gnome.org/show_bug.cgi?id=777714

 embed/web-extension/ephy-uri-tester.c |   56 +++++++++++++++++++++++---------
 1 files changed, 40 insertions(+), 16 deletions(-)
---
diff --git a/embed/web-extension/ephy-uri-tester.c b/embed/web-extension/ephy-uri-tester.c
index 2e270cc..8840975 100644
--- a/embed/web-extension/ephy-uri-tester.c
+++ b/embed/web-extension/ephy-uri-tester.c
@@ -374,7 +374,6 @@ static GString *
 ephy_uri_tester_fixup_regexp (const char *prefix, char *src)
 {
   GString *str;
-  int len = 0;
 
   if (!src)
     return NULL;
@@ -386,26 +385,56 @@ ephy_uri_tester_fixup_regexp (const char *prefix, char *src)
     (void)*src++;
   }
 
+  /* NOTE: The '$' is used as separator for the rule options, so rule patterns
+     cannot ever contain them. If a rule needs to match it, it uses "%24".
+     Splitting the option is done in ephy_uri_tester_add_url_pattern().
+
+     The loop below always escapes square brackets. This way there is no chance
+     that they get interpreted as a character class, and it is NOT needed to
+     escape '-' because it's only special inside a character class. */
   do {
     switch (*src) {
       case '*':
         g_string_append (str, ".*");
         break;
-      /*case '.':
-         g_string_append (str, "\\.");
-         break;*/
+      case '^':
+      /* Matches a separator character, defined as:
+       * "anything but a letter, a digit, or one of the following: _ - . %" */
+        g_string_append (str, "([^a-zA-Z\\d]|[_\\-\\.%])");
+        break;
+      case '|':
+      /* If at the end of the pattern, the match is anchored at the end. In
+       * the middle of a pattern it matches a literal vertical bar and the
+       * character must be escaped. */
+        if (src[1] == '\0')
+          g_string_append (str, "$");
+        else
+          g_string_append (str, "\\|");
+        break;
+      /* The following characters are escaped as they have a meaning in
+       * regular expressions:
+       *   - '.' matches any character.
+       *   - '+' matches the preceding pattern one or more times.
+       *   - '?' matches the preceding pattern zero or one times.
+       *   - '[' ']' are used to define a character class.
+       *   - '{' '}' are used to define a min/max quantifier.
+       *   - '(' ')' are used to defin a submatch expression.
+       *   - '\' has several uses in regexps (shortcut character classes.
+       *     matching non-printing characters, using octal/hex, octal
+       *     constants, backreferences... they must to be escaped to
+       *     match a literal backslash and prevent wrecking havoc!). */
+      case '.':
+      case '+':
       case '?':
       case '[':
       case ']':
+      case '{':
+      case '}':
+      case '(':
+      case ')':
+      case '\\':
         g_string_append_printf (str, "\\%c", *src);
         break;
-      case '|':
-      /* FIXME: We actually need to match :[0-9]+ or '/'. Sign means
-         "here could be port number or nothing". So bla.com^ will match
-         bla.com/ or bla.com:8080/ but not bla.com.au/ */
-      case '^':
-      case '+':
-        break;
       default:
         g_string_append_printf (str, "%c", *src);
         break;
@@ -413,11 +442,6 @@ ephy_uri_tester_fixup_regexp (const char *prefix, char *src)
     src++;
   } while (*src);
 
-  len = str->len;
-  /* We dont need .* in the end of url. Thats stupid */
-  if (str->str && str->str[len - 1] == '*' && str->str[len - 2] == '.')
-    g_string_erase (str, len - 2, 2);
-
   return str;
 }
 


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]