[glib] regex: if PCRE is 8.34 or later, disable auto-possessification for DFA



commit bf181a3ac78e824ca7e67ecfb2ba957e740594d7
Author: Simon McVittie <simon mcvittie collabora co uk>
Date:   Mon Apr 27 14:38:41 2015 +0100

    regex: if PCRE is 8.34 or later, disable auto-possessification for DFA
    
    Normally, recent PCRE behaves as if certain patterns were replaced
    by a more "possessive" pattern that gives the same answer for normal
    regex matching, but is more efficient. However, the modified pattern
    produces fewer results under DFA. If we want the full set of results
    we have to apply PCRE_NO_AUTO_POSSESS, and that's a compile-time flag.
    
    This currently only affects a system PCRE, but would also work fine for
    an internal PCRE 8.34 or later if the embedded copy is updated.
    
    Bug: https://bugzilla.gnome.org/show_bug.cgi?id=733325
    Reviewed-by: Christian Persch <chpe gnome org>

 glib/gregex.c |  128 ++++++++++++++++++++++++++++++++++++++++++---------------
 1 files changed, 95 insertions(+), 33 deletions(-)
---
diff --git a/glib/gregex.c b/glib/gregex.c
index 41bf67e..1c141b2 100644
--- a/glib/gregex.c
+++ b/glib/gregex.c
@@ -1267,6 +1267,15 @@ g_regex_unref (GRegex *regex)
     }
 }
 
+/*
+ * @match_options: (inout) (optional):
+ */
+static pcre *regex_compile (const gchar         *pattern,
+                            GRegexCompileFlags   compile_options,
+                            GRegexCompileFlags  *compile_options_out,
+                            GRegexMatchFlags    *match_options,
+                            GError             **error);
+
 /**
  * g_regex_new:
  * @pattern: the regular expression
@@ -1291,12 +1300,8 @@ g_regex_new (const gchar         *pattern,
   GRegex *regex;
   pcre *re;
   const gchar *errmsg;
-  gint erroffset;
-  gint errcode;
   gboolean optimize = FALSE;
   static volatile gsize initialised = 0;
-  unsigned long int pcre_compile_options;
-  GRegexCompileFlags nonpcre_compile_options;
 
   g_return_val_if_fail (pattern != NULL, NULL);
   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
@@ -1325,13 +1330,61 @@ g_regex_new (const gchar         *pattern,
       return NULL;
     }
 
-  nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK;
-
   /* G_REGEX_OPTIMIZE has the same numeric value of PCRE_NO_UTF8_CHECK,
    * as we do not need to wrap PCRE_NO_UTF8_CHECK. */
   if (compile_options & G_REGEX_OPTIMIZE)
     optimize = TRUE;
 
+  re = regex_compile (pattern, compile_options, &compile_options,
+                      &match_options, error);
+
+  if (re == NULL)
+    return NULL;
+
+  regex = g_new0 (GRegex, 1);
+  regex->ref_count = 1;
+  regex->pattern = g_strdup (pattern);
+  regex->pcre_re = re;
+  regex->compile_opts = compile_options;
+  regex->match_opts = match_options;
+
+  if (optimize)
+    {
+      regex->extra = pcre_study (regex->pcre_re, 0, &errmsg);
+      if (errmsg != NULL)
+        {
+          GError *tmp_error = g_error_new (G_REGEX_ERROR,
+                                           G_REGEX_ERROR_OPTIMIZE,
+                                           _("Error while optimizing "
+                                             "regular expression %s: %s"),
+                                           regex->pattern,
+                                           errmsg);
+          g_propagate_error (error, tmp_error);
+
+          g_regex_unref (regex);
+          return NULL;
+        }
+    }
+
+  return regex;
+}
+
+static pcre *
+regex_compile (const gchar         *pattern,
+               GRegexCompileFlags   compile_options,
+               GRegexCompileFlags  *compile_options_out,
+               GRegexMatchFlags    *match_options,
+               GError             **error)
+{
+  pcre *re;
+  const gchar *errmsg;
+  gint erroffset;
+  gint errcode;
+  GRegexCompileFlags nonpcre_compile_options;
+  unsigned long int pcre_compile_options;
+
+  nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK;
+
   /* In GRegex the string are, by default, UTF-8 encoded. PCRE
    * instead uses UTF-8 only if required with PCRE_UTF8. */
   if (compile_options & G_REGEX_RAW)
@@ -1343,7 +1396,9 @@ g_regex_new (const gchar         *pattern,
     {
       /* enable utf-8 */
       compile_options |= PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
-      match_options |= PCRE_NO_UTF8_CHECK;
+
+      if (match_options != NULL)
+        *match_options |= PCRE_NO_UTF8_CHECK;
     }
 
   /* PCRE_NEWLINE_ANY is the default for the internal PCRE but
@@ -1408,32 +1463,10 @@ g_regex_new (const gchar         *pattern,
         compile_options |= G_REGEX_DUPNAMES;
     }
 
-  regex = g_new0 (GRegex, 1);
-  regex->ref_count = 1;
-  regex->pattern = g_strdup (pattern);
-  regex->pcre_re = re;
-  regex->compile_opts = compile_options;
-  regex->match_opts = match_options;
-
-  if (optimize)
-    {
-      regex->extra = pcre_study (regex->pcre_re, 0, &errmsg);
-      if (errmsg != NULL)
-        {
-          GError *tmp_error = g_error_new (G_REGEX_ERROR,
-                                           G_REGEX_ERROR_OPTIMIZE,
-                                           _("Error while optimizing "
-                                             "regular expression %s: %s"),
-                                           regex->pattern,
-                                           errmsg);
-          g_propagate_error (error, tmp_error);
-
-          g_regex_unref (regex);
-          return NULL;
-        }
-    }
+  if (compile_options_out != 0)
+    *compile_options_out = compile_options;
 
-  return regex;
+  return re;
 }
 
 /**
@@ -1873,6 +1906,8 @@ g_regex_match_all_full (const GRegex      *regex,
 {
   GMatchInfo *info;
   gboolean done;
+  pcre *pcre_re;
+  pcre_extra *extra;
 
   g_return_val_if_fail (regex != NULL, FALSE);
   g_return_val_if_fail (string != NULL, FALSE);
@@ -1880,6 +1915,29 @@ g_regex_match_all_full (const GRegex      *regex,
   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
 
+#ifdef PCRE_NO_AUTO_POSSESS
+  /* For PCRE >= 8.34 we need to turn off PCRE_NO_AUTO_POSSESS, which
+   * is an optimization for normal regex matching, but results in omitting
+   * some shorter matches here, and an observable behaviour change.
+   *
+   * DFA matching is rather niche, and very rarely used according to
+   * codesearch.debian.net, so don't bother caching the recompiled RE. */
+  pcre_re = regex_compile (regex->pattern,
+                           regex->compile_opts | PCRE_NO_AUTO_POSSESS,
+                           NULL, NULL, error);
+
+  if (pcre_re == NULL)
+    return FALSE;
+
+  /* Not bothering to cache the optimization data either, with similar
+   * reasoning */
+  extra = NULL;
+#else
+  /* For PCRE < 8.33 the precompiled regex is fine. */
+  pcre_re = regex->pcre_re;
+  extra = regex->extra;
+#endif
+
   info = match_info_new (regex, string, string_len, start_position,
                          match_options, TRUE);
 
@@ -1887,7 +1945,7 @@ g_regex_match_all_full (const GRegex      *regex,
   while (!done)
     {
       done = TRUE;
-      info->matches = pcre_dfa_exec (regex->pcre_re, regex->extra,
+      info->matches = pcre_dfa_exec (pcre_re, extra,
                                      info->string, info->string_len,
                                      info->pos,
                                      regex->match_opts | match_options,
@@ -1917,6 +1975,10 @@ g_regex_match_all_full (const GRegex      *regex,
         }
     }
 
+#ifdef PCRE_NO_AUTO_POSSESS
+  pcre_free (pcre_re);
+#endif
+
   /* set info->pos to -1 so that a call to g_match_info_next() fails. */
   info->pos = -1;
 


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]