[gtksourceview/wip/chergert/pcre2] pcre2: start on pcre2 implementation




commit 9a7f7d62d0ef7490efd130b281b8c43fce4b0403
Author: Christian Hergert <chergert redhat com>
Date:   Fri Sep 25 10:23:18 2020 -0700

    pcre2: start on pcre2 implementation

 gtksourceview/gtksourceregex.c    |   4 +-
 gtksourceview/implregex-private.h |   6 +-
 gtksourceview/implregex.c         | 490 ++++++++++++++++++++++++++++++--------
 3 files changed, 403 insertions(+), 97 deletions(-)
---
diff --git a/gtksourceview/gtksourceregex.c b/gtksourceview/gtksourceregex.c
index 80d334a2..dc36c1ac 100644
--- a/gtksourceview/gtksourceregex.c
+++ b/gtksourceview/gtksourceregex.c
@@ -354,8 +354,8 @@ _gtk_source_regex_fetch_pos_bytes (GtkSourceRegex *regex,
                                   gint           *start_pos_p, /* byte offsets */
                                   gint           *end_pos_p)   /* byte offsets */
 {
-       gint start_pos;
-       gint end_pos;
+       gint start_pos = -1;
+       gint end_pos = -1;
 
        g_assert (regex->resolved);
 
diff --git a/gtksourceview/implregex-private.h b/gtksourceview/implregex-private.h
index da52474e..2b8424bc 100644
--- a/gtksourceview/implregex-private.h
+++ b/gtksourceview/implregex-private.h
@@ -41,6 +41,7 @@ gboolean    impl_regex_match                (const ImplRegex        *regex,
                                              const char             *string,
                                              GRegexMatchFlags        match_options,
                                              ImplMatchInfo         **match_info);
+ImplRegex  *impl_regex_ref                  (ImplRegex              *regex);
 void        impl_regex_unref                (ImplRegex              *regex);
 void        impl_match_info_free            (ImplMatchInfo          *match_info);
 char       *impl_match_info_fetch           (const ImplMatchInfo    *match_info,
@@ -58,7 +59,7 @@ char       *impl_regex_replace_eval         (const ImplRegex        *regex,
 gboolean    impl_regex_match_full           (const ImplRegex        *regex,
                                              const char             *string,
                                              gssize                  string_len,
-                                             int                     start_position,
+                                             gsize                   start_position,
                                              GRegexMatchFlags        match_options,
                                              ImplMatchInfo         **match_info,
                                              GError                **error);
@@ -70,6 +71,9 @@ gboolean    impl_match_info_fetch_named_pos (const ImplMatchInfo    *match_info,
                                              const char             *name,
                                              int                    *start_pos,
                                              int                    *end_pos);
+gboolean    impl_match_info_matches         (const ImplMatchInfo    *match_info);
+gboolean    impl_match_info_next            (ImplMatchInfo          *match_info,
+                                             GError                **error);
 const char *impl_regex_get_pattern          (const ImplRegex        *regex);
 
 G_END_DECLS
diff --git a/gtksourceview/implregex.c b/gtksourceview/implregex.c
index 56a12799..89808c79 100644
--- a/gtksourceview/implregex.c
+++ b/gtksourceview/implregex.c
@@ -21,46 +21,100 @@
 
 #include "config.h"
 
+#define PCRE2_CODE_UNIT_WIDTH 8
+
+#include <pcre2.h>
+#include <string.h>
+
 #include "implregex-private.h"
 
+#define IS_PCRE_ERROR(ret) ((ret) < PCRE2_ERROR_NOMATCH && (ret) != PCRE2_ERROR_PARTIAL)
+
 struct _ImplRegex
 {
-       int         ref_count;
-       char       *pattern;
-       GRegex     *re;
+       int                    ref_count;
+       char                  *pattern;
+       gsize                  compile_flags;
+       gsize                  match_flags;
+       pcre2_compile_context *context;
+       pcre2_code            *code;
+       PCRE2_SPTR             name_table;
+       int                    name_count;
+       int                    name_entry_size;
 };
 
 struct _ImplMatchInfo
 {
-       GMatchInfo *match_info;
+       gsize             compile_flags;
+       gsize             match_flags;
+       ImplRegex        *regex;
+       const char       *string;
+       gsize             string_len;
+       pcre2_match_data *match_data;
+       PCRE2_SIZE       *offsets;
+       int               n_groups;
+       gsize             start_pos;
 };
 
-#if 0
-static void
-set_regex_error (GError **error,
-                 int      errnum)
+static gsize
+translate_compile_flags (GRegexCompileFlags flags)
 {
-       guchar errstr[128];
+       gsize ret = PCRE2_ZERO_TERMINATED;
+
+       if (flags & G_REGEX_RAW)
+               ret |= PCRE2_NO_UTF_CHECK;
+       else
+               ret |= PCRE2_UTF;
+
+       if (flags & G_REGEX_ANCHORED)
+               ret |= PCRE2_ANCHORED;
+
+       if (flags & G_REGEX_CASELESS)
+               ret |= PCRE2_CASELESS;
+
+       if (flags & G_REGEX_NEWLINE_LF)
+               ret |= PCRE2_NEWLINE_LF;
 
-       pcre2_get_error_message (errnum, errstr, sizeof errstr - 1);
-       errstr[sizeof errstr - 1] = 0;
+       if (flags & G_REGEX_NEWLINE_CR)
+               ret |= PCRE2_NEWLINE_CR;
 
-       g_set_error_literal (error,
-                            G_REGEX_ERROR,
-                            G_REGEX_ERROR_COMPILE,
-                            (const gchar *)errstr);
+       return ret;
 }
-#endif
 
-static ImplMatchInfo *
-impl_match_info_new (const ImplRegex *regex)
+static gsize
+translate_match_flags (GRegexMatchFlags flags)
 {
-       ImplMatchInfo *match_info;
+       gsize ret = 0;
 
-       match_info = g_slice_new0 (ImplMatchInfo);
-       match_info->match_info = NULL;
+       if (flags & G_REGEX_MATCH_ANCHORED)
+               ret |= PCRE2_ANCHORED;
 
-       return match_info;
+       return ret;
+}
+
+static gboolean
+set_regex_error (GError **error,
+                 int      errnum)
+{
+       if (!IS_PCRE_ERROR (errnum))
+       {
+               return FALSE;
+       }
+
+       if (error != NULL)
+       {
+               guchar errstr[128];
+
+               pcre2_get_error_message (errnum, errstr, sizeof errstr - 1);
+               errstr[sizeof errstr - 1] = 0;
+
+               g_set_error_literal (error,
+                                    G_REGEX_ERROR,
+                                    G_REGEX_ERROR_COMPILE,
+                                    (const gchar *)errstr);
+       }
+
+       return TRUE;
 }
 
 ImplRegex *
@@ -69,22 +123,52 @@ impl_regex_new (const char          *pattern,
                 GRegexMatchFlags     match_options,
                 GError             **error)
 {
-       GRegex *re;
+       pcre2_compile_context *context;
+       pcre2_code *code;
        ImplRegex *regex;
+       PCRE2_SIZE erroffset;
+       int errnumber = 0;
 
        g_return_val_if_fail (pattern != NULL, NULL);
 
-       re = g_regex_new (pattern, compile_options, match_options, error);
+       context = pcre2_compile_context_create (NULL);
 
-       if (re == NULL)
+       regex = g_slice_new0 (ImplRegex);
+       regex->ref_count = 1;
+       regex->context = context;
+       regex->pattern = g_strdup (pattern);
+       regex->compile_flags = translate_compile_flags (compile_options);
+       regex->match_flags = translate_match_flags (match_options);
+
+       if (regex->compile_flags & PCRE2_NEWLINE_LF)
+               pcre2_set_newline (context, PCRE2_NEWLINE_LF);
+       else if (regex->compile_flags & PCRE2_NEWLINE_CR)
+               pcre2_set_newline (context, PCRE2_NEWLINE_CR);
+
+       regex->code = pcre2_compile ((PCRE2_SPTR)pattern,
+                                    regex->compile_flags,
+                                    regex->match_flags,
+                                    &errnumber,
+                                    &erroffset,
+                                    context);
+
+       if (set_regex_error (error, errnumber))
        {
+               impl_regex_unref (regex);
                return NULL;
        }
 
-       regex = g_slice_new0 (ImplRegex);
-       regex->ref_count = 1;
-       regex->pattern = g_strdup (pattern);
-       regex->re = re;
+       pcre2_pattern_info (code, PCRE2_INFO_NAMECOUNT, &regex->name_count);
+
+       if (regex->name_count > 0)
+       {
+               (void)pcre2_pattern_info (code,
+                                         PCRE2_INFO_NAMEENTRYSIZE,
+                                         &regex->name_entry_size);
+               (void)pcre2_pattern_info (code,
+                                         PCRE2_INFO_NAMETABLE,
+                                         &regex->name_table);
+       }
 
        return regex;
 }
@@ -97,6 +181,17 @@ impl_regex_get_pattern (const ImplRegex *regex)
        return regex->pattern;
 }
 
+ImplRegex *
+impl_regex_ref (ImplRegex *regex)
+{
+       g_return_val_if_fail (regex != NULL, NULL);
+       g_return_val_if_fail (regex->ref_count > 0, NULL);
+
+       regex->ref_count++;
+
+       return regex;
+}
+
 void
 impl_regex_unref (ImplRegex *regex)
 {
@@ -108,16 +203,64 @@ impl_regex_unref (ImplRegex *regex)
        if (regex->ref_count == 0)
        {
                g_clear_pointer (&regex->pattern, g_free);
-               g_clear_pointer (&regex->re, g_regex_unref);
+               g_clear_pointer (&regex->code, pcre2_code_free);
+               g_clear_pointer (&regex->context, pcre2_compile_context_free);
                g_slice_free (ImplRegex, regex);
        }
 }
 
+static ImplMatchInfo *
+impl_match_info_new (ImplRegex        *regex,
+                     GRegexMatchFlags  match_options,
+                     const char       *string,
+                     gssize            string_len)
+{
+       ImplMatchInfo *match_info;
+
+       g_assert (regex != NULL);
+       g_assert (string != NULL);
+       g_assert (string_len <= strlen (string));
+
+       if (string_len < 0)
+       {
+               string_len = strlen (string);
+       }
+
+       match_info = g_slice_new0 (ImplMatchInfo);
+       match_info->regex = impl_regex_ref (regex);
+       match_info->match_flags = regex->match_flags | translate_match_flags (match_options);
+       match_info->start_pos = -1;
+       match_info->n_groups = -1;
+       match_info->string = string;
+       match_info->string_len = string_len;
+       match_info->match_data = pcre2_match_data_create_from_pattern (regex->code, NULL);
+
+       if (match_info->match_data == NULL)
+       {
+               g_error ("Failed to allocate match data");
+       }
+
+       match_info->offsets = pcre2_get_ovector_pointer (match_info->match_data);
+
+       return match_info;
+}
+
 void
 impl_match_info_free (ImplMatchInfo *match_info)
 {
-       g_clear_pointer (&match_info->match_info, g_match_info_free);
-       g_slice_free (ImplMatchInfo, match_info);
+       if (match_info != NULL)
+       {
+               g_clear_pointer (&match_info->match_data, pcre2_match_data_free);
+               g_clear_pointer (&match_info->regex, impl_regex_unref);
+               match_info->string = NULL;
+               match_info->string_len = 0;
+               match_info->compile_flags = 0;
+               match_info->match_flags = 0;
+               match_info->n_groups = 0;
+               match_info->start_pos = 0;
+               match_info->offsets = NULL;
+               g_slice_free (ImplMatchInfo, match_info);
+       }
 }
 
 gboolean
@@ -127,51 +270,58 @@ impl_regex_match (const ImplRegex   *regex,
                   ImplMatchInfo    **match_info)
 {
        g_return_val_if_fail (regex != NULL, FALSE);
-       g_return_val_if_fail (regex->re != NULL, FALSE);
+       g_return_val_if_fail (regex->code != NULL, FALSE);
+       g_return_val_if_fail (string != NULL, FALSE);
 
-       if (match_info != NULL)
-       {
-               *match_info = impl_match_info_new (regex);
-       }
-
-       return g_regex_match (regex->re,
-                             string,
-                             match_options,
-                             match_info ? &(*match_info)->match_info : NULL);
+       return impl_regex_match_full (regex, string, -1, 0, match_options, match_info, NULL);
 }
 
 char *
 impl_match_info_fetch (const ImplMatchInfo *match_info,
                        int                  match_num)
 {
+       int begin =  -1;
+       int end =  -1;
+
        g_return_val_if_fail (match_info != NULL, NULL);
+       g_return_val_if_fail (match_info->string != NULL, NULL);
+       g_return_val_if_fail (match_info->offsets != NULL, NULL);
 
-       return g_match_info_fetch (match_info->match_info, match_num);
+       if (match_info->start_pos < match_info->string_len)
+       {
+               if (impl_match_info_fetch_pos (match_info, match_num, &begin, &end))
+               {
+                       if (begin >= 0 && end >= 0)
+                       {
+                               return g_strndup (match_info->string + begin, end - begin);
+                       }
+               }
+       }
+
+       return NULL;
 }
 
 char *
 impl_match_info_fetch_named (const ImplMatchInfo *match_info,
                              const char          *name)
 {
+       int begin = -1;
+       int end = -1;
+
        g_return_val_if_fail (match_info != NULL, NULL);
 
-       return g_match_info_fetch_named (match_info->match_info, name);
-}
+       if (match_info->start_pos < match_info->string_len)
+       {
+               if (impl_match_info_fetch_named_pos (match_info, name, &begin, &end))
+               {
+                       if (begin >= 0 && end >= 0)
+                       {
+                               return g_strndup (match_info->string + begin, end - begin);
+                       }
+               }
+       }
 
-static gboolean
-wrapper_eval (const GMatchInfo *match_info,
-              GString          *result,
-              gpointer          user_data)
-{
-       struct {
-               ImplRegexEvalCallback callback;
-               gpointer user_data;
-       } *wrapper = user_data;
-       ImplMatchInfo wrapped = {
-               .match_info = (GMatchInfo *)match_info,
-       };
-
-       return wrapper->callback (&wrapped, result, wrapper->user_data);
+       return NULL;
 }
 
 char *
@@ -184,58 +334,98 @@ impl_regex_replace_eval (const ImplRegex        *regex,
                          gpointer                user_data,
                          GError                **error)
 {
-       struct {
-               ImplRegexEvalCallback callback;
-               gpointer user_data;
-       } wrapper;
+       ImplMatchInfo *match_info;
+       GString *out_string;
+       gboolean done;
+       gsize prev_begin;
+       gsize str_pos;
 
        g_return_val_if_fail (regex != NULL, NULL);
-       g_return_val_if_fail (regex->re != NULL, NULL);
-
-       wrapper.callback = eval;
-       wrapper.user_data = user_data;
-
-       return g_regex_replace_eval (regex->re,
-                                    string,
-                                    string_len,
-                                    start_position,
-                                    match_options,
-                                    wrapper_eval,
-                                    &wrapper,
-                                    error);
+       g_return_val_if_fail (regex->code != NULL, NULL);
+       g_return_val_if_fail (start_position >= 0, NULL);
+
+       g_error ("++++++ Replace eval\n");
+
+       if (string_len < 0)
+       {
+               string_len = strlen (string);
+       }
+
+       match_info = NULL;
+
+       if (!impl_regex_match_full (regex, string, string_len, start_position, match_options, &match_info, 
error))
+       {
+               impl_match_info_free (match_info);
+               return g_strndup (string, string_len);
+       }
+
+       g_assert (match_info != NULL);
+       g_assert (match_info->n_groups > 0);
+
+       str_pos = 0;
+       out_string = g_string_sized_new (string_len);
+       done = FALSE;
+
+       while (!done && impl_match_info_matches (match_info))
+       {
+               prev_begin = match_info->offsets[0];
+               g_string_append_len (out_string, string + str_pos, prev_begin - str_pos);
+               str_pos = match_info->offsets[1];
+
+               done = eval (match_info, out_string, user_data);
+
+               if (!impl_match_info_next (match_info, NULL))
+               {
+                       break;
+               }
+       }
+
+       g_string_append_len (out_string,
+                            string + str_pos,
+                            string_len - str_pos);
+
+       impl_match_info_free (match_info);
+
+       return g_string_free (out_string, FALSE);
 }
 
 gboolean
 impl_regex_match_full (const ImplRegex   *regex,
                        const char        *string,
                        gssize             string_len,
-                       int                start_position,
+                       gsize              start_position,
                        GRegexMatchFlags   match_options,
                        ImplMatchInfo    **match_info,
                        GError           **error)
 {
-       GMatchInfo *wrapped = NULL;
-       gboolean ret;
+       ImplMatchInfo *local_match_info = NULL;
+       gboolean ret = FALSE;
 
        g_return_val_if_fail (regex != NULL, FALSE);
-       g_return_val_if_fail (regex->re != NULL, FALSE);
+       g_return_val_if_fail (regex->code != NULL, FALSE);
+       g_return_val_if_fail (match_options == 0, FALSE);
+       g_return_val_if_fail (string != NULL, FALSE);
+
+       if (string_len < 0)
+       {
+               string_len = strlen (string);
+       }
+
+       local_match_info = impl_match_info_new ((ImplRegex *)regex, match_options, string, string_len);
+
+       local_match_info->start_pos = start_position;
+       local_match_info->offsets[0] = start_position;
+       local_match_info->offsets[1] = start_position;
 
-       ret = g_regex_match_full (regex->re,
-                                 string,
-                                 string_len,
-                                 start_position,
-                                 match_options,
-                                 &wrapped,
-                                 error);
+       ret = impl_match_info_next (local_match_info, error);
 
        if (match_info != NULL)
        {
-               *match_info = g_slice_new0 (ImplMatchInfo);
-               (*match_info)->match_info = wrapped;
+               *match_info = g_steal_pointer (&local_match_info);
        }
        else
        {
-               g_match_info_free (wrapped);
+               impl_match_info_free (local_match_info);
        }
 
        return ret;
@@ -248,9 +438,22 @@ impl_match_info_fetch_pos (const ImplMatchInfo *match_info,
                            int                 *end_pos)
 {
        g_return_val_if_fail (match_info != NULL, FALSE);
-       g_return_val_if_fail (match_info->match_info != NULL, FALSE);
+       g_return_val_if_fail (match_info->match_data != NULL, FALSE);
+       g_return_val_if_fail (match_num >= 0, FALSE);
+       g_return_val_if_fail (match_info->offsets != NULL, FALSE);
 
-       return g_match_info_fetch_pos (match_info->match_info, match_num, start_pos, end_pos);
+       if (match_num >= match_info->n_groups)
+       {
+               return FALSE;
+       }
+
+       if (start_pos)
+               *start_pos = match_info->offsets[0];
+
+       if (end_pos)
+               *end_pos = match_info->offsets[1];
+
+       return TRUE;
 }
 
 gboolean
@@ -259,8 +462,107 @@ impl_match_info_fetch_named_pos (const ImplMatchInfo *match_info,
                                  int                 *start_pos,
                                  int                 *end_pos)
 {
+       PCRE2_SPTR tabptr;
+
        g_return_val_if_fail (match_info != NULL, FALSE);
-       g_return_val_if_fail (match_info->match_info != NULL, FALSE);
+       g_return_val_if_fail (match_info->match_data != NULL, FALSE);
+       g_return_val_if_fail (match_info->regex != NULL, FALSE);
+       g_return_val_if_fail (start_pos != NULL, FALSE);
+       g_return_val_if_fail (end_pos != NULL, FALSE);
+
+       tabptr = match_info->regex->name_table;
+
+       for (gsize i = 0; i < match_info->regex->name_count; i++)
+       {
+               PCRE2_SIZE n = (tabptr[0] << 8) | tabptr[1];
+
+               if (g_strcmp0 (name, (const char *)(tabptr+2)) == 0)
+               {
+                       return impl_match_info_fetch_pos (match_info, n, start_pos, end_pos);
+               }
+
+               tabptr += match_info->regex->name_entry_size;
+       }
+
+       return FALSE;
+}
+
+gboolean
+impl_match_info_matches (const ImplMatchInfo *match_info)
+{
+       g_return_val_if_fail (match_info != NULL, FALSE);
+
+       return match_info->n_groups >= 0;
+}
+
+gboolean
+impl_match_info_next (ImplMatchInfo  *match_info,
+                      GError        **error)
+{
+       gssize prev_end;
+       gssize prev_begin;
+       int rc;
+
+       g_return_val_if_fail (match_info != NULL, FALSE);
+       g_return_val_if_fail (match_info->regex != NULL, FALSE);
+       g_return_val_if_fail (match_info->regex->code != NULL, FALSE);
+
+       match_info->n_groups = -1;
+
+again:
+       if (match_info->start_pos >= match_info->string_len)
+       {
+               g_set_error_literal (error,
+                                    G_REGEX_ERROR,
+                                    G_REGEX_ERROR_MATCH,
+                                    "No matches");
+               return FALSE;
+       }
+
+       prev_begin = match_info->offsets[0];
+       prev_end = match_info->offsets[1];
+
+       rc = pcre2_match (match_info->regex->code,
+                         (PCRE2_SPTR)match_info->string,
+                         (PCRE2_SIZE)match_info->string_len,
+                         match_info->start_pos,
+                         match_info->match_flags,
+                         match_info->match_data,
+                         NULL);
+
+       if (set_regex_error (error, rc))
+       {
+               match_info->n_groups = -1;
+               match_info->start_pos = match_info->string_len + 1;
+               return FALSE;
+       }
+
+       if (match_info->start_pos == match_info->offsets[1])
+       {
+               const char *next = g_utf8_next_char (match_info->string + prev_end);
+
+               if (match_info->start_pos > match_info->string_len)
+               {
+                       match_info->start_pos = match_info->string_len + 1;
+                       match_info->n_groups = -1;
+                       return FALSE;
+               }
+
+               match_info->start_pos = next - match_info->string;
+       }
+       else
+       {
+               match_info->start_pos = match_info->offsets[1];
+       }
+
+       if (match_info->n_groups >= 0 &&
+           prev_begin == match_info->offsets[0] &&
+           prev_end == match_info->offsets[1])
+       {
+               goto again;
+       }
+
+       match_info->n_groups = rc;
 
-       return g_match_info_fetch_named_pos (match_info->match_info, name, start_pos, end_pos);
+       return impl_match_info_matches (match_info);
 }


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]