[gtksourceview/wip/chergert/pcre2] pcre2: start on pcre2 implementation
- From: Christian Hergert <chergert src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gtksourceview/wip/chergert/pcre2] pcre2: start on pcre2 implementation
- Date: Tue, 29 Sep 2020 22:43:58 +0000 (UTC)
commit 5f1a80358f71a960f8bdd26dbd122e97f44c6d67
Author: Christian Hergert <chergert redhat com>
Date: Fri Sep 25 10:23:18 2020 -0700
pcre2: start on pcre2 implementation
gtksourceview/gtksourceregex.c | 4 +-
gtksourceview/implregex-private.h | 6 +-
gtksourceview/implregex.c | 488 ++++++++++++++++++++++++++++++--------
3 files changed, 401 insertions(+), 97 deletions(-)
---
diff --git a/gtksourceview/gtksourceregex.c b/gtksourceview/gtksourceregex.c
index 80d334a2..dc36c1ac 100644
--- a/gtksourceview/gtksourceregex.c
+++ b/gtksourceview/gtksourceregex.c
@@ -354,8 +354,8 @@ _gtk_source_regex_fetch_pos_bytes (GtkSourceRegex *regex,
gint *start_pos_p, /* byte offsets */
gint *end_pos_p) /* byte offsets */
{
- gint start_pos;
- gint end_pos;
+ gint start_pos = -1;
+ gint end_pos = -1;
g_assert (regex->resolved);
diff --git a/gtksourceview/implregex-private.h b/gtksourceview/implregex-private.h
index da52474e..2b8424bc 100644
--- a/gtksourceview/implregex-private.h
+++ b/gtksourceview/implregex-private.h
@@ -41,6 +41,7 @@ gboolean impl_regex_match (const ImplRegex *regex,
const char *string,
GRegexMatchFlags match_options,
ImplMatchInfo **match_info);
+ImplRegex *impl_regex_ref (ImplRegex *regex);
void impl_regex_unref (ImplRegex *regex);
void impl_match_info_free (ImplMatchInfo *match_info);
char *impl_match_info_fetch (const ImplMatchInfo *match_info,
@@ -58,7 +59,7 @@ char *impl_regex_replace_eval (const ImplRegex *regex,
gboolean impl_regex_match_full (const ImplRegex *regex,
const char *string,
gssize string_len,
- int start_position,
+ gsize start_position,
GRegexMatchFlags match_options,
ImplMatchInfo **match_info,
GError **error);
@@ -70,6 +71,9 @@ gboolean impl_match_info_fetch_named_pos (const ImplMatchInfo *match_info,
const char *name,
int *start_pos,
int *end_pos);
+gboolean impl_match_info_matches (const ImplMatchInfo *match_info);
+gboolean impl_match_info_next (ImplMatchInfo *match_info,
+ GError **error);
const char *impl_regex_get_pattern (const ImplRegex *regex);
G_END_DECLS
diff --git a/gtksourceview/implregex.c b/gtksourceview/implregex.c
index 56a12799..3490945b 100644
--- a/gtksourceview/implregex.c
+++ b/gtksourceview/implregex.c
@@ -21,46 +21,100 @@
#include "config.h"
+#define PCRE2_CODE_UNIT_WIDTH 8
+
+#include <pcre2.h>
+#include <string.h>
+
#include "implregex-private.h"
+#define IS_PCRE_ERROR(ret) ((ret) < PCRE2_ERROR_NOMATCH && (ret) != PCRE2_ERROR_PARTIAL)
+
struct _ImplRegex
{
- int ref_count;
- char *pattern;
- GRegex *re;
+ int ref_count;
+ char *pattern;
+ gsize compile_flags;
+ gsize match_flags;
+ pcre2_compile_context *context;
+ pcre2_code *code;
+ PCRE2_SPTR name_table;
+ int name_count;
+ int name_entry_size;
};
struct _ImplMatchInfo
{
- GMatchInfo *match_info;
+ gsize compile_flags;
+ gsize match_flags;
+ ImplRegex *regex;
+ const char *string;
+ gsize string_len;
+ pcre2_match_data *match_data;
+ PCRE2_SIZE *offsets;
+ int n_groups;
+ gsize start_pos;
};
-#if 0
-static void
-set_regex_error (GError **error,
- int errnum)
+static gsize
+translate_compile_flags (GRegexCompileFlags flags)
{
- guchar errstr[128];
+ gsize ret = PCRE2_ZERO_TERMINATED;
+
+ if (flags & G_REGEX_RAW)
+ ret |= PCRE2_NO_UTF_CHECK;
+ else
+ ret |= PCRE2_UTF;
+
+ if (flags & G_REGEX_ANCHORED)
+ ret |= PCRE2_ANCHORED;
+
+ if (flags & G_REGEX_CASELESS)
+ ret |= PCRE2_CASELESS;
+
+ if (flags & G_REGEX_NEWLINE_LF)
+ ret |= PCRE2_NEWLINE_LF;
- pcre2_get_error_message (errnum, errstr, sizeof errstr - 1);
- errstr[sizeof errstr - 1] = 0;
+ if (flags & G_REGEX_NEWLINE_CR)
+ ret |= PCRE2_NEWLINE_CR;
- g_set_error_literal (error,
- G_REGEX_ERROR,
- G_REGEX_ERROR_COMPILE,
- (const gchar *)errstr);
+ return ret;
}
-#endif
-static ImplMatchInfo *
-impl_match_info_new (const ImplRegex *regex)
+static gsize
+translate_match_flags (GRegexMatchFlags flags)
{
- ImplMatchInfo *match_info;
+ gsize ret = 0;
- match_info = g_slice_new0 (ImplMatchInfo);
- match_info->match_info = NULL;
+ if (flags & G_REGEX_MATCH_ANCHORED)
+ ret |= PCRE2_ANCHORED;
- return match_info;
+ return ret;
+}
+
+static gboolean
+set_regex_error (GError **error,
+ int errnum)
+{
+ if (!IS_PCRE_ERROR (errnum))
+ {
+ return FALSE;
+ }
+
+ if (error != NULL)
+ {
+ guchar errstr[128];
+
+ pcre2_get_error_message (errnum, errstr, sizeof errstr - 1);
+ errstr[sizeof errstr - 1] = 0;
+
+ g_set_error_literal (error,
+ G_REGEX_ERROR,
+ G_REGEX_ERROR_COMPILE,
+ (const gchar *)errstr);
+ }
+
+ return TRUE;
}
ImplRegex *
@@ -69,22 +123,52 @@ impl_regex_new (const char *pattern,
GRegexMatchFlags match_options,
GError **error)
{
- GRegex *re;
+ pcre2_compile_context *context;
+ pcre2_code *code;
ImplRegex *regex;
+ PCRE2_SIZE erroffset;
+ int errnumber = 0;
g_return_val_if_fail (pattern != NULL, NULL);
- re = g_regex_new (pattern, compile_options, match_options, error);
+ context = pcre2_compile_context_create (NULL);
- if (re == NULL)
+ regex = g_slice_new0 (ImplRegex);
+ regex->ref_count = 1;
+ regex->context = context;
+ regex->pattern = g_strdup (pattern);
+ regex->compile_flags = translate_compile_flags (compile_options);
+ regex->match_flags = translate_match_flags (match_options);
+
+ if (regex->compile_flags & PCRE2_NEWLINE_LF)
+ pcre2_set_newline (context, PCRE2_NEWLINE_LF);
+ else if (regex->compile_flags & PCRE2_NEWLINE_CR)
+ pcre2_set_newline (context, PCRE2_NEWLINE_CR);
+
+ regex->code = pcre2_compile ((PCRE2_SPTR)pattern,
+ regex->compile_flags,
+ regex->match_flags,
+ &errnumber,
+ &erroffset,
+ context);
+
+ if (set_regex_error (error, errnumber))
{
+ impl_regex_unref (regex);
return NULL;
}
- regex = g_slice_new0 (ImplRegex);
- regex->ref_count = 1;
- regex->pattern = g_strdup (pattern);
- regex->re = re;
+ pcre2_pattern_info (code, PCRE2_INFO_NAMECOUNT, ®ex->name_count);
+
+ if (regex->name_count > 0)
+ {
+ (void)pcre2_pattern_info (code,
+ PCRE2_INFO_NAMEENTRYSIZE,
+ ®ex->name_entry_size);
+ (void)pcre2_pattern_info (code,
+ PCRE2_INFO_NAMETABLE,
+ ®ex->name_table);
+ }
return regex;
}
@@ -97,6 +181,17 @@ impl_regex_get_pattern (const ImplRegex *regex)
return regex->pattern;
}
+ImplRegex *
+impl_regex_ref (ImplRegex *regex)
+{
+ g_return_val_if_fail (regex != NULL, NULL);
+ g_return_val_if_fail (regex->ref_count > 0, NULL);
+
+ regex->ref_count++;
+
+ return regex;
+}
+
void
impl_regex_unref (ImplRegex *regex)
{
@@ -108,16 +203,64 @@ impl_regex_unref (ImplRegex *regex)
if (regex->ref_count == 0)
{
g_clear_pointer (®ex->pattern, g_free);
- g_clear_pointer (®ex->re, g_regex_unref);
+ g_clear_pointer (®ex->code, pcre2_code_free);
+ g_clear_pointer (®ex->context, pcre2_compile_context_free);
g_slice_free (ImplRegex, regex);
}
}
+static ImplMatchInfo *
+impl_match_info_new (ImplRegex *regex,
+ GRegexMatchFlags match_options,
+ const char *string,
+ gssize string_len)
+{
+ ImplMatchInfo *match_info;
+
+ g_assert (regex != NULL);
+ g_assert (string != NULL);
+ g_assert (string_len <= strlen (string));
+
+ if (string_len < 0)
+ {
+ string_len = strlen (string);
+ }
+
+ match_info = g_slice_new0 (ImplMatchInfo);
+ match_info->regex = impl_regex_ref (regex);
+ match_info->match_flags = regex->match_flags | translate_match_flags (match_options);
+ match_info->start_pos = -1;
+ match_info->n_groups = -1;
+ match_info->string = string;
+ match_info->string_len = string_len;
+ match_info->match_data = pcre2_match_data_create_from_pattern (regex->code, NULL);
+
+ if (match_info->match_data == NULL)
+ {
+ g_error ("Failed to allocate match data");
+ }
+
+ match_info->offsets = pcre2_get_ovector_pointer (match_info->match_data);
+
+ return match_info;
+}
+
void
impl_match_info_free (ImplMatchInfo *match_info)
{
- g_clear_pointer (&match_info->match_info, g_match_info_free);
- g_slice_free (ImplMatchInfo, match_info);
+ if (match_info != NULL)
+ {
+ g_clear_pointer (&match_info->match_data, pcre2_match_data_free);
+ g_clear_pointer (&match_info->regex, impl_regex_unref);
+ match_info->string = NULL;
+ match_info->string_len = 0;
+ match_info->compile_flags = 0;
+ match_info->match_flags = 0;
+ match_info->n_groups = 0;
+ match_info->start_pos = 0;
+ match_info->offsets = NULL;
+ g_slice_free (ImplMatchInfo, match_info);
+ }
}
gboolean
@@ -127,51 +270,58 @@ impl_regex_match (const ImplRegex *regex,
ImplMatchInfo **match_info)
{
g_return_val_if_fail (regex != NULL, FALSE);
- g_return_val_if_fail (regex->re != NULL, FALSE);
+ g_return_val_if_fail (regex->code != NULL, FALSE);
+ g_return_val_if_fail (string != NULL, FALSE);
- if (match_info != NULL)
- {
- *match_info = impl_match_info_new (regex);
- }
-
- return g_regex_match (regex->re,
- string,
- match_options,
- match_info ? &(*match_info)->match_info : NULL);
+ return impl_regex_match_full (regex, string, -1, 0, match_options, match_info, NULL);
}
char *
impl_match_info_fetch (const ImplMatchInfo *match_info,
int match_num)
{
+ int begin = -1;
+ int end = -1;
+
g_return_val_if_fail (match_info != NULL, NULL);
+ g_return_val_if_fail (match_info->string != NULL, NULL);
+ g_return_val_if_fail (match_info->offsets != NULL, NULL);
- return g_match_info_fetch (match_info->match_info, match_num);
+ if (match_info->start_pos < match_info->string_len)
+ {
+ if (impl_match_info_fetch_pos (match_info, match_num, &begin, &end))
+ {
+ if (begin >= 0 && end >= 0)
+ {
+ return g_strndup (match_info->string + begin, end - begin);
+ }
+ }
+ }
+
+ return NULL;
}
char *
impl_match_info_fetch_named (const ImplMatchInfo *match_info,
const char *name)
{
+ int begin = -1;
+ int end = -1;
+
g_return_val_if_fail (match_info != NULL, NULL);
- return g_match_info_fetch_named (match_info->match_info, name);
-}
+ if (match_info->start_pos < match_info->string_len)
+ {
+ if (impl_match_info_fetch_named_pos (match_info, name, &begin, &end))
+ {
+ if (begin >= 0 && end >= 0)
+ {
+ return g_strndup (match_info->string + begin, end - begin);
+ }
+ }
+ }
-static gboolean
-wrapper_eval (const GMatchInfo *match_info,
- GString *result,
- gpointer user_data)
-{
- struct {
- ImplRegexEvalCallback callback;
- gpointer user_data;
- } *wrapper = user_data;
- ImplMatchInfo wrapped = {
- .match_info = (GMatchInfo *)match_info,
- };
-
- return wrapper->callback (&wrapped, result, wrapper->user_data);
+ return NULL;
}
char *
@@ -184,58 +334,98 @@ impl_regex_replace_eval (const ImplRegex *regex,
gpointer user_data,
GError **error)
{
- struct {
- ImplRegexEvalCallback callback;
- gpointer user_data;
- } wrapper;
+ ImplMatchInfo *match_info;
+ GString *out_string;
+ gboolean done;
+ gsize prev_begin;
+ gsize str_pos;
g_return_val_if_fail (regex != NULL, NULL);
- g_return_val_if_fail (regex->re != NULL, NULL);
-
- wrapper.callback = eval;
- wrapper.user_data = user_data;
-
- return g_regex_replace_eval (regex->re,
- string,
- string_len,
- start_position,
- match_options,
- wrapper_eval,
- &wrapper,
- error);
+ g_return_val_if_fail (regex->code != NULL, NULL);
+ g_return_val_if_fail (start_position >= 0, NULL);
+
+ g_error ("++++++ Replace eval\n");
+
+ if (string_len < 0)
+ {
+ string_len = strlen (string);
+ }
+
+ match_info = NULL;
+
+ if (!impl_regex_match_full (regex, string, string_len, start_position, match_options, &match_info,
error))
+ {
+ impl_match_info_free (match_info);
+ return g_strndup (string, string_len);
+ }
+
+ g_assert (match_info != NULL);
+ g_assert (match_info->n_groups > 0);
+
+ str_pos = 0;
+ out_string = g_string_sized_new (string_len);
+ done = FALSE;
+
+ while (!done && impl_match_info_matches (match_info))
+ {
+ prev_begin = match_info->offsets[0];
+ g_string_append_len (out_string, string + str_pos, prev_begin - str_pos);
+ str_pos = match_info->offsets[1];
+
+ done = eval (match_info, out_string, user_data);
+
+ if (!impl_match_info_next (match_info, NULL))
+ {
+ break;
+ }
+ }
+
+ g_string_append_len (out_string,
+ string + str_pos,
+ string_len - str_pos);
+
+ impl_match_info_free (match_info);
+
+ return g_string_free (out_string, FALSE);
}
gboolean
impl_regex_match_full (const ImplRegex *regex,
const char *string,
gssize string_len,
- int start_position,
+ gsize start_position,
GRegexMatchFlags match_options,
ImplMatchInfo **match_info,
GError **error)
{
- GMatchInfo *wrapped = NULL;
- gboolean ret;
+ ImplMatchInfo *local_match_info = NULL;
+ gboolean ret = FALSE;
g_return_val_if_fail (regex != NULL, FALSE);
- g_return_val_if_fail (regex->re != NULL, FALSE);
+ g_return_val_if_fail (regex->code != NULL, FALSE);
+ g_return_val_if_fail (match_options == 0, FALSE);
+ g_return_val_if_fail (string != NULL, FALSE);
+
+ if (string_len < 0)
+ {
+ string_len = strlen (string);
+ }
+
+ local_match_info = impl_match_info_new ((ImplRegex *)regex, match_options, string, string_len);
+
+ local_match_info->start_pos = start_position;
+ local_match_info->offsets[0] = start_position;
+ local_match_info->offsets[1] = start_position;
- ret = g_regex_match_full (regex->re,
- string,
- string_len,
- start_position,
- match_options,
- &wrapped,
- error);
+ ret = impl_match_info_next (local_match_info, error);
if (match_info != NULL)
{
- *match_info = g_slice_new0 (ImplMatchInfo);
- (*match_info)->match_info = wrapped;
+ *match_info = g_steal_pointer (&local_match_info);
}
else
{
- g_match_info_free (wrapped);
+ impl_match_info_free (local_match_info);
}
return ret;
@@ -248,9 +438,22 @@ impl_match_info_fetch_pos (const ImplMatchInfo *match_info,
int *end_pos)
{
g_return_val_if_fail (match_info != NULL, FALSE);
- g_return_val_if_fail (match_info->match_info != NULL, FALSE);
+ g_return_val_if_fail (match_info->match_data != NULL, FALSE);
+ g_return_val_if_fail (match_num >= 0, FALSE);
+ g_return_val_if_fail (match_info->offsets != NULL, FALSE);
- return g_match_info_fetch_pos (match_info->match_info, match_num, start_pos, end_pos);
+ if (match_num >= match_info->n_groups)
+ {
+ return FALSE;
+ }
+
+ if (start_pos)
+ *start_pos = match_info->offsets[0];
+
+ if (end_pos)
+ *end_pos = match_info->offsets[1];
+
+ return TRUE;
}
gboolean
@@ -259,8 +462,105 @@ impl_match_info_fetch_named_pos (const ImplMatchInfo *match_info,
int *start_pos,
int *end_pos)
{
+ PCRE2_SPTR tabptr;
+
g_return_val_if_fail (match_info != NULL, FALSE);
- g_return_val_if_fail (match_info->match_info != NULL, FALSE);
+ g_return_val_if_fail (match_info->match_data != NULL, FALSE);
+ g_return_val_if_fail (match_info->regex != NULL, FALSE);
+ g_return_val_if_fail (start_pos != NULL, FALSE);
+ g_return_val_if_fail (end_pos != NULL, FALSE);
+
+ tabptr = match_info->regex->name_table;
+
+ for (gsize i = 0; i < match_info->regex->name_count; i++)
+ {
+ PCRE2_SIZE n = (tabptr[0] << 8) | tabptr[1];
+
+ if (g_strcmp0 (name, (const char *)(tabptr+2)) == 0)
+ {
+ return impl_match_info_fetch_pos (match_info, n, start_pos, end_pos);
+ }
+
+ tabptr += match_info->regex->name_entry_size;
+ }
+
+ return FALSE;
+}
+
+gboolean
+impl_match_info_matches (const ImplMatchInfo *match_info)
+{
+ g_return_val_if_fail (match_info != NULL, FALSE);
+
+ return match_info->n_groups >= 0;
+}
+
+gboolean
+impl_match_info_next (ImplMatchInfo *match_info,
+ GError **error)
+{
+ gssize prev_end;
+ gssize prev_begin;
+ int rc;
+
+ g_return_val_if_fail (match_info != NULL, FALSE);
+ g_return_val_if_fail (match_info->regex != NULL, FALSE);
+ g_return_val_if_fail (match_info->regex->code != NULL, FALSE);
+
+ match_info->n_groups = -1;
+
+again:
+ if (match_info->start_pos >= match_info->string_len)
+ {
+ g_set_error_literal (error,
+ G_REGEX_ERROR,
+ G_REGEX_ERROR_MATCH,
+ "No matches");
+ return FALSE;
+ }
+
+ prev_begin = match_info->offsets[0];
+ prev_end = match_info->offsets[1];
+
+ rc = pcre2_match (match_info->regex->code,
+ (PCRE2_SPTR)match_info->string,
+ (PCRE2_SIZE)match_info->string_len,
+ match_info->start_pos,
+ match_info->match_flags,
+ match_info->match_data,
+ NULL);
+
+ if (set_regex_error (error, rc))
+ {
+ return FALSE;
+ }
+
+ if (match_info->start_pos == match_info->offsets[1])
+ {
+ const char *next = g_utf8_next_char (match_info->string + prev_end);
+
+ if (match_info->start_pos > match_info->string_len)
+ {
+ match_info->start_pos = match_info->string_len + 1;
+ match_info->n_groups = -1;
+ return FALSE;
+ }
+
+ match_info->start_pos = next - match_info->string;
+ }
+ else
+ {
+ match_info->start_pos = match_info->offsets[1];
+ }
+
+ if (match_info->n_groups >= 0 &&
+ prev_begin == match_info->offsets[0] &&
+ prev_end == match_info->offsets[1])
+ {
+ goto again;
+ }
+
+ match_info->n_groups = rc;
- return g_match_info_fetch_named_pos (match_info->match_info, name, start_pos, end_pos);
+ return match_info->n_groups > 0;
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]