[gtk-doc] scan: Workaround for pathological regex parsing
- From: Stefan Sauer <stefkost src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gtk-doc] scan: Workaround for pathological regex parsing
- Date: Mon, 24 Apr 2017 17:37:19 +0000 (UTC)
commit a03aab7519d612c2851052b7e1e9a9e7a0650600
Author: Jussi Pakkanen <jpakkane gmail com>
Date: Sun Apr 23 21:48:21 2017 +0300
scan: Workaround for pathological regex parsing
Replace consecutive spaces with one to avoid bad performance for a particullar regex.
As pointed out in upstream python issue discusssion avoid matching empty strings.
This happens when no --ignore-decorators are given. This halfs the time for the regex
already.
https://bugzilla.gnome.org/show_bug.cgi?id=781569
gtkdoc/scan.py | 56 +++++++++++++++++++++++++++++++++++---------------------
1 files changed, 35 insertions(+), 21 deletions(-)
---
diff --git a/gtkdoc/scan.py b/gtkdoc/scan.py
index c325f71..aa6b96f 100644
--- a/gtkdoc/scan.py
+++ b/gtkdoc/scan.py
@@ -299,19 +299,32 @@ def ScanHeader(input_file, section_list, decl_list, get_types, options):
logging.info('no decl: %s', line.strip())
+ # Regular expression m17 below invokes pathological behaviour in
+ # Python's regex parser. In Perl it is fast. This hack makes the
+ # slowdown go away, but there has not been a thorough investigation
+ # why that is.
+ #
+ # https://bugzilla.gnome.org/show_bug.cgi?id=781569
+ line17 = line.replace(' ', ' ')
+
+ # avoid generating regex with |'' (matching no string)
+ ignore_decorators = ''
+ if options.ignore_decorators:
+ ignore_decorators = '|' + options.ignore_decorators
+
m = re.search(r'^\s*#\s*define\s+(\w+)', line)
- # $1 $3 $4
$5
+ # $1 $3 $4 $5
m2 = re.search(
r'^\s*typedef\s+((const\s+|G_CONST_RETURN\s+)?\w+)(\s+const)?\s*(\**)\s*\(\*\s*(\w+)\)\s*\(', line)
- # $1 $3 $4 $5
+ # $1 $3 $4 $5
m3 =
re.search(r'^\s*((const\s+|G_CONST_RETURN\s+)?\w+)(\s+const)?\s*(\**)\s*\(\*\s*(\w+)\)\s*\(', line)
# $1 $2
m4 = re.search(r'^\s*(\**)\s*\(\*\s*(\w+)\)\s*\(', line)
# $1 $3
m5 = re.search(r'^\s*typedef\s*((const\s+|G_CONST_RETURN\s+)?\w+)(\s+const)?\s*', previous_line)
- # $1
$3 $4 $5
+ # $1 $3
$4 $5
m6 = re.search(
-
r'^\s*(?:\b(?:extern|G_INLINE_FUNC|%s)\s*)*((const\s+|G_CONST_RETURN\s+)?\w+)(\s+const)?\s*(\**)\s*\(\*\s*(\w+)\)\s*\('
% options.ignore_decorators, line)
+
r'^\s*(?:\b(?:extern|G_INLINE_FUNC%s)\s*)*((const\s+|G_CONST_RETURN\s+)?\w+)(\s+const)?\s*(\**)\s*\(\*\s*(\w+)\)\s*\('
% ignore_decorators, line)
m7 = re.search(r'^\s*enum\s+_?(\w+)\s+\{', line)
m8 = re.search(r'^\s*typedef\s+enum', line)
m9 = re.search(r'^\s*typedef\s+(struct|union)\s+_(\w+)\s+\2\s*;', line)
@@ -320,16 +333,17 @@ def ScanHeader(input_file, section_list, decl_list, get_types, options):
m12 = re.search(r'^\s*typedef\s+(?:struct|union)\s+\w+[\s\*]+(\w+)\s*;', line)
m13 = re.search(r'^\s*(G_GNUC_EXTENSION\s+)?typedef\s+(.+[\s\*])(\w+)(\s*\[[^\]]+\])*\s*;', line)
m14 = re.search(
-
r'^\s*(extern|[A-Za-z_]+VAR|%s)\s+((const\s+|signed\s+|unsigned\s+|long\s+|short\s+)*\w+)(\s+\*+|\*+|\s)\s*(const\s+)*([A-Za-z]\w*)\s*;'
% options.ignore_decorators, line)
+
r'^\s*(extern|[A-Za-z_]+VAR%s)\s+((const\s+|signed\s+|unsigned\s+|long\s+|short\s+)*\w+)(\s+\*+|\*+|\s)\s*(const\s+)*([A-Za-z]\w*)\s*;'
% ignore_decorators, line)
m15 = re.search(
r'^\s*((const\s+|signed\s+|unsigned\s+|long\s+|short\s+)*\w+)(\s+\*+|\*+|\s)\s*(const\s+)*([A-Za-z]\w*)\s*\=',
line)
m16 = re.search(r'.*G_DECLARE_(FINAL_TYPE|DERIVABLE_TYPE|INTERFACE)\s*\(', line)
- # $1
$2
$3
+ # private functions
+ # $1
$2
$3
m17 = re.search(
-
r'^\s*(?:\b(?:extern|G_INLINE_FUNC|%s)\s*)*((?:const\s+|G_CONST_RETURN\s+|signed\s+|unsigned\s+|long\s+|short\s+|struct\s+|union\s+|enum\s+)*\w+)((?:\s+|\*)+(?:\s*(?:\*+|\bconst\b|\bG_CONST_RETURN\b))*)\s*(_[A-Za-z]\w*)\s*\('
% options.ignore_decorators, line)
- # $1
$2
$3
+
r'^\s*(?:\b(?:extern|G_INLINE_FUNC%s)\s*)*((?:const\s+|G_CONST_RETURN\s+|signed\s+|unsigned\s+|long\s+|short\s+|struct\s+|union\s+|enum\s+)*\w+)((?:\s+|\*)+(?:\s*(?:\*+|\bconst\b|\bG_CONST_RETURN\b))*)\s*(_[A-Za-z]\w*)\s*\('
% ignore_decorators, line17)
+ # $1
$2
$3
m18 = re.search(
-
r'^\s*(?:\b(?:extern|G_INLINE_FUNC|%s)\s*)*((?:const\s+|G_CONST_RETURN\s+|signed\s+|unsigned\s+|long\s+|short\s+|struct\s+|union\s+|enum\s+)*\w+)((?:\s+|\*)+(?:\s*(?:\*+|\bconst\b|\bG_CONST_RETURN\b))*)\s*([A-Za-z]\w*)\s*\('
% options.ignore_decorators, line)
+
r'^\s*(?:\b(?:extern|G_INLINE_FUNC%s)\s*)*((?:const\s+|G_CONST_RETURN\s+|signed\s+|unsigned\s+|long\s+|short\s+|struct\s+|union\s+|enum\s+)*\w+)((?:\s+|\*)+(?:\s*(?:\*+|\bconst\b|\bG_CONST_RETURN\b))*)\s*([A-Za-z]\w*)\s*\('
% ignore_decorators, line)
m19 = re.search(r'^\s*([A-Za-z]\w*)\s*\(', line)
m20 = re.search(r'^\s*\(', line)
m21 = re.search(r'^\s*struct\s+_?(\w+)', line)
@@ -532,8 +546,8 @@ def ScanHeader(input_file, section_list, decl_list, get_types, options):
if not previous_line.strip().startswith('G_INLINE_FUNC'):
if not previous_line_words or previous_line_words[0] != 'static':
# $1
$2
- pm =
re.search(r'^\s*(?:\b(?:extern|%s)\s*)*((?:const\s+|G_CONST_RETURN\s+|signed\s+|unsigned\s+|long\s+|short\s+|struct\s+|union\s+|enum\s+)*\w+)((?:\s*(?:\*+|\bconst\b|\bG_CONST_RETURN\b))*)\s*$'
%
- options.ignore_decorators, previous_line)
+ pm =
re.search(r'^\s*(?:\b(?:extern%s)\s*)*((?:const\s+|G_CONST_RETURN\s+|signed\s+|unsigned\s+|long\s+|short\s+|struct\s+|union\s+|enum\s+)*\w+)((?:\s*(?:\*+|\bconst\b|\bG_CONST_RETURN\b))*)\s*$'
%
+ ignore_decorators, previous_line)
if pm:
ret_type = pm.group(1)
if pm.group(2):
@@ -544,9 +558,9 @@ def ScanHeader(input_file, section_list, decl_list, get_types, options):
logging.info('skip block after inline function')
# now we we need to skip a whole { } block
skip_block = 1
- #
$1 $2
- pm =
re.search(r'^\s*(?:\b(?:extern|static|inline|%s)\s*)*((?:const\s+|G_CONST_RETURN\s+|signed\s+|unsigned\s+|long\s+|short\s+|struct\s+|union\s+|enum\s+)*\w+)((?:\s*(?:\*+|\bconst\b|\bG_CONST_RETURN\b))*)\s*$'
%
- options.ignore_decorators, previous_line)
+ # $1
$2
+ pm =
re.search(r'^\s*(?:\b(?:extern|static|inline%s)\s*)*((?:const\s+|G_CONST_RETURN\s+|signed\s+|unsigned\s+|long\s+|short\s+|struct\s+|union\s+|enum\s+)*\w+)((?:\s*(?:\*+|\bconst\b|\bG_CONST_RETURN\b))*)\s*$'
%
+ ignore_decorators, previous_line)
if pm:
ret_type = pm.group(1)
if pm.group(2):
@@ -558,9 +572,9 @@ def ScanHeader(input_file, section_list, decl_list, get_types, options):
logging.info('skip block after inline function')
# now we we need to skip a whole { } block
skip_block = 1
- #
$1 $2
- pm =
re.search(r'^\s*(?:\b(?:extern|G_INLINE_FUNC|%s)\s*)*((?:const\s+|G_CONST_RETURN\s+|signed\s+|unsigned\s+|long\s+|short\s+|struct\s+|union\s+|enum\s+)*\w+)((?:\s*(?:\*+|\bconst\b|\bG_CONST_RETURN\b))*)\s*$'
%
- options.ignore_decorators, previous_line)
+ # $1
$2
+ pm =
re.search(r'^\s*(?:\b(?:extern|G_INLINE_FUNC%s)\s*)*((?:const\s+|G_CONST_RETURN\s+|signed\s+|unsigned\s+|long\s+|short\s+|struct\s+|union\s+|enum\s+)*\w+)((?:\s*(?:\*+|\bconst\b|\bG_CONST_RETURN\b))*)\s*$'
%
+ ignore_decorators, previous_line)
if pm:
ret_type = pm.group(1)
if pm.group(2):
@@ -573,9 +587,9 @@ def ScanHeader(input_file, section_list, decl_list, get_types, options):
elif m20:
decl = line[m20.end():]
pm = re.search(
-
r'^\s*(?:\b(?:extern|G_INLINE_FUNC|%s)\s*)*((?:const\s+|G_CONST_RETURN\s+|signed\s+|unsigned\s+|enum\s+)*\w+)(\s+\*+|\*+|\s)\s*([A-Za-z]\w*)\s*$'
% options.ignore_decorators, previous_line)
- ppm =
re.search(r'^\s*(?:\b(?:extern|G_INLINE_FUNC|%s)\s*)*((?:const\s+|G_CONST_RETURN\s+|signed\s+|unsigned\s+|struct\s+|union\s+|enum\s+)*\w+(?:\**\s+\**(?:const|G_CONST_RETURN))?(?:\s+|\s*\*+))\s*$'
%
- options.ignore_decorators, pre_previous_line)
+
r'^\s*(?:\b(?:extern|G_INLINE_FUNC%s)\s*)*((?:const\s+|G_CONST_RETURN\s+|signed\s+|unsigned\s+|enum\s+)*\w+)(\s+\*+|\*+|\s)\s*([A-Za-z]\w*)\s*$'
% ignore_decorators, previous_line)
+ ppm =
re.search(r'^\s*(?:\b(?:extern|G_INLINE_FUNC%s)\s*)*((?:const\s+|G_CONST_RETURN\s+|signed\s+|unsigned\s+|struct\s+|union\s+|enum\s+)*\w+(?:\**\s+\**(?:const|G_CONST_RETURN))?(?:\s+|\s*\*+))\s*$'
%
+ ignore_decorators, pre_previous_line)
if pm:
ret_type = pm.group(1) + ' ' + pm.group(2)
symbol = pm.group(3)
@@ -671,7 +685,7 @@ def ScanHeader(input_file, section_list, decl_list, get_types, options):
# Note that sometimes functions end in ') G_GNUC_PRINTF (2, 3);' or
# ') __attribute__ (...);'.
if in_declaration == 'function':
- regex = r'\)\s*(G_GNUC_.*|.*DEPRECATED.*|%s\s*|__attribute__\s*\(.*\)\s*)*;.*$' %
options.ignore_decorators
+ regex = r'\)\s*(G_GNUC_.*|.*DEPRECATED.*%s\s*|__attribute__\s*\(.*\)\s*)*;.*$' %
ignore_decorators
pm = re.search(regex, decl, flags=re.MULTILINE)
if pm:
logging.info('scrubbing:[%s]', decl)
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]