[gtk-doc/gtk-doc-for-gtk4: 22/23] scan: Fix handling ignored headers

From: Matthias Clasen <matthiasc src gnome org>
To: commits-list gnome org
Cc:
Subject: [gtk-doc/gtk-doc-for-gtk4: 22/23] scan: Fix handling ignored headers
Date: Sat, 11 Apr 2020 03:03:27 +0000 (UTC)
commit 424e5c8e1bb30eff6faf557940feb126b7165e23
Author: Emmanuele Bassi <ebassi gnome org>
Date:   Wed Mar 4 15:53:52 2020 +0000

    scan: Fix handling ignored headers
    
    Ignored headers are typically listed as relative paths from the
    top-level source directories passed to gtkdoc-scan; for instance:
    
      --ignore-headers=foo.h a/bar.h b/baz.h
    
    The code in gtkdoc-scan will store this as a flat string, and do regular
    expression matching on each header file it scans.
    
    The current code based on regular expression matching fails to deal with
    relative paths, which means of the three headers listed in the example
    above, only `foo.h` will be detected; additionally, even if we remove
    the relative path fragment from `bar.h` and `baz.h`, the regular
    expression will not match files inside sub-directories, because each
    source directory and header is validated as a full path, instead of a
    relative one.
    
    Since all ignored headers are relative to the top level source
    directories, we can create a list of absolute paths, and match each path
    we are scanning against them; this removes the brittle regular
    expression approach, and correctly matches ignored headers inside
    sub-directories.

 gtkdoc/scan.py | 70 ++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 41 insertions(+), 29 deletions(-)
---
diff --git a/gtkdoc/scan.py b/gtkdoc/scan.py
index 4b5b8a9..87809d2 100644
--- a/gtkdoc/scan.py
+++ b/gtkdoc/scan.py
@@ -179,14 +179,23 @@ def Run(options):
     get_types = []
 
     # do not read files twice; checking it here permits to give both srcdir and
-    # builddir as --source-dir without fear of duplicities
+    # builddir as --source-dir without fear of duplicates
     seen_headers = {}
 
+    # split all the ignored files and directories
+    ignored_headers = options.ignore_headers and options.ignore_headers.split(' ') or []
+
     for file in options.headers:
-        ScanHeader(file, section_list, decl_list, get_types, seen_headers, options)
+        # We assume that the specific headers we have been told to scan
+        # are not going to be placed in a blocklist
+        ScanHeader(file, section_list, decl_list, get_types, seen_headers, [], options)
 
     for dir in options.source_dir:
-        ScanHeaders(dir, section_list, decl_list, get_types, seen_headers, options)
+        # Ignored headers are relative to a source directory; since we
+        # pass the full path of each header to ScanHeader(), we need to
+        # build a list of absolute paths from the ignored headers list
+        src_ignored_headers = [os.path.join(dir, x) for x in ignored_headers]
+        ScanHeaders(dir, section_list, decl_list, get_types, seen_headers, src_ignored_headers, options)
 
     with open(new_decl_list, 'w', encoding='utf-8') as f:
         for section in sorted(section_list.keys()):
@@ -330,13 +339,17 @@ def InitScanner(options):
         ]
 
 
-def ScanHeaders(source_dir, section_list, decl_list, get_types, seen_headers, options):
+def ScanHeaders(source_dir, section_list, decl_list, get_types, seen_headers, ignored_headers, options):
     """Scans a directory tree looking for header files.
 
     Args:
       source_dir (str): the directory to scan.
       section_list (dict): map of section to filenames.
+      decl_list (list): list of declarations
+      get_types (list): list of symbols that have a get_type function
       seen_headers (set): set to avoid scanning headers twice
+      ignored_headers (list): list of ignored headers
+      options: command line options
     """
 
     logging.info('Scanning source directory: %s', source_dir)
@@ -344,26 +357,29 @@ def ScanHeaders(source_dir, section_list, decl_list, get_types, seen_headers, op
     # This array holds any subdirectories found.
     subdirs = []
 
-    for file in sorted(os.listdir(source_dir)):
-        if file.startswith('.'):
+    for filename in sorted(os.listdir(source_dir)):
+        if filename.startswith('.'):
             continue
-        fullname = os.path.join(source_dir, file)
+        fullname = os.path.join(source_dir, filename)
         if os.path.isdir(fullname):
-            subdirs.append(file)
-        elif file.endswith('.h'):
+            subdirs.append(fullname)
+        elif filename.endswith('.h'):
+            if fullname in ignored_headers:
+                logging.info(f"File {fullname} matches ignored headers")
+                continue
             ScanHeader(fullname, section_list, decl_list, get_types,
-                       seen_headers, options)
+                       seen_headers, ignored_headers, options)
 
     # Now recursively scan the subdirectories.
-    for dir in subdirs:
-        matchstr = r'(\s|^)' + re.escape(dir) + r'(\s|$)'
-        if re.search(matchstr, options.ignore_headers):
+    for d in subdirs:
+        if d in ignored_headers:
+            logging.info(f"Directory {d} matches ignored headers")
             continue
-        ScanHeaders(os.path.join(source_dir, dir), section_list, decl_list,
-                    get_types, seen_headers, options)
+        ScanHeaders(d, section_list, decl_list,
+                    get_types, seen_headers, ignored_headers, options)
 
 
-def ScanHeader(input_file, section_list, decl_list, get_types, seen_headers, options):
+def ScanHeader(input_file, section_list, decl_list, get_types, seen_headers, ignored_headers, options):
     """Scan a header file for doc commants.
 
     Look for doc comments and extract them. Parse each doc comments and the
@@ -373,9 +389,17 @@ def ScanHeader(input_file, section_list, decl_list, get_types, seen_headers, opt
       input_file (str): the header file to scan.
       section_list (dict): a map of section per filename
       decl_list (list): a list of declarations
+      get_types (list): list of symbols that have a get_type function
       seen_headers (set): set to avoid scanning headers twice
+      ignored_headers (list): a list of ignored headers
+      options: command line options
     """
 
+    # Skip ignored headers
+    if input_file in ignored_headers:
+        logging.info(f"File {input_file} matches ignored headers")
+        return
+
     # Don't scan headers twice
     canonical_input_file = os.path.realpath(input_file)
     if canonical_input_file in seen_headers:
@@ -386,18 +410,6 @@ def ScanHeader(input_file, section_list, decl_list, get_types, seen_headers, opt
 
     file_basename = os.path.split(input_file)[1][:-2]  # filename ends in .h
 
-    # Check if the basename is in the list of headers to ignore.
-    matchstr = r'(\s|^)' + re.escape(file_basename) + r'\.h(\s|$)'
-    if re.search(matchstr, options.ignore_headers):
-        logging.info('File ignored: %s', input_file)
-        return
-
-    # Check if the full name is in the list of headers to ignore.
-    matchstr = r'(\s|^)' + re.escape(input_file) + r'(\s|$)'
-    if re.search(matchstr, options.ignore_headers):
-        logging.info('File ignored: %s', input_file)
-        return
-
     if not os.path.exists(input_file):
         logging.warning('File does not exist: %s', input_file)
         return
@@ -427,7 +439,7 @@ def ScanHeaderContent(input_lines, decl_list, get_types, options):
     Args:
       input_lines (list):
       decl_list (list): symbols declarations
-      get_types (list): lst of symbols that have a get_type function
+      get_types (list): list of symbols that have a get_type function
       options: commandline options
 
     Returns:
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]